/*
 * Copyright (c) 1997, 2013, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License version 2 only, as
 * published by the Free Software Foundation.  Oracle designates this
 * particular file as subject to the "Classpath" exception as provided
 * by Oracle in the LICENSE file that accompanied this code.
 *
 * This code is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 * version 2 for more details (a copy is included in the LICENSE file that
 * accompanied this code).
 *
 * You should have received a copy of the GNU General Public License version
 * 2 along with this work; if not, write to the Free Software Foundation,
 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
 *
 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
 * or visit www.oracle.com if you need additional information or have any
 * questions.
 */

/*
 * Native method support for java.util.zip.CRC32
 */

#include "jni.h"
#include "jni_util.h"
#include <zlib.h>

#include "java_util_zip_CRC32.h"

/* define CAN_COMPILE_CLMUL 0 to disable fastcrc32 completely. */

#ifndef CAN_COMPILE_CLMUL
   /* Windows not supported -- different assembly language syntax,
      and though the newer compilers support the intrinsics, the code
      is not very good. */
#  ifndef _WIN32
#    ifdef __x86_64
#      define CAN_COMPILE_CLMUL 1
#    elif defined(__i386)
#      define CAN_COMPILE_CLMUL 1
#    endif
#  endif /* _WIN32 */
#endif

#if CAN_COMPILE_CLMUL
#include <stdint.h>
#include <stdlib.h>

struct crc_by128_K {
    /* The fields in this structure are arranged so that if it is
     * allocated at a 16-byte alignment they can be picked up two at
     * a time with 128-bit loads.
     *
     * Because of flipped bit order for this CRC polynomials
     * the constant for X**N is left-shifted by 1.  This is because
     * a 64 x 64 polynomial multiply produces a 127-bit result
     * but the highest term is always aligned to bit 0 in the container.
     * Pre-shifting by one fixes this, at the cost of potentially making
     * the 32-bit constant no longer fit in a 32-bit container (thus the
     * use of uint64_t, though this is also the size used by the carry-
     * less multiply instruction.
     *
     * In addition, the flipped bit order and highest-term-at-least-bit
     * multiply changes the constants used.  The 96-bit result will be
     * aligned to the high-term end of the target 128-bit container,
     * not the low-term end; that is, instead of a 512-bit or 576-bit fold,
     * instead it is a 480 (=512-32) or 544 (=512+64-32) bit fold.
     *
     * This cause additional problems in the 128-to-64-bit reduction; see the
     * code for details.  By storing a mask in the otherwise unused half of
     * a 128-bit constant, bits can be cleared before multiplication without
     * storing and reloading.  Note that staying on a 128-bit datapath means
     * that some data is uselessly stored and some unused data is intersected
     * with an irrelevant constant.
     */

    uint64_t mask; /* low of K_M_64 */
    uint64_t xtt64; /* high of K_M_64 */
    uint64_t xtt160; /* low of K_160_96 */
    uint64_t xtt96; /* high of K_160_96 */
    uint64_t xtt544; /* low of K_544_480 */
    uint64_t xtt480; /* high of K_544_480 */
};

struct crc_by128_K * K_struct = 0;

static const uint64_t x64 = (uint64_t) 0xb1e6b092U << 1;
static const uint64_t x96 = (uint64_t) 0x6655004fU << 1;
static const uint64_t x160 = (uint64_t) 0xba8ccbe8U << 1;
static const uint64_t x480 = (uint64_t) 0xe3720acbU << 1;
static const uint64_t x544 = (uint64_t) 0xaa2215eaU << 1;

static struct crc_by128_K * init_crc_by128_K() {
    void * y;
    y = malloc(16 + sizeof(struct crc_by128_K));
    // posix_memalign not available on all platforms
    if (y == NULL) {
        return (struct crc_by128_K *) NULL;
    }
    uint8_t * z = (uint8_t *) y;
    long p = (long) z;
    z += (16 - p) & 15;
    struct crc_by128_K * x = (struct crc_by128_K *) z;
    x -> mask = 0xffffffffUL;
    x -> xtt64 = x64;
    x -> xtt160 = x160;
    x -> xtt96 = x96;
    x -> xtt544 = x544;
    x -> xtt480 = x480;
    return x;
}

uint32_t fastcrc32(jint crc, Bytef * buf, jint len);

/* Flag governing use of "CLMUL" instruction.
   For now, implies little-endian.
   Computed dynamically, incorporates information about
   the current hardware and the compiler used to compile
   this file. */
static int useClmul = 0;
#else /* Cannot compile CLMUL */
/* Stub out fastcrc32 */
# define fastcrc32 crc32
# define useClmul 0
#endif

/* Local copy of CRC32 table is used to fill and drain CLMUL CRC.
   Extra members beyond the first 256-entry row are ignored. */
static const unsigned long FAR * crc_table;

/* Initialize java-side table (for small CRCs) to avoid extra startup work,
   and capture the platform-dependent useClmul flag.
*/
JNIEXPORT jboolean JNICALL
Java_java_util_zip_CRC32_init(JNIEnv *env, jclass cls, jarray b, jboolean use_clmul)
{
    /* Get the CRC table from zip to initialize JNI.  Our private copy
       is missing if not compiled for fastcrc32. */
    jint *buf = (*env)->GetPrimitiveArrayCritical(env, b, 0);
    crc_table = get_crc_table();
    if (buf) {
          /* Don't know for sure how big an unsigned long is, therefore
             copy one at a time. */
          int i;
          for (i = 0; i < 256; i++) buf[i] = (jint) (crc_table[i]);
          (*env)->ReleasePrimitiveArrayCritical(env, b, buf, 0);
    }
#if CAN_COMPILE_CLMUL
   if (use_clmul) {
        K_struct = init_crc_by128_K();
        useClmul = K_struct != 0;
        /* Rather than throw OOME, just do without fast CRC. */
    }
#endif
    return useClmul;
}

JNIEXPORT jint JNICALL
Java_java_util_zip_CRC32_update(JNIEnv *env, jclass cls, jint crc, jint b)
{
    Bytef buf[1];

    buf[0] = (Bytef)b;
    return crc32(crc, buf, 1); // single byte not done quickly by fastcrc32
}

JNIEXPORT jint JNICALL
Java_java_util_zip_CRC32_updateBytes(JNIEnv *env, jclass cls, jint crc,
                                     jarray b, jint off, jint len)
{
    Bytef *buf = (*env)->GetPrimitiveArrayCritical(env, b, 0);
    if (buf) {
        crc = (jint) (useClmul ? fastcrc32(crc, buf + off, len) :
                                     crc32(crc, buf + off, len));
        (*env)->ReleasePrimitiveArrayCritical(env, b, buf, 0);
    }
    return crc;
}

JNIEXPORT jint ZIP_CRC32(jint crc, const jbyte *buf, jint len)
{
    return (jint) (useClmul ? fastcrc32(crc, (Bytef*)buf, len) :
                                  crc32(crc, (Bytef*)buf, len));
}

JNIEXPORT jint JNICALL
Java_java_util_zip_CRC32_updateByteBuffer(JNIEnv *env, jclass cls, jint crc,
                                          jlong address, jint off, jint len)
{
    Bytef *buf = (Bytef *)jlong_to_ptr(address);
    if (buf) {
        crc = (jint) (useClmul ? fastcrc32(crc, buf + off, len) :
                                     crc32(crc, buf + off, len));
    }
    return crc;
}

#if CAN_COMPILE_CLMUL
#ifndef NO_ASM

/* set up the platform-specific glop surrounding the function body. */
#  ifdef __x86_64
#    ifdef __APPLE__
#      define ASM_PREFIX ".text\n\t.align 8\n\t.globl _kernel\n_kernel:\n\t"
#      define ASM_SUFFIX ""
#    elif defined(__GNUC__)
#      define ASM_PREFIX ".text\n\t.align 16\n\t.globl kernel\n\t.type kernel,@function\nkernel:\n\t"
#      define ASM_SUFFIX ""
#    elif defined(__SUNPRO_C)
#      define ASM_PREFIX ".section .text,\"ax\"\n\t.align 16, 0x90\n\t.globl kernel\n\t.type kernel,@function\nkernel:\n\t"
#      define ASM_SUFFIX ".size kernel,.-kernel"
#    else
       /* Perhaps the mystery compiler can handle the intrinsics. */
#      define NO_ASM 1
#    endif

#    ifndef NO_ASM
__asm__(
        ASM_PREFIX
        " pushq  %rbp\n\t"
        " movq   %rsp, %rbp\n\t"
        " movl   %edi, %eax\n\t"
        " .byte  0xc5,0xf9,0x6f,0x06  # vmovdqa(%rsi), %xmm0\n\t"
        " .byte  0xc4,0xe1,0xf9,0x7e,0xc7  # vmovd  %xmm0, %rdi\n\t"
        " xorq   %rax, %rdi\n\t"
        " .byte  0xc4,0xe3,0xf9,0x22,0xd7,0x00  # vpinsrq$0, %rdi, %xmm0, %xmm2\n\t"
        " .byte  0xc5,0x79,0x6f,0x01  # vmovdqa(%rcx), %xmm8\n\t"
        " .byte  0xc5,0x79,0x6f,0x49,0x10  # vmovdqa16(%rcx), %xmm9\n\t"
        " movl   $1, %eax\n\t"
        " cmpl   $4, %edx\n\t"
        " jl     1f\n\t"
        " .byte  0xc5,0xf9,0x6f,0x6e,0x10  # vmovdqa16(%rsi), %xmm5\n\t"
        " .byte  0xc5,0xf9,0x6f,0x66,0x20  # vmovdqa32(%rsi), %xmm4\n\t"
        " .byte  0xc5,0xf9,0x6f,0x5e,0x30  # vmovdqa48(%rsi), %xmm3\n\t"
        " leal   -3(%rdx), %edi\n\t"
        " movl   $4, %eax\n\t"
        " cmpl   $5, %edi\n\t"
        " jl     2f\n\t"
        " .byte  0xc5,0xf9,0x6f,0x71,0x20  # vmovdqa32(%rcx), %xmm6\n\t"
        " leaq   112(%rsi), %rcx\n\t"
        " movl   $4, %eax\n\t"
        " .align  4, 0x90\n"
        "3: .byte  0xc4,0xe3,0x49,0x44,0xc2,0x00  # vpclmulqdq$0, %xmm2, %xmm6, %xmm0\n\t"
        " .byte  0xc4,0xe3,0x49,0x44,0xcb,0x11  # vpclmulqdq$17, %xmm3, %xmm6, %xmm1\n\t"
        " .byte  0xc4,0xe3,0x49,0x44,0xdb,0x00  # vpclmulqdq$0, %xmm3, %xmm6, %xmm3\n\t"
        " .byte  0xc5,0xe1,0xef,0x19  # vpxor  (%rcx), %xmm3, %xmm3\n\t"
        " .byte  0xc4,0xe3,0x49,0x44,0xfd,0x00  # vpclmulqdq$0, %xmm5, %xmm6, %xmm7\n\t"
        " .byte  0xc5,0xc1,0xef,0x79,0xe0  # vpxor  -32(%rcx), %xmm7, %xmm7\n\t"
        " .byte  0xc5,0xf1,0xef,0xdb  # vpxor  %xmm3, %xmm1, %xmm3\n\t"
        " .byte  0xc4,0xe3,0x49,0x44,0xd2,0x11  # vpclmulqdq$17, %xmm2, %xmm6, %xmm2\n\t"
        " .byte  0xc5,0xf9,0xef,0x41,0xd0  # vpxor  -48(%rcx), %xmm0, %xmm0\n\t"
        " .byte  0xc4,0xe3,0x49,0x44,0xcd,0x11  # vpclmulqdq$17, %xmm5, %xmm6, %xmm1\n\t"
        " .byte  0xc4,0xe3,0x49,0x44,0xec,0x11  # vpclmulqdq$17, %xmm4, %xmm6, %xmm5\n\t"
        " .byte  0xc4,0xe3,0x49,0x44,0xe4,0x00  # vpclmulqdq$0, %xmm4, %xmm6, %xmm4\n\t"
        " .byte  0xc5,0xd9,0xef,0x61,0xf0  # vpxor  -16(%rcx), %xmm4, %xmm4\n\t"
        " .byte  0xc5,0xd1,0xef,0xe4  # vpxor  %xmm4, %xmm5, %xmm4\n\t"
        " .byte  0xc5,0xf1,0xef,0xef  # vpxor  %xmm7, %xmm1, %xmm5\n\t"
        " .byte  0xc5,0xe9,0xef,0xd0  # vpxor  %xmm0, %xmm2, %xmm2\n\t"
        " addq   $64, %rcx\n\t"
        " addl   $4, %eax\n\t"
        " cmpl   %edi, %eax\n\t"
        " jl     3b\n"
        "2: .byte  0xc4,0xe3,0x31,0x44,0xc2,0x11  # vpclmulqdq$17, %xmm2, %xmm9, %xmm0\n\t"
        " .byte  0xc4,0xe3,0x31,0x44,0xca,0x00  # vpclmulqdq$0, %xmm2, %xmm9, %xmm1\n\t"
        " .byte  0xc5,0xd1,0xef,0xc9  # vpxor  %xmm1, %xmm5, %xmm1\n\t"
        " .byte  0xc5,0xf1,0xef,0xc8  # vpxor  %xmm0, %xmm1, %xmm1\n\t"
        " .byte  0xc4,0xe3,0x31,0x44,0xc1,0x11  # vpclmulqdq$17, %xmm1, %xmm9, %xmm0\n\t"
        " .byte  0xc4,0xe3,0x31,0x44,0xc9,0x00  # vpclmulqdq$0, %xmm1, %xmm9, %xmm1\n\t"
        " .byte  0xc5,0xd9,0xef,0xc9  # vpxor  %xmm1, %xmm4, %xmm1\n\t"
        " .byte  0xc5,0xf1,0xef,0xc8  # vpxor  %xmm0, %xmm1, %xmm1\n\t"
        " .byte  0xc4,0xe3,0x31,0x44,0xc1,0x11  # vpclmulqdq$17, %xmm1, %xmm9, %xmm0\n\t"
        " .byte  0xc4,0xe3,0x31,0x44,0xc9,0x00  # vpclmulqdq$0, %xmm1, %xmm9, %xmm1\n\t"
        " .byte  0xc5,0xe1,0xef,0xc9  # vpxor  %xmm1, %xmm3, %xmm1\n\t"
        " .byte  0xc5,0xf1,0xef,0xd0  # vpxor  %xmm0, %xmm1, %xmm2\n"
        "1: cmpl   %edx, %eax\n\t"
        " jge    4f\n\t"
        " subl   %eax, %edx\n\t"
        " movslq %eax, %rax\n\t"
        " shlq   $4, %rax\n\t"
        " addq   %rax, %rsi\n\t"
        " .align  4, 0x90\n"
        "5: .byte  0xc4,0xe3,0x31,0x44,0xc2,0x11  # vpclmulqdq$17, %xmm2, %xmm9, %xmm0\n\t"
        " .byte  0xc4,0xe3,0x31,0x44,0xca,0x00  # vpclmulqdq$0, %xmm2, %xmm9, %xmm1\n\t"
        " .byte  0xc5,0xf1,0xef,0x0e  # vpxor  (%rsi), %xmm1, %xmm1\n\t"
        " .byte  0xc5,0xf1,0xef,0xd0  # vpxor  %xmm0, %xmm1, %xmm2\n\t"
        " addq   $16, %rsi\n\t"
        " decl   %edx\n\t"
        " jne    5b\n"
        "4: .byte  0xc4,0xe3,0x39,0x44,0xc2,0x01  # vpclmulqdq$1, %xmm2, %xmm8, %xmm0\n\t"
        " .byte  0xc4,0xe1,0xf9,0x7e,0xc0  # vmovd  %xmm0, %rax\n\t"
        " .byte  0xc4,0xe3,0xf9,0x16,0xc1,0x01  # vpextrq$1, %xmm0, %rcx\n\t"
        " shldq  $32, %rax, %rcx\n\t"
        " .byte  0xc5,0xb9,0xdb,0xc0  # vpand  %xmm0, %xmm8, %xmm0\n\t"
        " .byte  0xc4,0xe3,0x39,0x44,0xc0,0x01  # vpclmulqdq$1, %xmm0, %xmm8, %xmm0\n\t"
        " .byte  0xc4,0xe1,0xf9,0x7e,0xc2  # vmovd  %xmm0, %rdx\n\t"
        " .byte  0xc4,0xe3,0xf9,0x16,0xd0,0x01  # vpextrq$1, %xmm2, %rax\n\t"
        " xorq   %rdx, %rax\n\t"
        " xorq   %rcx, %rax\n\t"
        " popq   %rbp\n\t"
        " ret\n"
        ASM_SUFFIX
        );
#    endif
#  elif defined(__i386)

/* set up the platform-specific glop surrounding the function body. */
#    ifdef __APPLE__
#      define ASM_PREFIX ".text\n\t.align 16\n\t.globl _kernel\n_kernel:\n\t"
#      define ASM_SUFFIX ""
#    elif defined(__GNUC__)
#      define ASM_PREFIX ".text\n\t.align 16\n\t.globl kernel\n\t.type kernel,@function\nkernel:\n\t"
#      define ASM_SUFFIX ""
#    elif defined(__SUNPRO_C)
#      define ASM_PREFIX ".section .text,\"ax\"\n\t.align 16, 0x90\n\t.globl kernel\n\t.type kernel,@function\nkernel:\n\t"
#      define ASM_SUFFIX ".size kernel,.-kernel"
#    else
       /* Perhaps the mystery compiler can handle the intrinsics. */
#      define NO_ASM 1
#    endif

#    ifndef NO_ASM
__asm__(
        ASM_PREFIX
        " pushl  %ebp\n\t"
        " movl   %esp, %ebp\n\t"
        " pushl  %edi\n\t"
        " pushl  %esi\n\t"
        " movl   12(%ebp), %eax\n\t"
        " .byte  0xc5,0xf9,0x28,0x00  # vmovapd(%eax), %xmm0\n\t"
        " .byte  0xc5,0xf9,0x7e,0xc1  # vmovd  %xmm0, %ecx\n\t"
        " xorl   8(%ebp), %ecx\n\t"
        " .byte  0xc4,0xe3,0x79,0x22,0xc9,0x00  # vpinsrd$0, %ecx, %xmm0, %xmm1\n\t"
        " .byte  0xc4,0xe3,0x79,0x16,0xc1,0x01  # vpextrd$1, %xmm0, %ecx\n\t"
        " .byte  0xc4,0xe3,0x71,0x22,0xc9,0x01  # vpinsrd$1, %ecx, %xmm1, %xmm1\n\t"
        " movl   20(%ebp), %edi\n\t"
        " .byte  0xc5,0xf9,0x6f,0x07  # vmovdqa(%edi), %xmm0\n\t"
        " .byte  0xc5,0xf9,0x6f,0x57,0x10  # vmovdqa16(%edi), %xmm2\n\t"
        " movl   $1, %edx\n\t"
        " movl   16(%ebp), %ecx\n\t"
        " cmpl   $4, %ecx\n\t"
        " jl     1f\n\t"
        " .byte  0xc5,0xf9,0x6f,0x58,0x30  # vmovdqa48(%eax), %xmm3\n\t"
        " .byte  0xc5,0xf9,0x6f,0x68,0x10  # vmovdqa16(%eax), %xmm5\n\t"
        " .byte  0xc5,0xf9,0x6f,0x60,0x20  # vmovdqa32(%eax), %xmm4\n\t"
        " leal   -3(%ecx), %esi\n\t"
        " movl   $4, %edx\n\t"
        " cmpl   $5, %esi\n\t"
        " jl     2f\n\t"
        " .byte  0xc5,0xf9,0x6f,0x77,0x20  # vmovdqa32(%edi), %xmm6\n\t"
        " leal   112(%eax), %edi\n\t"
        " movl   $4, %edx\n\t"
        " .align  4, 0x90\n"
        "3: .byte  0xc4,0xe3,0x49,0x44,0xfb,0x11  # vpclmulqdq$17, %xmm3, %xmm6, %xmm7\n\t"
        " .byte  0xc4,0xe3,0x49,0x44,0xdb,0x00  # vpclmulqdq$0, %xmm3, %xmm6, %xmm3\n\t"
        " .byte  0xc5,0xe1,0xef,0x1f  # vpxor  (%edi), %xmm3, %xmm3\n\t"
        " .byte  0xc5,0xc1,0xef,0xdb  # vpxor  %xmm3, %xmm7, %xmm3\n\t"
        " .byte  0xc4,0xe3,0x49,0x44,0xfc,0x11  # vpclmulqdq$17, %xmm4, %xmm6, %xmm7\n\t"
        " .byte  0xc4,0xe3,0x49,0x44,0xe4,0x00  # vpclmulqdq$0, %xmm4, %xmm6, %xmm4\n\t"
        " .byte  0xc5,0xd9,0xef,0x67,0xf0  # vpxor  -16(%edi), %xmm4, %xmm4\n\t"
        " .byte  0xc5,0xc1,0xef,0xe4  # vpxor  %xmm4, %xmm7, %xmm4\n\t"
        " .byte  0xc4,0xe3,0x49,0x44,0xfd,0x11  # vpclmulqdq$17, %xmm5, %xmm6, %xmm7\n\t"
        " .byte  0xc4,0xe3,0x49,0x44,0xed,0x00  # vpclmulqdq$0, %xmm5, %xmm6, %xmm5\n\t"
        " .byte  0xc5,0xd1,0xef,0x6f,0xe0  # vpxor  -32(%edi), %xmm5, %xmm5\n\t"
        " .byte  0xc5,0xc1,0xef,0xed  # vpxor  %xmm5, %xmm7, %xmm5\n\t"
        " .byte  0xc4,0xe3,0x49,0x44,0xf9,0x11  # vpclmulqdq$17, %xmm1, %xmm6, %xmm7\n\t"
        " .byte  0xc4,0xe3,0x49,0x44,0xc9,0x00  # vpclmulqdq$0, %xmm1, %xmm6, %xmm1\n\t"
        " .byte  0xc5,0xf1,0xef,0x4f,0xd0  # vpxor  -48(%edi), %xmm1, %xmm1\n\t"
        " .byte  0xc5,0xc1,0xef,0xc9  # vpxor  %xmm1, %xmm7, %xmm1\n\t"
        " addl   $64, %edi\n\t"
        " addl   $4, %edx\n\t"
        " cmpl   %esi, %edx\n\t"
        " jl     3b\n"
        "2: .byte  0xc4,0xe3,0x69,0x44,0xf1,0x11  # vpclmulqdq$17, %xmm1, %xmm2, %xmm6\n\t"
        " .byte  0xc4,0xe3,0x69,0x44,0xc9,0x00  # vpclmulqdq$0, %xmm1, %xmm2, %xmm1\n\t"
        " .byte  0xc5,0xd1,0xef,0xc9  # vpxor  %xmm1, %xmm5, %xmm1\n\t"
        " .byte  0xc5,0xf1,0xef,0xee  # vpxor  %xmm6, %xmm1, %xmm5\n\t"
        " .byte  0xc4,0xe3,0x69,0x44,0xcd,0x11  # vpclmulqdq$17, %xmm5, %xmm2, %xmm1\n\t"
        " .byte  0xc4,0xe3,0x69,0x44,0xed,0x00  # vpclmulqdq$0, %xmm5, %xmm2, %xmm5\n\t"
        " .byte  0xc5,0xd9,0xef,0xe5  # vpxor  %xmm5, %xmm4, %xmm4\n\t"
        " .byte  0xc5,0xd9,0xef,0xe1  # vpxor  %xmm1, %xmm4, %xmm4\n\t"
        " .byte  0xc4,0xe3,0x69,0x44,0xcc,0x11  # vpclmulqdq$17, %xmm4, %xmm2, %xmm1\n\t"
        " .byte  0xc4,0xe3,0x69,0x44,0xe4,0x00  # vpclmulqdq$0, %xmm4, %xmm2, %xmm4\n\t"
        " .byte  0xc5,0xe1,0xef,0xdc  # vpxor  %xmm4, %xmm3, %xmm3\n\t"
        " .byte  0xc5,0xe1,0xef,0xc9  # vpxor  %xmm1, %xmm3, %xmm1\n"
        "1: cmpl   %ecx, %edx\n\t"
        " jge    4f\n\t"
        " subl   %edx, %ecx\n\t"
        " shll   $4, %edx\n\t"
        " addl   %edx, %eax\n\t"
        " .align  4, 0x90\n"
        "5: .byte  0xc4,0xe3,0x69,0x44,0xd9,0x11  # vpclmulqdq$17, %xmm1, %xmm2, %xmm3\n\t"
        " .byte  0xc4,0xe3,0x69,0x44,0xc9,0x00  # vpclmulqdq$0, %xmm1, %xmm2, %xmm1\n\t"
        " .byte  0xc5,0xf1,0xef,0x08  # vpxor  (%eax), %xmm1, %xmm1\n\t"
        " .byte  0xc5,0xf1,0xef,0xcb  # vpxor  %xmm3, %xmm1, %xmm1\n\t"
        " addl   $16, %eax\n\t"
        " decl   %ecx\n\t"
        " jne    5b\n"
        "4: .byte  0xc4,0xe3,0x79,0x44,0xd1,0x01  # vpclmulqdq$1, %xmm1, %xmm0, %xmm2\n\t"
        " .byte  0xc5,0xf9,0xdb,0xda  # vpand  %xmm2, %xmm0, %xmm3\n\t"
        " .byte  0xc4,0xe3,0x79,0x44,0xc3,0x01  # vpclmulqdq$1, %xmm3, %xmm0, %xmm0\n\t"
        " .byte  0xc5,0xf9,0x7e,0xc0  # vmovd  %xmm0, %eax\n\t"
        " .byte  0xc4,0xe3,0x79,0x16,0xc9,0x02  # vpextrd$2, %xmm1, %ecx\n\t"
        " xorl   %eax, %ecx\n\t"
        " .byte  0xc4,0xe3,0x79,0x16,0xd0,0x01  # vpextrd$1, %xmm2, %eax\n\t"
        " xorl   %ecx, %eax\n\t"
        " .byte  0xc4,0xe3,0x79,0x16,0xc2,0x01  # vpextrd$1, %xmm0, %edx\n\t"
        " .byte  0xc4,0xe3,0x79,0x16,0xc9,0x03  # vpextrd$3, %xmm1, %ecx\n\t"
        " xorl   %edx, %ecx\n\t"
        " .byte  0xc4,0xe3,0x79,0x16,0xd2,0x02  # vpextrd$2, %xmm2, %edx\n\t"
        " xorl   %ecx, %edx\n\t"
        " popl   %esi\n\t"
        " popl   %edi\n\t"
        " popl   %ebp\n\t"
        " ret\n"
        ASM_SUFFIX
        );
#    endif
#  else /* architecture type */
/* Not intel, not that the C intrinsics will compile anywhere else,
 * but it will be a slightly better error message.
 */
#    define NO_ASM 1
#  endif
#endif /* NO_ASM */

#ifndef NO_ASM
/* Declaration for use below. */
uint64_t kernel(uint32_t c, unsigned char * buf, int len_128bit, struct crc_by128_K * K);
#else
#pragma message("Compiling 'kernel' from C source with intrinsics")
#include <wmmintrin.h>
#include <emmintrin.h>

union u {
    __m128i v;
    struct {
        uint64_t lo;
        uint64_t hi;
    };
};

/**
 * Assume c is existing crc,
 * buf is 16-byte-aligned,
 * len is a multiple of 16 greater than zero.
 */
uint64_t kernel(uint32_t c, unsigned char * buf, int len_128bit,
        struct crc_by128_K * K) {

    __m128i * b = (__m128i *) buf;
    int i = 0;

    /* 128 bit constants and variables. */
    __m128i K_544_480, K_160_96, K_M_64,
    x0, x1, x2, x3,
    x0a, x1a, x2a, x3a,
    x0b, x1b, x2b, x3b;

    /* Use these to move data between xmm registers and "normal" registers. */
    union u ut0, ut1, ut2, ut3;

    K_544_480 = * (__m128i *) & (K -> xtt544);
    K_160_96 = * (__m128i *) & (K -> xtt160);
    K_M_64 = * (__m128i *) & (K -> mask);

    /* Incorporate existing CRC into first item */
    ut0.v = b[0];
    ut0.lo ^= c;
    x0 = ut0.v;

    if (len_128bit >= 4) {
        /* Written as a slightly pipelined loop. */

        x1 = b[1];
        x2 = b[2];
        x3 = b[3];

        /* Iterate once if len_128bit is between 8 and 11
         * 4 < 8-3 < 11 - 3
         * 8 !< 11 - 3 < 12 - 3.
         *
         * 0 1 2 3 | 4 5 6 7 | 8 9 10 11 | 12
         *
         */
        for (i = 4; i < len_128bit - 3 ; i+= 4) {
            /* Each iteration of this loop folds the 512 bits of polynomial
             * in x0-x3 with the data in b[i]..b[i+3].
             */
            x0a = b[i];
            x1a = b[i+1];
            x2a = b[i+2];
            x3a = b[i+3];

            x0b = _mm_clmulepi64_si128(K_544_480, x0, 0x00);
            x0 = _mm_clmulepi64_si128(K_544_480, x0, 0x11);
            x1b = _mm_clmulepi64_si128(K_544_480, x1, 0x00);
            x1 = _mm_clmulepi64_si128(K_544_480, x1, 0x11);

            x2b = _mm_clmulepi64_si128(K_544_480, x2, 0x00);
            x2 = _mm_clmulepi64_si128(K_544_480, x2, 0x11);
            x3b = _mm_clmulepi64_si128(K_544_480, x3, 0x00);
            x3 = _mm_clmulepi64_si128(K_544_480, x3, 0x11);

            // x0 ^= x0a ^ x0b;
            x0 = _mm_xor_si128(x0, x0a);
            x0 = _mm_xor_si128(x0, x0b);
            // x1 ^= x1a ^ x1b;
            x1 = _mm_xor_si128(x1, x1a);
            x1 = _mm_xor_si128(x1, x1b);
            // x2 ^= x2a ^ x2b;
            x2 = _mm_xor_si128(x2, x2a);
            x2 = _mm_xor_si128(x2, x2b);
            // x3 ^= x3a ^ x3b;
            x3 = _mm_xor_si128(x3, x3a);
            x3 = _mm_xor_si128(x3, x3b);
        }
        /* x0 - x3 contains 4 x 128 bits of accumulated result.
         * 0-3 hexads potentially remain in [i,len_128bit) entries.
         * Assume trailing bytes beyond that are handled by our caller.
         */
        x0a = _mm_clmulepi64_si128(K_160_96, x0, 0x00);
        x0b = _mm_clmulepi64_si128(K_160_96, x0, 0x11);
        x1 = _mm_xor_si128(x1, x0a);
        x1 = _mm_xor_si128(x1, x0b);
        x0a = _mm_clmulepi64_si128(K_160_96, x1, 0x00);
        x0b = _mm_clmulepi64_si128(K_160_96, x1, 0x11);
        x2 = _mm_xor_si128(x2, x0a);
        x2 = _mm_xor_si128(x2, x0b);
        x0a = _mm_clmulepi64_si128(K_160_96, x2, 0x00);
        x0b = _mm_clmulepi64_si128(K_160_96, x2, 0x11);
        x3 = _mm_xor_si128(x3, x0a);
        x3 = _mm_xor_si128(x3, x0b);
    } else {
        /* Loaded 128 bits already into x0.
         */
        x3 = x0;
        i = 1;
    }

    /* x3 is now 128-bit result.
     * Fold 0-3 128-bit chunks into x3.
     */
    for (; i < len_128bit; i++) {
        x0 = b[i]; // data to fold
        // fold x3 down by 128 to align with data.
        x0a = _mm_clmulepi64_si128(K_160_96, x3, 0x00);
        x0b = _mm_clmulepi64_si128(K_160_96, x3, 0x11);
        x3 = _mm_xor_si128(x0, x0a);
        x3 = _mm_xor_si128(x3, x0b);
        // x3 is now aligned with data we just loaded.
    }

    /*
     * No more 128bits remain.
     * Fold x3 down into 32 bits.
     */
    {
        uint64_t w;
        uint64_t y;
        ut0.v = x3;
        y = ut0.hi; // 64 low-order terms of polynomial into y.

        /* polynomial term order:
         * high -> low
         * bit number order
         * 0 -> 127
         *
         * input, from which y was just extracted.
         * w0 w1 y0 y1
         * w0:w1 * x64 yields 96 bits.
         * p0:p1:p2:__ (aligned wrong, store to extract p1 and p2)
         * p0:p1:__:__ & ff:00:__:__ (mask to get rid of p1)
         * p0:00:__:__
         * p0:00 * x64 (times x64 yields 64 bits)
         * r0:r1 store and xor.
         */

        x0 = _mm_clmulepi64_si128(K_M_64, x3, 0x01);
        ut1.v = x0;
        w = (ut1.lo >> 32) + (ut1.hi << 32); // extract low-poly 64 bits.
        x0 = _mm_and_si128(K_M_64, x0); // mask away what we just extracted..
        x0 = _mm_clmulepi64_si128(K_M_64, x0, 0x01);
        w ^= y;
        ut2.v = x0;
        w ^= ut2.lo;

        return w;
    }
}
#endif /* NO_ASM */

uint32_t fastcrc32(jint crc, Bytef * buf, jint len) {
    const unsigned long FAR * timesXtoThe32 = crc_table;
    intptr_t ibuf = (intptr_t) buf;
    int log_align = 4;
    int align = 1 << log_align;
    int mask = align - 1;
    int islop = (align - ibuf) & mask;
    uint32_t c = ~crc;
    int i = 0;

    if (len - islop >= align) {
        jint len_128bit;
        /* Handle bytes preceding 16-byte alignment. */
        for (i = 0; i < islop; i++ ) {
            uint32_t x0 = buf[i];
            x0 = timesXtoThe32[(x0 ^ c) & 0xFF];
            c = x0 ^ (c >> 8);
        }
        buf += i;
        len -= i;

        len_128bit = len >> log_align;

        if (len_128bit > 0) {
            uint64_t w = kernel(c, buf, len_128bit, K_struct);
            /*
             * 8 8-bit folds to compute 32-bit CRC.
             */
            w = timesXtoThe32[w & 0xFF] ^ (w >> 8);
            w = timesXtoThe32[w & 0xFF] ^ (w >> 8);
            w = timesXtoThe32[w & 0xFF] ^ (w >> 8);
            w = timesXtoThe32[w & 0xFF] ^ (w >> 8);
            w = timesXtoThe32[w & 0xFF] ^ (w >> 8);
            w = timesXtoThe32[w & 0xFF] ^ (w >> 8);
            w = timesXtoThe32[w & 0xFF] ^ (w >> 8);
            w = timesXtoThe32[w & 0xFF] ^ (w >> 8);
            c = (uint32_t) w;
            i = len_128bit << log_align;
        } else {
            i = 0;
        }
    }
    /* Handle short CRC and tail of long CRC */
    for (; i < len; i++) {
        uint32_t x0 = buf[i];
        x0 = timesXtoThe32[(x0 ^ c) & 0xFF];
        c = x0 ^ (c >> 8);
    }
    return ~c;
}
#endif