1 /*
   2  * Copyright (c) 1997, 2013, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.  Oracle designates this
   8  * particular file as subject to the "Classpath" exception as provided
   9  * by Oracle in the LICENSE file that accompanied this code.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  */
  25 
  26 /*
  27  * Native method support for java.util.zip.CRC32
  28  */
  29 
  30 #include "jni.h"
  31 #include "jni_util.h"
  32 #include <zlib.h>
  33 
  34 #include "java_util_zip_CRC32.h"
  35 
  36 /* define CAN_COMPILE_CLMUL 0 to disable fastcrc32 completely. */
  37 
  38 #ifndef CAN_COMPILE_CLMUL
  39    /* Windows not supported -- different assembly language syntax,
  40       and though the newer compilers support the intrinsics, the code
  41       is not very good. */
  42 #  ifndef _WIN32
  43 #    ifdef __x86_64
  44 #      define CAN_COMPILE_CLMUL 1
  45 #    elif defined(__i386)
  46 #      define CAN_COMPILE_CLMUL 1
  47 #    endif
  48 #  endif /* _WIN32 */
  49 #endif
  50 
  51 #if CAN_COMPILE_CLMUL
  52 #include <stdint.h>
  53 #include <stdlib.h>
  54 
  55 struct crc_by128_K {
  56     /* The fields in this structure are arranged so that if it is
  57      * allocated at a 16-byte alignment they can be picked up two at
  58      * a time with 128-bit loads.
  59      *
  60      * Because of flipped bit order for this CRC polynomials
  61      * the constant for X**N is left-shifted by 1.  This is because
  62      * a 64 x 64 polynomial multiply produces a 127-bit result
  63      * but the highest term is always aligned to bit 0 in the container.
  64      * Pre-shifting by one fixes this, at the cost of potentially making
  65      * the 32-bit constant no longer fit in a 32-bit container (thus the
  66      * use of uint64_t, though this is also the size used by the carry-
  67      * less multiply instruction.
  68      *
  69      * In addition, the flipped bit order and highest-term-at-least-bit
  70      * multiply changes the constants used.  The 96-bit result will be
  71      * aligned to the high-term end of the target 128-bit container,
  72      * not the low-term end; that is, instead of a 512-bit or 576-bit fold,
  73      * instead it is a 480 (=512-32) or 544 (=512+64-32) bit fold.
  74      *
  75      * This cause additional problems in the 128-to-64-bit reduction; see the
  76      * code for details.  By storing a mask in the otherwise unused half of
  77      * a 128-bit constant, bits can be cleared before multiplication without
  78      * storing and reloading.  Note that staying on a 128-bit datapath means
  79      * that some data is uselessly stored and some unused data is intersected
  80      * with an irrelevant constant.
  81      */
  82 
  83     uint64_t mask; /* low of K_M_64 */
  84     uint64_t xtt64; /* high of K_M_64 */
  85     uint64_t xtt160; /* low of K_160_96 */
  86     uint64_t xtt96; /* high of K_160_96 */
  87     uint64_t xtt544; /* low of K_544_480 */
  88     uint64_t xtt480; /* high of K_544_480 */
  89 };
  90 
  91 struct crc_by128_K * K_struct = 0;
  92 
  93 static const uint64_t x64 = (uint64_t) 0xb1e6b092U << 1;
  94 static const uint64_t x96 = (uint64_t) 0x6655004fU << 1;
  95 static const uint64_t x160 = (uint64_t) 0xba8ccbe8U << 1;
  96 static const uint64_t x480 = (uint64_t) 0xe3720acbU << 1;
  97 static const uint64_t x544 = (uint64_t) 0xaa2215eaU << 1;
  98 
  99 static struct crc_by128_K * init_crc_by128_K() {
 100     void * y;
 101     y = malloc(16 + sizeof(struct crc_by128_K));
 102     // posix_memalign not available on all platforms
 103     if (y == NULL) {
 104         return (struct crc_by128_K *) NULL;
 105     }
 106     uint8_t * z = (uint8_t *) y;
 107     long p = (long) z;
 108     z += (16 - p) & 15;
 109     struct crc_by128_K * x = (struct crc_by128_K *) z;
 110     x -> mask = 0xffffffffUL;
 111     x -> xtt64 = x64;
 112     x -> xtt160 = x160;
 113     x -> xtt96 = x96;
 114     x -> xtt544 = x544;
 115     x -> xtt480 = x480;
 116     return x;
 117 }
 118 
 119 uint32_t fastcrc32(jint crc, Bytef * buf, jint len);
 120 
 121 /* Flag governing use of "CLMUL" instruction.
 122    For now, implies little-endian.
 123    Computed dynamically, incorporates information about
 124    the current hardware and the compiler used to compile
 125    this file. */
 126 static int useClmul = 0;
 127 #else /* Cannot compile CLMUL */
 128 /* Stub out fastcrc32 */
 129 # define fastcrc32 crc32
 130 # define useClmul 0
 131 #endif
 132 
 133 /* Local copy of CRC32 table is used to fill and drain CLMUL CRC.
 134    Extra members beyond the first 256-entry row are ignored. */
 135 static const unsigned long FAR * crc_table;
 136 
 137 /* Initialize java-side table (for small CRCs) to avoid extra startup work,
 138    and capture the platform-dependent useClmul flag.
 139 */
 140 JNIEXPORT jboolean JNICALL
 141 Java_java_util_zip_CRC32_init(JNIEnv *env, jclass cls, jarray b, jboolean use_clmul)
 142 {
 143     /* Get the CRC table from zip to initialize JNI.  Our private copy
 144        is missing if not compiled for fastcrc32. */
 145     jint *buf = (*env)->GetPrimitiveArrayCritical(env, b, 0);
 146     crc_table = get_crc_table();
 147     if (buf) {
 148           /* Don't know for sure how big an unsigned long is, therefore
 149              copy one at a time. */
 150           int i;
 151           for (i = 0; i < 256; i++) buf[i] = (jint) (crc_table[i]);
 152           (*env)->ReleasePrimitiveArrayCritical(env, b, buf, 0);
 153     }
 154 #if CAN_COMPILE_CLMUL
 155    if (use_clmul) {
 156         K_struct = init_crc_by128_K();
 157         useClmul = K_struct != 0;
 158         /* Rather than throw OOME, just do without fast CRC. */
 159     }
 160 #endif
 161     return useClmul;
 162 }
 163 
 164 JNIEXPORT jint JNICALL
 165 Java_java_util_zip_CRC32_update(JNIEnv *env, jclass cls, jint crc, jint b)
 166 {
 167     Bytef buf[1];
 168 
 169     buf[0] = (Bytef)b;
 170     return crc32(crc, buf, 1); // single byte not done quickly by fastcrc32
 171 }
 172 
 173 JNIEXPORT jint JNICALL
 174 Java_java_util_zip_CRC32_updateBytes(JNIEnv *env, jclass cls, jint crc,
 175                                      jarray b, jint off, jint len)
 176 {
 177     Bytef *buf = (*env)->GetPrimitiveArrayCritical(env, b, 0);
 178     if (buf) {
 179         crc = (jint) (useClmul ? fastcrc32(crc, buf + off, len) :
 180                                      crc32(crc, buf + off, len));
 181         (*env)->ReleasePrimitiveArrayCritical(env, b, buf, 0);
 182     }
 183     return crc;
 184 }
 185 
 186 JNIEXPORT jint ZIP_CRC32(jint crc, const jbyte *buf, jint len)
 187 {
 188     return (jint) (useClmul ? fastcrc32(crc, (Bytef*)buf, len) :
 189                                   crc32(crc, (Bytef*)buf, len));
 190 }
 191 
 192 JNIEXPORT jint JNICALL
 193 Java_java_util_zip_CRC32_updateByteBuffer(JNIEnv *env, jclass cls, jint crc,
 194                                           jlong address, jint off, jint len)
 195 {
 196     Bytef *buf = (Bytef *)jlong_to_ptr(address);
 197     if (buf) {
 198         crc = (jint) (useClmul ? fastcrc32(crc, buf + off, len) :
 199                                      crc32(crc, buf + off, len));
 200     }
 201     return crc;
 202 }
 203 
 204 #if CAN_COMPILE_CLMUL
 205 #ifndef NO_ASM
 206 
 207 /* set up the platform-specific glop surrounding the function body. */
 208 #  ifdef __x86_64
 209 #    ifdef __APPLE__
 210 #      define ASM_PREFIX ".text\n\t.align 8\n\t.globl _kernel\n_kernel:\n\t"
 211 #      define ASM_SUFFIX ""
 212 #    elif defined(__GNUC__)
 213 #      define ASM_PREFIX ".text\n\t.align 16\n\t.globl kernel\n\t.type kernel,@function\nkernel:\n\t"
 214 #      define ASM_SUFFIX ""
 215 #    elif defined(__SUNPRO_C)
 216 #      define ASM_PREFIX ".section .text,\"ax\"\n\t.align 16, 0x90\n\t.globl kernel\n\t.type kernel,@function\nkernel:\n\t"
 217 #      define ASM_SUFFIX ".size kernel,.-kernel"
 218 #    else
 219        /* Perhaps the mystery compiler can handle the intrinsics. */
 220 #      define NO_ASM 1
 221 #    endif
 222 
 223 #    ifndef NO_ASM
 224 __asm__(
 225         ASM_PREFIX
 226         " pushq  %rbp\n\t"
 227         " movq   %rsp, %rbp\n\t"
 228         " movl   %edi, %eax\n\t"
 229         " .byte  0xc5,0xf9,0x6f,0x06  # vmovdqa(%rsi), %xmm0\n\t"
 230         " .byte  0xc4,0xe1,0xf9,0x7e,0xc7  # vmovd  %xmm0, %rdi\n\t"
 231         " xorq   %rax, %rdi\n\t"
 232         " .byte  0xc4,0xe3,0xf9,0x22,0xd7,0x00  # vpinsrq$0, %rdi, %xmm0, %xmm2\n\t"
 233         " .byte  0xc5,0x79,0x6f,0x01  # vmovdqa(%rcx), %xmm8\n\t"
 234         " .byte  0xc5,0x79,0x6f,0x49,0x10  # vmovdqa16(%rcx), %xmm9\n\t"
 235         " movl   $1, %eax\n\t"
 236         " cmpl   $4, %edx\n\t"
 237         " jl     1f\n\t"
 238         " .byte  0xc5,0xf9,0x6f,0x6e,0x10  # vmovdqa16(%rsi), %xmm5\n\t"
 239         " .byte  0xc5,0xf9,0x6f,0x66,0x20  # vmovdqa32(%rsi), %xmm4\n\t"
 240         " .byte  0xc5,0xf9,0x6f,0x5e,0x30  # vmovdqa48(%rsi), %xmm3\n\t"
 241         " leal   -3(%rdx), %edi\n\t"
 242         " movl   $4, %eax\n\t"
 243         " cmpl   $5, %edi\n\t"
 244         " jl     2f\n\t"
 245         " .byte  0xc5,0xf9,0x6f,0x71,0x20  # vmovdqa32(%rcx), %xmm6\n\t"
 246         " leaq   112(%rsi), %rcx\n\t"
 247         " movl   $4, %eax\n\t"
 248         " .align  4, 0x90\n"
 249         "3: .byte  0xc4,0xe3,0x49,0x44,0xc2,0x00  # vpclmulqdq$0, %xmm2, %xmm6, %xmm0\n\t"
 250         " .byte  0xc4,0xe3,0x49,0x44,0xcb,0x11  # vpclmulqdq$17, %xmm3, %xmm6, %xmm1\n\t"
 251         " .byte  0xc4,0xe3,0x49,0x44,0xdb,0x00  # vpclmulqdq$0, %xmm3, %xmm6, %xmm3\n\t"
 252         " .byte  0xc5,0xe1,0xef,0x19  # vpxor  (%rcx), %xmm3, %xmm3\n\t"
 253         " .byte  0xc4,0xe3,0x49,0x44,0xfd,0x00  # vpclmulqdq$0, %xmm5, %xmm6, %xmm7\n\t"
 254         " .byte  0xc5,0xc1,0xef,0x79,0xe0  # vpxor  -32(%rcx), %xmm7, %xmm7\n\t"
 255         " .byte  0xc5,0xf1,0xef,0xdb  # vpxor  %xmm3, %xmm1, %xmm3\n\t"
 256         " .byte  0xc4,0xe3,0x49,0x44,0xd2,0x11  # vpclmulqdq$17, %xmm2, %xmm6, %xmm2\n\t"
 257         " .byte  0xc5,0xf9,0xef,0x41,0xd0  # vpxor  -48(%rcx), %xmm0, %xmm0\n\t"
 258         " .byte  0xc4,0xe3,0x49,0x44,0xcd,0x11  # vpclmulqdq$17, %xmm5, %xmm6, %xmm1\n\t"
 259         " .byte  0xc4,0xe3,0x49,0x44,0xec,0x11  # vpclmulqdq$17, %xmm4, %xmm6, %xmm5\n\t"
 260         " .byte  0xc4,0xe3,0x49,0x44,0xe4,0x00  # vpclmulqdq$0, %xmm4, %xmm6, %xmm4\n\t"
 261         " .byte  0xc5,0xd9,0xef,0x61,0xf0  # vpxor  -16(%rcx), %xmm4, %xmm4\n\t"
 262         " .byte  0xc5,0xd1,0xef,0xe4  # vpxor  %xmm4, %xmm5, %xmm4\n\t"
 263         " .byte  0xc5,0xf1,0xef,0xef  # vpxor  %xmm7, %xmm1, %xmm5\n\t"
 264         " .byte  0xc5,0xe9,0xef,0xd0  # vpxor  %xmm0, %xmm2, %xmm2\n\t"
 265         " addq   $64, %rcx\n\t"
 266         " addl   $4, %eax\n\t"
 267         " cmpl   %edi, %eax\n\t"
 268         " jl     3b\n"
 269         "2: .byte  0xc4,0xe3,0x31,0x44,0xc2,0x11  # vpclmulqdq$17, %xmm2, %xmm9, %xmm0\n\t"
 270         " .byte  0xc4,0xe3,0x31,0x44,0xca,0x00  # vpclmulqdq$0, %xmm2, %xmm9, %xmm1\n\t"
 271         " .byte  0xc5,0xd1,0xef,0xc9  # vpxor  %xmm1, %xmm5, %xmm1\n\t"
 272         " .byte  0xc5,0xf1,0xef,0xc8  # vpxor  %xmm0, %xmm1, %xmm1\n\t"
 273         " .byte  0xc4,0xe3,0x31,0x44,0xc1,0x11  # vpclmulqdq$17, %xmm1, %xmm9, %xmm0\n\t"
 274         " .byte  0xc4,0xe3,0x31,0x44,0xc9,0x00  # vpclmulqdq$0, %xmm1, %xmm9, %xmm1\n\t"
 275         " .byte  0xc5,0xd9,0xef,0xc9  # vpxor  %xmm1, %xmm4, %xmm1\n\t"
 276         " .byte  0xc5,0xf1,0xef,0xc8  # vpxor  %xmm0, %xmm1, %xmm1\n\t"
 277         " .byte  0xc4,0xe3,0x31,0x44,0xc1,0x11  # vpclmulqdq$17, %xmm1, %xmm9, %xmm0\n\t"
 278         " .byte  0xc4,0xe3,0x31,0x44,0xc9,0x00  # vpclmulqdq$0, %xmm1, %xmm9, %xmm1\n\t"
 279         " .byte  0xc5,0xe1,0xef,0xc9  # vpxor  %xmm1, %xmm3, %xmm1\n\t"
 280         " .byte  0xc5,0xf1,0xef,0xd0  # vpxor  %xmm0, %xmm1, %xmm2\n"
 281         "1: cmpl   %edx, %eax\n\t"
 282         " jge    4f\n\t"
 283         " subl   %eax, %edx\n\t"
 284         " movslq %eax, %rax\n\t"
 285         " shlq   $4, %rax\n\t"
 286         " addq   %rax, %rsi\n\t"
 287         " .align  4, 0x90\n"
 288         "5: .byte  0xc4,0xe3,0x31,0x44,0xc2,0x11  # vpclmulqdq$17, %xmm2, %xmm9, %xmm0\n\t"
 289         " .byte  0xc4,0xe3,0x31,0x44,0xca,0x00  # vpclmulqdq$0, %xmm2, %xmm9, %xmm1\n\t"
 290         " .byte  0xc5,0xf1,0xef,0x0e  # vpxor  (%rsi), %xmm1, %xmm1\n\t"
 291         " .byte  0xc5,0xf1,0xef,0xd0  # vpxor  %xmm0, %xmm1, %xmm2\n\t"
 292         " addq   $16, %rsi\n\t"
 293         " decl   %edx\n\t"
 294         " jne    5b\n"
 295         "4: .byte  0xc4,0xe3,0x39,0x44,0xc2,0x01  # vpclmulqdq$1, %xmm2, %xmm8, %xmm0\n\t"
 296         " .byte  0xc4,0xe1,0xf9,0x7e,0xc0  # vmovd  %xmm0, %rax\n\t"
 297         " .byte  0xc4,0xe3,0xf9,0x16,0xc1,0x01  # vpextrq$1, %xmm0, %rcx\n\t"
 298         " shldq  $32, %rax, %rcx\n\t"
 299         " .byte  0xc5,0xb9,0xdb,0xc0  # vpand  %xmm0, %xmm8, %xmm0\n\t"
 300         " .byte  0xc4,0xe3,0x39,0x44,0xc0,0x01  # vpclmulqdq$1, %xmm0, %xmm8, %xmm0\n\t"
 301         " .byte  0xc4,0xe1,0xf9,0x7e,0xc2  # vmovd  %xmm0, %rdx\n\t"
 302         " .byte  0xc4,0xe3,0xf9,0x16,0xd0,0x01  # vpextrq$1, %xmm2, %rax\n\t"
 303         " xorq   %rdx, %rax\n\t"
 304         " xorq   %rcx, %rax\n\t"
 305         " popq   %rbp\n\t"
 306         " ret\n"
 307         ASM_SUFFIX
 308         );
 309 #    endif
 310 #  elif defined(__i386)
 311 
 312 /* set up the platform-specific glop surrounding the function body. */
 313 #    ifdef __APPLE__
 314 #      define ASM_PREFIX ".text\n\t.align 16\n\t.globl _kernel\n_kernel:\n\t"
 315 #      define ASM_SUFFIX ""
 316 #    elif defined(__GNUC__)
 317 #      define ASM_PREFIX ".text\n\t.align 16\n\t.globl kernel\n\t.type kernel,@function\nkernel:\n\t"
 318 #      define ASM_SUFFIX ""
 319 #    elif defined(__SUNPRO_C)
 320 #      define ASM_PREFIX ".section .text,\"ax\"\n\t.align 16, 0x90\n\t.globl kernel\n\t.type kernel,@function\nkernel:\n\t"
 321 #      define ASM_SUFFIX ".size kernel,.-kernel"
 322 #    else
 323        /* Perhaps the mystery compiler can handle the intrinsics. */
 324 #      define NO_ASM 1
 325 #    endif
 326 
 327 #    ifndef NO_ASM
 328 __asm__(
 329         ASM_PREFIX
 330         " pushl  %ebp\n\t"
 331         " movl   %esp, %ebp\n\t"
 332         " pushl  %edi\n\t"
 333         " pushl  %esi\n\t"
 334         " movl   12(%ebp), %eax\n\t"
 335         " .byte  0xc5,0xf9,0x28,0x00  # vmovapd(%eax), %xmm0\n\t"
 336         " .byte  0xc5,0xf9,0x7e,0xc1  # vmovd  %xmm0, %ecx\n\t"
 337         " xorl   8(%ebp), %ecx\n\t"
 338         " .byte  0xc4,0xe3,0x79,0x22,0xc9,0x00  # vpinsrd$0, %ecx, %xmm0, %xmm1\n\t"
 339         " .byte  0xc4,0xe3,0x79,0x16,0xc1,0x01  # vpextrd$1, %xmm0, %ecx\n\t"
 340         " .byte  0xc4,0xe3,0x71,0x22,0xc9,0x01  # vpinsrd$1, %ecx, %xmm1, %xmm1\n\t"
 341         " movl   20(%ebp), %edi\n\t"
 342         " .byte  0xc5,0xf9,0x6f,0x07  # vmovdqa(%edi), %xmm0\n\t"
 343         " .byte  0xc5,0xf9,0x6f,0x57,0x10  # vmovdqa16(%edi), %xmm2\n\t"
 344         " movl   $1, %edx\n\t"
 345         " movl   16(%ebp), %ecx\n\t"
 346         " cmpl   $4, %ecx\n\t"
 347         " jl     1f\n\t"
 348         " .byte  0xc5,0xf9,0x6f,0x58,0x30  # vmovdqa48(%eax), %xmm3\n\t"
 349         " .byte  0xc5,0xf9,0x6f,0x68,0x10  # vmovdqa16(%eax), %xmm5\n\t"
 350         " .byte  0xc5,0xf9,0x6f,0x60,0x20  # vmovdqa32(%eax), %xmm4\n\t"
 351         " leal   -3(%ecx), %esi\n\t"
 352         " movl   $4, %edx\n\t"
 353         " cmpl   $5, %esi\n\t"
 354         " jl     2f\n\t"
 355         " .byte  0xc5,0xf9,0x6f,0x77,0x20  # vmovdqa32(%edi), %xmm6\n\t"
 356         " leal   112(%eax), %edi\n\t"
 357         " movl   $4, %edx\n\t"
 358         " .align  4, 0x90\n"
 359         "3: .byte  0xc4,0xe3,0x49,0x44,0xfb,0x11  # vpclmulqdq$17, %xmm3, %xmm6, %xmm7\n\t"
 360         " .byte  0xc4,0xe3,0x49,0x44,0xdb,0x00  # vpclmulqdq$0, %xmm3, %xmm6, %xmm3\n\t"
 361         " .byte  0xc5,0xe1,0xef,0x1f  # vpxor  (%edi), %xmm3, %xmm3\n\t"
 362         " .byte  0xc5,0xc1,0xef,0xdb  # vpxor  %xmm3, %xmm7, %xmm3\n\t"
 363         " .byte  0xc4,0xe3,0x49,0x44,0xfc,0x11  # vpclmulqdq$17, %xmm4, %xmm6, %xmm7\n\t"
 364         " .byte  0xc4,0xe3,0x49,0x44,0xe4,0x00  # vpclmulqdq$0, %xmm4, %xmm6, %xmm4\n\t"
 365         " .byte  0xc5,0xd9,0xef,0x67,0xf0  # vpxor  -16(%edi), %xmm4, %xmm4\n\t"
 366         " .byte  0xc5,0xc1,0xef,0xe4  # vpxor  %xmm4, %xmm7, %xmm4\n\t"
 367         " .byte  0xc4,0xe3,0x49,0x44,0xfd,0x11  # vpclmulqdq$17, %xmm5, %xmm6, %xmm7\n\t"
 368         " .byte  0xc4,0xe3,0x49,0x44,0xed,0x00  # vpclmulqdq$0, %xmm5, %xmm6, %xmm5\n\t"
 369         " .byte  0xc5,0xd1,0xef,0x6f,0xe0  # vpxor  -32(%edi), %xmm5, %xmm5\n\t"
 370         " .byte  0xc5,0xc1,0xef,0xed  # vpxor  %xmm5, %xmm7, %xmm5\n\t"
 371         " .byte  0xc4,0xe3,0x49,0x44,0xf9,0x11  # vpclmulqdq$17, %xmm1, %xmm6, %xmm7\n\t"
 372         " .byte  0xc4,0xe3,0x49,0x44,0xc9,0x00  # vpclmulqdq$0, %xmm1, %xmm6, %xmm1\n\t"
 373         " .byte  0xc5,0xf1,0xef,0x4f,0xd0  # vpxor  -48(%edi), %xmm1, %xmm1\n\t"
 374         " .byte  0xc5,0xc1,0xef,0xc9  # vpxor  %xmm1, %xmm7, %xmm1\n\t"
 375         " addl   $64, %edi\n\t"
 376         " addl   $4, %edx\n\t"
 377         " cmpl   %esi, %edx\n\t"
 378         " jl     3b\n"
 379         "2: .byte  0xc4,0xe3,0x69,0x44,0xf1,0x11  # vpclmulqdq$17, %xmm1, %xmm2, %xmm6\n\t"
 380         " .byte  0xc4,0xe3,0x69,0x44,0xc9,0x00  # vpclmulqdq$0, %xmm1, %xmm2, %xmm1\n\t"
 381         " .byte  0xc5,0xd1,0xef,0xc9  # vpxor  %xmm1, %xmm5, %xmm1\n\t"
 382         " .byte  0xc5,0xf1,0xef,0xee  # vpxor  %xmm6, %xmm1, %xmm5\n\t"
 383         " .byte  0xc4,0xe3,0x69,0x44,0xcd,0x11  # vpclmulqdq$17, %xmm5, %xmm2, %xmm1\n\t"
 384         " .byte  0xc4,0xe3,0x69,0x44,0xed,0x00  # vpclmulqdq$0, %xmm5, %xmm2, %xmm5\n\t"
 385         " .byte  0xc5,0xd9,0xef,0xe5  # vpxor  %xmm5, %xmm4, %xmm4\n\t"
 386         " .byte  0xc5,0xd9,0xef,0xe1  # vpxor  %xmm1, %xmm4, %xmm4\n\t"
 387         " .byte  0xc4,0xe3,0x69,0x44,0xcc,0x11  # vpclmulqdq$17, %xmm4, %xmm2, %xmm1\n\t"
 388         " .byte  0xc4,0xe3,0x69,0x44,0xe4,0x00  # vpclmulqdq$0, %xmm4, %xmm2, %xmm4\n\t"
 389         " .byte  0xc5,0xe1,0xef,0xdc  # vpxor  %xmm4, %xmm3, %xmm3\n\t"
 390         " .byte  0xc5,0xe1,0xef,0xc9  # vpxor  %xmm1, %xmm3, %xmm1\n"
 391         "1: cmpl   %ecx, %edx\n\t"
 392         " jge    4f\n\t"
 393         " subl   %edx, %ecx\n\t"
 394         " shll   $4, %edx\n\t"
 395         " addl   %edx, %eax\n\t"
 396         " .align  4, 0x90\n"
 397         "5: .byte  0xc4,0xe3,0x69,0x44,0xd9,0x11  # vpclmulqdq$17, %xmm1, %xmm2, %xmm3\n\t"
 398         " .byte  0xc4,0xe3,0x69,0x44,0xc9,0x00  # vpclmulqdq$0, %xmm1, %xmm2, %xmm1\n\t"
 399         " .byte  0xc5,0xf1,0xef,0x08  # vpxor  (%eax), %xmm1, %xmm1\n\t"
 400         " .byte  0xc5,0xf1,0xef,0xcb  # vpxor  %xmm3, %xmm1, %xmm1\n\t"
 401         " addl   $16, %eax\n\t"
 402         " decl   %ecx\n\t"
 403         " jne    5b\n"
 404         "4: .byte  0xc4,0xe3,0x79,0x44,0xd1,0x01  # vpclmulqdq$1, %xmm1, %xmm0, %xmm2\n\t"
 405         " .byte  0xc5,0xf9,0xdb,0xda  # vpand  %xmm2, %xmm0, %xmm3\n\t"
 406         " .byte  0xc4,0xe3,0x79,0x44,0xc3,0x01  # vpclmulqdq$1, %xmm3, %xmm0, %xmm0\n\t"
 407         " .byte  0xc5,0xf9,0x7e,0xc0  # vmovd  %xmm0, %eax\n\t"
 408         " .byte  0xc4,0xe3,0x79,0x16,0xc9,0x02  # vpextrd$2, %xmm1, %ecx\n\t"
 409         " xorl   %eax, %ecx\n\t"
 410         " .byte  0xc4,0xe3,0x79,0x16,0xd0,0x01  # vpextrd$1, %xmm2, %eax\n\t"
 411         " xorl   %ecx, %eax\n\t"
 412         " .byte  0xc4,0xe3,0x79,0x16,0xc2,0x01  # vpextrd$1, %xmm0, %edx\n\t"
 413         " .byte  0xc4,0xe3,0x79,0x16,0xc9,0x03  # vpextrd$3, %xmm1, %ecx\n\t"
 414         " xorl   %edx, %ecx\n\t"
 415         " .byte  0xc4,0xe3,0x79,0x16,0xd2,0x02  # vpextrd$2, %xmm2, %edx\n\t"
 416         " xorl   %ecx, %edx\n\t"
 417         " popl   %esi\n\t"
 418         " popl   %edi\n\t"
 419         " popl   %ebp\n\t"
 420         " ret\n"
 421         ASM_SUFFIX
 422         );
 423 #    endif
 424 #  else /* architecture type */
 425 /* Not intel, not that the C intrinsics will compile anywhere else,
 426  * but it will be a slightly better error message.
 427  */
 428 #    define NO_ASM 1
 429 #  endif
 430 #endif /* NO_ASM */
 431 
 432 #ifndef NO_ASM
 433 /* Declaration for use below. */
 434 uint64_t kernel(uint32_t c, unsigned char * buf, int len_128bit, struct crc_by128_K * K);
 435 #else
 436 #pragma message("Compiling 'kernel' from C source with intrinsics")
 437 #include <wmmintrin.h>
 438 #include <emmintrin.h>
 439 
 440 union u {
 441     __m128i v;
 442     struct {
 443         uint64_t lo;
 444         uint64_t hi;
 445     };
 446 };
 447 
 448 /**
 449  * Assume c is existing crc,
 450  * buf is 16-byte-aligned,
 451  * len is a multiple of 16 greater than zero.
 452  */
 453 uint64_t kernel(uint32_t c, unsigned char * buf, int len_128bit,
 454         struct crc_by128_K * K) {
 455 
 456     __m128i * b = (__m128i *) buf;
 457     int i = 0;
 458 
 459     /* 128 bit constants and variables. */
 460     __m128i K_544_480, K_160_96, K_M_64,
 461     x0, x1, x2, x3,
 462     x0a, x1a, x2a, x3a,
 463     x0b, x1b, x2b, x3b;
 464 
 465     /* Use these to move data between xmm registers and "normal" registers. */
 466     union u ut0, ut1, ut2, ut3;
 467 
 468     K_544_480 = * (__m128i *) & (K -> xtt544);
 469     K_160_96 = * (__m128i *) & (K -> xtt160);
 470     K_M_64 = * (__m128i *) & (K -> mask);
 471 
 472     /* Incorporate existing CRC into first item */
 473     ut0.v = b[0];
 474     ut0.lo ^= c;
 475     x0 = ut0.v;
 476 
 477     if (len_128bit >= 4) {
 478         /* Written as a slightly pipelined loop. */
 479 
 480         x1 = b[1];
 481         x2 = b[2];
 482         x3 = b[3];
 483 
 484         /* Iterate once if len_128bit is between 8 and 11
 485          * 4 < 8-3 < 11 - 3
 486          * 8 !< 11 - 3 < 12 - 3.
 487          *
 488          * 0 1 2 3 | 4 5 6 7 | 8 9 10 11 | 12
 489          *
 490          */
 491         for (i = 4; i < len_128bit - 3 ; i+= 4) {
 492             /* Each iteration of this loop folds the 512 bits of polynomial
 493              * in x0-x3 with the data in b[i]..b[i+3].
 494              */
 495             x0a = b[i];
 496             x1a = b[i+1];
 497             x2a = b[i+2];
 498             x3a = b[i+3];
 499 
 500             x0b = _mm_clmulepi64_si128(K_544_480, x0, 0x00);
 501             x0 = _mm_clmulepi64_si128(K_544_480, x0, 0x11);
 502             x1b = _mm_clmulepi64_si128(K_544_480, x1, 0x00);
 503             x1 = _mm_clmulepi64_si128(K_544_480, x1, 0x11);
 504 
 505             x2b = _mm_clmulepi64_si128(K_544_480, x2, 0x00);
 506             x2 = _mm_clmulepi64_si128(K_544_480, x2, 0x11);
 507             x3b = _mm_clmulepi64_si128(K_544_480, x3, 0x00);
 508             x3 = _mm_clmulepi64_si128(K_544_480, x3, 0x11);
 509 
 510             // x0 ^= x0a ^ x0b;
 511             x0 = _mm_xor_si128(x0, x0a);
 512             x0 = _mm_xor_si128(x0, x0b);
 513             // x1 ^= x1a ^ x1b;
 514             x1 = _mm_xor_si128(x1, x1a);
 515             x1 = _mm_xor_si128(x1, x1b);
 516             // x2 ^= x2a ^ x2b;
 517             x2 = _mm_xor_si128(x2, x2a);
 518             x2 = _mm_xor_si128(x2, x2b);
 519             // x3 ^= x3a ^ x3b;
 520             x3 = _mm_xor_si128(x3, x3a);
 521             x3 = _mm_xor_si128(x3, x3b);
 522         }
 523         /* x0 - x3 contains 4 x 128 bits of accumulated result.
 524          * 0-3 hexads potentially remain in [i,len_128bit) entries.
 525          * Assume trailing bytes beyond that are handled by our caller.
 526          */
 527         x0a = _mm_clmulepi64_si128(K_160_96, x0, 0x00);
 528         x0b = _mm_clmulepi64_si128(K_160_96, x0, 0x11);
 529         x1 = _mm_xor_si128(x1, x0a);
 530         x1 = _mm_xor_si128(x1, x0b);
 531         x0a = _mm_clmulepi64_si128(K_160_96, x1, 0x00);
 532         x0b = _mm_clmulepi64_si128(K_160_96, x1, 0x11);
 533         x2 = _mm_xor_si128(x2, x0a);
 534         x2 = _mm_xor_si128(x2, x0b);
 535         x0a = _mm_clmulepi64_si128(K_160_96, x2, 0x00);
 536         x0b = _mm_clmulepi64_si128(K_160_96, x2, 0x11);
 537         x3 = _mm_xor_si128(x3, x0a);
 538         x3 = _mm_xor_si128(x3, x0b);
 539     } else {
 540         /* Loaded 128 bits already into x0.
 541          */
 542         x3 = x0;
 543         i = 1;
 544     }
 545 
 546     /* x3 is now 128-bit result.
 547      * Fold 0-3 128-bit chunks into x3.
 548      */
 549     for (; i < len_128bit; i++) {
 550         x0 = b[i]; // data to fold
 551         // fold x3 down by 128 to align with data.
 552         x0a = _mm_clmulepi64_si128(K_160_96, x3, 0x00);
 553         x0b = _mm_clmulepi64_si128(K_160_96, x3, 0x11);
 554         x3 = _mm_xor_si128(x0, x0a);
 555         x3 = _mm_xor_si128(x3, x0b);
 556         // x3 is now aligned with data we just loaded.
 557     }
 558 
 559     /*
 560      * No more 128bits remain.
 561      * Fold x3 down into 32 bits.
 562      */
 563     {
 564         uint64_t w;
 565         uint64_t y;
 566         ut0.v = x3;
 567         y = ut0.hi; // 64 low-order terms of polynomial into y.
 568 
 569         /* polynomial term order:
 570          * high -> low
 571          * bit number order
 572          * 0 -> 127
 573          *
 574          * input, from which y was just extracted.
 575          * w0 w1 y0 y1
 576          * w0:w1 * x64 yields 96 bits.
 577          * p0:p1:p2:__ (aligned wrong, store to extract p1 and p2)
 578          * p0:p1:__:__ & ff:00:__:__ (mask to get rid of p1)
 579          * p0:00:__:__
 580          * p0:00 * x64 (times x64 yields 64 bits)
 581          * r0:r1 store and xor.
 582          */
 583 
 584         x0 = _mm_clmulepi64_si128(K_M_64, x3, 0x01);
 585         ut1.v = x0;
 586         w = (ut1.lo >> 32) + (ut1.hi << 32); // extract low-poly 64 bits.
 587         x0 = _mm_and_si128(K_M_64, x0); // mask away what we just extracted..
 588         x0 = _mm_clmulepi64_si128(K_M_64, x0, 0x01);
 589         w ^= y;
 590         ut2.v = x0;
 591         w ^= ut2.lo;
 592 
 593         return w;
 594     }
 595 }
 596 #endif /* NO_ASM */
 597 
 598 uint32_t fastcrc32(jint crc, Bytef * buf, jint len) {
 599     const unsigned long FAR * timesXtoThe32 = crc_table;
 600     intptr_t ibuf = (intptr_t) buf;
 601     int log_align = 4;
 602     int align = 1 << log_align;
 603     int mask = align - 1;
 604     int islop = (align - ibuf) & mask;
 605     uint32_t c = ~crc;
 606     int i = 0;
 607 
 608     if (len - islop >= align) {
 609         jint len_128bit;
 610         /* Handle bytes preceding 16-byte alignment. */
 611         for (i = 0; i < islop; i++ ) {
 612             uint32_t x0 = buf[i];
 613             x0 = timesXtoThe32[(x0 ^ c) & 0xFF];
 614             c = x0 ^ (c >> 8);
 615         }
 616         buf += i;
 617         len -= i;
 618 
 619         len_128bit = len >> log_align;
 620 
 621         if (len_128bit > 0) {
 622             uint64_t w = kernel(c, buf, len_128bit, K_struct);
 623             /*
 624              * 8 8-bit folds to compute 32-bit CRC.
 625              */
 626             w = timesXtoThe32[w & 0xFF] ^ (w >> 8);
 627             w = timesXtoThe32[w & 0xFF] ^ (w >> 8);
 628             w = timesXtoThe32[w & 0xFF] ^ (w >> 8);
 629             w = timesXtoThe32[w & 0xFF] ^ (w >> 8);
 630             w = timesXtoThe32[w & 0xFF] ^ (w >> 8);
 631             w = timesXtoThe32[w & 0xFF] ^ (w >> 8);
 632             w = timesXtoThe32[w & 0xFF] ^ (w >> 8);
 633             w = timesXtoThe32[w & 0xFF] ^ (w >> 8);
 634             c = (uint32_t) w;
 635             i = len_128bit << log_align;
 636         } else {
 637             i = 0;
 638         }
 639     }
 640     /* Handle short CRC and tail of long CRC */
 641     for (; i < len; i++) {
 642         uint32_t x0 = buf[i];
 643         x0 = timesXtoThe32[(x0 ^ c) & 0xFF];
 644         c = x0 ^ (c >> 8);
 645     }
 646     return ~c;
 647 }
 648 #endif