1 /*
   2  * Copyright (c) 1997, 2013, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.  Oracle designates this
   8  * particular file as subject to the "Classpath" exception as provided
   9  * by Oracle in the LICENSE file that accompanied this code.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  */
  25 
  26 /*
  27  * Native method support for java.util.zip.CRC32
  28  */
  29 
  30 #include "jni.h"
  31 #include "jni_util.h"
  32 #include <zlib.h>
  33 
  34 #include "java_util_zip_CRC32.h"
  35 
  36 /* define CAN_COMPILE_CLMUL 0 to disable fastcrc32 completely. */
  37 
  38 #ifndef CAN_COMPILE_CLMUL
  39 #  ifdef __x86_64
  40 #    define CAN_COMPILE_CLMUL 1
  41 #  elif defined(__i386)
  42 #    define CAN_COMPILE_CLMUL 1
  43 #  endif
  44 #endif
  45 
  46 #if CAN_COMPILE_CLMUL
  47 #include <stdint.h>
  48 #include <stdlib.h>
  49 
  50 struct crc_by128_K {
  51     /* The fields in this structure are arranged so that if it is
  52      * allocated at a 16-byte alignment they can be picked up two at
  53      * a time with 128-bit loads.
  54      *
  55      * Because of flipped bit order for this CRC polynomials
  56      * the constant for X**N is left-shifted by 1.  This is because
  57      * a 64 x 64 polynomial multiply produces a 127-bit result
  58      * but the highest term is always aligned to bit 0 in the container.
  59      * Pre-shifting by one fixes this, at the cost of potentially making
  60      * the 32-bit constant no longer fit in a 32-bit container (thus the
  61      * use of uint64_t, though this is also the size used by the carry-
  62      * less multiply instruction.
  63      *
  64      * In addition, the flipped bit order and highest-term-at-least-bit
  65      * multiply changes the constants used.  The 96-bit result will be
  66      * aligned to the high-term end of the target 128-bit container,
  67      * not the low-term end; that is, instead of a 512-bit or 576-bit fold,
  68      * instead it is a 480 (=512-32) or 544 (=512+64-32) bit fold.
  69      *
  70      * This cause additional problems in the 128-to-64-bit reduction; see the
  71      * code for details.  By storing a mask in the otherwise unused half of
  72      * a 128-bit constant, bits can be cleared before multiplication without
  73      * storing and reloading.  Note that staying on a 128-bit datapath means
  74      * that some data is uselessly stored and some unused data is intersected
  75      * with an irrelevant constant.
  76      */
  77 
  78     uint64_t mask; /* low of K_M_64 */
  79     uint64_t xtt64; /* high of K_M_64 */
  80     uint64_t xtt160; /* low of K_160_96 */
  81     uint64_t xtt96; /* high of K_160_96 */
  82     uint64_t xtt544; /* low of K_544_480 */
  83     uint64_t xtt480; /* high of K_544_480 */
  84 };
  85 
  86 struct crc_by128_K * K_struct = 0;
  87 
  88 static const uint64_t x64 = (uint64_t) 0xb1e6b092U << 1;
  89 static const uint64_t x96 = (uint64_t) 0x6655004fU << 1;
  90 static const uint64_t x160 = (uint64_t) 0xba8ccbe8U << 1;
  91 static const uint64_t x480 = (uint64_t) 0xe3720acbU << 1;
  92 static const uint64_t x544 = (uint64_t) 0xaa2215eaU << 1;
  93 
  94 static struct crc_by128_K * init_crc_by128_K() {
  95     void * y;
  96     int rc = posix_memalign( & y, 16, sizeof(struct crc_by128_K));
  97     if (rc) {
  98         return (struct crc_by128_K *) NULL;
  99     } else {
 100         struct crc_by128_K * x = y;
 101         x -> mask = 0xffffffffUL;
 102         x -> xtt64 = x64;
 103         x -> xtt160 = x160;
 104         x -> xtt96 = x96;
 105         x -> xtt544 = x544;
 106         x -> xtt480 = x480;
 107         return x;
 108     }
 109 }
 110 
 111 uint32_t fastcrc32(jint crc, Bytef * buf, jint len);
 112 
 113 /* Flag governing use of "CLMUL" instruction.
 114    For now, implies little-endian.
 115    Computed dynamically, incorporates information about
 116    the current hardware and the compiler used to compile
 117    this file. */
 118 static int useClmul = 0;
 119 #else
 120 /* Stub out fastcrc32 */
 121 # define fastcrc32 crc32
 122 # define useClmul 0
 123 #endif
 124 
 125 
 126 /* Local copy of CRC32 table is used to fill and drain CLMUL CRC.
 127    Extra members beyond the first 256-entry row are ignored. */
 128 static const unsigned long FAR * crc_table;
 129 
 130 /* Initialize java-side table (for small CRCs) to avoid extra startup work,
 131    and capture the platform-dependent useClmul flag.
 132 */
 133 JNIEXPORT jboolean JNICALL
 134 Java_java_util_zip_CRC32_init(JNIEnv *env, jclass cls, jarray b, jboolean use_clmul)
 135 {
 136   /* Get the CRC table from zip to initialize JNI.  Our private copy
 137      is missing if not compiled for fastcrc32. */
 138   crc_table = get_crc_table();
 139   jint *buf = (*env)->GetPrimitiveArrayCritical(env, b, 0);
 140   if (buf) {
 141         /* Don't know for sure how big an unsigned long is, therefore
 142            copy one at a time. */
 143         int i;
 144         for (i = 0; i < 256; i++) buf[i] = (jint) (crc_table[i]);
 145         (*env)->ReleasePrimitiveArrayCritical(env, b, buf, 0);
 146   }
 147 #if CAN_COMPILE_CLMUL
 148  if (use_clmul) {
 149       K_struct = init_crc_by128_K();
 150       useClmul = K_struct != 0;
 151       /* Rather than throw OOME, just do without fast CRC. */
 152   }
 153 #endif
 154   return useClmul;
 155 }
 156 
 157 JNIEXPORT jint JNICALL
 158 Java_java_util_zip_CRC32_update(JNIEnv *env, jclass cls, jint crc, jint b)
 159 {
 160     Bytef buf[1];
 161 
 162     buf[0] = (Bytef)b;
 163     return crc32(crc, buf, 1); // single byte not done quickly by fastcrc32
 164 }
 165 
 166 JNIEXPORT jint JNICALL
 167 Java_java_util_zip_CRC32_updateBytes(JNIEnv *env, jclass cls, jint crc,
 168                                      jarray b, jint off, jint len)
 169 {
 170     Bytef *buf = (*env)->GetPrimitiveArrayCritical(env, b, 0);
 171     if (buf) {
 172         crc = (jint) (useClmul ? fastcrc32(crc, buf + off, len) :
 173                                      crc32(crc, buf + off, len));
 174         (*env)->ReleasePrimitiveArrayCritical(env, b, buf, 0);
 175     }
 176     return crc;
 177 }
 178 
 179 JNIEXPORT jint ZIP_CRC32(jint crc, const jbyte *buf, jint len)
 180 {
 181     return (jint) (useClmul ? fastcrc32(crc, (Bytef*)buf, len) :
 182                                   crc32(crc, (Bytef*)buf, len));
 183 }
 184 
 185 JNIEXPORT jint JNICALL
 186 Java_java_util_zip_CRC32_updateByteBuffer(JNIEnv *env, jclass cls, jint crc,
 187                                           jlong address, jint off, jint len)
 188 {
 189     Bytef *buf = (Bytef *)jlong_to_ptr(address);
 190     if (buf) {
 191         crc = (jint) (useClmul ? fastcrc32(crc, buf + off, len) :
 192                                      crc32(crc, buf + off, len));
 193     }
 194     return crc;
 195 }
 196 
 197 #if CAN_COMPILE_CLMUL
 198 #ifndef NO_ASM
 199 
 200 /* set up the platform-specific glop surrounding the function body. */
 201 #  ifdef __x86_64
 202 #    ifdef __APPLE__
 203 #      define ASM_PREFIX ".text\n\t.align 8\n\t.globl _kernel\n_kernel:\n\t"
 204 #      define ASM_SUFFIX ""
 205 #    elif defined(__GNUC__)
 206 #      define ASM_PREFIX ".text\n\t.align 16\n\t.globl kernel\n\t.type kernel,@function\nkernel:\n\t"
 207 #      define ASM_SUFFIX ""
 208 #    elif defined(__SUNPRO_C)
 209 #      define ASM_PREFIX ".section .text,\"ax\"\n\t.align 16, 0x90\n\t.globl kernel\n\t.type kernel,@function\nkernel:\n\t"
 210 #      define ASM_SUFFIX ".size kernel,.-kernel"
 211 #    else
 212        /* Perhaps the mystery compiler can handle the intrinsics. */
 213 #      define NO_ASM 1
 214 #    endif
 215 
 216 #    ifndef NO_ASM
 217 __asm__(
 218         ASM_PREFIX
 219         " pushq  %rbp\n\t"
 220         " movq   %rsp, %rbp\n\t"
 221         " movl   %edi, %eax\n\t"
 222         " .byte  0xc5,0xf9,0x6f,0x06  # vmovdqa(%rsi), %xmm0\n\t"
 223         " .byte  0xc4,0xe1,0xf9,0x7e,0xc7  # vmovd  %xmm0, %rdi\n\t"
 224         " xorq   %rax, %rdi\n\t"
 225         " .byte  0xc4,0xe3,0xf9,0x22,0xd7,0x00  # vpinsrq$0, %rdi, %xmm0, %xmm2\n\t"
 226         " .byte  0xc5,0x79,0x6f,0x01  # vmovdqa(%rcx), %xmm8\n\t"
 227         " .byte  0xc5,0x79,0x6f,0x49,0x10  # vmovdqa16(%rcx), %xmm9\n\t"
 228         " movl   $1, %eax\n\t"
 229         " cmpl   $4, %edx\n\t"
 230         " jl     1f\n\t"
 231         " .byte  0xc5,0xf9,0x6f,0x6e,0x10  # vmovdqa16(%rsi), %xmm5\n\t"
 232         " .byte  0xc5,0xf9,0x6f,0x66,0x20  # vmovdqa32(%rsi), %xmm4\n\t"
 233         " .byte  0xc5,0xf9,0x6f,0x5e,0x30  # vmovdqa48(%rsi), %xmm3\n\t"
 234         " leal   -3(%rdx), %edi\n\t"
 235         " movl   $4, %eax\n\t"
 236         " cmpl   $5, %edi\n\t"
 237         " jl     2f\n\t"
 238         " .byte  0xc5,0xf9,0x6f,0x71,0x20  # vmovdqa32(%rcx), %xmm6\n\t"
 239         " leaq   112(%rsi), %rcx\n\t"
 240         " movl   $4, %eax\n\t"
 241         " .align  4, 0x90\n"
 242         "3: .byte  0xc4,0xe3,0x49,0x44,0xc2,0x00  # vpclmulqdq$0, %xmm2, %xmm6, %xmm0\n\t"
 243         " .byte  0xc4,0xe3,0x49,0x44,0xcb,0x11  # vpclmulqdq$17, %xmm3, %xmm6, %xmm1\n\t"
 244         " .byte  0xc4,0xe3,0x49,0x44,0xdb,0x00  # vpclmulqdq$0, %xmm3, %xmm6, %xmm3\n\t"
 245         " .byte  0xc5,0xe1,0xef,0x19  # vpxor  (%rcx), %xmm3, %xmm3\n\t"
 246         " .byte  0xc4,0xe3,0x49,0x44,0xfd,0x00  # vpclmulqdq$0, %xmm5, %xmm6, %xmm7\n\t"
 247         " .byte  0xc5,0xc1,0xef,0x79,0xe0  # vpxor  -32(%rcx), %xmm7, %xmm7\n\t"
 248         " .byte  0xc5,0xf1,0xef,0xdb  # vpxor  %xmm3, %xmm1, %xmm3\n\t"
 249         " .byte  0xc4,0xe3,0x49,0x44,0xd2,0x11  # vpclmulqdq$17, %xmm2, %xmm6, %xmm2\n\t"
 250         " .byte  0xc5,0xf9,0xef,0x41,0xd0  # vpxor  -48(%rcx), %xmm0, %xmm0\n\t"
 251         " .byte  0xc4,0xe3,0x49,0x44,0xcd,0x11  # vpclmulqdq$17, %xmm5, %xmm6, %xmm1\n\t"
 252         " .byte  0xc4,0xe3,0x49,0x44,0xec,0x11  # vpclmulqdq$17, %xmm4, %xmm6, %xmm5\n\t"
 253         " .byte  0xc4,0xe3,0x49,0x44,0xe4,0x00  # vpclmulqdq$0, %xmm4, %xmm6, %xmm4\n\t"
 254         " .byte  0xc5,0xd9,0xef,0x61,0xf0  # vpxor  -16(%rcx), %xmm4, %xmm4\n\t"
 255         " .byte  0xc5,0xd1,0xef,0xe4  # vpxor  %xmm4, %xmm5, %xmm4\n\t"
 256         " .byte  0xc5,0xf1,0xef,0xef  # vpxor  %xmm7, %xmm1, %xmm5\n\t"
 257         " .byte  0xc5,0xe9,0xef,0xd0  # vpxor  %xmm0, %xmm2, %xmm2\n\t"
 258         " addq   $64, %rcx\n\t"
 259         " addl   $4, %eax\n\t"
 260         " cmpl   %edi, %eax\n\t"
 261         " jl     3b\n"
 262         "2: .byte  0xc4,0xe3,0x31,0x44,0xc2,0x11  # vpclmulqdq$17, %xmm2, %xmm9, %xmm0\n\t"
 263         " .byte  0xc4,0xe3,0x31,0x44,0xca,0x00  # vpclmulqdq$0, %xmm2, %xmm9, %xmm1\n\t"
 264         " .byte  0xc5,0xd1,0xef,0xc9  # vpxor  %xmm1, %xmm5, %xmm1\n\t"
 265         " .byte  0xc5,0xf1,0xef,0xc8  # vpxor  %xmm0, %xmm1, %xmm1\n\t"
 266         " .byte  0xc4,0xe3,0x31,0x44,0xc1,0x11  # vpclmulqdq$17, %xmm1, %xmm9, %xmm0\n\t"
 267         " .byte  0xc4,0xe3,0x31,0x44,0xc9,0x00  # vpclmulqdq$0, %xmm1, %xmm9, %xmm1\n\t"
 268         " .byte  0xc5,0xd9,0xef,0xc9  # vpxor  %xmm1, %xmm4, %xmm1\n\t"
 269         " .byte  0xc5,0xf1,0xef,0xc8  # vpxor  %xmm0, %xmm1, %xmm1\n\t"
 270         " .byte  0xc4,0xe3,0x31,0x44,0xc1,0x11  # vpclmulqdq$17, %xmm1, %xmm9, %xmm0\n\t"
 271         " .byte  0xc4,0xe3,0x31,0x44,0xc9,0x00  # vpclmulqdq$0, %xmm1, %xmm9, %xmm1\n\t"
 272         " .byte  0xc5,0xe1,0xef,0xc9  # vpxor  %xmm1, %xmm3, %xmm1\n\t"
 273         " .byte  0xc5,0xf1,0xef,0xd0  # vpxor  %xmm0, %xmm1, %xmm2\n"
 274         "1: cmpl   %edx, %eax\n\t"
 275         " jge    4f\n\t"
 276         " subl   %eax, %edx\n\t"
 277         " movslq %eax, %rax\n\t"
 278         " shlq   $4, %rax\n\t"
 279         " addq   %rax, %rsi\n\t"
 280         " .align  4, 0x90\n"
 281         "5: .byte  0xc4,0xe3,0x31,0x44,0xc2,0x11  # vpclmulqdq$17, %xmm2, %xmm9, %xmm0\n\t"
 282         " .byte  0xc4,0xe3,0x31,0x44,0xca,0x00  # vpclmulqdq$0, %xmm2, %xmm9, %xmm1\n\t"
 283         " .byte  0xc5,0xf1,0xef,0x0e  # vpxor  (%rsi), %xmm1, %xmm1\n\t"
 284         " .byte  0xc5,0xf1,0xef,0xd0  # vpxor  %xmm0, %xmm1, %xmm2\n\t"
 285         " addq   $16, %rsi\n\t"
 286         " decl   %edx\n\t"
 287         " jne    5b\n"
 288         "4: .byte  0xc4,0xe3,0x39,0x44,0xc2,0x01  # vpclmulqdq$1, %xmm2, %xmm8, %xmm0\n\t"
 289         " .byte  0xc4,0xe1,0xf9,0x7e,0xc0  # vmovd  %xmm0, %rax\n\t"
 290         " .byte  0xc4,0xe3,0xf9,0x16,0xc1,0x01  # vpextrq$1, %xmm0, %rcx\n\t"
 291         " shldq  $32, %rax, %rcx\n\t"
 292         " .byte  0xc5,0xb9,0xdb,0xc0  # vpand  %xmm0, %xmm8, %xmm0\n\t"
 293         " .byte  0xc4,0xe3,0x39,0x44,0xc0,0x01  # vpclmulqdq$1, %xmm0, %xmm8, %xmm0\n\t"
 294         " .byte  0xc4,0xe1,0xf9,0x7e,0xc2  # vmovd  %xmm0, %rdx\n\t"
 295         " .byte  0xc4,0xe3,0xf9,0x16,0xd0,0x01  # vpextrq$1, %xmm2, %rax\n\t"
 296         " xorq   %rdx, %rax\n\t"
 297         " xorq   %rcx, %rax\n\t"
 298         " popq   %rbp\n\t"
 299         " ret\n"
 300         ASM_SUFFIX
 301         );
 302 #    endif
 303 #  elif defined(__i386)
 304 
 305 /* set up the platform-specific glop surrounding the function body. */
 306 #    ifdef __APPLE__
 307 #      define ASM_PREFIX ".text\n\t.align 16\n\t.globl _kernel\n_kernel:\n\t"
 308 #      define ASM_SUFFIX ""
 309 #    elif defined(__GNUC__)
 310 #      define ASM_PREFIX ".text\n\t.align 16\n\t.globl kernel\n\t.type kernel,@function\nkernel:\n\t"
 311 #      define ASM_SUFFIX ""
 312 #    elif defined(__SUNPRO_C)
 313 #      define ASM_PREFIX ".section .text,\"ax\"\n\t.align 16, 0x90\n\t.globl kernel\n\t.type kernel,@function\nkernel:\n\t"
 314 #      define ASM_SUFFIX ".size kernel,.-kernel"
 315 #    else
 316        /* Perhaps the mystery compiler can handle the intrinsics. */
 317 #      define NO_ASM 1
 318 #    endif
 319 
 320 #    ifndef NO_ASM
 321 __asm__(
 322         ASM_PREFIX
 323         " pushl  %ebp\n\t"
 324         " movl   %esp, %ebp\n\t"
 325         " pushl  %edi\n\t"
 326         " pushl  %esi\n\t"
 327         " movl   12(%ebp), %eax\n\t"
 328         " .byte  0xc5,0xf9,0x28,0x00  # vmovapd(%eax), %xmm0\n\t"
 329         " .byte  0xc5,0xf9,0x7e,0xc1  # vmovd  %xmm0, %ecx\n\t"
 330         " xorl   8(%ebp), %ecx\n\t"
 331         " .byte  0xc4,0xe3,0x79,0x22,0xc9,0x00  # vpinsrd$0, %ecx, %xmm0, %xmm1\n\t"
 332         " .byte  0xc4,0xe3,0x79,0x16,0xc1,0x01  # vpextrd$1, %xmm0, %ecx\n\t"
 333         " .byte  0xc4,0xe3,0x71,0x22,0xc9,0x01  # vpinsrd$1, %ecx, %xmm1, %xmm1\n\t"
 334         " movl   20(%ebp), %edi\n\t"
 335         " .byte  0xc5,0xf9,0x6f,0x07  # vmovdqa(%edi), %xmm0\n\t"
 336         " .byte  0xc5,0xf9,0x6f,0x57,0x10  # vmovdqa16(%edi), %xmm2\n\t"
 337         " movl   $1, %edx\n\t"
 338         " movl   16(%ebp), %ecx\n\t"
 339         " cmpl   $4, %ecx\n\t"
 340         " jl     1f\n\t"
 341         " .byte  0xc5,0xf9,0x6f,0x58,0x30  # vmovdqa48(%eax), %xmm3\n\t"
 342         " .byte  0xc5,0xf9,0x6f,0x68,0x10  # vmovdqa16(%eax), %xmm5\n\t"
 343         " .byte  0xc5,0xf9,0x6f,0x60,0x20  # vmovdqa32(%eax), %xmm4\n\t"
 344         " leal   -3(%ecx), %esi\n\t"
 345         " movl   $4, %edx\n\t"
 346         " cmpl   $5, %esi\n\t"
 347         " jl     2f\n\t"
 348         " .byte  0xc5,0xf9,0x6f,0x77,0x20  # vmovdqa32(%edi), %xmm6\n\t"
 349         " leal   112(%eax), %edi\n\t"
 350         " movl   $4, %edx\n\t"
 351         " .align  4, 0x90\n"
 352         "3: .byte  0xc4,0xe3,0x49,0x44,0xfb,0x11  # vpclmulqdq$17, %xmm3, %xmm6, %xmm7\n\t"
 353         " .byte  0xc4,0xe3,0x49,0x44,0xdb,0x00  # vpclmulqdq$0, %xmm3, %xmm6, %xmm3\n\t"
 354         " .byte  0xc5,0xe1,0xef,0x1f  # vpxor  (%edi), %xmm3, %xmm3\n\t"
 355         " .byte  0xc5,0xc1,0xef,0xdb  # vpxor  %xmm3, %xmm7, %xmm3\n\t"
 356         " .byte  0xc4,0xe3,0x49,0x44,0xfc,0x11  # vpclmulqdq$17, %xmm4, %xmm6, %xmm7\n\t"
 357         " .byte  0xc4,0xe3,0x49,0x44,0xe4,0x00  # vpclmulqdq$0, %xmm4, %xmm6, %xmm4\n\t"
 358         " .byte  0xc5,0xd9,0xef,0x67,0xf0  # vpxor  -16(%edi), %xmm4, %xmm4\n\t"
 359         " .byte  0xc5,0xc1,0xef,0xe4  # vpxor  %xmm4, %xmm7, %xmm4\n\t"
 360         " .byte  0xc4,0xe3,0x49,0x44,0xfd,0x11  # vpclmulqdq$17, %xmm5, %xmm6, %xmm7\n\t"
 361         " .byte  0xc4,0xe3,0x49,0x44,0xed,0x00  # vpclmulqdq$0, %xmm5, %xmm6, %xmm5\n\t"
 362         " .byte  0xc5,0xd1,0xef,0x6f,0xe0  # vpxor  -32(%edi), %xmm5, %xmm5\n\t"
 363         " .byte  0xc5,0xc1,0xef,0xed  # vpxor  %xmm5, %xmm7, %xmm5\n\t"
 364         " .byte  0xc4,0xe3,0x49,0x44,0xf9,0x11  # vpclmulqdq$17, %xmm1, %xmm6, %xmm7\n\t"
 365         " .byte  0xc4,0xe3,0x49,0x44,0xc9,0x00  # vpclmulqdq$0, %xmm1, %xmm6, %xmm1\n\t"
 366         " .byte  0xc5,0xf1,0xef,0x4f,0xd0  # vpxor  -48(%edi), %xmm1, %xmm1\n\t"
 367         " .byte  0xc5,0xc1,0xef,0xc9  # vpxor  %xmm1, %xmm7, %xmm1\n\t"
 368         " addl   $64, %edi\n\t"
 369         " addl   $4, %edx\n\t"
 370         " cmpl   %esi, %edx\n\t"
 371         " jl     3b\n"
 372         "2: .byte  0xc4,0xe3,0x69,0x44,0xf1,0x11  # vpclmulqdq$17, %xmm1, %xmm2, %xmm6\n\t"
 373         " .byte  0xc4,0xe3,0x69,0x44,0xc9,0x00  # vpclmulqdq$0, %xmm1, %xmm2, %xmm1\n\t"
 374         " .byte  0xc5,0xd1,0xef,0xc9  # vpxor  %xmm1, %xmm5, %xmm1\n\t"
 375         " .byte  0xc5,0xf1,0xef,0xee  # vpxor  %xmm6, %xmm1, %xmm5\n\t"
 376         " .byte  0xc4,0xe3,0x69,0x44,0xcd,0x11  # vpclmulqdq$17, %xmm5, %xmm2, %xmm1\n\t"
 377         " .byte  0xc4,0xe3,0x69,0x44,0xed,0x00  # vpclmulqdq$0, %xmm5, %xmm2, %xmm5\n\t"
 378         " .byte  0xc5,0xd9,0xef,0xe5  # vpxor  %xmm5, %xmm4, %xmm4\n\t"
 379         " .byte  0xc5,0xd9,0xef,0xe1  # vpxor  %xmm1, %xmm4, %xmm4\n\t"
 380         " .byte  0xc4,0xe3,0x69,0x44,0xcc,0x11  # vpclmulqdq$17, %xmm4, %xmm2, %xmm1\n\t"
 381         " .byte  0xc4,0xe3,0x69,0x44,0xe4,0x00  # vpclmulqdq$0, %xmm4, %xmm2, %xmm4\n\t"
 382         " .byte  0xc5,0xe1,0xef,0xdc  # vpxor  %xmm4, %xmm3, %xmm3\n\t"
 383         " .byte  0xc5,0xe1,0xef,0xc9  # vpxor  %xmm1, %xmm3, %xmm1\n"
 384         "1: cmpl   %ecx, %edx\n\t"
 385         " jge    4f\n\t"
 386         " subl   %edx, %ecx\n\t"
 387         " shll   $4, %edx\n\t"
 388         " addl   %edx, %eax\n\t"
 389         " .align  4, 0x90\n"
 390         "5: .byte  0xc4,0xe3,0x69,0x44,0xd9,0x11  # vpclmulqdq$17, %xmm1, %xmm2, %xmm3\n\t"
 391         " .byte  0xc4,0xe3,0x69,0x44,0xc9,0x00  # vpclmulqdq$0, %xmm1, %xmm2, %xmm1\n\t"
 392         " .byte  0xc5,0xf1,0xef,0x08  # vpxor  (%eax), %xmm1, %xmm1\n\t"
 393         " .byte  0xc5,0xf1,0xef,0xcb  # vpxor  %xmm3, %xmm1, %xmm1\n\t"
 394         " addl   $16, %eax\n\t"
 395         " decl   %ecx\n\t"
 396         " jne    5b\n"
 397         "4: .byte  0xc4,0xe3,0x79,0x44,0xd1,0x01  # vpclmulqdq$1, %xmm1, %xmm0, %xmm2\n\t"
 398         " .byte  0xc5,0xf9,0xdb,0xda  # vpand  %xmm2, %xmm0, %xmm3\n\t"
 399         " .byte  0xc4,0xe3,0x79,0x44,0xc3,0x01  # vpclmulqdq$1, %xmm3, %xmm0, %xmm0\n\t"
 400         " .byte  0xc5,0xf9,0x7e,0xc0  # vmovd  %xmm0, %eax\n\t"
 401         " .byte  0xc4,0xe3,0x79,0x16,0xc9,0x02  # vpextrd$2, %xmm1, %ecx\n\t"
 402         " xorl   %eax, %ecx\n\t"
 403         " .byte  0xc4,0xe3,0x79,0x16,0xd0,0x01  # vpextrd$1, %xmm2, %eax\n\t"
 404         " xorl   %ecx, %eax\n\t"
 405         " .byte  0xc4,0xe3,0x79,0x16,0xc2,0x01  # vpextrd$1, %xmm0, %edx\n\t"
 406         " .byte  0xc4,0xe3,0x79,0x16,0xc9,0x03  # vpextrd$3, %xmm1, %ecx\n\t"
 407         " xorl   %edx, %ecx\n\t"
 408         " .byte  0xc4,0xe3,0x79,0x16,0xd2,0x02  # vpextrd$2, %xmm2, %edx\n\t"
 409         " xorl   %ecx, %edx\n\t"
 410         " popl   %esi\n\t"
 411         " popl   %edi\n\t"
 412         " popl   %ebp\n\t"
 413         " ret\n"
 414         ASM_SUFFIX
 415         );
 416 #    endif
 417 #  else /* architecture type */
 418 /* Not intel, not that the C intrinsics will compile anywhere else,
 419  * but it will be a slightly better error message.
 420  */
 421 #    define NO_ASM 1
 422 #  endif
 423 #endif /* NO_ASM */
 424 
 425 #ifndef NO_ASM
 426 /* Declaration for use below. */
 427 uint64_t kernel(uint32_t c, unsigned char * buf, int len_128bit, struct crc_by128_K * K);
 428 #else
 429 #pragma message("Compiling 'kernel' from C source with intrinsics")
 430 #include <wmmintrin.h>
 431 #include <emmintrin.h>
 432 
 433 union u {
 434     __m128i v;
 435     struct {
 436         uint64_t lo;
 437         uint64_t hi;
 438     };
 439 };
 440 
 441 /**
 442  * Assume c is existing crc,
 443  * buf is 16-byte-aligned,
 444  * len is a multiple of 16 greater than zero.
 445  */
 446 uint64_t kernel(uint32_t c, unsigned char * buf, int len_128bit,
 447         struct crc_by128_K * K) {
 448 
 449     __m128i * b = (__m128i *) buf;
 450     int i = 0;
 451 
 452     /* 128 bit constants and variables. */
 453     __m128i K_544_480, K_160_96, K_M_64,
 454     x0, x1, x2, x3,
 455     x0a, x1a, x2a, x3a,
 456     x0b, x1b, x2b, x3b;
 457 
 458     /* Use these to move data between xmm registers and "normal" registers. */
 459     union u ut0, ut1, ut2, ut3;
 460 
 461     K_544_480 = * (__m128i *) & (K -> xtt544);
 462     K_160_96 = * (__m128i *) & (K -> xtt160);
 463     K_M_64 = * (__m128i *) & (K -> mask);
 464 
 465     /* Incorporate existing CRC into first item */
 466     ut0.v = b[0];
 467     ut0.lo ^= c;
 468     x0 = ut0.v;
 469 
 470     if (len_128bit >= 4) {
 471         /* Written as a slightly pipelined loop. */
 472 
 473         x1 = b[1];
 474         x2 = b[2];
 475         x3 = b[3];
 476 
 477         /* Iterate once if len_128bit is between 8 and 11
 478          * 4 < 8-3 < 11 - 3
 479          * 8 !< 11 - 3 < 12 - 3.
 480          *
 481          * 0 1 2 3 | 4 5 6 7 | 8 9 10 11 | 12
 482          *
 483          */
 484         for (i = 4; i < len_128bit - 3 ; i+= 4) {
 485             /* Each iteration of this loop folds the 512 bits of polynomial
 486              * in x0-x3 with the data in b[i]..b[i+3].
 487              */
 488             x0a = b[i];
 489             x1a = b[i+1];
 490             x2a = b[i+2];
 491             x3a = b[i+3];
 492 
 493             x0b = _mm_clmulepi64_si128(K_544_480, x0, 0x00);
 494             x0 = _mm_clmulepi64_si128(K_544_480, x0, 0x11);
 495             x1b = _mm_clmulepi64_si128(K_544_480, x1, 0x00);
 496             x1 = _mm_clmulepi64_si128(K_544_480, x1, 0x11);
 497 
 498             x2b = _mm_clmulepi64_si128(K_544_480, x2, 0x00);
 499             x2 = _mm_clmulepi64_si128(K_544_480, x2, 0x11);
 500             x3b = _mm_clmulepi64_si128(K_544_480, x3, 0x00);
 501             x3 = _mm_clmulepi64_si128(K_544_480, x3, 0x11);
 502 
 503             // x0 ^= x0a ^ x0b;
 504             x0 = _mm_xor_si128(x0, x0a);
 505             x0 = _mm_xor_si128(x0, x0b);
 506             // x1 ^= x1a ^ x1b;
 507             x1 = _mm_xor_si128(x1, x1a);
 508             x1 = _mm_xor_si128(x1, x1b);
 509             // x2 ^= x2a ^ x2b;
 510             x2 = _mm_xor_si128(x2, x2a);
 511             x2 = _mm_xor_si128(x2, x2b);
 512             // x3 ^= x3a ^ x3b;
 513             x3 = _mm_xor_si128(x3, x3a);
 514             x3 = _mm_xor_si128(x3, x3b);
 515         }
 516         /* x0 - x3 contains 4 x 128 bits of accumulated result.
 517          * 0-3 hexads potentially remain in [i,len_128bit) entries.
 518          * Assume trailing bytes beyond that are handled by our caller.
 519          */
 520         x0a = _mm_clmulepi64_si128(K_160_96, x0, 0x00);
 521         x0b = _mm_clmulepi64_si128(K_160_96, x0, 0x11);
 522         x1 = _mm_xor_si128(x1, x0a);
 523         x1 = _mm_xor_si128(x1, x0b);
 524         x0a = _mm_clmulepi64_si128(K_160_96, x1, 0x00);
 525         x0b = _mm_clmulepi64_si128(K_160_96, x1, 0x11);
 526         x2 = _mm_xor_si128(x2, x0a);
 527         x2 = _mm_xor_si128(x2, x0b);
 528         x0a = _mm_clmulepi64_si128(K_160_96, x2, 0x00);
 529         x0b = _mm_clmulepi64_si128(K_160_96, x2, 0x11);
 530         x3 = _mm_xor_si128(x3, x0a);
 531         x3 = _mm_xor_si128(x3, x0b);
 532     } else {
 533         /* Loaded 128 bits already into x0.
 534          */
 535         x3 = x0;
 536         i = 1;
 537     }
 538 
 539     /* x3 is now 128-bit result.
 540      * Fold 0-3 128-bit chunks into x3.
 541      */
 542     for (; i < len_128bit; i++) {
 543         x0 = b[i]; // data to fold
 544         // fold x3 down by 128 to align with data.
 545         x0a = _mm_clmulepi64_si128(K_160_96, x3, 0x00);
 546         x0b = _mm_clmulepi64_si128(K_160_96, x3, 0x11);
 547         x3 = _mm_xor_si128(x0, x0a);
 548         x3 = _mm_xor_si128(x3, x0b);
 549         // x3 is now aligned with data we just loaded.
 550     }
 551 
 552     /*
 553      * No more 128bits remain.
 554      * Fold x3 down into 32 bits.
 555      */
 556     {
 557         ut0.v = x3;
 558         uint64_t w;
 559         uint64_t y = ut0.hi; // 64 low-order terms of polynomial into y.
 560 
 561         /* polynomial term order:
 562          * high -> low
 563          * bit number order
 564          * 0 -> 127
 565          *
 566          * input, from which y was just extracted.
 567          * w0 w1 y0 y1
 568          * w0:w1 * x64 yields 96 bits.
 569          * p0:p1:p2:__ (aligned wrong, store to extract p1 and p2)
 570          * p0:p1:__:__ & ff:00:__:__ (mask to get rid of p1)
 571          * p0:00:__:__
 572          * p0:00 * x64 (times x64 yields 64 bits)
 573          * r0:r1 store and xor.
 574          */
 575 
 576         x0 = _mm_clmulepi64_si128(K_M_64, x3, 0x01);
 577         ut1.v = x0;
 578         w = (ut1.lo >> 32) + (ut1.hi << 32); // extract low-poly 64 bits.
 579         x0 = _mm_and_si128(K_M_64, x0); // mask away what we just extracted..
 580         x0 = _mm_clmulepi64_si128(K_M_64, x0, 0x01);
 581         w ^= y;
 582         ut2.v = x0;
 583         w ^= ut2.lo;
 584 
 585         return w;
 586     }
 587 }
 588 #endif /* NO_ASM */
 589 
 590 uint32_t fastcrc32(jint crc, Bytef * buf, jint len) {
 591     const unsigned long FAR * timesXtoThe32 = crc_table;
 592     intptr_t ibuf = (intptr_t) buf;
 593     int log_align = 4;
 594     int align = 1 << log_align;
 595     int mask = align - 1;
 596     int islop = (align - ibuf) & mask;
 597     uint32_t c = ~crc;
 598     int i = 0;
 599 
 600     if (len - islop >= align) {
 601         /* Handle bytes preceding 16-byte alignment. */
 602         for (i = 0; i < islop; i++ ) {
 603             uint32_t x0 = buf[i];
 604             x0 = timesXtoThe32[(x0 ^ c) & 0xFF];
 605             c = x0 ^ (c >> 8);
 606         }
 607         buf += i;
 608         len -= i;
 609 
 610         jint len_128bit = len >> log_align;
 611 
 612         if (len_128bit > 0) {
 613             uint64_t w = kernel(c, buf, len_128bit, K_struct);
 614             /*
 615              * 8 8-bit folds to compute 32-bit CRC.
 616              */
 617             w = timesXtoThe32[w & 0xFF] ^ (w >> 8);
 618             w = timesXtoThe32[w & 0xFF] ^ (w >> 8);
 619             w = timesXtoThe32[w & 0xFF] ^ (w >> 8);
 620             w = timesXtoThe32[w & 0xFF] ^ (w >> 8);
 621             w = timesXtoThe32[w & 0xFF] ^ (w >> 8);
 622             w = timesXtoThe32[w & 0xFF] ^ (w >> 8);
 623             w = timesXtoThe32[w & 0xFF] ^ (w >> 8);
 624             w = timesXtoThe32[w & 0xFF] ^ (w >> 8);
 625             c = (uint32_t) w;
 626             i = len_128bit << log_align;
 627         } else {
 628             i = 0;
 629         }
 630     }
 631     /* Handle short CRC and tail of long CRC */
 632     for (; i < len; i++) {
 633         uint32_t x0 = buf[i];
 634         x0 = timesXtoThe32[(x0 ^ c) & 0xFF];
 635         c = x0 ^ (c >> 8);
 636     }
 637     return ~c;
 638 }
 639 #endif