/* * Copyright (c) 1997, 2013, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License version 2 only, as * published by the Free Software Foundation. Oracle designates this * particular file as subject to the "Classpath" exception as provided * by Oracle in the LICENSE file that accompanied this code. * * This code is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License * version 2 for more details (a copy is included in the LICENSE file that * accompanied this code). * * You should have received a copy of the GNU General Public License version * 2 along with this work; if not, write to the Free Software Foundation, * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. * * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA * or visit www.oracle.com if you need additional information or have any * questions. */ /* * Native method support for java.util.zip.CRC32 */ #include "jni.h" #include "jni_util.h" #include #include "java_util_zip_CRC32.h" /* define CAN_COMPILE_CLMUL 0 to disable fastcrc32 completely. */ #ifndef CAN_COMPILE_CLMUL /* Windows not supported -- different assembly language syntax, and though the newer compilers support the intrinsics, the code is not very good. */ # ifndef _WIN32 # ifdef __x86_64 # define CAN_COMPILE_CLMUL 1 # elif defined(__i386) # define CAN_COMPILE_CLMUL 1 # endif # endif /* _WIN32 */ #endif #if CAN_COMPILE_CLMUL #include #include struct crc_by128_K { /* The fields in this structure are arranged so that if it is * allocated at a 16-byte alignment they can be picked up two at * a time with 128-bit loads. * * Because of flipped bit order for this CRC polynomials * the constant for X**N is left-shifted by 1. This is because * a 64 x 64 polynomial multiply produces a 127-bit result * but the highest term is always aligned to bit 0 in the container. * Pre-shifting by one fixes this, at the cost of potentially making * the 32-bit constant no longer fit in a 32-bit container (thus the * use of uint64_t, though this is also the size used by the carry- * less multiply instruction. * * In addition, the flipped bit order and highest-term-at-least-bit * multiply changes the constants used. The 96-bit result will be * aligned to the high-term end of the target 128-bit container, * not the low-term end; that is, instead of a 512-bit or 576-bit fold, * instead it is a 480 (=512-32) or 544 (=512+64-32) bit fold. * * This cause additional problems in the 128-to-64-bit reduction; see the * code for details. By storing a mask in the otherwise unused half of * a 128-bit constant, bits can be cleared before multiplication without * storing and reloading. Note that staying on a 128-bit datapath means * that some data is uselessly stored and some unused data is intersected * with an irrelevant constant. */ uint64_t mask; /* low of K_M_64 */ uint64_t xtt64; /* high of K_M_64 */ uint64_t xtt160; /* low of K_160_96 */ uint64_t xtt96; /* high of K_160_96 */ uint64_t xtt544; /* low of K_544_480 */ uint64_t xtt480; /* high of K_544_480 */ }; struct crc_by128_K * K_struct = 0; static const uint64_t x64 = (uint64_t) 0xb1e6b092U << 1; static const uint64_t x96 = (uint64_t) 0x6655004fU << 1; static const uint64_t x160 = (uint64_t) 0xba8ccbe8U << 1; static const uint64_t x480 = (uint64_t) 0xe3720acbU << 1; static const uint64_t x544 = (uint64_t) 0xaa2215eaU << 1; static struct crc_by128_K * init_crc_by128_K() { void * y; y = malloc(16 + sizeof(struct crc_by128_K)); // posix_memalign not available on all platforms if (y == NULL) { return (struct crc_by128_K *) NULL; } uint8_t * z = (uint8_t *) y; long p = (long) z; z += (16 - p) & 15; struct crc_by128_K * x = (struct crc_by128_K *) z; x -> mask = 0xffffffffUL; x -> xtt64 = x64; x -> xtt160 = x160; x -> xtt96 = x96; x -> xtt544 = x544; x -> xtt480 = x480; return x; } uint32_t fastcrc32(jint crc, Bytef * buf, jint len); /* Flag governing use of "CLMUL" instruction. For now, implies little-endian. Computed dynamically, incorporates information about the current hardware and the compiler used to compile this file. */ static int useClmul = 0; #else /* Cannot compile CLMUL */ /* Stub out fastcrc32 */ # define fastcrc32 crc32 # define useClmul 0 #endif /* Local copy of CRC32 table is used to fill and drain CLMUL CRC. Extra members beyond the first 256-entry row are ignored. */ static const unsigned long FAR * crc_table; /* Initialize java-side table (for small CRCs) to avoid extra startup work, and capture the platform-dependent useClmul flag. */ JNIEXPORT jboolean JNICALL Java_java_util_zip_CRC32_init(JNIEnv *env, jclass cls, jarray b, jboolean use_clmul) { /* Get the CRC table from zip to initialize JNI. Our private copy is missing if not compiled for fastcrc32. */ jint *buf = (*env)->GetPrimitiveArrayCritical(env, b, 0); crc_table = get_crc_table(); if (buf) { /* Don't know for sure how big an unsigned long is, therefore copy one at a time. */ int i; for (i = 0; i < 256; i++) buf[i] = (jint) (crc_table[i]); (*env)->ReleasePrimitiveArrayCritical(env, b, buf, 0); } #if CAN_COMPILE_CLMUL if (use_clmul) { K_struct = init_crc_by128_K(); useClmul = K_struct != 0; /* Rather than throw OOME, just do without fast CRC. */ } #endif return useClmul; } JNIEXPORT jint JNICALL Java_java_util_zip_CRC32_update(JNIEnv *env, jclass cls, jint crc, jint b) { Bytef buf[1]; buf[0] = (Bytef)b; return crc32(crc, buf, 1); // single byte not done quickly by fastcrc32 } JNIEXPORT jint JNICALL Java_java_util_zip_CRC32_updateBytes(JNIEnv *env, jclass cls, jint crc, jarray b, jint off, jint len) { Bytef *buf = (*env)->GetPrimitiveArrayCritical(env, b, 0); if (buf) { crc = (jint) (useClmul ? fastcrc32(crc, buf + off, len) : crc32(crc, buf + off, len)); (*env)->ReleasePrimitiveArrayCritical(env, b, buf, 0); } return crc; } JNIEXPORT jint ZIP_CRC32(jint crc, const jbyte *buf, jint len) { return (jint) (useClmul ? fastcrc32(crc, (Bytef*)buf, len) : crc32(crc, (Bytef*)buf, len)); } JNIEXPORT jint JNICALL Java_java_util_zip_CRC32_updateByteBuffer(JNIEnv *env, jclass cls, jint crc, jlong address, jint off, jint len) { Bytef *buf = (Bytef *)jlong_to_ptr(address); if (buf) { crc = (jint) (useClmul ? fastcrc32(crc, buf + off, len) : crc32(crc, buf + off, len)); } return crc; } #if CAN_COMPILE_CLMUL #ifndef NO_ASM /* set up the platform-specific glop surrounding the function body. */ # ifdef __x86_64 # ifdef __APPLE__ # define ASM_PREFIX ".text\n\t.align 8\n\t.globl _kernel\n_kernel:\n\t" # define ASM_SUFFIX "" # elif defined(__GNUC__) # define ASM_PREFIX ".text\n\t.align 16\n\t.globl kernel\n\t.type kernel,@function\nkernel:\n\t" # define ASM_SUFFIX "" # elif defined(__SUNPRO_C) # define ASM_PREFIX ".section .text,\"ax\"\n\t.align 16, 0x90\n\t.globl kernel\n\t.type kernel,@function\nkernel:\n\t" # define ASM_SUFFIX ".size kernel,.-kernel" # else /* Perhaps the mystery compiler can handle the intrinsics. */ # define NO_ASM 1 # endif # ifndef NO_ASM __asm__( ASM_PREFIX " pushq %rbp\n\t" " movq %rsp, %rbp\n\t" " movl %edi, %eax\n\t" " .byte 0xc5,0xf9,0x6f,0x06 # vmovdqa(%rsi), %xmm0\n\t" " .byte 0xc4,0xe1,0xf9,0x7e,0xc7 # vmovd %xmm0, %rdi\n\t" " xorq %rax, %rdi\n\t" " .byte 0xc4,0xe3,0xf9,0x22,0xd7,0x00 # vpinsrq$0, %rdi, %xmm0, %xmm2\n\t" " .byte 0xc5,0x79,0x6f,0x01 # vmovdqa(%rcx), %xmm8\n\t" " .byte 0xc5,0x79,0x6f,0x49,0x10 # vmovdqa16(%rcx), %xmm9\n\t" " movl $1, %eax\n\t" " cmpl $4, %edx\n\t" " jl 1f\n\t" " .byte 0xc5,0xf9,0x6f,0x6e,0x10 # vmovdqa16(%rsi), %xmm5\n\t" " .byte 0xc5,0xf9,0x6f,0x66,0x20 # vmovdqa32(%rsi), %xmm4\n\t" " .byte 0xc5,0xf9,0x6f,0x5e,0x30 # vmovdqa48(%rsi), %xmm3\n\t" " leal -3(%rdx), %edi\n\t" " movl $4, %eax\n\t" " cmpl $5, %edi\n\t" " jl 2f\n\t" " .byte 0xc5,0xf9,0x6f,0x71,0x20 # vmovdqa32(%rcx), %xmm6\n\t" " leaq 112(%rsi), %rcx\n\t" " movl $4, %eax\n\t" " .align 4, 0x90\n" "3: .byte 0xc4,0xe3,0x49,0x44,0xc2,0x00 # vpclmulqdq$0, %xmm2, %xmm6, %xmm0\n\t" " .byte 0xc4,0xe3,0x49,0x44,0xcb,0x11 # vpclmulqdq$17, %xmm3, %xmm6, %xmm1\n\t" " .byte 0xc4,0xe3,0x49,0x44,0xdb,0x00 # vpclmulqdq$0, %xmm3, %xmm6, %xmm3\n\t" " .byte 0xc5,0xe1,0xef,0x19 # vpxor (%rcx), %xmm3, %xmm3\n\t" " .byte 0xc4,0xe3,0x49,0x44,0xfd,0x00 # vpclmulqdq$0, %xmm5, %xmm6, %xmm7\n\t" " .byte 0xc5,0xc1,0xef,0x79,0xe0 # vpxor -32(%rcx), %xmm7, %xmm7\n\t" " .byte 0xc5,0xf1,0xef,0xdb # vpxor %xmm3, %xmm1, %xmm3\n\t" " .byte 0xc4,0xe3,0x49,0x44,0xd2,0x11 # vpclmulqdq$17, %xmm2, %xmm6, %xmm2\n\t" " .byte 0xc5,0xf9,0xef,0x41,0xd0 # vpxor -48(%rcx), %xmm0, %xmm0\n\t" " .byte 0xc4,0xe3,0x49,0x44,0xcd,0x11 # vpclmulqdq$17, %xmm5, %xmm6, %xmm1\n\t" " .byte 0xc4,0xe3,0x49,0x44,0xec,0x11 # vpclmulqdq$17, %xmm4, %xmm6, %xmm5\n\t" " .byte 0xc4,0xe3,0x49,0x44,0xe4,0x00 # vpclmulqdq$0, %xmm4, %xmm6, %xmm4\n\t" " .byte 0xc5,0xd9,0xef,0x61,0xf0 # vpxor -16(%rcx), %xmm4, %xmm4\n\t" " .byte 0xc5,0xd1,0xef,0xe4 # vpxor %xmm4, %xmm5, %xmm4\n\t" " .byte 0xc5,0xf1,0xef,0xef # vpxor %xmm7, %xmm1, %xmm5\n\t" " .byte 0xc5,0xe9,0xef,0xd0 # vpxor %xmm0, %xmm2, %xmm2\n\t" " addq $64, %rcx\n\t" " addl $4, %eax\n\t" " cmpl %edi, %eax\n\t" " jl 3b\n" "2: .byte 0xc4,0xe3,0x31,0x44,0xc2,0x11 # vpclmulqdq$17, %xmm2, %xmm9, %xmm0\n\t" " .byte 0xc4,0xe3,0x31,0x44,0xca,0x00 # vpclmulqdq$0, %xmm2, %xmm9, %xmm1\n\t" " .byte 0xc5,0xd1,0xef,0xc9 # vpxor %xmm1, %xmm5, %xmm1\n\t" " .byte 0xc5,0xf1,0xef,0xc8 # vpxor %xmm0, %xmm1, %xmm1\n\t" " .byte 0xc4,0xe3,0x31,0x44,0xc1,0x11 # vpclmulqdq$17, %xmm1, %xmm9, %xmm0\n\t" " .byte 0xc4,0xe3,0x31,0x44,0xc9,0x00 # vpclmulqdq$0, %xmm1, %xmm9, %xmm1\n\t" " .byte 0xc5,0xd9,0xef,0xc9 # vpxor %xmm1, %xmm4, %xmm1\n\t" " .byte 0xc5,0xf1,0xef,0xc8 # vpxor %xmm0, %xmm1, %xmm1\n\t" " .byte 0xc4,0xe3,0x31,0x44,0xc1,0x11 # vpclmulqdq$17, %xmm1, %xmm9, %xmm0\n\t" " .byte 0xc4,0xe3,0x31,0x44,0xc9,0x00 # vpclmulqdq$0, %xmm1, %xmm9, %xmm1\n\t" " .byte 0xc5,0xe1,0xef,0xc9 # vpxor %xmm1, %xmm3, %xmm1\n\t" " .byte 0xc5,0xf1,0xef,0xd0 # vpxor %xmm0, %xmm1, %xmm2\n" "1: cmpl %edx, %eax\n\t" " jge 4f\n\t" " subl %eax, %edx\n\t" " movslq %eax, %rax\n\t" " shlq $4, %rax\n\t" " addq %rax, %rsi\n\t" " .align 4, 0x90\n" "5: .byte 0xc4,0xe3,0x31,0x44,0xc2,0x11 # vpclmulqdq$17, %xmm2, %xmm9, %xmm0\n\t" " .byte 0xc4,0xe3,0x31,0x44,0xca,0x00 # vpclmulqdq$0, %xmm2, %xmm9, %xmm1\n\t" " .byte 0xc5,0xf1,0xef,0x0e # vpxor (%rsi), %xmm1, %xmm1\n\t" " .byte 0xc5,0xf1,0xef,0xd0 # vpxor %xmm0, %xmm1, %xmm2\n\t" " addq $16, %rsi\n\t" " decl %edx\n\t" " jne 5b\n" "4: .byte 0xc4,0xe3,0x39,0x44,0xc2,0x01 # vpclmulqdq$1, %xmm2, %xmm8, %xmm0\n\t" " .byte 0xc4,0xe1,0xf9,0x7e,0xc0 # vmovd %xmm0, %rax\n\t" " .byte 0xc4,0xe3,0xf9,0x16,0xc1,0x01 # vpextrq$1, %xmm0, %rcx\n\t" " shldq $32, %rax, %rcx\n\t" " .byte 0xc5,0xb9,0xdb,0xc0 # vpand %xmm0, %xmm8, %xmm0\n\t" " .byte 0xc4,0xe3,0x39,0x44,0xc0,0x01 # vpclmulqdq$1, %xmm0, %xmm8, %xmm0\n\t" " .byte 0xc4,0xe1,0xf9,0x7e,0xc2 # vmovd %xmm0, %rdx\n\t" " .byte 0xc4,0xe3,0xf9,0x16,0xd0,0x01 # vpextrq$1, %xmm2, %rax\n\t" " xorq %rdx, %rax\n\t" " xorq %rcx, %rax\n\t" " popq %rbp\n\t" " ret\n" ASM_SUFFIX ); # endif # elif defined(__i386) /* set up the platform-specific glop surrounding the function body. */ # ifdef __APPLE__ # define ASM_PREFIX ".text\n\t.align 16\n\t.globl _kernel\n_kernel:\n\t" # define ASM_SUFFIX "" # elif defined(__GNUC__) # define ASM_PREFIX ".text\n\t.align 16\n\t.globl kernel\n\t.type kernel,@function\nkernel:\n\t" # define ASM_SUFFIX "" # elif defined(__SUNPRO_C) # define ASM_PREFIX ".section .text,\"ax\"\n\t.align 16, 0x90\n\t.globl kernel\n\t.type kernel,@function\nkernel:\n\t" # define ASM_SUFFIX ".size kernel,.-kernel" # else /* Perhaps the mystery compiler can handle the intrinsics. */ # define NO_ASM 1 # endif # ifndef NO_ASM __asm__( ASM_PREFIX " pushl %ebp\n\t" " movl %esp, %ebp\n\t" " pushl %edi\n\t" " pushl %esi\n\t" " movl 12(%ebp), %eax\n\t" " .byte 0xc5,0xf9,0x28,0x00 # vmovapd(%eax), %xmm0\n\t" " .byte 0xc5,0xf9,0x7e,0xc1 # vmovd %xmm0, %ecx\n\t" " xorl 8(%ebp), %ecx\n\t" " .byte 0xc4,0xe3,0x79,0x22,0xc9,0x00 # vpinsrd$0, %ecx, %xmm0, %xmm1\n\t" " .byte 0xc4,0xe3,0x79,0x16,0xc1,0x01 # vpextrd$1, %xmm0, %ecx\n\t" " .byte 0xc4,0xe3,0x71,0x22,0xc9,0x01 # vpinsrd$1, %ecx, %xmm1, %xmm1\n\t" " movl 20(%ebp), %edi\n\t" " .byte 0xc5,0xf9,0x6f,0x07 # vmovdqa(%edi), %xmm0\n\t" " .byte 0xc5,0xf9,0x6f,0x57,0x10 # vmovdqa16(%edi), %xmm2\n\t" " movl $1, %edx\n\t" " movl 16(%ebp), %ecx\n\t" " cmpl $4, %ecx\n\t" " jl 1f\n\t" " .byte 0xc5,0xf9,0x6f,0x58,0x30 # vmovdqa48(%eax), %xmm3\n\t" " .byte 0xc5,0xf9,0x6f,0x68,0x10 # vmovdqa16(%eax), %xmm5\n\t" " .byte 0xc5,0xf9,0x6f,0x60,0x20 # vmovdqa32(%eax), %xmm4\n\t" " leal -3(%ecx), %esi\n\t" " movl $4, %edx\n\t" " cmpl $5, %esi\n\t" " jl 2f\n\t" " .byte 0xc5,0xf9,0x6f,0x77,0x20 # vmovdqa32(%edi), %xmm6\n\t" " leal 112(%eax), %edi\n\t" " movl $4, %edx\n\t" " .align 4, 0x90\n" "3: .byte 0xc4,0xe3,0x49,0x44,0xfb,0x11 # vpclmulqdq$17, %xmm3, %xmm6, %xmm7\n\t" " .byte 0xc4,0xe3,0x49,0x44,0xdb,0x00 # vpclmulqdq$0, %xmm3, %xmm6, %xmm3\n\t" " .byte 0xc5,0xe1,0xef,0x1f # vpxor (%edi), %xmm3, %xmm3\n\t" " .byte 0xc5,0xc1,0xef,0xdb # vpxor %xmm3, %xmm7, %xmm3\n\t" " .byte 0xc4,0xe3,0x49,0x44,0xfc,0x11 # vpclmulqdq$17, %xmm4, %xmm6, %xmm7\n\t" " .byte 0xc4,0xe3,0x49,0x44,0xe4,0x00 # vpclmulqdq$0, %xmm4, %xmm6, %xmm4\n\t" " .byte 0xc5,0xd9,0xef,0x67,0xf0 # vpxor -16(%edi), %xmm4, %xmm4\n\t" " .byte 0xc5,0xc1,0xef,0xe4 # vpxor %xmm4, %xmm7, %xmm4\n\t" " .byte 0xc4,0xe3,0x49,0x44,0xfd,0x11 # vpclmulqdq$17, %xmm5, %xmm6, %xmm7\n\t" " .byte 0xc4,0xe3,0x49,0x44,0xed,0x00 # vpclmulqdq$0, %xmm5, %xmm6, %xmm5\n\t" " .byte 0xc5,0xd1,0xef,0x6f,0xe0 # vpxor -32(%edi), %xmm5, %xmm5\n\t" " .byte 0xc5,0xc1,0xef,0xed # vpxor %xmm5, %xmm7, %xmm5\n\t" " .byte 0xc4,0xe3,0x49,0x44,0xf9,0x11 # vpclmulqdq$17, %xmm1, %xmm6, %xmm7\n\t" " .byte 0xc4,0xe3,0x49,0x44,0xc9,0x00 # vpclmulqdq$0, %xmm1, %xmm6, %xmm1\n\t" " .byte 0xc5,0xf1,0xef,0x4f,0xd0 # vpxor -48(%edi), %xmm1, %xmm1\n\t" " .byte 0xc5,0xc1,0xef,0xc9 # vpxor %xmm1, %xmm7, %xmm1\n\t" " addl $64, %edi\n\t" " addl $4, %edx\n\t" " cmpl %esi, %edx\n\t" " jl 3b\n" "2: .byte 0xc4,0xe3,0x69,0x44,0xf1,0x11 # vpclmulqdq$17, %xmm1, %xmm2, %xmm6\n\t" " .byte 0xc4,0xe3,0x69,0x44,0xc9,0x00 # vpclmulqdq$0, %xmm1, %xmm2, %xmm1\n\t" " .byte 0xc5,0xd1,0xef,0xc9 # vpxor %xmm1, %xmm5, %xmm1\n\t" " .byte 0xc5,0xf1,0xef,0xee # vpxor %xmm6, %xmm1, %xmm5\n\t" " .byte 0xc4,0xe3,0x69,0x44,0xcd,0x11 # vpclmulqdq$17, %xmm5, %xmm2, %xmm1\n\t" " .byte 0xc4,0xe3,0x69,0x44,0xed,0x00 # vpclmulqdq$0, %xmm5, %xmm2, %xmm5\n\t" " .byte 0xc5,0xd9,0xef,0xe5 # vpxor %xmm5, %xmm4, %xmm4\n\t" " .byte 0xc5,0xd9,0xef,0xe1 # vpxor %xmm1, %xmm4, %xmm4\n\t" " .byte 0xc4,0xe3,0x69,0x44,0xcc,0x11 # vpclmulqdq$17, %xmm4, %xmm2, %xmm1\n\t" " .byte 0xc4,0xe3,0x69,0x44,0xe4,0x00 # vpclmulqdq$0, %xmm4, %xmm2, %xmm4\n\t" " .byte 0xc5,0xe1,0xef,0xdc # vpxor %xmm4, %xmm3, %xmm3\n\t" " .byte 0xc5,0xe1,0xef,0xc9 # vpxor %xmm1, %xmm3, %xmm1\n" "1: cmpl %ecx, %edx\n\t" " jge 4f\n\t" " subl %edx, %ecx\n\t" " shll $4, %edx\n\t" " addl %edx, %eax\n\t" " .align 4, 0x90\n" "5: .byte 0xc4,0xe3,0x69,0x44,0xd9,0x11 # vpclmulqdq$17, %xmm1, %xmm2, %xmm3\n\t" " .byte 0xc4,0xe3,0x69,0x44,0xc9,0x00 # vpclmulqdq$0, %xmm1, %xmm2, %xmm1\n\t" " .byte 0xc5,0xf1,0xef,0x08 # vpxor (%eax), %xmm1, %xmm1\n\t" " .byte 0xc5,0xf1,0xef,0xcb # vpxor %xmm3, %xmm1, %xmm1\n\t" " addl $16, %eax\n\t" " decl %ecx\n\t" " jne 5b\n" "4: .byte 0xc4,0xe3,0x79,0x44,0xd1,0x01 # vpclmulqdq$1, %xmm1, %xmm0, %xmm2\n\t" " .byte 0xc5,0xf9,0xdb,0xda # vpand %xmm2, %xmm0, %xmm3\n\t" " .byte 0xc4,0xe3,0x79,0x44,0xc3,0x01 # vpclmulqdq$1, %xmm3, %xmm0, %xmm0\n\t" " .byte 0xc5,0xf9,0x7e,0xc0 # vmovd %xmm0, %eax\n\t" " .byte 0xc4,0xe3,0x79,0x16,0xc9,0x02 # vpextrd$2, %xmm1, %ecx\n\t" " xorl %eax, %ecx\n\t" " .byte 0xc4,0xe3,0x79,0x16,0xd0,0x01 # vpextrd$1, %xmm2, %eax\n\t" " xorl %ecx, %eax\n\t" " .byte 0xc4,0xe3,0x79,0x16,0xc2,0x01 # vpextrd$1, %xmm0, %edx\n\t" " .byte 0xc4,0xe3,0x79,0x16,0xc9,0x03 # vpextrd$3, %xmm1, %ecx\n\t" " xorl %edx, %ecx\n\t" " .byte 0xc4,0xe3,0x79,0x16,0xd2,0x02 # vpextrd$2, %xmm2, %edx\n\t" " xorl %ecx, %edx\n\t" " popl %esi\n\t" " popl %edi\n\t" " popl %ebp\n\t" " ret\n" ASM_SUFFIX ); # endif # else /* architecture type */ /* Not intel, not that the C intrinsics will compile anywhere else, * but it will be a slightly better error message. */ # define NO_ASM 1 # endif #endif /* NO_ASM */ #ifndef NO_ASM /* Declaration for use below. */ uint64_t kernel(uint32_t c, unsigned char * buf, int len_128bit, struct crc_by128_K * K); #else #pragma message("Compiling 'kernel' from C source with intrinsics") #include #include union u { __m128i v; struct { uint64_t lo; uint64_t hi; }; }; /** * Assume c is existing crc, * buf is 16-byte-aligned, * len is a multiple of 16 greater than zero. */ uint64_t kernel(uint32_t c, unsigned char * buf, int len_128bit, struct crc_by128_K * K) { __m128i * b = (__m128i *) buf; int i = 0; /* 128 bit constants and variables. */ __m128i K_544_480, K_160_96, K_M_64, x0, x1, x2, x3, x0a, x1a, x2a, x3a, x0b, x1b, x2b, x3b; /* Use these to move data between xmm registers and "normal" registers. */ union u ut0, ut1, ut2, ut3; K_544_480 = * (__m128i *) & (K -> xtt544); K_160_96 = * (__m128i *) & (K -> xtt160); K_M_64 = * (__m128i *) & (K -> mask); /* Incorporate existing CRC into first item */ ut0.v = b[0]; ut0.lo ^= c; x0 = ut0.v; if (len_128bit >= 4) { /* Written as a slightly pipelined loop. */ x1 = b[1]; x2 = b[2]; x3 = b[3]; /* Iterate once if len_128bit is between 8 and 11 * 4 < 8-3 < 11 - 3 * 8 !< 11 - 3 < 12 - 3. * * 0 1 2 3 | 4 5 6 7 | 8 9 10 11 | 12 * */ for (i = 4; i < len_128bit - 3 ; i+= 4) { /* Each iteration of this loop folds the 512 bits of polynomial * in x0-x3 with the data in b[i]..b[i+3]. */ x0a = b[i]; x1a = b[i+1]; x2a = b[i+2]; x3a = b[i+3]; x0b = _mm_clmulepi64_si128(K_544_480, x0, 0x00); x0 = _mm_clmulepi64_si128(K_544_480, x0, 0x11); x1b = _mm_clmulepi64_si128(K_544_480, x1, 0x00); x1 = _mm_clmulepi64_si128(K_544_480, x1, 0x11); x2b = _mm_clmulepi64_si128(K_544_480, x2, 0x00); x2 = _mm_clmulepi64_si128(K_544_480, x2, 0x11); x3b = _mm_clmulepi64_si128(K_544_480, x3, 0x00); x3 = _mm_clmulepi64_si128(K_544_480, x3, 0x11); // x0 ^= x0a ^ x0b; x0 = _mm_xor_si128(x0, x0a); x0 = _mm_xor_si128(x0, x0b); // x1 ^= x1a ^ x1b; x1 = _mm_xor_si128(x1, x1a); x1 = _mm_xor_si128(x1, x1b); // x2 ^= x2a ^ x2b; x2 = _mm_xor_si128(x2, x2a); x2 = _mm_xor_si128(x2, x2b); // x3 ^= x3a ^ x3b; x3 = _mm_xor_si128(x3, x3a); x3 = _mm_xor_si128(x3, x3b); } /* x0 - x3 contains 4 x 128 bits of accumulated result. * 0-3 hexads potentially remain in [i,len_128bit) entries. * Assume trailing bytes beyond that are handled by our caller. */ x0a = _mm_clmulepi64_si128(K_160_96, x0, 0x00); x0b = _mm_clmulepi64_si128(K_160_96, x0, 0x11); x1 = _mm_xor_si128(x1, x0a); x1 = _mm_xor_si128(x1, x0b); x0a = _mm_clmulepi64_si128(K_160_96, x1, 0x00); x0b = _mm_clmulepi64_si128(K_160_96, x1, 0x11); x2 = _mm_xor_si128(x2, x0a); x2 = _mm_xor_si128(x2, x0b); x0a = _mm_clmulepi64_si128(K_160_96, x2, 0x00); x0b = _mm_clmulepi64_si128(K_160_96, x2, 0x11); x3 = _mm_xor_si128(x3, x0a); x3 = _mm_xor_si128(x3, x0b); } else { /* Loaded 128 bits already into x0. */ x3 = x0; i = 1; } /* x3 is now 128-bit result. * Fold 0-3 128-bit chunks into x3. */ for (; i < len_128bit; i++) { x0 = b[i]; // data to fold // fold x3 down by 128 to align with data. x0a = _mm_clmulepi64_si128(K_160_96, x3, 0x00); x0b = _mm_clmulepi64_si128(K_160_96, x3, 0x11); x3 = _mm_xor_si128(x0, x0a); x3 = _mm_xor_si128(x3, x0b); // x3 is now aligned with data we just loaded. } /* * No more 128bits remain. * Fold x3 down into 32 bits. */ { uint64_t w; uint64_t y; ut0.v = x3; y = ut0.hi; // 64 low-order terms of polynomial into y. /* polynomial term order: * high -> low * bit number order * 0 -> 127 * * input, from which y was just extracted. * w0 w1 y0 y1 * w0:w1 * x64 yields 96 bits. * p0:p1:p2:__ (aligned wrong, store to extract p1 and p2) * p0:p1:__:__ & ff:00:__:__ (mask to get rid of p1) * p0:00:__:__ * p0:00 * x64 (times x64 yields 64 bits) * r0:r1 store and xor. */ x0 = _mm_clmulepi64_si128(K_M_64, x3, 0x01); ut1.v = x0; w = (ut1.lo >> 32) + (ut1.hi << 32); // extract low-poly 64 bits. x0 = _mm_and_si128(K_M_64, x0); // mask away what we just extracted.. x0 = _mm_clmulepi64_si128(K_M_64, x0, 0x01); w ^= y; ut2.v = x0; w ^= ut2.lo; return w; } } #endif /* NO_ASM */ uint32_t fastcrc32(jint crc, Bytef * buf, jint len) { const unsigned long FAR * timesXtoThe32 = crc_table; intptr_t ibuf = (intptr_t) buf; int log_align = 4; int align = 1 << log_align; int mask = align - 1; int islop = (align - ibuf) & mask; uint32_t c = ~crc; int i = 0; if (len - islop >= align) { jint len_128bit; /* Handle bytes preceding 16-byte alignment. */ for (i = 0; i < islop; i++ ) { uint32_t x0 = buf[i]; x0 = timesXtoThe32[(x0 ^ c) & 0xFF]; c = x0 ^ (c >> 8); } buf += i; len -= i; len_128bit = len >> log_align; if (len_128bit > 0) { uint64_t w = kernel(c, buf, len_128bit, K_struct); /* * 8 8-bit folds to compute 32-bit CRC. */ w = timesXtoThe32[w & 0xFF] ^ (w >> 8); w = timesXtoThe32[w & 0xFF] ^ (w >> 8); w = timesXtoThe32[w & 0xFF] ^ (w >> 8); w = timesXtoThe32[w & 0xFF] ^ (w >> 8); w = timesXtoThe32[w & 0xFF] ^ (w >> 8); w = timesXtoThe32[w & 0xFF] ^ (w >> 8); w = timesXtoThe32[w & 0xFF] ^ (w >> 8); w = timesXtoThe32[w & 0xFF] ^ (w >> 8); c = (uint32_t) w; i = len_128bit << log_align; } else { i = 0; } } /* Handle short CRC and tail of long CRC */ for (; i < len; i++) { uint32_t x0 = buf[i]; x0 = timesXtoThe32[(x0 ^ c) & 0xFF]; c = x0 ^ (c >> 8); } return ~c; } #endif