1 /* 2 * Copyright (c) 1997, 2013, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 26 /* 27 * Native method support for java.util.zip.CRC32 28 */ 29 30 #include "jni.h" 31 #include "jni_util.h" 32 #include <zlib.h> 33 34 #include "java_util_zip_CRC32.h" 35 36 /* define CAN_COMPILE_CLMUL 0 to disable fastcrc32 completely. */ 37 38 #ifndef CAN_COMPILE_CLMUL 39 /* Windows not supported -- different assembly language syntax, 40 and though the newer compilers support the intrinsics, the code 41 is not very good. */ 42 # ifndef _WIN32 43 # ifdef __x86_64 44 # define CAN_COMPILE_CLMUL 1 45 # elif defined(__i386) 46 # define CAN_COMPILE_CLMUL 1 47 # endif 48 # endif /* _WIN32 */ 49 #endif 50 51 #if CAN_COMPILE_CLMUL 52 #include <stdint.h> 53 #include <stdlib.h> 54 55 struct crc_by128_K { 56 /* The fields in this structure are arranged so that if it is 57 * allocated at a 16-byte alignment they can be picked up two at 58 * a time with 128-bit loads. 59 * 60 * Because of flipped bit order for this CRC polynomials 61 * the constant for X**N is left-shifted by 1. This is because 62 * a 64 x 64 polynomial multiply produces a 127-bit result 63 * but the highest term is always aligned to bit 0 in the container. 64 * Pre-shifting by one fixes this, at the cost of potentially making 65 * the 32-bit constant no longer fit in a 32-bit container (thus the 66 * use of uint64_t, though this is also the size used by the carry- 67 * less multiply instruction. 68 * 69 * In addition, the flipped bit order and highest-term-at-least-bit 70 * multiply changes the constants used. The 96-bit result will be 71 * aligned to the high-term end of the target 128-bit container, 72 * not the low-term end; that is, instead of a 512-bit or 576-bit fold, 73 * instead it is a 480 (=512-32) or 544 (=512+64-32) bit fold. 74 * 75 * This cause additional problems in the 128-to-64-bit reduction; see the 76 * code for details. By storing a mask in the otherwise unused half of 77 * a 128-bit constant, bits can be cleared before multiplication without 78 * storing and reloading. Note that staying on a 128-bit datapath means 79 * that some data is uselessly stored and some unused data is intersected 80 * with an irrelevant constant. 81 */ 82 83 uint64_t mask; /* low of K_M_64 */ 84 uint64_t xtt64; /* high of K_M_64 */ 85 uint64_t xtt160; /* low of K_160_96 */ 86 uint64_t xtt96; /* high of K_160_96 */ 87 uint64_t xtt544; /* low of K_544_480 */ 88 uint64_t xtt480; /* high of K_544_480 */ 89 }; 90 91 struct crc_by128_K * K_struct = 0; 92 93 static const uint64_t x64 = (uint64_t) 0xb1e6b092U << 1; 94 static const uint64_t x96 = (uint64_t) 0x6655004fU << 1; 95 static const uint64_t x160 = (uint64_t) 0xba8ccbe8U << 1; 96 static const uint64_t x480 = (uint64_t) 0xe3720acbU << 1; 97 static const uint64_t x544 = (uint64_t) 0xaa2215eaU << 1; 98 99 static struct crc_by128_K * init_crc_by128_K() { 100 void * y; 101 y = malloc(16 + sizeof(struct crc_by128_K)); 102 // posix_memalign not available on all platforms 103 if (y == NULL) { 104 return (struct crc_by128_K *) NULL; 105 } 106 uint8_t * z = (uint8_t *) y; 107 long p = (long) z; 108 z += (16 - p) & 15; 109 struct crc_by128_K * x = (struct crc_by128_K *) z; 110 x -> mask = 0xffffffffUL; 111 x -> xtt64 = x64; 112 x -> xtt160 = x160; 113 x -> xtt96 = x96; 114 x -> xtt544 = x544; 115 x -> xtt480 = x480; 116 return x; 117 } 118 119 uint32_t fastcrc32(jint crc, Bytef * buf, jint len); 120 121 /* Flag governing use of "CLMUL" instruction. 122 For now, implies little-endian. 123 Computed dynamically, incorporates information about 124 the current hardware and the compiler used to compile 125 this file. */ 126 static int useClmul = 0; 127 #else /* Cannot compile CLMUL */ 128 /* Stub out fastcrc32 */ 129 # define fastcrc32 crc32 130 # define useClmul 0 131 #endif 132 133 /* Local copy of CRC32 table is used to fill and drain CLMUL CRC. 134 Extra members beyond the first 256-entry row are ignored. */ 135 static const unsigned long FAR * crc_table; 136 137 /* Initialize java-side table (for small CRCs) to avoid extra startup work, 138 and capture the platform-dependent useClmul flag. 139 */ 140 JNIEXPORT jboolean JNICALL 141 Java_java_util_zip_CRC32_init(JNIEnv *env, jclass cls, jarray b, jboolean use_clmul) 142 { 143 /* Get the CRC table from zip to initialize JNI. Our private copy 144 is missing if not compiled for fastcrc32. */ 145 jint *buf = (*env)->GetPrimitiveArrayCritical(env, b, 0); 146 crc_table = get_crc_table(); 147 if (buf) { 148 /* Don't know for sure how big an unsigned long is, therefore 149 copy one at a time. */ 150 int i; 151 for (i = 0; i < 256; i++) buf[i] = (jint) (crc_table[i]); 152 (*env)->ReleasePrimitiveArrayCritical(env, b, buf, 0); 153 } 154 #if CAN_COMPILE_CLMUL 155 if (use_clmul) { 156 K_struct = init_crc_by128_K(); 157 useClmul = K_struct != 0; 158 /* Rather than throw OOME, just do without fast CRC. */ 159 } 160 #endif 161 return useClmul; 162 } 163 164 JNIEXPORT jint JNICALL 165 Java_java_util_zip_CRC32_update(JNIEnv *env, jclass cls, jint crc, jint b) 166 { 167 Bytef buf[1]; 168 169 buf[0] = (Bytef)b; 170 return crc32(crc, buf, 1); // single byte not done quickly by fastcrc32 171 } 172 173 JNIEXPORT jint JNICALL 174 Java_java_util_zip_CRC32_updateBytes(JNIEnv *env, jclass cls, jint crc, 175 jarray b, jint off, jint len) 176 { 177 Bytef *buf = (*env)->GetPrimitiveArrayCritical(env, b, 0); 178 if (buf) { 179 crc = (jint) (useClmul ? fastcrc32(crc, buf + off, len) : 180 crc32(crc, buf + off, len)); 181 (*env)->ReleasePrimitiveArrayCritical(env, b, buf, 0); 182 } 183 return crc; 184 } 185 186 JNIEXPORT jint ZIP_CRC32(jint crc, const jbyte *buf, jint len) 187 { 188 return (jint) (useClmul ? fastcrc32(crc, (Bytef*)buf, len) : 189 crc32(crc, (Bytef*)buf, len)); 190 } 191 192 JNIEXPORT jint JNICALL 193 Java_java_util_zip_CRC32_updateByteBuffer(JNIEnv *env, jclass cls, jint crc, 194 jlong address, jint off, jint len) 195 { 196 Bytef *buf = (Bytef *)jlong_to_ptr(address); 197 if (buf) { 198 crc = (jint) (useClmul ? fastcrc32(crc, buf + off, len) : 199 crc32(crc, buf + off, len)); 200 } 201 return crc; 202 } 203 204 #if CAN_COMPILE_CLMUL 205 #ifndef NO_ASM 206 207 /* set up the platform-specific glop surrounding the function body. */ 208 # ifdef __x86_64 209 # ifdef __APPLE__ 210 # define ASM_PREFIX ".text\n\t.align 8\n\t.globl _kernel\n_kernel:\n\t" 211 # define ASM_SUFFIX "" 212 # elif defined(__GNUC__) 213 # define ASM_PREFIX ".text\n\t.align 16\n\t.globl kernel\n\t.type kernel,@function\nkernel:\n\t" 214 # define ASM_SUFFIX "" 215 # elif defined(__SUNPRO_C) 216 # define ASM_PREFIX ".section .text,\"ax\"\n\t.align 16, 0x90\n\t.globl kernel\n\t.type kernel,@function\nkernel:\n\t" 217 # define ASM_SUFFIX ".size kernel,.-kernel" 218 # else 219 /* Perhaps the mystery compiler can handle the intrinsics. */ 220 # define NO_ASM 1 221 # endif 222 223 # ifndef NO_ASM 224 __asm__( 225 ASM_PREFIX 226 " pushq %rbp\n\t" 227 " movq %rsp, %rbp\n\t" 228 " movl %edi, %eax\n\t" 229 " .byte 0xc5,0xf9,0x6f,0x06 # vmovdqa(%rsi), %xmm0\n\t" 230 " .byte 0xc4,0xe1,0xf9,0x7e,0xc7 # vmovd %xmm0, %rdi\n\t" 231 " xorq %rax, %rdi\n\t" 232 " .byte 0xc4,0xe3,0xf9,0x22,0xd7,0x00 # vpinsrq$0, %rdi, %xmm0, %xmm2\n\t" 233 " .byte 0xc5,0x79,0x6f,0x01 # vmovdqa(%rcx), %xmm8\n\t" 234 " .byte 0xc5,0x79,0x6f,0x49,0x10 # vmovdqa16(%rcx), %xmm9\n\t" 235 " movl $1, %eax\n\t" 236 " cmpl $4, %edx\n\t" 237 " jl 1f\n\t" 238 " .byte 0xc5,0xf9,0x6f,0x6e,0x10 # vmovdqa16(%rsi), %xmm5\n\t" 239 " .byte 0xc5,0xf9,0x6f,0x66,0x20 # vmovdqa32(%rsi), %xmm4\n\t" 240 " .byte 0xc5,0xf9,0x6f,0x5e,0x30 # vmovdqa48(%rsi), %xmm3\n\t" 241 " leal -3(%rdx), %edi\n\t" 242 " movl $4, %eax\n\t" 243 " cmpl $5, %edi\n\t" 244 " jl 2f\n\t" 245 " .byte 0xc5,0xf9,0x6f,0x71,0x20 # vmovdqa32(%rcx), %xmm6\n\t" 246 " leaq 112(%rsi), %rcx\n\t" 247 " movl $4, %eax\n\t" 248 " .align 4, 0x90\n" 249 "3: .byte 0xc4,0xe3,0x49,0x44,0xc2,0x00 # vpclmulqdq$0, %xmm2, %xmm6, %xmm0\n\t" 250 " .byte 0xc4,0xe3,0x49,0x44,0xcb,0x11 # vpclmulqdq$17, %xmm3, %xmm6, %xmm1\n\t" 251 " .byte 0xc4,0xe3,0x49,0x44,0xdb,0x00 # vpclmulqdq$0, %xmm3, %xmm6, %xmm3\n\t" 252 " .byte 0xc5,0xe1,0xef,0x19 # vpxor (%rcx), %xmm3, %xmm3\n\t" 253 " .byte 0xc4,0xe3,0x49,0x44,0xfd,0x00 # vpclmulqdq$0, %xmm5, %xmm6, %xmm7\n\t" 254 " .byte 0xc5,0xc1,0xef,0x79,0xe0 # vpxor -32(%rcx), %xmm7, %xmm7\n\t" 255 " .byte 0xc5,0xf1,0xef,0xdb # vpxor %xmm3, %xmm1, %xmm3\n\t" 256 " .byte 0xc4,0xe3,0x49,0x44,0xd2,0x11 # vpclmulqdq$17, %xmm2, %xmm6, %xmm2\n\t" 257 " .byte 0xc5,0xf9,0xef,0x41,0xd0 # vpxor -48(%rcx), %xmm0, %xmm0\n\t" 258 " .byte 0xc4,0xe3,0x49,0x44,0xcd,0x11 # vpclmulqdq$17, %xmm5, %xmm6, %xmm1\n\t" 259 " .byte 0xc4,0xe3,0x49,0x44,0xec,0x11 # vpclmulqdq$17, %xmm4, %xmm6, %xmm5\n\t" 260 " .byte 0xc4,0xe3,0x49,0x44,0xe4,0x00 # vpclmulqdq$0, %xmm4, %xmm6, %xmm4\n\t" 261 " .byte 0xc5,0xd9,0xef,0x61,0xf0 # vpxor -16(%rcx), %xmm4, %xmm4\n\t" 262 " .byte 0xc5,0xd1,0xef,0xe4 # vpxor %xmm4, %xmm5, %xmm4\n\t" 263 " .byte 0xc5,0xf1,0xef,0xef # vpxor %xmm7, %xmm1, %xmm5\n\t" 264 " .byte 0xc5,0xe9,0xef,0xd0 # vpxor %xmm0, %xmm2, %xmm2\n\t" 265 " addq $64, %rcx\n\t" 266 " addl $4, %eax\n\t" 267 " cmpl %edi, %eax\n\t" 268 " jl 3b\n" 269 "2: .byte 0xc4,0xe3,0x31,0x44,0xc2,0x11 # vpclmulqdq$17, %xmm2, %xmm9, %xmm0\n\t" 270 " .byte 0xc4,0xe3,0x31,0x44,0xca,0x00 # vpclmulqdq$0, %xmm2, %xmm9, %xmm1\n\t" 271 " .byte 0xc5,0xd1,0xef,0xc9 # vpxor %xmm1, %xmm5, %xmm1\n\t" 272 " .byte 0xc5,0xf1,0xef,0xc8 # vpxor %xmm0, %xmm1, %xmm1\n\t" 273 " .byte 0xc4,0xe3,0x31,0x44,0xc1,0x11 # vpclmulqdq$17, %xmm1, %xmm9, %xmm0\n\t" 274 " .byte 0xc4,0xe3,0x31,0x44,0xc9,0x00 # vpclmulqdq$0, %xmm1, %xmm9, %xmm1\n\t" 275 " .byte 0xc5,0xd9,0xef,0xc9 # vpxor %xmm1, %xmm4, %xmm1\n\t" 276 " .byte 0xc5,0xf1,0xef,0xc8 # vpxor %xmm0, %xmm1, %xmm1\n\t" 277 " .byte 0xc4,0xe3,0x31,0x44,0xc1,0x11 # vpclmulqdq$17, %xmm1, %xmm9, %xmm0\n\t" 278 " .byte 0xc4,0xe3,0x31,0x44,0xc9,0x00 # vpclmulqdq$0, %xmm1, %xmm9, %xmm1\n\t" 279 " .byte 0xc5,0xe1,0xef,0xc9 # vpxor %xmm1, %xmm3, %xmm1\n\t" 280 " .byte 0xc5,0xf1,0xef,0xd0 # vpxor %xmm0, %xmm1, %xmm2\n" 281 "1: cmpl %edx, %eax\n\t" 282 " jge 4f\n\t" 283 " subl %eax, %edx\n\t" 284 " movslq %eax, %rax\n\t" 285 " shlq $4, %rax\n\t" 286 " addq %rax, %rsi\n\t" 287 " .align 4, 0x90\n" 288 "5: .byte 0xc4,0xe3,0x31,0x44,0xc2,0x11 # vpclmulqdq$17, %xmm2, %xmm9, %xmm0\n\t" 289 " .byte 0xc4,0xe3,0x31,0x44,0xca,0x00 # vpclmulqdq$0, %xmm2, %xmm9, %xmm1\n\t" 290 " .byte 0xc5,0xf1,0xef,0x0e # vpxor (%rsi), %xmm1, %xmm1\n\t" 291 " .byte 0xc5,0xf1,0xef,0xd0 # vpxor %xmm0, %xmm1, %xmm2\n\t" 292 " addq $16, %rsi\n\t" 293 " decl %edx\n\t" 294 " jne 5b\n" 295 "4: .byte 0xc4,0xe3,0x39,0x44,0xc2,0x01 # vpclmulqdq$1, %xmm2, %xmm8, %xmm0\n\t" 296 " .byte 0xc4,0xe1,0xf9,0x7e,0xc0 # vmovd %xmm0, %rax\n\t" 297 " .byte 0xc4,0xe3,0xf9,0x16,0xc1,0x01 # vpextrq$1, %xmm0, %rcx\n\t" 298 " shldq $32, %rax, %rcx\n\t" 299 " .byte 0xc5,0xb9,0xdb,0xc0 # vpand %xmm0, %xmm8, %xmm0\n\t" 300 " .byte 0xc4,0xe3,0x39,0x44,0xc0,0x01 # vpclmulqdq$1, %xmm0, %xmm8, %xmm0\n\t" 301 " .byte 0xc4,0xe1,0xf9,0x7e,0xc2 # vmovd %xmm0, %rdx\n\t" 302 " .byte 0xc4,0xe3,0xf9,0x16,0xd0,0x01 # vpextrq$1, %xmm2, %rax\n\t" 303 " xorq %rdx, %rax\n\t" 304 " xorq %rcx, %rax\n\t" 305 " popq %rbp\n\t" 306 " ret\n" 307 ASM_SUFFIX 308 ); 309 # endif 310 # elif defined(__i386) 311 312 /* set up the platform-specific glop surrounding the function body. */ 313 # ifdef __APPLE__ 314 # define ASM_PREFIX ".text\n\t.align 16\n\t.globl _kernel\n_kernel:\n\t" 315 # define ASM_SUFFIX "" 316 # elif defined(__GNUC__) 317 # define ASM_PREFIX ".text\n\t.align 16\n\t.globl kernel\n\t.type kernel,@function\nkernel:\n\t" 318 # define ASM_SUFFIX "" 319 # elif defined(__SUNPRO_C) 320 # define ASM_PREFIX ".section .text,\"ax\"\n\t.align 16, 0x90\n\t.globl kernel\n\t.type kernel,@function\nkernel:\n\t" 321 # define ASM_SUFFIX ".size kernel,.-kernel" 322 # else 323 /* Perhaps the mystery compiler can handle the intrinsics. */ 324 # define NO_ASM 1 325 # endif 326 327 # ifndef NO_ASM 328 __asm__( 329 ASM_PREFIX 330 " pushl %ebp\n\t" 331 " movl %esp, %ebp\n\t" 332 " pushl %edi\n\t" 333 " pushl %esi\n\t" 334 " movl 12(%ebp), %eax\n\t" 335 " .byte 0xc5,0xf9,0x28,0x00 # vmovapd(%eax), %xmm0\n\t" 336 " .byte 0xc5,0xf9,0x7e,0xc1 # vmovd %xmm0, %ecx\n\t" 337 " xorl 8(%ebp), %ecx\n\t" 338 " .byte 0xc4,0xe3,0x79,0x22,0xc9,0x00 # vpinsrd$0, %ecx, %xmm0, %xmm1\n\t" 339 " .byte 0xc4,0xe3,0x79,0x16,0xc1,0x01 # vpextrd$1, %xmm0, %ecx\n\t" 340 " .byte 0xc4,0xe3,0x71,0x22,0xc9,0x01 # vpinsrd$1, %ecx, %xmm1, %xmm1\n\t" 341 " movl 20(%ebp), %edi\n\t" 342 " .byte 0xc5,0xf9,0x6f,0x07 # vmovdqa(%edi), %xmm0\n\t" 343 " .byte 0xc5,0xf9,0x6f,0x57,0x10 # vmovdqa16(%edi), %xmm2\n\t" 344 " movl $1, %edx\n\t" 345 " movl 16(%ebp), %ecx\n\t" 346 " cmpl $4, %ecx\n\t" 347 " jl 1f\n\t" 348 " .byte 0xc5,0xf9,0x6f,0x58,0x30 # vmovdqa48(%eax), %xmm3\n\t" 349 " .byte 0xc5,0xf9,0x6f,0x68,0x10 # vmovdqa16(%eax), %xmm5\n\t" 350 " .byte 0xc5,0xf9,0x6f,0x60,0x20 # vmovdqa32(%eax), %xmm4\n\t" 351 " leal -3(%ecx), %esi\n\t" 352 " movl $4, %edx\n\t" 353 " cmpl $5, %esi\n\t" 354 " jl 2f\n\t" 355 " .byte 0xc5,0xf9,0x6f,0x77,0x20 # vmovdqa32(%edi), %xmm6\n\t" 356 " leal 112(%eax), %edi\n\t" 357 " movl $4, %edx\n\t" 358 " .align 4, 0x90\n" 359 "3: .byte 0xc4,0xe3,0x49,0x44,0xfb,0x11 # vpclmulqdq$17, %xmm3, %xmm6, %xmm7\n\t" 360 " .byte 0xc4,0xe3,0x49,0x44,0xdb,0x00 # vpclmulqdq$0, %xmm3, %xmm6, %xmm3\n\t" 361 " .byte 0xc5,0xe1,0xef,0x1f # vpxor (%edi), %xmm3, %xmm3\n\t" 362 " .byte 0xc5,0xc1,0xef,0xdb # vpxor %xmm3, %xmm7, %xmm3\n\t" 363 " .byte 0xc4,0xe3,0x49,0x44,0xfc,0x11 # vpclmulqdq$17, %xmm4, %xmm6, %xmm7\n\t" 364 " .byte 0xc4,0xe3,0x49,0x44,0xe4,0x00 # vpclmulqdq$0, %xmm4, %xmm6, %xmm4\n\t" 365 " .byte 0xc5,0xd9,0xef,0x67,0xf0 # vpxor -16(%edi), %xmm4, %xmm4\n\t" 366 " .byte 0xc5,0xc1,0xef,0xe4 # vpxor %xmm4, %xmm7, %xmm4\n\t" 367 " .byte 0xc4,0xe3,0x49,0x44,0xfd,0x11 # vpclmulqdq$17, %xmm5, %xmm6, %xmm7\n\t" 368 " .byte 0xc4,0xe3,0x49,0x44,0xed,0x00 # vpclmulqdq$0, %xmm5, %xmm6, %xmm5\n\t" 369 " .byte 0xc5,0xd1,0xef,0x6f,0xe0 # vpxor -32(%edi), %xmm5, %xmm5\n\t" 370 " .byte 0xc5,0xc1,0xef,0xed # vpxor %xmm5, %xmm7, %xmm5\n\t" 371 " .byte 0xc4,0xe3,0x49,0x44,0xf9,0x11 # vpclmulqdq$17, %xmm1, %xmm6, %xmm7\n\t" 372 " .byte 0xc4,0xe3,0x49,0x44,0xc9,0x00 # vpclmulqdq$0, %xmm1, %xmm6, %xmm1\n\t" 373 " .byte 0xc5,0xf1,0xef,0x4f,0xd0 # vpxor -48(%edi), %xmm1, %xmm1\n\t" 374 " .byte 0xc5,0xc1,0xef,0xc9 # vpxor %xmm1, %xmm7, %xmm1\n\t" 375 " addl $64, %edi\n\t" 376 " addl $4, %edx\n\t" 377 " cmpl %esi, %edx\n\t" 378 " jl 3b\n" 379 "2: .byte 0xc4,0xe3,0x69,0x44,0xf1,0x11 # vpclmulqdq$17, %xmm1, %xmm2, %xmm6\n\t" 380 " .byte 0xc4,0xe3,0x69,0x44,0xc9,0x00 # vpclmulqdq$0, %xmm1, %xmm2, %xmm1\n\t" 381 " .byte 0xc5,0xd1,0xef,0xc9 # vpxor %xmm1, %xmm5, %xmm1\n\t" 382 " .byte 0xc5,0xf1,0xef,0xee # vpxor %xmm6, %xmm1, %xmm5\n\t" 383 " .byte 0xc4,0xe3,0x69,0x44,0xcd,0x11 # vpclmulqdq$17, %xmm5, %xmm2, %xmm1\n\t" 384 " .byte 0xc4,0xe3,0x69,0x44,0xed,0x00 # vpclmulqdq$0, %xmm5, %xmm2, %xmm5\n\t" 385 " .byte 0xc5,0xd9,0xef,0xe5 # vpxor %xmm5, %xmm4, %xmm4\n\t" 386 " .byte 0xc5,0xd9,0xef,0xe1 # vpxor %xmm1, %xmm4, %xmm4\n\t" 387 " .byte 0xc4,0xe3,0x69,0x44,0xcc,0x11 # vpclmulqdq$17, %xmm4, %xmm2, %xmm1\n\t" 388 " .byte 0xc4,0xe3,0x69,0x44,0xe4,0x00 # vpclmulqdq$0, %xmm4, %xmm2, %xmm4\n\t" 389 " .byte 0xc5,0xe1,0xef,0xdc # vpxor %xmm4, %xmm3, %xmm3\n\t" 390 " .byte 0xc5,0xe1,0xef,0xc9 # vpxor %xmm1, %xmm3, %xmm1\n" 391 "1: cmpl %ecx, %edx\n\t" 392 " jge 4f\n\t" 393 " subl %edx, %ecx\n\t" 394 " shll $4, %edx\n\t" 395 " addl %edx, %eax\n\t" 396 " .align 4, 0x90\n" 397 "5: .byte 0xc4,0xe3,0x69,0x44,0xd9,0x11 # vpclmulqdq$17, %xmm1, %xmm2, %xmm3\n\t" 398 " .byte 0xc4,0xe3,0x69,0x44,0xc9,0x00 # vpclmulqdq$0, %xmm1, %xmm2, %xmm1\n\t" 399 " .byte 0xc5,0xf1,0xef,0x08 # vpxor (%eax), %xmm1, %xmm1\n\t" 400 " .byte 0xc5,0xf1,0xef,0xcb # vpxor %xmm3, %xmm1, %xmm1\n\t" 401 " addl $16, %eax\n\t" 402 " decl %ecx\n\t" 403 " jne 5b\n" 404 "4: .byte 0xc4,0xe3,0x79,0x44,0xd1,0x01 # vpclmulqdq$1, %xmm1, %xmm0, %xmm2\n\t" 405 " .byte 0xc5,0xf9,0xdb,0xda # vpand %xmm2, %xmm0, %xmm3\n\t" 406 " .byte 0xc4,0xe3,0x79,0x44,0xc3,0x01 # vpclmulqdq$1, %xmm3, %xmm0, %xmm0\n\t" 407 " .byte 0xc5,0xf9,0x7e,0xc0 # vmovd %xmm0, %eax\n\t" 408 " .byte 0xc4,0xe3,0x79,0x16,0xc9,0x02 # vpextrd$2, %xmm1, %ecx\n\t" 409 " xorl %eax, %ecx\n\t" 410 " .byte 0xc4,0xe3,0x79,0x16,0xd0,0x01 # vpextrd$1, %xmm2, %eax\n\t" 411 " xorl %ecx, %eax\n\t" 412 " .byte 0xc4,0xe3,0x79,0x16,0xc2,0x01 # vpextrd$1, %xmm0, %edx\n\t" 413 " .byte 0xc4,0xe3,0x79,0x16,0xc9,0x03 # vpextrd$3, %xmm1, %ecx\n\t" 414 " xorl %edx, %ecx\n\t" 415 " .byte 0xc4,0xe3,0x79,0x16,0xd2,0x02 # vpextrd$2, %xmm2, %edx\n\t" 416 " xorl %ecx, %edx\n\t" 417 " popl %esi\n\t" 418 " popl %edi\n\t" 419 " popl %ebp\n\t" 420 " ret\n" 421 ASM_SUFFIX 422 ); 423 # endif 424 # else /* architecture type */ 425 /* Not intel, not that the C intrinsics will compile anywhere else, 426 * but it will be a slightly better error message. 427 */ 428 # define NO_ASM 1 429 # endif 430 #endif /* NO_ASM */ 431 432 #ifndef NO_ASM 433 /* Declaration for use below. */ 434 uint64_t kernel(uint32_t c, unsigned char * buf, int len_128bit, struct crc_by128_K * K); 435 #else 436 #pragma message("Compiling 'kernel' from C source with intrinsics") 437 #include <wmmintrin.h> 438 #include <emmintrin.h> 439 440 union u { 441 __m128i v; 442 struct { 443 uint64_t lo; 444 uint64_t hi; 445 }; 446 }; 447 448 /** 449 * Assume c is existing crc, 450 * buf is 16-byte-aligned, 451 * len is a multiple of 16 greater than zero. 452 */ 453 uint64_t kernel(uint32_t c, unsigned char * buf, int len_128bit, 454 struct crc_by128_K * K) { 455 456 __m128i * b = (__m128i *) buf; 457 int i = 0; 458 459 /* 128 bit constants and variables. */ 460 __m128i K_544_480, K_160_96, K_M_64, 461 x0, x1, x2, x3, 462 x0a, x1a, x2a, x3a, 463 x0b, x1b, x2b, x3b; 464 465 /* Use these to move data between xmm registers and "normal" registers. */ 466 union u ut0, ut1, ut2, ut3; 467 468 K_544_480 = * (__m128i *) & (K -> xtt544); 469 K_160_96 = * (__m128i *) & (K -> xtt160); 470 K_M_64 = * (__m128i *) & (K -> mask); 471 472 /* Incorporate existing CRC into first item */ 473 ut0.v = b[0]; 474 ut0.lo ^= c; 475 x0 = ut0.v; 476 477 if (len_128bit >= 4) { 478 /* Written as a slightly pipelined loop. */ 479 480 x1 = b[1]; 481 x2 = b[2]; 482 x3 = b[3]; 483 484 /* Iterate once if len_128bit is between 8 and 11 485 * 4 < 8-3 < 11 - 3 486 * 8 !< 11 - 3 < 12 - 3. 487 * 488 * 0 1 2 3 | 4 5 6 7 | 8 9 10 11 | 12 489 * 490 */ 491 for (i = 4; i < len_128bit - 3 ; i+= 4) { 492 /* Each iteration of this loop folds the 512 bits of polynomial 493 * in x0-x3 with the data in b[i]..b[i+3]. 494 */ 495 x0a = b[i]; 496 x1a = b[i+1]; 497 x2a = b[i+2]; 498 x3a = b[i+3]; 499 500 x0b = _mm_clmulepi64_si128(K_544_480, x0, 0x00); 501 x0 = _mm_clmulepi64_si128(K_544_480, x0, 0x11); 502 x1b = _mm_clmulepi64_si128(K_544_480, x1, 0x00); 503 x1 = _mm_clmulepi64_si128(K_544_480, x1, 0x11); 504 505 x2b = _mm_clmulepi64_si128(K_544_480, x2, 0x00); 506 x2 = _mm_clmulepi64_si128(K_544_480, x2, 0x11); 507 x3b = _mm_clmulepi64_si128(K_544_480, x3, 0x00); 508 x3 = _mm_clmulepi64_si128(K_544_480, x3, 0x11); 509 510 // x0 ^= x0a ^ x0b; 511 x0 = _mm_xor_si128(x0, x0a); 512 x0 = _mm_xor_si128(x0, x0b); 513 // x1 ^= x1a ^ x1b; 514 x1 = _mm_xor_si128(x1, x1a); 515 x1 = _mm_xor_si128(x1, x1b); 516 // x2 ^= x2a ^ x2b; 517 x2 = _mm_xor_si128(x2, x2a); 518 x2 = _mm_xor_si128(x2, x2b); 519 // x3 ^= x3a ^ x3b; 520 x3 = _mm_xor_si128(x3, x3a); 521 x3 = _mm_xor_si128(x3, x3b); 522 } 523 /* x0 - x3 contains 4 x 128 bits of accumulated result. 524 * 0-3 hexads potentially remain in [i,len_128bit) entries. 525 * Assume trailing bytes beyond that are handled by our caller. 526 */ 527 x0a = _mm_clmulepi64_si128(K_160_96, x0, 0x00); 528 x0b = _mm_clmulepi64_si128(K_160_96, x0, 0x11); 529 x1 = _mm_xor_si128(x1, x0a); 530 x1 = _mm_xor_si128(x1, x0b); 531 x0a = _mm_clmulepi64_si128(K_160_96, x1, 0x00); 532 x0b = _mm_clmulepi64_si128(K_160_96, x1, 0x11); 533 x2 = _mm_xor_si128(x2, x0a); 534 x2 = _mm_xor_si128(x2, x0b); 535 x0a = _mm_clmulepi64_si128(K_160_96, x2, 0x00); 536 x0b = _mm_clmulepi64_si128(K_160_96, x2, 0x11); 537 x3 = _mm_xor_si128(x3, x0a); 538 x3 = _mm_xor_si128(x3, x0b); 539 } else { 540 /* Loaded 128 bits already into x0. 541 */ 542 x3 = x0; 543 i = 1; 544 } 545 546 /* x3 is now 128-bit result. 547 * Fold 0-3 128-bit chunks into x3. 548 */ 549 for (; i < len_128bit; i++) { 550 x0 = b[i]; // data to fold 551 // fold x3 down by 128 to align with data. 552 x0a = _mm_clmulepi64_si128(K_160_96, x3, 0x00); 553 x0b = _mm_clmulepi64_si128(K_160_96, x3, 0x11); 554 x3 = _mm_xor_si128(x0, x0a); 555 x3 = _mm_xor_si128(x3, x0b); 556 // x3 is now aligned with data we just loaded. 557 } 558 559 /* 560 * No more 128bits remain. 561 * Fold x3 down into 32 bits. 562 */ 563 { 564 uint64_t w; 565 uint64_t y; 566 ut0.v = x3; 567 y = ut0.hi; // 64 low-order terms of polynomial into y. 568 569 /* polynomial term order: 570 * high -> low 571 * bit number order 572 * 0 -> 127 573 * 574 * input, from which y was just extracted. 575 * w0 w1 y0 y1 576 * w0:w1 * x64 yields 96 bits. 577 * p0:p1:p2:__ (aligned wrong, store to extract p1 and p2) 578 * p0:p1:__:__ & ff:00:__:__ (mask to get rid of p1) 579 * p0:00:__:__ 580 * p0:00 * x64 (times x64 yields 64 bits) 581 * r0:r1 store and xor. 582 */ 583 584 x0 = _mm_clmulepi64_si128(K_M_64, x3, 0x01); 585 ut1.v = x0; 586 w = (ut1.lo >> 32) + (ut1.hi << 32); // extract low-poly 64 bits. 587 x0 = _mm_and_si128(K_M_64, x0); // mask away what we just extracted.. 588 x0 = _mm_clmulepi64_si128(K_M_64, x0, 0x01); 589 w ^= y; 590 ut2.v = x0; 591 w ^= ut2.lo; 592 593 return w; 594 } 595 } 596 #endif /* NO_ASM */ 597 598 uint32_t fastcrc32(jint crc, Bytef * buf, jint len) { 599 const unsigned long FAR * timesXtoThe32 = crc_table; 600 intptr_t ibuf = (intptr_t) buf; 601 int log_align = 4; 602 int align = 1 << log_align; 603 int mask = align - 1; 604 int islop = (align - ibuf) & mask; 605 uint32_t c = ~crc; 606 int i = 0; 607 608 if (len - islop >= align) { 609 jint len_128bit; 610 /* Handle bytes preceding 16-byte alignment. */ 611 for (i = 0; i < islop; i++ ) { 612 uint32_t x0 = buf[i]; 613 x0 = timesXtoThe32[(x0 ^ c) & 0xFF]; 614 c = x0 ^ (c >> 8); 615 } 616 buf += i; 617 len -= i; 618 619 len_128bit = len >> log_align; 620 621 if (len_128bit > 0) { 622 uint64_t w = kernel(c, buf, len_128bit, K_struct); 623 /* 624 * 8 8-bit folds to compute 32-bit CRC. 625 */ 626 w = timesXtoThe32[w & 0xFF] ^ (w >> 8); 627 w = timesXtoThe32[w & 0xFF] ^ (w >> 8); 628 w = timesXtoThe32[w & 0xFF] ^ (w >> 8); 629 w = timesXtoThe32[w & 0xFF] ^ (w >> 8); 630 w = timesXtoThe32[w & 0xFF] ^ (w >> 8); 631 w = timesXtoThe32[w & 0xFF] ^ (w >> 8); 632 w = timesXtoThe32[w & 0xFF] ^ (w >> 8); 633 w = timesXtoThe32[w & 0xFF] ^ (w >> 8); 634 c = (uint32_t) w; 635 i = len_128bit << log_align; 636 } else { 637 i = 0; 638 } 639 } 640 /* Handle short CRC and tail of long CRC */ 641 for (; i < len; i++) { 642 uint32_t x0 = buf[i]; 643 x0 = timesXtoThe32[(x0 ^ c) & 0xFF]; 644 c = x0 ^ (c >> 8); 645 } 646 return ~c; 647 } 648 #endif