1 /* 2 * Copyright (c) 1997, 2013, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 26 /* 27 * Native method support for java.util.zip.CRC32 28 */ 29 30 #include "jni.h" 31 #include "jni_util.h" 32 #include <zlib.h> 33 34 #include "java_util_zip_CRC32.h" 35 36 /* define CAN_COMPILE_CLMUL 0 to disable fastcrc32 completely. */ 37 38 #ifndef CAN_COMPILE_CLMUL 39 # ifdef __x86_64 40 # define CAN_COMPILE_CLMUL 1 41 # elif defined(__i386) 42 # define CAN_COMPILE_CLMUL 1 43 # endif 44 #endif 45 46 #if CAN_COMPILE_CLMUL 47 #include <stdint.h> 48 #include <stdlib.h> 49 50 struct crc_by128_K { 51 /* The fields in this structure are arranged so that if it is 52 * allocated at a 16-byte alignment they can be picked up two at 53 * a time with 128-bit loads. 54 * 55 * Because of flipped bit order for this CRC polynomials 56 * the constant for X**N is left-shifted by 1. This is because 57 * a 64 x 64 polynomial multiply produces a 127-bit result 58 * but the highest term is always aligned to bit 0 in the container. 59 * Pre-shifting by one fixes this, at the cost of potentially making 60 * the 32-bit constant no longer fit in a 32-bit container (thus the 61 * use of uint64_t, though this is also the size used by the carry- 62 * less multiply instruction. 63 * 64 * In addition, the flipped bit order and highest-term-at-least-bit 65 * multiply changes the constants used. The 96-bit result will be 66 * aligned to the high-term end of the target 128-bit container, 67 * not the low-term end; that is, instead of a 512-bit or 576-bit fold, 68 * instead it is a 480 (=512-32) or 544 (=512+64-32) bit fold. 69 * 70 * This cause additional problems in the 128-to-64-bit reduction; see the 71 * code for details. By storing a mask in the otherwise unused half of 72 * a 128-bit constant, bits can be cleared before multiplication without 73 * storing and reloading. Note that staying on a 128-bit datapath means 74 * that some data is uselessly stored and some unused data is intersected 75 * with an irrelevant constant. 76 */ 77 78 uint64_t mask; /* low of K_M_64 */ 79 uint64_t xtt64; /* high of K_M_64 */ 80 uint64_t xtt160; /* low of K_160_96 */ 81 uint64_t xtt96; /* high of K_160_96 */ 82 uint64_t xtt544; /* low of K_544_480 */ 83 uint64_t xtt480; /* high of K_544_480 */ 84 }; 85 86 struct crc_by128_K * K_struct = 0; 87 88 static const uint64_t x64 = (uint64_t) 0xb1e6b092U << 1; 89 static const uint64_t x96 = (uint64_t) 0x6655004fU << 1; 90 static const uint64_t x160 = (uint64_t) 0xba8ccbe8U << 1; 91 static const uint64_t x480 = (uint64_t) 0xe3720acbU << 1; 92 static const uint64_t x544 = (uint64_t) 0xaa2215eaU << 1; 93 94 static struct crc_by128_K * init_crc_by128_K() { 95 void * y; 96 int rc = posix_memalign( & y, 16, sizeof(struct crc_by128_K)); 97 if (rc) { 98 return (struct crc_by128_K *) NULL; 99 } else { 100 struct crc_by128_K * x = y; 101 x -> mask = 0xffffffffUL; 102 x -> xtt64 = x64; 103 x -> xtt160 = x160; 104 x -> xtt96 = x96; 105 x -> xtt544 = x544; 106 x -> xtt480 = x480; 107 return x; 108 } 109 } 110 111 uint32_t fastcrc32(jint crc, Bytef * buf, jint len); 112 113 /* Flag governing use of "CLMUL" instruction. 114 For now, implies little-endian. 115 Computed dynamically, incorporates information about 116 the current hardware and the compiler used to compile 117 this file. */ 118 static int useClmul = 0; 119 #else 120 /* Stub out fastcrc32 */ 121 # define fastcrc32 crc32 122 # define useClmul 0 123 #endif 124 125 126 /* Local copy of CRC32 table is used to fill and drain CLMUL CRC. 127 Extra members beyond the first 256-entry row are ignored. */ 128 static const unsigned long FAR * crc_table; 129 130 /* Initialize java-side table (for small CRCs) to avoid extra startup work, 131 and capture the platform-dependent useClmul flag. 132 */ 133 JNIEXPORT jboolean JNICALL 134 Java_java_util_zip_CRC32_init(JNIEnv *env, jclass cls, jarray b, jboolean use_clmul) 135 { 136 /* Get the CRC table from zip to initialize JNI. Our private copy 137 is missing if not compiled for fastcrc32. */ 138 crc_table = get_crc_table(); 139 jint *buf = (*env)->GetPrimitiveArrayCritical(env, b, 0); 140 if (buf) { 141 /* Don't know for sure how big an unsigned long is, therefore 142 copy one at a time. */ 143 int i; 144 for (i = 0; i < 256; i++) buf[i] = (jint) (crc_table[i]); 145 (*env)->ReleasePrimitiveArrayCritical(env, b, buf, 0); 146 } 147 #if CAN_COMPILE_CLMUL 148 if (use_clmul) { 149 K_struct = init_crc_by128_K(); 150 useClmul = K_struct != 0; 151 /* Rather than throw OOME, just do without fast CRC. */ 152 } 153 #endif 154 return useClmul; 155 } 156 157 JNIEXPORT jint JNICALL 158 Java_java_util_zip_CRC32_update(JNIEnv *env, jclass cls, jint crc, jint b) 159 { 160 Bytef buf[1]; 161 162 buf[0] = (Bytef)b; 163 return crc32(crc, buf, 1); // single byte not done quickly by fastcrc32 164 } 165 166 JNIEXPORT jint JNICALL 167 Java_java_util_zip_CRC32_updateBytes(JNIEnv *env, jclass cls, jint crc, 168 jarray b, jint off, jint len) 169 { 170 Bytef *buf = (*env)->GetPrimitiveArrayCritical(env, b, 0); 171 if (buf) { 172 crc = (jint) (useClmul ? fastcrc32(crc, buf + off, len) : 173 crc32(crc, buf + off, len)); 174 (*env)->ReleasePrimitiveArrayCritical(env, b, buf, 0); 175 } 176 return crc; 177 } 178 179 JNIEXPORT jint ZIP_CRC32(jint crc, const jbyte *buf, jint len) 180 { 181 return (jint) (useClmul ? fastcrc32(crc, (Bytef*)buf, len) : 182 crc32(crc, (Bytef*)buf, len)); 183 } 184 185 JNIEXPORT jint JNICALL 186 Java_java_util_zip_CRC32_updateByteBuffer(JNIEnv *env, jclass cls, jint crc, 187 jlong address, jint off, jint len) 188 { 189 Bytef *buf = (Bytef *)jlong_to_ptr(address); 190 if (buf) { 191 crc = (jint) (useClmul ? fastcrc32(crc, buf + off, len) : 192 crc32(crc, buf + off, len)); 193 } 194 return crc; 195 } 196 197 #if CAN_COMPILE_CLMUL 198 #ifndef NO_ASM 199 200 /* set up the platform-specific glop surrounding the function body. */ 201 # ifdef __x86_64 202 # ifdef __APPLE__ 203 # define ASM_PREFIX ".text\n\t.align 8\n\t.globl _kernel\n_kernel:\n\t" 204 # define ASM_SUFFIX "" 205 # elif defined(__GNUC__) 206 # define ASM_PREFIX ".text\n\t.align 16\n\t.globl kernel\n\t.type kernel,@function\nkernel:\n\t" 207 # define ASM_SUFFIX "" 208 # elif defined(__SUNPRO_C) 209 # define ASM_PREFIX ".section .text,\"ax\"\n\t.align 16, 0x90\n\t.globl kernel\n\t.type kernel,@function\nkernel:\n\t" 210 # define ASM_SUFFIX ".size kernel,.-kernel" 211 # else 212 /* Perhaps the mystery compiler can handle the intrinsics. */ 213 # define NO_ASM 1 214 # endif 215 216 # ifndef NO_ASM 217 __asm__( 218 ASM_PREFIX 219 " pushq %rbp\n\t" 220 " movq %rsp, %rbp\n\t" 221 " movl %edi, %eax\n\t" 222 " .byte 0xc5,0xf9,0x6f,0x06 # vmovdqa(%rsi), %xmm0\n\t" 223 " .byte 0xc4,0xe1,0xf9,0x7e,0xc7 # vmovd %xmm0, %rdi\n\t" 224 " xorq %rax, %rdi\n\t" 225 " .byte 0xc4,0xe3,0xf9,0x22,0xd7,0x00 # vpinsrq$0, %rdi, %xmm0, %xmm2\n\t" 226 " .byte 0xc5,0x79,0x6f,0x01 # vmovdqa(%rcx), %xmm8\n\t" 227 " .byte 0xc5,0x79,0x6f,0x49,0x10 # vmovdqa16(%rcx), %xmm9\n\t" 228 " movl $1, %eax\n\t" 229 " cmpl $4, %edx\n\t" 230 " jl 1f\n\t" 231 " .byte 0xc5,0xf9,0x6f,0x6e,0x10 # vmovdqa16(%rsi), %xmm5\n\t" 232 " .byte 0xc5,0xf9,0x6f,0x66,0x20 # vmovdqa32(%rsi), %xmm4\n\t" 233 " .byte 0xc5,0xf9,0x6f,0x5e,0x30 # vmovdqa48(%rsi), %xmm3\n\t" 234 " leal -3(%rdx), %edi\n\t" 235 " movl $4, %eax\n\t" 236 " cmpl $5, %edi\n\t" 237 " jl 2f\n\t" 238 " .byte 0xc5,0xf9,0x6f,0x71,0x20 # vmovdqa32(%rcx), %xmm6\n\t" 239 " leaq 112(%rsi), %rcx\n\t" 240 " movl $4, %eax\n\t" 241 " .align 4, 0x90\n" 242 "3: .byte 0xc4,0xe3,0x49,0x44,0xc2,0x00 # vpclmulqdq$0, %xmm2, %xmm6, %xmm0\n\t" 243 " .byte 0xc4,0xe3,0x49,0x44,0xcb,0x11 # vpclmulqdq$17, %xmm3, %xmm6, %xmm1\n\t" 244 " .byte 0xc4,0xe3,0x49,0x44,0xdb,0x00 # vpclmulqdq$0, %xmm3, %xmm6, %xmm3\n\t" 245 " .byte 0xc5,0xe1,0xef,0x19 # vpxor (%rcx), %xmm3, %xmm3\n\t" 246 " .byte 0xc4,0xe3,0x49,0x44,0xfd,0x00 # vpclmulqdq$0, %xmm5, %xmm6, %xmm7\n\t" 247 " .byte 0xc5,0xc1,0xef,0x79,0xe0 # vpxor -32(%rcx), %xmm7, %xmm7\n\t" 248 " .byte 0xc5,0xf1,0xef,0xdb # vpxor %xmm3, %xmm1, %xmm3\n\t" 249 " .byte 0xc4,0xe3,0x49,0x44,0xd2,0x11 # vpclmulqdq$17, %xmm2, %xmm6, %xmm2\n\t" 250 " .byte 0xc5,0xf9,0xef,0x41,0xd0 # vpxor -48(%rcx), %xmm0, %xmm0\n\t" 251 " .byte 0xc4,0xe3,0x49,0x44,0xcd,0x11 # vpclmulqdq$17, %xmm5, %xmm6, %xmm1\n\t" 252 " .byte 0xc4,0xe3,0x49,0x44,0xec,0x11 # vpclmulqdq$17, %xmm4, %xmm6, %xmm5\n\t" 253 " .byte 0xc4,0xe3,0x49,0x44,0xe4,0x00 # vpclmulqdq$0, %xmm4, %xmm6, %xmm4\n\t" 254 " .byte 0xc5,0xd9,0xef,0x61,0xf0 # vpxor -16(%rcx), %xmm4, %xmm4\n\t" 255 " .byte 0xc5,0xd1,0xef,0xe4 # vpxor %xmm4, %xmm5, %xmm4\n\t" 256 " .byte 0xc5,0xf1,0xef,0xef # vpxor %xmm7, %xmm1, %xmm5\n\t" 257 " .byte 0xc5,0xe9,0xef,0xd0 # vpxor %xmm0, %xmm2, %xmm2\n\t" 258 " addq $64, %rcx\n\t" 259 " addl $4, %eax\n\t" 260 " cmpl %edi, %eax\n\t" 261 " jl 3b\n" 262 "2: .byte 0xc4,0xe3,0x31,0x44,0xc2,0x11 # vpclmulqdq$17, %xmm2, %xmm9, %xmm0\n\t" 263 " .byte 0xc4,0xe3,0x31,0x44,0xca,0x00 # vpclmulqdq$0, %xmm2, %xmm9, %xmm1\n\t" 264 " .byte 0xc5,0xd1,0xef,0xc9 # vpxor %xmm1, %xmm5, %xmm1\n\t" 265 " .byte 0xc5,0xf1,0xef,0xc8 # vpxor %xmm0, %xmm1, %xmm1\n\t" 266 " .byte 0xc4,0xe3,0x31,0x44,0xc1,0x11 # vpclmulqdq$17, %xmm1, %xmm9, %xmm0\n\t" 267 " .byte 0xc4,0xe3,0x31,0x44,0xc9,0x00 # vpclmulqdq$0, %xmm1, %xmm9, %xmm1\n\t" 268 " .byte 0xc5,0xd9,0xef,0xc9 # vpxor %xmm1, %xmm4, %xmm1\n\t" 269 " .byte 0xc5,0xf1,0xef,0xc8 # vpxor %xmm0, %xmm1, %xmm1\n\t" 270 " .byte 0xc4,0xe3,0x31,0x44,0xc1,0x11 # vpclmulqdq$17, %xmm1, %xmm9, %xmm0\n\t" 271 " .byte 0xc4,0xe3,0x31,0x44,0xc9,0x00 # vpclmulqdq$0, %xmm1, %xmm9, %xmm1\n\t" 272 " .byte 0xc5,0xe1,0xef,0xc9 # vpxor %xmm1, %xmm3, %xmm1\n\t" 273 " .byte 0xc5,0xf1,0xef,0xd0 # vpxor %xmm0, %xmm1, %xmm2\n" 274 "1: cmpl %edx, %eax\n\t" 275 " jge 4f\n\t" 276 " subl %eax, %edx\n\t" 277 " movslq %eax, %rax\n\t" 278 " shlq $4, %rax\n\t" 279 " addq %rax, %rsi\n\t" 280 " .align 4, 0x90\n" 281 "5: .byte 0xc4,0xe3,0x31,0x44,0xc2,0x11 # vpclmulqdq$17, %xmm2, %xmm9, %xmm0\n\t" 282 " .byte 0xc4,0xe3,0x31,0x44,0xca,0x00 # vpclmulqdq$0, %xmm2, %xmm9, %xmm1\n\t" 283 " .byte 0xc5,0xf1,0xef,0x0e # vpxor (%rsi), %xmm1, %xmm1\n\t" 284 " .byte 0xc5,0xf1,0xef,0xd0 # vpxor %xmm0, %xmm1, %xmm2\n\t" 285 " addq $16, %rsi\n\t" 286 " decl %edx\n\t" 287 " jne 5b\n" 288 "4: .byte 0xc4,0xe3,0x39,0x44,0xc2,0x01 # vpclmulqdq$1, %xmm2, %xmm8, %xmm0\n\t" 289 " .byte 0xc4,0xe1,0xf9,0x7e,0xc0 # vmovd %xmm0, %rax\n\t" 290 " .byte 0xc4,0xe3,0xf9,0x16,0xc1,0x01 # vpextrq$1, %xmm0, %rcx\n\t" 291 " shldq $32, %rax, %rcx\n\t" 292 " .byte 0xc5,0xb9,0xdb,0xc0 # vpand %xmm0, %xmm8, %xmm0\n\t" 293 " .byte 0xc4,0xe3,0x39,0x44,0xc0,0x01 # vpclmulqdq$1, %xmm0, %xmm8, %xmm0\n\t" 294 " .byte 0xc4,0xe1,0xf9,0x7e,0xc2 # vmovd %xmm0, %rdx\n\t" 295 " .byte 0xc4,0xe3,0xf9,0x16,0xd0,0x01 # vpextrq$1, %xmm2, %rax\n\t" 296 " xorq %rdx, %rax\n\t" 297 " xorq %rcx, %rax\n\t" 298 " popq %rbp\n\t" 299 " ret\n" 300 ASM_SUFFIX 301 ); 302 # endif 303 # elif defined(__i386) 304 305 /* set up the platform-specific glop surrounding the function body. */ 306 # ifdef __APPLE__ 307 # define ASM_PREFIX ".text\n\t.align 16\n\t.globl _kernel\n_kernel:\n\t" 308 # define ASM_SUFFIX "" 309 # elif defined(__GNUC__) 310 # define ASM_PREFIX ".text\n\t.align 16\n\t.globl kernel\n\t.type kernel,@function\nkernel:\n\t" 311 # define ASM_SUFFIX "" 312 # elif defined(__SUNPRO_C) 313 # define ASM_PREFIX ".section .text,\"ax\"\n\t.align 16, 0x90\n\t.globl kernel\n\t.type kernel,@function\nkernel:\n\t" 314 # define ASM_SUFFIX ".size kernel,.-kernel" 315 # else 316 /* Perhaps the mystery compiler can handle the intrinsics. */ 317 # define NO_ASM 1 318 # endif 319 320 # ifndef NO_ASM 321 __asm__( 322 ASM_PREFIX 323 " pushl %ebp\n\t" 324 " movl %esp, %ebp\n\t" 325 " pushl %edi\n\t" 326 " pushl %esi\n\t" 327 " movl 12(%ebp), %eax\n\t" 328 " .byte 0xc5,0xf9,0x28,0x00 # vmovapd(%eax), %xmm0\n\t" 329 " .byte 0xc5,0xf9,0x7e,0xc1 # vmovd %xmm0, %ecx\n\t" 330 " xorl 8(%ebp), %ecx\n\t" 331 " .byte 0xc4,0xe3,0x79,0x22,0xc9,0x00 # vpinsrd$0, %ecx, %xmm0, %xmm1\n\t" 332 " .byte 0xc4,0xe3,0x79,0x16,0xc1,0x01 # vpextrd$1, %xmm0, %ecx\n\t" 333 " .byte 0xc4,0xe3,0x71,0x22,0xc9,0x01 # vpinsrd$1, %ecx, %xmm1, %xmm1\n\t" 334 " movl 20(%ebp), %edi\n\t" 335 " .byte 0xc5,0xf9,0x6f,0x07 # vmovdqa(%edi), %xmm0\n\t" 336 " .byte 0xc5,0xf9,0x6f,0x57,0x10 # vmovdqa16(%edi), %xmm2\n\t" 337 " movl $1, %edx\n\t" 338 " movl 16(%ebp), %ecx\n\t" 339 " cmpl $4, %ecx\n\t" 340 " jl 1f\n\t" 341 " .byte 0xc5,0xf9,0x6f,0x58,0x30 # vmovdqa48(%eax), %xmm3\n\t" 342 " .byte 0xc5,0xf9,0x6f,0x68,0x10 # vmovdqa16(%eax), %xmm5\n\t" 343 " .byte 0xc5,0xf9,0x6f,0x60,0x20 # vmovdqa32(%eax), %xmm4\n\t" 344 " leal -3(%ecx), %esi\n\t" 345 " movl $4, %edx\n\t" 346 " cmpl $5, %esi\n\t" 347 " jl 2f\n\t" 348 " .byte 0xc5,0xf9,0x6f,0x77,0x20 # vmovdqa32(%edi), %xmm6\n\t" 349 " leal 112(%eax), %edi\n\t" 350 " movl $4, %edx\n\t" 351 " .align 4, 0x90\n" 352 "3: .byte 0xc4,0xe3,0x49,0x44,0xfb,0x11 # vpclmulqdq$17, %xmm3, %xmm6, %xmm7\n\t" 353 " .byte 0xc4,0xe3,0x49,0x44,0xdb,0x00 # vpclmulqdq$0, %xmm3, %xmm6, %xmm3\n\t" 354 " .byte 0xc5,0xe1,0xef,0x1f # vpxor (%edi), %xmm3, %xmm3\n\t" 355 " .byte 0xc5,0xc1,0xef,0xdb # vpxor %xmm3, %xmm7, %xmm3\n\t" 356 " .byte 0xc4,0xe3,0x49,0x44,0xfc,0x11 # vpclmulqdq$17, %xmm4, %xmm6, %xmm7\n\t" 357 " .byte 0xc4,0xe3,0x49,0x44,0xe4,0x00 # vpclmulqdq$0, %xmm4, %xmm6, %xmm4\n\t" 358 " .byte 0xc5,0xd9,0xef,0x67,0xf0 # vpxor -16(%edi), %xmm4, %xmm4\n\t" 359 " .byte 0xc5,0xc1,0xef,0xe4 # vpxor %xmm4, %xmm7, %xmm4\n\t" 360 " .byte 0xc4,0xe3,0x49,0x44,0xfd,0x11 # vpclmulqdq$17, %xmm5, %xmm6, %xmm7\n\t" 361 " .byte 0xc4,0xe3,0x49,0x44,0xed,0x00 # vpclmulqdq$0, %xmm5, %xmm6, %xmm5\n\t" 362 " .byte 0xc5,0xd1,0xef,0x6f,0xe0 # vpxor -32(%edi), %xmm5, %xmm5\n\t" 363 " .byte 0xc5,0xc1,0xef,0xed # vpxor %xmm5, %xmm7, %xmm5\n\t" 364 " .byte 0xc4,0xe3,0x49,0x44,0xf9,0x11 # vpclmulqdq$17, %xmm1, %xmm6, %xmm7\n\t" 365 " .byte 0xc4,0xe3,0x49,0x44,0xc9,0x00 # vpclmulqdq$0, %xmm1, %xmm6, %xmm1\n\t" 366 " .byte 0xc5,0xf1,0xef,0x4f,0xd0 # vpxor -48(%edi), %xmm1, %xmm1\n\t" 367 " .byte 0xc5,0xc1,0xef,0xc9 # vpxor %xmm1, %xmm7, %xmm1\n\t" 368 " addl $64, %edi\n\t" 369 " addl $4, %edx\n\t" 370 " cmpl %esi, %edx\n\t" 371 " jl 3b\n" 372 "2: .byte 0xc4,0xe3,0x69,0x44,0xf1,0x11 # vpclmulqdq$17, %xmm1, %xmm2, %xmm6\n\t" 373 " .byte 0xc4,0xe3,0x69,0x44,0xc9,0x00 # vpclmulqdq$0, %xmm1, %xmm2, %xmm1\n\t" 374 " .byte 0xc5,0xd1,0xef,0xc9 # vpxor %xmm1, %xmm5, %xmm1\n\t" 375 " .byte 0xc5,0xf1,0xef,0xee # vpxor %xmm6, %xmm1, %xmm5\n\t" 376 " .byte 0xc4,0xe3,0x69,0x44,0xcd,0x11 # vpclmulqdq$17, %xmm5, %xmm2, %xmm1\n\t" 377 " .byte 0xc4,0xe3,0x69,0x44,0xed,0x00 # vpclmulqdq$0, %xmm5, %xmm2, %xmm5\n\t" 378 " .byte 0xc5,0xd9,0xef,0xe5 # vpxor %xmm5, %xmm4, %xmm4\n\t" 379 " .byte 0xc5,0xd9,0xef,0xe1 # vpxor %xmm1, %xmm4, %xmm4\n\t" 380 " .byte 0xc4,0xe3,0x69,0x44,0xcc,0x11 # vpclmulqdq$17, %xmm4, %xmm2, %xmm1\n\t" 381 " .byte 0xc4,0xe3,0x69,0x44,0xe4,0x00 # vpclmulqdq$0, %xmm4, %xmm2, %xmm4\n\t" 382 " .byte 0xc5,0xe1,0xef,0xdc # vpxor %xmm4, %xmm3, %xmm3\n\t" 383 " .byte 0xc5,0xe1,0xef,0xc9 # vpxor %xmm1, %xmm3, %xmm1\n" 384 "1: cmpl %ecx, %edx\n\t" 385 " jge 4f\n\t" 386 " subl %edx, %ecx\n\t" 387 " shll $4, %edx\n\t" 388 " addl %edx, %eax\n\t" 389 " .align 4, 0x90\n" 390 "5: .byte 0xc4,0xe3,0x69,0x44,0xd9,0x11 # vpclmulqdq$17, %xmm1, %xmm2, %xmm3\n\t" 391 " .byte 0xc4,0xe3,0x69,0x44,0xc9,0x00 # vpclmulqdq$0, %xmm1, %xmm2, %xmm1\n\t" 392 " .byte 0xc5,0xf1,0xef,0x08 # vpxor (%eax), %xmm1, %xmm1\n\t" 393 " .byte 0xc5,0xf1,0xef,0xcb # vpxor %xmm3, %xmm1, %xmm1\n\t" 394 " addl $16, %eax\n\t" 395 " decl %ecx\n\t" 396 " jne 5b\n" 397 "4: .byte 0xc4,0xe3,0x79,0x44,0xd1,0x01 # vpclmulqdq$1, %xmm1, %xmm0, %xmm2\n\t" 398 " .byte 0xc5,0xf9,0xdb,0xda # vpand %xmm2, %xmm0, %xmm3\n\t" 399 " .byte 0xc4,0xe3,0x79,0x44,0xc3,0x01 # vpclmulqdq$1, %xmm3, %xmm0, %xmm0\n\t" 400 " .byte 0xc5,0xf9,0x7e,0xc0 # vmovd %xmm0, %eax\n\t" 401 " .byte 0xc4,0xe3,0x79,0x16,0xc9,0x02 # vpextrd$2, %xmm1, %ecx\n\t" 402 " xorl %eax, %ecx\n\t" 403 " .byte 0xc4,0xe3,0x79,0x16,0xd0,0x01 # vpextrd$1, %xmm2, %eax\n\t" 404 " xorl %ecx, %eax\n\t" 405 " .byte 0xc4,0xe3,0x79,0x16,0xc2,0x01 # vpextrd$1, %xmm0, %edx\n\t" 406 " .byte 0xc4,0xe3,0x79,0x16,0xc9,0x03 # vpextrd$3, %xmm1, %ecx\n\t" 407 " xorl %edx, %ecx\n\t" 408 " .byte 0xc4,0xe3,0x79,0x16,0xd2,0x02 # vpextrd$2, %xmm2, %edx\n\t" 409 " xorl %ecx, %edx\n\t" 410 " popl %esi\n\t" 411 " popl %edi\n\t" 412 " popl %ebp\n\t" 413 " ret\n" 414 ASM_SUFFIX 415 ); 416 # endif 417 # else /* architecture type */ 418 /* Not intel, not that the C intrinsics will compile anywhere else, 419 * but it will be a slightly better error message. 420 */ 421 # define NO_ASM 1 422 # endif 423 #endif /* NO_ASM */ 424 425 #ifndef NO_ASM 426 /* Declaration for use below. */ 427 uint64_t kernel(uint32_t c, unsigned char * buf, int len_128bit, struct crc_by128_K * K); 428 #else 429 #pragma message("Compiling 'kernel' from C source with intrinsics") 430 #include <wmmintrin.h> 431 #include <emmintrin.h> 432 433 union u { 434 __m128i v; 435 struct { 436 uint64_t lo; 437 uint64_t hi; 438 }; 439 }; 440 441 /** 442 * Assume c is existing crc, 443 * buf is 16-byte-aligned, 444 * len is a multiple of 16 greater than zero. 445 */ 446 uint64_t kernel(uint32_t c, unsigned char * buf, int len_128bit, 447 struct crc_by128_K * K) { 448 449 __m128i * b = (__m128i *) buf; 450 int i = 0; 451 452 /* 128 bit constants and variables. */ 453 __m128i K_544_480, K_160_96, K_M_64, 454 x0, x1, x2, x3, 455 x0a, x1a, x2a, x3a, 456 x0b, x1b, x2b, x3b; 457 458 /* Use these to move data between xmm registers and "normal" registers. */ 459 union u ut0, ut1, ut2, ut3; 460 461 K_544_480 = * (__m128i *) & (K -> xtt544); 462 K_160_96 = * (__m128i *) & (K -> xtt160); 463 K_M_64 = * (__m128i *) & (K -> mask); 464 465 /* Incorporate existing CRC into first item */ 466 ut0.v = b[0]; 467 ut0.lo ^= c; 468 x0 = ut0.v; 469 470 if (len_128bit >= 4) { 471 /* Written as a slightly pipelined loop. */ 472 473 x1 = b[1]; 474 x2 = b[2]; 475 x3 = b[3]; 476 477 /* Iterate once if len_128bit is between 8 and 11 478 * 4 < 8-3 < 11 - 3 479 * 8 !< 11 - 3 < 12 - 3. 480 * 481 * 0 1 2 3 | 4 5 6 7 | 8 9 10 11 | 12 482 * 483 */ 484 for (i = 4; i < len_128bit - 3 ; i+= 4) { 485 /* Each iteration of this loop folds the 512 bits of polynomial 486 * in x0-x3 with the data in b[i]..b[i+3]. 487 */ 488 x0a = b[i]; 489 x1a = b[i+1]; 490 x2a = b[i+2]; 491 x3a = b[i+3]; 492 493 x0b = _mm_clmulepi64_si128(K_544_480, x0, 0x00); 494 x0 = _mm_clmulepi64_si128(K_544_480, x0, 0x11); 495 x1b = _mm_clmulepi64_si128(K_544_480, x1, 0x00); 496 x1 = _mm_clmulepi64_si128(K_544_480, x1, 0x11); 497 498 x2b = _mm_clmulepi64_si128(K_544_480, x2, 0x00); 499 x2 = _mm_clmulepi64_si128(K_544_480, x2, 0x11); 500 x3b = _mm_clmulepi64_si128(K_544_480, x3, 0x00); 501 x3 = _mm_clmulepi64_si128(K_544_480, x3, 0x11); 502 503 // x0 ^= x0a ^ x0b; 504 x0 = _mm_xor_si128(x0, x0a); 505 x0 = _mm_xor_si128(x0, x0b); 506 // x1 ^= x1a ^ x1b; 507 x1 = _mm_xor_si128(x1, x1a); 508 x1 = _mm_xor_si128(x1, x1b); 509 // x2 ^= x2a ^ x2b; 510 x2 = _mm_xor_si128(x2, x2a); 511 x2 = _mm_xor_si128(x2, x2b); 512 // x3 ^= x3a ^ x3b; 513 x3 = _mm_xor_si128(x3, x3a); 514 x3 = _mm_xor_si128(x3, x3b); 515 } 516 /* x0 - x3 contains 4 x 128 bits of accumulated result. 517 * 0-3 hexads potentially remain in [i,len_128bit) entries. 518 * Assume trailing bytes beyond that are handled by our caller. 519 */ 520 x0a = _mm_clmulepi64_si128(K_160_96, x0, 0x00); 521 x0b = _mm_clmulepi64_si128(K_160_96, x0, 0x11); 522 x1 = _mm_xor_si128(x1, x0a); 523 x1 = _mm_xor_si128(x1, x0b); 524 x0a = _mm_clmulepi64_si128(K_160_96, x1, 0x00); 525 x0b = _mm_clmulepi64_si128(K_160_96, x1, 0x11); 526 x2 = _mm_xor_si128(x2, x0a); 527 x2 = _mm_xor_si128(x2, x0b); 528 x0a = _mm_clmulepi64_si128(K_160_96, x2, 0x00); 529 x0b = _mm_clmulepi64_si128(K_160_96, x2, 0x11); 530 x3 = _mm_xor_si128(x3, x0a); 531 x3 = _mm_xor_si128(x3, x0b); 532 } else { 533 /* Loaded 128 bits already into x0. 534 */ 535 x3 = x0; 536 i = 1; 537 } 538 539 /* x3 is now 128-bit result. 540 * Fold 0-3 128-bit chunks into x3. 541 */ 542 for (; i < len_128bit; i++) { 543 x0 = b[i]; // data to fold 544 // fold x3 down by 128 to align with data. 545 x0a = _mm_clmulepi64_si128(K_160_96, x3, 0x00); 546 x0b = _mm_clmulepi64_si128(K_160_96, x3, 0x11); 547 x3 = _mm_xor_si128(x0, x0a); 548 x3 = _mm_xor_si128(x3, x0b); 549 // x3 is now aligned with data we just loaded. 550 } 551 552 /* 553 * No more 128bits remain. 554 * Fold x3 down into 32 bits. 555 */ 556 { 557 ut0.v = x3; 558 uint64_t w; 559 uint64_t y = ut0.hi; // 64 low-order terms of polynomial into y. 560 561 /* polynomial term order: 562 * high -> low 563 * bit number order 564 * 0 -> 127 565 * 566 * input, from which y was just extracted. 567 * w0 w1 y0 y1 568 * w0:w1 * x64 yields 96 bits. 569 * p0:p1:p2:__ (aligned wrong, store to extract p1 and p2) 570 * p0:p1:__:__ & ff:00:__:__ (mask to get rid of p1) 571 * p0:00:__:__ 572 * p0:00 * x64 (times x64 yields 64 bits) 573 * r0:r1 store and xor. 574 */ 575 576 x0 = _mm_clmulepi64_si128(K_M_64, x3, 0x01); 577 ut1.v = x0; 578 w = (ut1.lo >> 32) + (ut1.hi << 32); // extract low-poly 64 bits. 579 x0 = _mm_and_si128(K_M_64, x0); // mask away what we just extracted.. 580 x0 = _mm_clmulepi64_si128(K_M_64, x0, 0x01); 581 w ^= y; 582 ut2.v = x0; 583 w ^= ut2.lo; 584 585 return w; 586 } 587 } 588 #endif /* NO_ASM */ 589 590 uint32_t fastcrc32(jint crc, Bytef * buf, jint len) { 591 const unsigned long FAR * timesXtoThe32 = crc_table; 592 intptr_t ibuf = (intptr_t) buf; 593 int log_align = 4; 594 int align = 1 << log_align; 595 int mask = align - 1; 596 int islop = (align - ibuf) & mask; 597 uint32_t c = ~crc; 598 int i = 0; 599 600 if (len - islop >= align) { 601 /* Handle bytes preceding 16-byte alignment. */ 602 for (i = 0; i < islop; i++ ) { 603 uint32_t x0 = buf[i]; 604 x0 = timesXtoThe32[(x0 ^ c) & 0xFF]; 605 c = x0 ^ (c >> 8); 606 } 607 buf += i; 608 len -= i; 609 610 jint len_128bit = len >> log_align; 611 612 if (len_128bit > 0) { 613 uint64_t w = kernel(c, buf, len_128bit, K_struct); 614 /* 615 * 8 8-bit folds to compute 32-bit CRC. 616 */ 617 w = timesXtoThe32[w & 0xFF] ^ (w >> 8); 618 w = timesXtoThe32[w & 0xFF] ^ (w >> 8); 619 w = timesXtoThe32[w & 0xFF] ^ (w >> 8); 620 w = timesXtoThe32[w & 0xFF] ^ (w >> 8); 621 w = timesXtoThe32[w & 0xFF] ^ (w >> 8); 622 w = timesXtoThe32[w & 0xFF] ^ (w >> 8); 623 w = timesXtoThe32[w & 0xFF] ^ (w >> 8); 624 w = timesXtoThe32[w & 0xFF] ^ (w >> 8); 625 c = (uint32_t) w; 626 i = len_128bit << log_align; 627 } else { 628 i = 0; 629 } 630 } 631 /* Handle short CRC and tail of long CRC */ 632 for (; i < len; i++) { 633 uint32_t x0 = buf[i]; 634 x0 = timesXtoThe32[(x0 ^ c) & 0xFF]; 635 c = x0 ^ (c >> 8); 636 } 637 return ~c; 638 } 639 #endif