--- old/makefiles/mapfiles/libzip/mapfile-vers 2013-05-16 08:24:39.000000000 -0400 +++ new/makefiles/mapfiles/libzip/mapfile-vers 2013-05-16 08:24:39.000000000 -0400 @@ -31,6 +31,7 @@ Java_java_util_zip_Adler32_update; Java_java_util_zip_Adler32_updateBytes; Java_java_util_zip_Adler32_updateByteBuffer; + Java_java_util_zip_CRC32_init; Java_java_util_zip_CRC32_update; Java_java_util_zip_CRC32_updateBytes; Java_java_util_zip_CRC32_updateByteBuffer; --- old/src/share/classes/java/util/zip/Adler32.java 2013-05-16 08:24:40.000000000 -0400 +++ new/src/share/classes/java/util/zip/Adler32.java 2013-05-16 08:24:40.000000000 -0400 @@ -1,5 +1,5 @@ /* - * Copyright (c) 1996, 2011, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2013, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -27,6 +27,7 @@ import java.nio.ByteBuffer; import sun.nio.ch.DirectBuffer; +import java.util.concurrent.RecursiveTask; /** * A class that can be used to compute the Adler-32 checksum of a data @@ -41,8 +42,46 @@ */ public class Adler32 implements Checksum { + /* + * This is a reformulation of the Adler32 calculation that permits recursive + * subdivision of the problem, thus allowing both parallelism and faster calculation + * byte-at-a-time checksums. + * + * The Adler calculation is regarded as + * taking an input text T of length N = |T|, + * and computing two quantities, A(T) and B(T), + * where A(T) = 1 + sum_{0 <= i < |T|}(T_i) + * and B(T) = |T| + sum_{0 <= i < |T|}((|T| - i) * T_i), + * both modulo 65521. + * + * However, with sufficient algebraic manipulation, one can derive + * that A(U||V) = A(U) + A(V) - 1 + * and B(U||V) = B(U) + B(V) + |V| (A(U) - 1). + */ + + /** + * The modulo operation can be deferred for MAX_SLOP bytes of input, permitting + * faster byte-by-byte Adler computations. 1024 is plenty conservative. + */ + private final static int MAX_SLOP = 1024; + + /** + * For inputs smaller than SERIAL_BELOW fork-join parallelism might + * not be profitable. + */ + private final static int SERIAL_BELOW = 1024 * 1024; + + /** + * For inputs smaller than JAVA_ADLER_BELOW JNI overheads make it faster + * to compute on the Java side. (This may change as overheads and compiler + * quality change). + */ + private final static int JAVA_ADLER_BELOW = 32; private int adler = 1; + private int aa = 1; + private int bb = 0; + private int slop = 0; /** * Creates a new Adler32 object. @@ -57,7 +96,13 @@ * @param b the byte to update the checksum with */ public void update(int b) { - adler = update(adler, b); + int la = aa + (b & 0xFF); + bb = bb + la; + aa = la; + slop++; + if (slop == MAX_SLOP) { + getValueI(); + } } /** @@ -70,7 +115,12 @@ if (off < 0 || len < 0 || off > b.length - len) { throw new ArrayIndexOutOfBoundsException(); } - adler = updateBytes(adler, b, off, len); + if (len < JAVA_ADLER_BELOW) { + for (int i = 0; i < len; i++) + update(b[i+off]); + } else { + setValue(updateBytesFJ(getValueI(), b, off, len)); + } } /** @@ -79,10 +129,14 @@ * @param b the byte array to update the checksum with */ public void update(byte[] b) { - adler = updateBytes(adler, b, 0, b.length); + if (b.length < JAVA_ADLER_BELOW) { + for (int i = 0; i < b.length; i++) + update(b[i]); + } else { + setValue(updateBytesFJ(getValueI(), b, 0, b.length)); + } } - /** * Updates the checksum with the bytes from the specified buffer. * @@ -104,13 +158,13 @@ if (rem <= 0) return; if (buffer instanceof DirectBuffer) { - adler = updateByteBuffer(adler, ((DirectBuffer)buffer).address(), pos, rem); + setValue(updateByteBufferFJ(getValueI(), ((DirectBuffer)buffer).address(), pos, rem)); } else if (buffer.hasArray()) { - adler = updateBytes(adler, buffer.array(), pos + buffer.arrayOffset(), rem); + setValue(updateBytesFJ(getValueI(), buffer.array(), pos + buffer.arrayOffset(), rem)); } else { byte[] b = new byte[rem]; buffer.get(b); - adler = updateBytes(adler, b, 0, b.length); + setValue(updateBytesFJ(getValueI(), b, 0, b.length)); } buffer.position(limit); } @@ -119,19 +173,133 @@ * Resets the checksum to initial value. */ public void reset() { + aa = 1; + bb = 0; adler = 1; + slop = 0; } /** * Returns the checksum value. */ public long getValue() { - return (long)adler & 0xffffffffL; + return getValueI() & 0xffffffffL; + } + + private int getValueI() { + if (slop > 0) { + aa = aa % 65521; + bb = bb % 65521; + adler = (bb << 16) + aa; + slop = 0; + } + return adler; + } + + private void setValue(int newValue) { + aa = newValue & 0xffff; + bb = newValue >>> 16; + adler = newValue; + slop = 0; } private native static int update(int adler, int b); private native static int updateBytes(int adler, byte[] b, int off, int len); + private native static int updateByteBuffer(int adler, long addr, int off, int len); + + private static int updateBytesFJ(int adler, byte[] ba, int start, int length) { + if (length < SERIAL_BELOW) { + return updateBytes(adler, ba, start, length); + } + AdlerTask w = new AdlerTask(adler, ba, start, length); + w.invoke(); + return(w.join()); + } + + private static int updateByteBufferFJ(int adler, long addr, int start, int length) { + if (length < SERIAL_BELOW) { + return updateByteBuffer(adler, addr, start, length); + } + AdlerBufferTask w = new AdlerBufferTask(adler, addr, start, length); + w.invoke(); + return(w.join()); + } + + static int combineAdlers(int prev_adler, int next_adler, int length) { + /* that A(U||V) = A(U) + A(V) - 1 + * and B(U||V) = B(U) + B(V) + |V| (A(U) - 1). + */ + if (prev_adler == 1) { + // Appending to initial checksum + return next_adler; + } else { + int after_a = next_adler & 0xffff; + int after_b = next_adler >>> 16; + + int prev_a = prev_adler & 0xffff; + int prev_b = prev_adler >>> 16; + + long partial = (long) length * (prev_a + 65520) % 65521; + prev_b = (prev_b + after_b + (int) partial) % 65521; + prev_a = (prev_a + after_a + 65520) % 65521; + return ((prev_b << 16) + prev_a); + } + } + + static class AdlerTask extends RecursiveTask { + final int adler; + final byte[] ba; + final int start; + final int length; + AdlerTask(int adler, byte[] ba, int start, int length) { + this.ba = ba; + this.start = start; + this.length = length; + this.adler = adler; + } + + @Override + protected Integer compute() { + if (length < SERIAL_BELOW) { + return updateBytes(adler, ba, start, length); + } else { + int half = length/2; + AdlerTask task2 = new AdlerTask(1, ba, start + half, length - half); + task2.fork(); + AdlerTask task1 = new AdlerTask(adler, ba, start, half); + int result1 = task1.compute(); + return combineAdlers(result1, task2.join(), length - half); + } + } + } + + static class AdlerBufferTask extends RecursiveTask { + final int adler; + final long addr; + final int start; + final int length; + AdlerBufferTask(int adler, long addr, int start, int length) { + this.addr = addr; + this.start = start; + this.length = length; + this.adler = adler; + } + + @Override + protected Integer compute() { + if (length < SERIAL_BELOW) { + return updateByteBuffer(adler, addr, start, length); + } else { + int half = length/2; + AdlerBufferTask task2 = new AdlerBufferTask(1, addr, start + half, length - half); + task2.fork(); + AdlerBufferTask task1 = new AdlerBufferTask(adler, addr, start, half); + int result1 = task1.compute(); + return combineAdlers(result1, task2.join(), length - half); + } + } + } } --- old/src/share/classes/java/util/zip/CRC32.java 2013-05-16 08:24:41.000000000 -0400 +++ new/src/share/classes/java/util/zip/CRC32.java 2013-05-16 08:24:41.000000000 -0400 @@ -1,5 +1,5 @@ /* - * Copyright (c) 1996, 2011, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2013, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -25,8 +25,12 @@ package java.util.zip; -import java.nio.ByteBuffer; import sun.nio.ch.DirectBuffer; +import java.lang.reflect.Field; +import java.nio.ByteBuffer; +import java.util.concurrent.RecursiveTask; +import java.util.concurrent.ForkJoinPool; +import sun.misc.Unsafe; /** * A class that can be used to compute the CRC-32 of a data stream. @@ -55,7 +59,10 @@ * @param b the byte to update the checksum with */ public void update(int b) { - crc = update(crc, b); + int c = ~ crc; + b = timesXtoThe32[(b ^ c) & 0xFF]; + b = b ^ (c >>> 8); + crc = ~b; } /** @@ -68,7 +75,12 @@ if (off < 0 || len < 0 || off > b.length - len) { throw new ArrayIndexOutOfBoundsException(); } - crc = updateBytes(crc, b, off, len); + + if (len < javaCRCIfSmallerThan) { + crc = updateBytesSimple(crc, b, off, len); + } else { + crc = updateBytesFJ(crc, b, off, len); + } } /** @@ -77,7 +89,7 @@ * @param b the array of bytes to update the checksum with */ public void update(byte[] b) { - crc = updateBytes(crc, b, 0, b.length); + crc = updateBytesFJ(crc, b, 0, b.length); } /** @@ -101,13 +113,13 @@ if (rem <= 0) return; if (buffer instanceof DirectBuffer) { - crc = updateByteBuffer(crc, ((DirectBuffer)buffer).address(), pos, rem); + crc = updateByteBufferFJ(crc, ((DirectBuffer)buffer).address(), pos, rem); } else if (buffer.hasArray()) { - crc = updateBytes(crc, buffer.array(), pos + buffer.arrayOffset(), rem); + crc = updateBytesFJ(crc, buffer.array(), pos + buffer.arrayOffset(), rem); } else { byte[] b = new byte[rem]; buffer.get(b); - crc = updateBytes(crc, b, 0, b.length); + crc = updateBytesFJ(crc, b, 0, b.length); } buffer.position(limit); } @@ -131,4 +143,267 @@ private native static int updateByteBuffer(int adler, long addr, int off, int len); + + + private static int updateBytesSimple(int crc, byte[] b, int off, int len) { + int[] a = timesXtoThe32; + if (a.length < 256) + throw new ArrayIndexOutOfBoundsException(); + int c = ~crc; + for (int i = 0; i < len; i++ ) { + int x0 = b[i + off]; + x0 = a[(x0 ^ c) & 0xFF]; + c = x0 ^ (c >>> 8); + } + return ~c; + } + + private native static boolean init(int[] timesXtoThe32, boolean try_use_clmul); + + /** + * timesXtoThe32[a] = rep(poly(a)*x**32) + */ + static int[] timesXtoThe32; + + /** + * powersByLog[i] = rep(x**(2**i)) + */ + static final int[] powersByLog = new int[32]; + + static final int LOG_PB8_LEN = 8; + + /** + * powersBy8[i] = rep(x**(8*i)) + */ + static final int[] powersBy8 = new int[1 << LOG_PB8_LEN]; + + static final int X_to_the_1 = 0x40000000; + static final int X_to_the_0 = 0x80000000; + + /** + * Indicates if the clmul instruction is enabled in the native + * code; this changes the estimated cost of computing a CRC. + */ + private static boolean clmulEnabled; + + /** + * Helpful for deciding whether to use workstealing or not. + */ + private static int fjParallelism = ForkJoinPool.getCommonPoolParallelism(); + + /** + * Estimated task size below which the fork-join overhead could be too large. + * May be modified depending on platform properties. + */ + static int serialIfSmallerThan = 512 * 1024; + + /** + * Estimated CRC size below which the JNI overhead is too large. + * May be modified depending on platform properties. + */ + static int javaCRCIfSmallerThan = 80; + + static int ARRAY_BYTE_BASE_OFFSET = 0; + + static boolean debug = false; + + static { + timesXtoThe32 = new int[256]; + boolean try_use_clmul = + "true".equals(sun.misc.VM.getSavedProperty("sun.zip.clmulSupported")); + clmulEnabled = init(timesXtoThe32, try_use_clmul); + + if (clmulEnabled) + serialIfSmallerThan *= 2; + + if ("true".equals(sun.misc.VM.getSavedProperty("sun.zip.serialOnly"))) + fjParallelism = 1; + + powersByLog[0] = X_to_the_1; + for (int i = 1; i < powersByLog.length; i++) { + int x = powersByLog[i-1]; + powersByLog[i] = mul(x,x); + } + + powersBy8[0] = X_to_the_0; + for (int i = 1; i < powersBy8.length; i++) { + int x = powersBy8[i-1]; + powersBy8[i] = mul(x,powersByLog[3]); + } + + /* Attempt to do all fork-join splits so that they land on a 16-byte boundary. + Even if arrays are only 8-byte aligned, this improves our chances. */ + try { + Field field = sun.misc.Unsafe.class.getDeclaredField("theUnsafe"); + field.setAccessible(true); + Unsafe u = (Unsafe) field.get(null); + ARRAY_BYTE_BASE_OFFSET = u.ARRAY_BYTE_BASE_OFFSET; + } catch (NoSuchFieldException | SecurityException | + IllegalArgumentException | IllegalAccessException e) { + // It was just an optimization, no need to fail hard. + } + } + + /* Java implementation of enough GF arithmetic to combine two CRCs for fork/join. */ + + /** + * Performs a carryless 32x32 into 63 (NOT 64) bit multiply. + * Note that the low-order term of the polynomial lands in bit + * 62, not 63. + * + * The Intel pclmulqdq instruction works in much the same way, + * except that it is 64 x 64 into 128. + */ + private static long clmul32x32(int a, int b) { + long accum = 0; + long la = (long) a & 0xffffffffL; + long lb = (long) b & 0xffffffffL; + while (la != 0) { + if (0 != (la & 1)) + accum ^= lb; + la = la >>> 1; + lb = lb << 1; + } + return accum; + } + + /** + * Converts a 64-bit polynomial into a 32-bit polynomial, modulo P. + */ + static int reduceLongTable(long x) { + x = (x >>> 8) ^ (timesXtoThe32[(int)(x & 0xFF)] & 0xffffffffL); + x = (x >>> 8) ^ (timesXtoThe32[(int)(x & 0xFF)] & 0xffffffffL); + x = (x >>> 8) ^ (timesXtoThe32[(int)(x & 0xFF)] & 0xffffffffL); + x = (x >>> 8) ^ (timesXtoThe32[(int)(x & 0xFF)] & 0xffffffffL); + return (int) x; + } + + /** + * Returns polynomial a times b modulo P. + * The least (x**0) term of the polynomial is + * aligned with the sign bit of the returned int. + * + * @param a + * @param b + * @return + */ + static int mul(int a, int b) { + long product = clmul32x32(a, b); + return reduceLongTable(product << 1); + } + + /** + * Returns the polynomial for a * x ** 8n, where a + * is some other polynomial. n is typically a byte count, + * and 8n is the number of bits in n bytes. + */ + static int timesXtoThe8NTable(int a, int n) { + if (n == 0) + return a; + if (n < powersBy8.length) { + return mul(a,powersBy8[n]); + } + int lo = powersBy8.length - 1; + int accum = mul(a,powersBy8[n & lo]); + n = n >>> LOG_PB8_LEN; + int i = LOG_PB8_LEN + 3; + while (n != 0) { + if (0 != (n & 1)) + accum = mul(accum,powersByLog[i]); + n = n >>> 1; + i++; + } + return accum; + } + + static int combine(int prev, int next, int next_length) { + // x**(8 * length) * prev + next + return next ^ timesXtoThe8NTable(prev, next_length); + } + + private static int updateBytesFJ(int crc, byte[] b, int off, int len) { + if (fjParallelism < 2 || len < serialIfSmallerThan) + return updateBytes(crc, b, off, len); + CRCArrayTask cat = new CRCArrayTask(crc, b, off, len); + cat.invoke(); + return cat.join(); + } + + private static int updateByteBufferFJ(int crc, long addr, + int off, int len) { + if (fjParallelism < 2 || len < serialIfSmallerThan) + return updateByteBuffer(crc, addr, off, len); + CRCBufferTask cat = new CRCBufferTask(crc, addr, off, len); + cat.invoke(); + return cat.join(); + } + + static final class CRCArrayTask extends RecursiveTask { + final byte[] ba; + final int start; + final int length; + final int crc; + + CRCArrayTask(int crc, byte[] ba, int start, int length) { + this.crc = crc; + this.ba = ba; + this.start = start; + this.length = length; + } + + @Override + protected Integer compute() { + if (length < serialIfSmallerThan) { + if (length < javaCRCIfSmallerThan) { + return updateBytesSimple(crc, ba, start, length); + } else { + return updateBytes(crc, ba, start, length); + } + } else { + int half = length/2; + /* Avoid gratuitous misalignment. */ + long addr = ARRAY_BYTE_BASE_OFFSET; // Best we can do given limited info. + int unaligned = (int) (addr + start + half) & 15; + if (half - unaligned >= 32) + half -= unaligned; + CRCArrayTask task2 = new CRCArrayTask(0, ba, start + half, length - half); + task2.fork(); + CRCArrayTask task1 = new CRCArrayTask(crc, ba, start, half); + int result1 = task1.compute(); + return combine(result1, task2.join(), length - half); + } + } + } + + static final class CRCBufferTask extends RecursiveTask { + final long addr; + final int start; + final int length; + final int crc; + + CRCBufferTask(int crc, long addr, int start, int length) { + this.crc = crc; + this.addr = addr; + this.start = start; + this.length = length; + } + + @Override + protected Integer compute() { + if (length < serialIfSmallerThan) { + return updateByteBuffer(crc, addr, start, length); + } else { + int half = length/2; + /* Avoid gratuitous misalignment. */ + int unaligned = (int) (addr + start + half) & 15; + if (half - unaligned >= 32) + half -= unaligned; + CRCBufferTask task2 = new CRCBufferTask(0, addr, start + half, length - half); + task2.fork(); + CRCBufferTask task1 = new CRCBufferTask(crc, addr, start, half); + int result1 = task1.compute(); + return combine(result1, task2.join(), length - half); + } + } + } } --- old/src/share/classes/sun/misc/VM.java 2013-05-16 08:24:42.000000000 -0400 +++ new/src/share/classes/sun/misc/VM.java 2013-05-16 08:24:41.000000000 -0400 @@ -1,5 +1,5 @@ /* - * Copyright (c) 1996, 2012, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2013, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -298,6 +298,9 @@ // used by sun.launcher.LauncherHelper props.remove("sun.java.launcher.diag"); + + // used by java.util.zip.CRC32 + props.remove("sun.zip.clmulSupported"); } // Initialize any miscellenous operating system settings that need to be --- old/src/share/native/java/util/zip/CRC32.c 2013-05-16 08:24:42.000000000 -0400 +++ new/src/share/native/java/util/zip/CRC32.c 2013-05-16 08:24:42.000000000 -0400 @@ -1,5 +1,5 @@ /* - * Copyright (c) 1997, 2012, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1997, 2013, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -33,13 +33,134 @@ #include "java_util_zip_CRC32.h" +/* define CAN_COMPILE_CLMUL 0 to disable fastcrc32 completely. */ + +#ifndef CAN_COMPILE_CLMUL +# ifdef __x86_64 +# define CAN_COMPILE_CLMUL 1 +# elif defined(__i386) +# define CAN_COMPILE_CLMUL 1 +# endif +#endif + +#if CAN_COMPILE_CLMUL +#include +#include + +struct crc_by128_K { + /* The fields in this structure are arranged so that if it is + * allocated at a 16-byte alignment they can be picked up two at + * a time with 128-bit loads. + * + * Because of flipped bit order for this CRC polynomials + * the constant for X**N is left-shifted by 1. This is because + * a 64 x 64 polynomial multiply produces a 127-bit result + * but the highest term is always aligned to bit 0 in the container. + * Pre-shifting by one fixes this, at the cost of potentially making + * the 32-bit constant no longer fit in a 32-bit container (thus the + * use of uint64_t, though this is also the size used by the carry- + * less multiply instruction. + * + * In addition, the flipped bit order and highest-term-at-least-bit + * multiply changes the constants used. The 96-bit result will be + * aligned to the high-term end of the target 128-bit container, + * not the low-term end; that is, instead of a 512-bit or 576-bit fold, + * instead it is a 480 (=512-32) or 544 (=512+64-32) bit fold. + * + * This cause additional problems in the 128-to-64-bit reduction; see the + * code for details. By storing a mask in the otherwise unused half of + * a 128-bit constant, bits can be cleared before multiplication without + * storing and reloading. Note that staying on a 128-bit datapath means + * that some data is uselessly stored and some unused data is intersected + * with an irrelevant constant. + */ + + uint64_t mask; /* low of K_M_64 */ + uint64_t xtt64; /* high of K_M_64 */ + uint64_t xtt160; /* low of K_160_96 */ + uint64_t xtt96; /* high of K_160_96 */ + uint64_t xtt544; /* low of K_544_480 */ + uint64_t xtt480; /* high of K_544_480 */ +}; + +struct crc_by128_K * K_struct = 0; + +static const uint64_t x64 = (uint64_t) 0xb1e6b092U << 1; +static const uint64_t x96 = (uint64_t) 0x6655004fU << 1; +static const uint64_t x160 = (uint64_t) 0xba8ccbe8U << 1; +static const uint64_t x480 = (uint64_t) 0xe3720acbU << 1; +static const uint64_t x544 = (uint64_t) 0xaa2215eaU << 1; + +static struct crc_by128_K * init_crc_by128_K() { + void * y; + int rc = posix_memalign( & y, 16, sizeof(struct crc_by128_K)); + if (rc) { + return (struct crc_by128_K *) NULL; + } else { + struct crc_by128_K * x = y; + x -> mask = 0xffffffffUL; + x -> xtt64 = x64; + x -> xtt160 = x160; + x -> xtt96 = x96; + x -> xtt544 = x544; + x -> xtt480 = x480; + return x; + } +} + +uint32_t fastcrc32(jint crc, Bytef * buf, jint len); + +/* Flag governing use of "CLMUL" instruction. + For now, implies little-endian. + Computed dynamically, incorporates information about + the current hardware and the compiler used to compile + this file. */ +static int useClmul = 0; +#else +/* Stub out fastcrc32 */ +# define fastcrc32 crc32 +# define useClmul 0 +#endif + + +/* Local copy of CRC32 table is used to fill and drain CLMUL CRC. + Extra members beyond the first 256-entry row are ignored. */ +static const unsigned long FAR * crc_table; + +/* Initialize java-side table (for small CRCs) to avoid extra startup work, + and capture the platform-dependent useClmul flag. +*/ +JNIEXPORT jboolean JNICALL +Java_java_util_zip_CRC32_init(JNIEnv *env, jclass cls, jarray b, jboolean use_clmul) +{ + /* Get the CRC table from zip to initialize JNI. Our private copy + is missing if not compiled for fastcrc32. */ + crc_table = get_crc_table(); + jint *buf = (*env)->GetPrimitiveArrayCritical(env, b, 0); + if (buf) { + /* Don't know for sure how big an unsigned long is, therefore + copy one at a time. */ + int i; + for (i = 0; i < 256; i++) buf[i] = (jint) (crc_table[i]); + (*env)->ReleasePrimitiveArrayCritical(env, b, buf, 0); + } +#if CAN_COMPILE_CLMUL + if (use_clmul) { + K_struct = init_crc_by128_K(); + useClmul = K_struct != 0; + /* Rather than throw OOME, just do without fast CRC. */ + } +#endif + return useClmul; +} + JNIEXPORT jint JNICALL Java_java_util_zip_CRC32_update(JNIEnv *env, jclass cls, jint crc, jint b) { Bytef buf[1]; buf[0] = (Bytef)b; - return crc32(crc, buf, 1); + return crc32(crc, buf, 1); // single byte not done quickly by fastcrc32 } JNIEXPORT jint JNICALL @@ -48,7 +169,8 @@ { Bytef *buf = (*env)->GetPrimitiveArrayCritical(env, b, 0); if (buf) { - crc = crc32(crc, buf + off, len); + crc = (jint) (useClmul ? fastcrc32(crc, buf + off, len) : + crc32(crc, buf + off, len)); (*env)->ReleasePrimitiveArrayCritical(env, b, buf, 0); } return crc; @@ -56,7 +178,8 @@ JNIEXPORT jint ZIP_CRC32(jint crc, const jbyte *buf, jint len) { - return crc32(crc, (Bytef*)buf, len); + return (jint) (useClmul ? fastcrc32(crc, (Bytef*)buf, len) : + crc32(crc, (Bytef*)buf, len)); } JNIEXPORT jint JNICALL @@ -65,7 +188,452 @@ { Bytef *buf = (Bytef *)jlong_to_ptr(address); if (buf) { - crc = crc32(crc, buf + off, len); + crc = (jint) (useClmul ? fastcrc32(crc, buf + off, len) : + crc32(crc, buf + off, len)); } return crc; } + +#if CAN_COMPILE_CLMUL +#ifndef NO_ASM + +/* set up the platform-specific glop surrounding the function body. */ +# ifdef __x86_64 +# ifdef __APPLE__ +# define ASM_PREFIX ".text\n\t.align 8\n\t.globl _kernel\n_kernel:\n\t" +# define ASM_SUFFIX "" +# elif defined(__GNUC__) +# define ASM_PREFIX ".text\n\t.align 16\n\t.globl kernel\n\t.type kernel,@function\nkernel:\n\t" +# define ASM_SUFFIX "" +# elif defined(__SUNPRO_C) +# define ASM_PREFIX ".section .text,\"ax\"\n\t.align 16, 0x90\n\t.globl kernel\n\t.type kernel,@function\nkernel:\n\t" +# define ASM_SUFFIX ".size kernel,.-kernel" +# else + /* Perhaps the mystery compiler can handle the intrinsics. */ +# define NO_ASM 1 +# endif + +# ifndef NO_ASM +__asm__( + ASM_PREFIX + " pushq %rbp\n\t" + " movq %rsp, %rbp\n\t" + " movl %edi, %eax\n\t" + " .byte 0xc5,0xf9,0x6f,0x06 # vmovdqa(%rsi), %xmm0\n\t" + " .byte 0xc4,0xe1,0xf9,0x7e,0xc7 # vmovd %xmm0, %rdi\n\t" + " xorq %rax, %rdi\n\t" + " .byte 0xc4,0xe3,0xf9,0x22,0xd7,0x00 # vpinsrq$0, %rdi, %xmm0, %xmm2\n\t" + " .byte 0xc5,0x79,0x6f,0x01 # vmovdqa(%rcx), %xmm8\n\t" + " .byte 0xc5,0x79,0x6f,0x49,0x10 # vmovdqa16(%rcx), %xmm9\n\t" + " movl $1, %eax\n\t" + " cmpl $4, %edx\n\t" + " jl 1f\n\t" + " .byte 0xc5,0xf9,0x6f,0x6e,0x10 # vmovdqa16(%rsi), %xmm5\n\t" + " .byte 0xc5,0xf9,0x6f,0x66,0x20 # vmovdqa32(%rsi), %xmm4\n\t" + " .byte 0xc5,0xf9,0x6f,0x5e,0x30 # vmovdqa48(%rsi), %xmm3\n\t" + " leal -3(%rdx), %edi\n\t" + " movl $4, %eax\n\t" + " cmpl $5, %edi\n\t" + " jl 2f\n\t" + " .byte 0xc5,0xf9,0x6f,0x71,0x20 # vmovdqa32(%rcx), %xmm6\n\t" + " leaq 112(%rsi), %rcx\n\t" + " movl $4, %eax\n\t" + " .align 4, 0x90\n" + "3: .byte 0xc4,0xe3,0x49,0x44,0xc2,0x00 # vpclmulqdq$0, %xmm2, %xmm6, %xmm0\n\t" + " .byte 0xc4,0xe3,0x49,0x44,0xcb,0x11 # vpclmulqdq$17, %xmm3, %xmm6, %xmm1\n\t" + " .byte 0xc4,0xe3,0x49,0x44,0xdb,0x00 # vpclmulqdq$0, %xmm3, %xmm6, %xmm3\n\t" + " .byte 0xc5,0xe1,0xef,0x19 # vpxor (%rcx), %xmm3, %xmm3\n\t" + " .byte 0xc4,0xe3,0x49,0x44,0xfd,0x00 # vpclmulqdq$0, %xmm5, %xmm6, %xmm7\n\t" + " .byte 0xc5,0xc1,0xef,0x79,0xe0 # vpxor -32(%rcx), %xmm7, %xmm7\n\t" + " .byte 0xc5,0xf1,0xef,0xdb # vpxor %xmm3, %xmm1, %xmm3\n\t" + " .byte 0xc4,0xe3,0x49,0x44,0xd2,0x11 # vpclmulqdq$17, %xmm2, %xmm6, %xmm2\n\t" + " .byte 0xc5,0xf9,0xef,0x41,0xd0 # vpxor -48(%rcx), %xmm0, %xmm0\n\t" + " .byte 0xc4,0xe3,0x49,0x44,0xcd,0x11 # vpclmulqdq$17, %xmm5, %xmm6, %xmm1\n\t" + " .byte 0xc4,0xe3,0x49,0x44,0xec,0x11 # vpclmulqdq$17, %xmm4, %xmm6, %xmm5\n\t" + " .byte 0xc4,0xe3,0x49,0x44,0xe4,0x00 # vpclmulqdq$0, %xmm4, %xmm6, %xmm4\n\t" + " .byte 0xc5,0xd9,0xef,0x61,0xf0 # vpxor -16(%rcx), %xmm4, %xmm4\n\t" + " .byte 0xc5,0xd1,0xef,0xe4 # vpxor %xmm4, %xmm5, %xmm4\n\t" + " .byte 0xc5,0xf1,0xef,0xef # vpxor %xmm7, %xmm1, %xmm5\n\t" + " .byte 0xc5,0xe9,0xef,0xd0 # vpxor %xmm0, %xmm2, %xmm2\n\t" + " addq $64, %rcx\n\t" + " addl $4, %eax\n\t" + " cmpl %edi, %eax\n\t" + " jl 3b\n" + "2: .byte 0xc4,0xe3,0x31,0x44,0xc2,0x11 # vpclmulqdq$17, %xmm2, %xmm9, %xmm0\n\t" + " .byte 0xc4,0xe3,0x31,0x44,0xca,0x00 # vpclmulqdq$0, %xmm2, %xmm9, %xmm1\n\t" + " .byte 0xc5,0xd1,0xef,0xc9 # vpxor %xmm1, %xmm5, %xmm1\n\t" + " .byte 0xc5,0xf1,0xef,0xc8 # vpxor %xmm0, %xmm1, %xmm1\n\t" + " .byte 0xc4,0xe3,0x31,0x44,0xc1,0x11 # vpclmulqdq$17, %xmm1, %xmm9, %xmm0\n\t" + " .byte 0xc4,0xe3,0x31,0x44,0xc9,0x00 # vpclmulqdq$0, %xmm1, %xmm9, %xmm1\n\t" + " .byte 0xc5,0xd9,0xef,0xc9 # vpxor %xmm1, %xmm4, %xmm1\n\t" + " .byte 0xc5,0xf1,0xef,0xc8 # vpxor %xmm0, %xmm1, %xmm1\n\t" + " .byte 0xc4,0xe3,0x31,0x44,0xc1,0x11 # vpclmulqdq$17, %xmm1, %xmm9, %xmm0\n\t" + " .byte 0xc4,0xe3,0x31,0x44,0xc9,0x00 # vpclmulqdq$0, %xmm1, %xmm9, %xmm1\n\t" + " .byte 0xc5,0xe1,0xef,0xc9 # vpxor %xmm1, %xmm3, %xmm1\n\t" + " .byte 0xc5,0xf1,0xef,0xd0 # vpxor %xmm0, %xmm1, %xmm2\n" + "1: cmpl %edx, %eax\n\t" + " jge 4f\n\t" + " subl %eax, %edx\n\t" + " movslq %eax, %rax\n\t" + " shlq $4, %rax\n\t" + " addq %rax, %rsi\n\t" + " .align 4, 0x90\n" + "5: .byte 0xc4,0xe3,0x31,0x44,0xc2,0x11 # vpclmulqdq$17, %xmm2, %xmm9, %xmm0\n\t" + " .byte 0xc4,0xe3,0x31,0x44,0xca,0x00 # vpclmulqdq$0, %xmm2, %xmm9, %xmm1\n\t" + " .byte 0xc5,0xf1,0xef,0x0e # vpxor (%rsi), %xmm1, %xmm1\n\t" + " .byte 0xc5,0xf1,0xef,0xd0 # vpxor %xmm0, %xmm1, %xmm2\n\t" + " addq $16, %rsi\n\t" + " decl %edx\n\t" + " jne 5b\n" + "4: .byte 0xc4,0xe3,0x39,0x44,0xc2,0x01 # vpclmulqdq$1, %xmm2, %xmm8, %xmm0\n\t" + " .byte 0xc4,0xe1,0xf9,0x7e,0xc0 # vmovd %xmm0, %rax\n\t" + " .byte 0xc4,0xe3,0xf9,0x16,0xc1,0x01 # vpextrq$1, %xmm0, %rcx\n\t" + " shldq $32, %rax, %rcx\n\t" + " .byte 0xc5,0xb9,0xdb,0xc0 # vpand %xmm0, %xmm8, %xmm0\n\t" + " .byte 0xc4,0xe3,0x39,0x44,0xc0,0x01 # vpclmulqdq$1, %xmm0, %xmm8, %xmm0\n\t" + " .byte 0xc4,0xe1,0xf9,0x7e,0xc2 # vmovd %xmm0, %rdx\n\t" + " .byte 0xc4,0xe3,0xf9,0x16,0xd0,0x01 # vpextrq$1, %xmm2, %rax\n\t" + " xorq %rdx, %rax\n\t" + " xorq %rcx, %rax\n\t" + " popq %rbp\n\t" + " ret\n" + ASM_SUFFIX + ); +# endif +# elif defined(__i386) + +/* set up the platform-specific glop surrounding the function body. */ +# ifdef __APPLE__ +# define ASM_PREFIX ".text\n\t.align 16\n\t.globl _kernel\n_kernel:\n\t" +# define ASM_SUFFIX "" +# elif defined(__GNUC__) +# define ASM_PREFIX ".text\n\t.align 16\n\t.globl kernel\n\t.type kernel,@function\nkernel:\n\t" +# define ASM_SUFFIX "" +# elif defined(__SUNPRO_C) +# define ASM_PREFIX ".section .text,\"ax\"\n\t.align 16, 0x90\n\t.globl kernel\n\t.type kernel,@function\nkernel:\n\t" +# define ASM_SUFFIX ".size kernel,.-kernel" +# else + /* Perhaps the mystery compiler can handle the intrinsics. */ +# define NO_ASM 1 +# endif + +# ifndef NO_ASM +__asm__( + ASM_PREFIX + " pushl %ebp\n\t" + " movl %esp, %ebp\n\t" + " pushl %edi\n\t" + " pushl %esi\n\t" + " movl 12(%ebp), %eax\n\t" + " .byte 0xc5,0xf9,0x28,0x00 # vmovapd(%eax), %xmm0\n\t" + " .byte 0xc5,0xf9,0x7e,0xc1 # vmovd %xmm0, %ecx\n\t" + " xorl 8(%ebp), %ecx\n\t" + " .byte 0xc4,0xe3,0x79,0x22,0xc9,0x00 # vpinsrd$0, %ecx, %xmm0, %xmm1\n\t" + " .byte 0xc4,0xe3,0x79,0x16,0xc1,0x01 # vpextrd$1, %xmm0, %ecx\n\t" + " .byte 0xc4,0xe3,0x71,0x22,0xc9,0x01 # vpinsrd$1, %ecx, %xmm1, %xmm1\n\t" + " movl 20(%ebp), %edi\n\t" + " .byte 0xc5,0xf9,0x6f,0x07 # vmovdqa(%edi), %xmm0\n\t" + " .byte 0xc5,0xf9,0x6f,0x57,0x10 # vmovdqa16(%edi), %xmm2\n\t" + " movl $1, %edx\n\t" + " movl 16(%ebp), %ecx\n\t" + " cmpl $4, %ecx\n\t" + " jl 1f\n\t" + " .byte 0xc5,0xf9,0x6f,0x58,0x30 # vmovdqa48(%eax), %xmm3\n\t" + " .byte 0xc5,0xf9,0x6f,0x68,0x10 # vmovdqa16(%eax), %xmm5\n\t" + " .byte 0xc5,0xf9,0x6f,0x60,0x20 # vmovdqa32(%eax), %xmm4\n\t" + " leal -3(%ecx), %esi\n\t" + " movl $4, %edx\n\t" + " cmpl $5, %esi\n\t" + " jl 2f\n\t" + " .byte 0xc5,0xf9,0x6f,0x77,0x20 # vmovdqa32(%edi), %xmm6\n\t" + " leal 112(%eax), %edi\n\t" + " movl $4, %edx\n\t" + " .align 4, 0x90\n" + "3: .byte 0xc4,0xe3,0x49,0x44,0xfb,0x11 # vpclmulqdq$17, %xmm3, %xmm6, %xmm7\n\t" + " .byte 0xc4,0xe3,0x49,0x44,0xdb,0x00 # vpclmulqdq$0, %xmm3, %xmm6, %xmm3\n\t" + " .byte 0xc5,0xe1,0xef,0x1f # vpxor (%edi), %xmm3, %xmm3\n\t" + " .byte 0xc5,0xc1,0xef,0xdb # vpxor %xmm3, %xmm7, %xmm3\n\t" + " .byte 0xc4,0xe3,0x49,0x44,0xfc,0x11 # vpclmulqdq$17, %xmm4, %xmm6, %xmm7\n\t" + " .byte 0xc4,0xe3,0x49,0x44,0xe4,0x00 # vpclmulqdq$0, %xmm4, %xmm6, %xmm4\n\t" + " .byte 0xc5,0xd9,0xef,0x67,0xf0 # vpxor -16(%edi), %xmm4, %xmm4\n\t" + " .byte 0xc5,0xc1,0xef,0xe4 # vpxor %xmm4, %xmm7, %xmm4\n\t" + " .byte 0xc4,0xe3,0x49,0x44,0xfd,0x11 # vpclmulqdq$17, %xmm5, %xmm6, %xmm7\n\t" + " .byte 0xc4,0xe3,0x49,0x44,0xed,0x00 # vpclmulqdq$0, %xmm5, %xmm6, %xmm5\n\t" + " .byte 0xc5,0xd1,0xef,0x6f,0xe0 # vpxor -32(%edi), %xmm5, %xmm5\n\t" + " .byte 0xc5,0xc1,0xef,0xed # vpxor %xmm5, %xmm7, %xmm5\n\t" + " .byte 0xc4,0xe3,0x49,0x44,0xf9,0x11 # vpclmulqdq$17, %xmm1, %xmm6, %xmm7\n\t" + " .byte 0xc4,0xe3,0x49,0x44,0xc9,0x00 # vpclmulqdq$0, %xmm1, %xmm6, %xmm1\n\t" + " .byte 0xc5,0xf1,0xef,0x4f,0xd0 # vpxor -48(%edi), %xmm1, %xmm1\n\t" + " .byte 0xc5,0xc1,0xef,0xc9 # vpxor %xmm1, %xmm7, %xmm1\n\t" + " addl $64, %edi\n\t" + " addl $4, %edx\n\t" + " cmpl %esi, %edx\n\t" + " jl 3b\n" + "2: .byte 0xc4,0xe3,0x69,0x44,0xf1,0x11 # vpclmulqdq$17, %xmm1, %xmm2, %xmm6\n\t" + " .byte 0xc4,0xe3,0x69,0x44,0xc9,0x00 # vpclmulqdq$0, %xmm1, %xmm2, %xmm1\n\t" + " .byte 0xc5,0xd1,0xef,0xc9 # vpxor %xmm1, %xmm5, %xmm1\n\t" + " .byte 0xc5,0xf1,0xef,0xee # vpxor %xmm6, %xmm1, %xmm5\n\t" + " .byte 0xc4,0xe3,0x69,0x44,0xcd,0x11 # vpclmulqdq$17, %xmm5, %xmm2, %xmm1\n\t" + " .byte 0xc4,0xe3,0x69,0x44,0xed,0x00 # vpclmulqdq$0, %xmm5, %xmm2, %xmm5\n\t" + " .byte 0xc5,0xd9,0xef,0xe5 # vpxor %xmm5, %xmm4, %xmm4\n\t" + " .byte 0xc5,0xd9,0xef,0xe1 # vpxor %xmm1, %xmm4, %xmm4\n\t" + " .byte 0xc4,0xe3,0x69,0x44,0xcc,0x11 # vpclmulqdq$17, %xmm4, %xmm2, %xmm1\n\t" + " .byte 0xc4,0xe3,0x69,0x44,0xe4,0x00 # vpclmulqdq$0, %xmm4, %xmm2, %xmm4\n\t" + " .byte 0xc5,0xe1,0xef,0xdc # vpxor %xmm4, %xmm3, %xmm3\n\t" + " .byte 0xc5,0xe1,0xef,0xc9 # vpxor %xmm1, %xmm3, %xmm1\n" + "1: cmpl %ecx, %edx\n\t" + " jge 4f\n\t" + " subl %edx, %ecx\n\t" + " shll $4, %edx\n\t" + " addl %edx, %eax\n\t" + " .align 4, 0x90\n" + "5: .byte 0xc4,0xe3,0x69,0x44,0xd9,0x11 # vpclmulqdq$17, %xmm1, %xmm2, %xmm3\n\t" + " .byte 0xc4,0xe3,0x69,0x44,0xc9,0x00 # vpclmulqdq$0, %xmm1, %xmm2, %xmm1\n\t" + " .byte 0xc5,0xf1,0xef,0x08 # vpxor (%eax), %xmm1, %xmm1\n\t" + " .byte 0xc5,0xf1,0xef,0xcb # vpxor %xmm3, %xmm1, %xmm1\n\t" + " addl $16, %eax\n\t" + " decl %ecx\n\t" + " jne 5b\n" + "4: .byte 0xc4,0xe3,0x79,0x44,0xd1,0x01 # vpclmulqdq$1, %xmm1, %xmm0, %xmm2\n\t" + " .byte 0xc5,0xf9,0xdb,0xda # vpand %xmm2, %xmm0, %xmm3\n\t" + " .byte 0xc4,0xe3,0x79,0x44,0xc3,0x01 # vpclmulqdq$1, %xmm3, %xmm0, %xmm0\n\t" + " .byte 0xc5,0xf9,0x7e,0xc0 # vmovd %xmm0, %eax\n\t" + " .byte 0xc4,0xe3,0x79,0x16,0xc9,0x02 # vpextrd$2, %xmm1, %ecx\n\t" + " xorl %eax, %ecx\n\t" + " .byte 0xc4,0xe3,0x79,0x16,0xd0,0x01 # vpextrd$1, %xmm2, %eax\n\t" + " xorl %ecx, %eax\n\t" + " .byte 0xc4,0xe3,0x79,0x16,0xc2,0x01 # vpextrd$1, %xmm0, %edx\n\t" + " .byte 0xc4,0xe3,0x79,0x16,0xc9,0x03 # vpextrd$3, %xmm1, %ecx\n\t" + " xorl %edx, %ecx\n\t" + " .byte 0xc4,0xe3,0x79,0x16,0xd2,0x02 # vpextrd$2, %xmm2, %edx\n\t" + " xorl %ecx, %edx\n\t" + " popl %esi\n\t" + " popl %edi\n\t" + " popl %ebp\n\t" + " ret\n" + ASM_SUFFIX + ); +# endif +# else /* architecture type */ +/* Not intel, not that the C intrinsics will compile anywhere else, + * but it will be a slightly better error message. + */ +# define NO_ASM 1 +# endif +#endif /* NO_ASM */ + +#ifndef NO_ASM +/* Declaration for use below. */ +uint64_t kernel(uint32_t c, unsigned char * buf, int len_128bit, struct crc_by128_K * K); +#else +#pragma message("Compiling 'kernel' from C source with intrinsics") +#include +#include + +union u { + __m128i v; + struct { + uint64_t lo; + uint64_t hi; + }; +}; + +/** + * Assume c is existing crc, + * buf is 16-byte-aligned, + * len is a multiple of 16 greater than zero. + */ +uint64_t kernel(uint32_t c, unsigned char * buf, int len_128bit, + struct crc_by128_K * K) { + + __m128i * b = (__m128i *) buf; + int i = 0; + + /* 128 bit constants and variables. */ + __m128i K_544_480, K_160_96, K_M_64, + x0, x1, x2, x3, + x0a, x1a, x2a, x3a, + x0b, x1b, x2b, x3b; + + /* Use these to move data between xmm registers and "normal" registers. */ + union u ut0, ut1, ut2, ut3; + + K_544_480 = * (__m128i *) & (K -> xtt544); + K_160_96 = * (__m128i *) & (K -> xtt160); + K_M_64 = * (__m128i *) & (K -> mask); + + /* Incorporate existing CRC into first item */ + ut0.v = b[0]; + ut0.lo ^= c; + x0 = ut0.v; + + if (len_128bit >= 4) { + /* Written as a slightly pipelined loop. */ + + x1 = b[1]; + x2 = b[2]; + x3 = b[3]; + + /* Iterate once if len_128bit is between 8 and 11 + * 4 < 8-3 < 11 - 3 + * 8 !< 11 - 3 < 12 - 3. + * + * 0 1 2 3 | 4 5 6 7 | 8 9 10 11 | 12 + * + */ + for (i = 4; i < len_128bit - 3 ; i+= 4) { + /* Each iteration of this loop folds the 512 bits of polynomial + * in x0-x3 with the data in b[i]..b[i+3]. + */ + x0a = b[i]; + x1a = b[i+1]; + x2a = b[i+2]; + x3a = b[i+3]; + + x0b = _mm_clmulepi64_si128(K_544_480, x0, 0x00); + x0 = _mm_clmulepi64_si128(K_544_480, x0, 0x11); + x1b = _mm_clmulepi64_si128(K_544_480, x1, 0x00); + x1 = _mm_clmulepi64_si128(K_544_480, x1, 0x11); + + x2b = _mm_clmulepi64_si128(K_544_480, x2, 0x00); + x2 = _mm_clmulepi64_si128(K_544_480, x2, 0x11); + x3b = _mm_clmulepi64_si128(K_544_480, x3, 0x00); + x3 = _mm_clmulepi64_si128(K_544_480, x3, 0x11); + + // x0 ^= x0a ^ x0b; + x0 = _mm_xor_si128(x0, x0a); + x0 = _mm_xor_si128(x0, x0b); + // x1 ^= x1a ^ x1b; + x1 = _mm_xor_si128(x1, x1a); + x1 = _mm_xor_si128(x1, x1b); + // x2 ^= x2a ^ x2b; + x2 = _mm_xor_si128(x2, x2a); + x2 = _mm_xor_si128(x2, x2b); + // x3 ^= x3a ^ x3b; + x3 = _mm_xor_si128(x3, x3a); + x3 = _mm_xor_si128(x3, x3b); + } + /* x0 - x3 contains 4 x 128 bits of accumulated result. + * 0-3 hexads potentially remain in [i,len_128bit) entries. + * Assume trailing bytes beyond that are handled by our caller. + */ + x0a = _mm_clmulepi64_si128(K_160_96, x0, 0x00); + x0b = _mm_clmulepi64_si128(K_160_96, x0, 0x11); + x1 = _mm_xor_si128(x1, x0a); + x1 = _mm_xor_si128(x1, x0b); + x0a = _mm_clmulepi64_si128(K_160_96, x1, 0x00); + x0b = _mm_clmulepi64_si128(K_160_96, x1, 0x11); + x2 = _mm_xor_si128(x2, x0a); + x2 = _mm_xor_si128(x2, x0b); + x0a = _mm_clmulepi64_si128(K_160_96, x2, 0x00); + x0b = _mm_clmulepi64_si128(K_160_96, x2, 0x11); + x3 = _mm_xor_si128(x3, x0a); + x3 = _mm_xor_si128(x3, x0b); + } else { + /* Loaded 128 bits already into x0. + */ + x3 = x0; + i = 1; + } + + /* x3 is now 128-bit result. + * Fold 0-3 128-bit chunks into x3. + */ + for (; i < len_128bit; i++) { + x0 = b[i]; // data to fold + // fold x3 down by 128 to align with data. + x0a = _mm_clmulepi64_si128(K_160_96, x3, 0x00); + x0b = _mm_clmulepi64_si128(K_160_96, x3, 0x11); + x3 = _mm_xor_si128(x0, x0a); + x3 = _mm_xor_si128(x3, x0b); + // x3 is now aligned with data we just loaded. + } + + /* + * No more 128bits remain. + * Fold x3 down into 32 bits. + */ + { + ut0.v = x3; + uint64_t w; + uint64_t y = ut0.hi; // 64 low-order terms of polynomial into y. + + /* polynomial term order: + * high -> low + * bit number order + * 0 -> 127 + * + * input, from which y was just extracted. + * w0 w1 y0 y1 + * w0:w1 * x64 yields 96 bits. + * p0:p1:p2:__ (aligned wrong, store to extract p1 and p2) + * p0:p1:__:__ & ff:00:__:__ (mask to get rid of p1) + * p0:00:__:__ + * p0:00 * x64 (times x64 yields 64 bits) + * r0:r1 store and xor. + */ + + x0 = _mm_clmulepi64_si128(K_M_64, x3, 0x01); + ut1.v = x0; + w = (ut1.lo >> 32) + (ut1.hi << 32); // extract low-poly 64 bits. + x0 = _mm_and_si128(K_M_64, x0); // mask away what we just extracted.. + x0 = _mm_clmulepi64_si128(K_M_64, x0, 0x01); + w ^= y; + ut2.v = x0; + w ^= ut2.lo; + + return w; + } +} +#endif /* NO_ASM */ + +uint32_t fastcrc32(jint crc, Bytef * buf, jint len) { + const unsigned long FAR * timesXtoThe32 = crc_table; + intptr_t ibuf = (intptr_t) buf; + int log_align = 4; + int align = 1 << log_align; + int mask = align - 1; + int islop = (align - ibuf) & mask; + uint32_t c = ~crc; + int i = 0; + + if (len - islop >= align) { + /* Handle bytes preceding 16-byte alignment. */ + for (i = 0; i < islop; i++ ) { + uint32_t x0 = buf[i]; + x0 = timesXtoThe32[(x0 ^ c) & 0xFF]; + c = x0 ^ (c >> 8); + } + buf += i; + len -= i; + + jint len_128bit = len >> log_align; + + if (len_128bit > 0) { + uint64_t w = kernel(c, buf, len_128bit, K_struct); + /* + * 8 8-bit folds to compute 32-bit CRC. + */ + w = timesXtoThe32[w & 0xFF] ^ (w >> 8); + w = timesXtoThe32[w & 0xFF] ^ (w >> 8); + w = timesXtoThe32[w & 0xFF] ^ (w >> 8); + w = timesXtoThe32[w & 0xFF] ^ (w >> 8); + w = timesXtoThe32[w & 0xFF] ^ (w >> 8); + w = timesXtoThe32[w & 0xFF] ^ (w >> 8); + w = timesXtoThe32[w & 0xFF] ^ (w >> 8); + w = timesXtoThe32[w & 0xFF] ^ (w >> 8); + c = (uint32_t) w; + i = len_128bit << log_align; + } else { + i = 0; + } + } + /* Handle short CRC and tail of long CRC */ + for (; i < len; i++) { + uint32_t x0 = buf[i]; + x0 = timesXtoThe32[(x0 ^ c) & 0xFF]; + c = x0 ^ (c >> 8); + } + return ~c; +} +#endif --- old/test/java/util/zip/TimeChecksum.java 2013-05-16 08:24:43.000000000 -0400 +++ new/test/java/util/zip/TimeChecksum.java 2013-05-16 08:24:43.000000000 -0400 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2011, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011, 2013, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -120,6 +120,7 @@ System.out.println("---------- Adler32 ----------"); System.out.print("Warmup..."); time(adler32, data, iters, len); + time(adler32, data, 2*iters, 16); // warmup short case, too time(adler32, ByteBuffer.wrap(data), iters); buf = ByteBuffer.allocateDirect(len); buf.put(data, 0, len); @@ -162,6 +163,7 @@ System.out.println("\n---------- CRC32 ----------"); System.out.print("Warmup..."); time(crc32, data, iters, len); + time(crc32, data, 2*iters, 16); // warmup short case, too time(crc32, ByteBuffer.wrap(data), iters); buf = ByteBuffer.allocateDirect(len); buf.put(data, 0, len); --- /dev/null 2013-05-16 08:24:44.000000000 -0400 +++ new/test/java/util/zip/CRCandAdlerTest.java 2013-05-16 08:24:44.000000000 -0400 @@ -0,0 +1,220 @@ +/* + * Copyright (c) 2013, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +/* + @test + @bug 7088419 + @summary Ensure that the byte-at-a-time, byte array, and DirectByteBuffer + methods of CRC32 and Adler32 are consistent across a range of inputs. + */ + +import java.nio.ByteBuffer; +import java.util.zip.Adler32; +import java.util.zip.CRC32; +import java.util.zip.Checksum; + +public class CRCandAdlerTest { + + public static void main(String[] args) throws Exception { + + byte[] b = initializedBytes(4096 * 4096); + + { + CRC32 crc1 = new CRC32(); + CRC32 crc2 = new CRC32(); + CRC32 crc3 = new CRC32(); + CRC32 crc4 = new CRC32(); + + crc1.update(b, 0, b.length); + updateSerial(crc2, b, 0, b.length); + updateDirect(crc3, b, 0, b.length); + updateSerialSlow(crc4, b, 0, b.length); + + check(crc1, crc2); + check(crc3, crc4); + check(crc1, crc3); + + crc1.update(17); + crc2.update(17); + crc3.update(17); + crc4.update(17); + + crc1.update(b, 0, b.length); + updateSerial(crc2, b, 0, b.length); + updateDirect(crc3, b, 0, b.length); + updateSerialSlow(crc4, b, 0, b.length); + + check(crc1, crc2); + check(crc3, crc4); + check(crc1, crc3); + + report("finished huge crc", crc1, crc2, crc3, crc4); + + for (int i = 0; i < 256; i++) { + for (int j = 0; j < 256; j += 1) { + crc1.update(b, i, j); + updateSerial(crc2, b, i, j); + updateDirect(crc3, b, i, j); + updateSerialSlow(crc4, b, i, j); + + check(crc1, crc2); + check(crc3, crc4); + check(crc1, crc3); + + } + } + + report("finished small survey crc", crc1, crc2, crc3, crc4); + + for (int i = 0; i < 128; i+= 5) { + for (int j = 1024 * 1024; j < 1024 * 1024 + 128; j += 7) { + crc1.update(b, i, j); + updateSerial(crc2, b, i, j); + updateDirect(crc3, b, i, j); + updateSerialSlow(crc4, b, i, j); + + check(crc1, crc2); + check(crc3, crc4); + check(crc1, crc3); + } + } + + report("finished large survey crc", crc1, crc2, crc3, crc4); + } + + { + Adler32 crc1 = new Adler32(); + Adler32 crc2 = new Adler32(); + Adler32 crc3 = new Adler32(); + Adler32 crc4 = new Adler32(); + // Test long CRC. + + crc1.update(b, 0, b.length); + updateSerial(crc2, b, 0, b.length); + updateDirect(crc3, b, 0, b.length); + updateSerialSlow(crc4, b, 0, b.length); + + check(crc1, crc2); + check(crc3, crc4); + check(crc1, crc3); + + crc1.update(17); + crc2.update(17); + crc3.update(17); + crc4.update(17); + + crc1.update(b, 0, b.length); + updateSerial(crc2, b, 0, b.length); + updateDirect(crc3, b, 0, b.length); + updateSerialSlow(crc4, b, 0, b.length); + + check(crc1, crc2); + check(crc3, crc4); + check(crc1, crc3); + + report("finished huge adler32", crc1, crc2, crc3, crc4); + + for (int i = 0; i < 256; i++) { + for (int j = 0; j < 256; j += 1) { + crc1.update(b, i, j); + updateSerial(crc2, b, i, j); + updateDirect(crc3, b, i, j); + updateSerialSlow(crc4, b, i, j); + + check(crc1, crc2); + check(crc3, crc4); + check(crc1, crc3); + + } + } + + report("finished small survey adler32", crc1, crc2, crc3, crc4); + + for (int i = 0; i < 128; i+= 5) { + for (int j = 1024 * 1024; j < 1024 * 1024 + 128; j += 7) { + crc1.update(b, i, j); + updateSerial(crc2, b, i, j); + updateDirect(crc3, b, i, j); + updateSerialSlow(crc4, b, i, j); + + check(crc1, crc2); + check(crc3, crc4); + check(crc1, crc3); + } + } + + report("finished large survey adler32", crc1, crc2, crc3, crc4); + } + } + + + private static void report(String s, Checksum crc1, Checksum crc2, + Checksum crc3, Checksum crc4) { + System.out.println(s + ", crc1 = " + crc1.getValue() + + ", crc2 = " + crc2.getValue()+ + ", crc3 = " + crc3.getValue()+ + ", crc4 = " + crc4.getValue()); + } + + private static void check(Checksum crc1, Checksum crc2) throws Exception { + if (crc1.getValue() != crc2.getValue()) { + String s = "value 1 = " + crc1.getValue() + ", value 2 = " + crc2.getValue(); + System.err.println(s); + throw new Exception(s); + } + } + + private static byte[] initializedBytes(int M) { + byte[] bytes = new byte[M]; + for (int i = 0; i < bytes.length; i++) { + bytes[i] = (byte) i; + } + return bytes; + } + + private static void updateSerial(Checksum crc, byte[] b, int start, int length) { + for (int i = 0; i < length; i++) + crc.update(b[i+start]); + } + + private static void updateSerialSlow(Checksum crc, byte[] b, int start, int length) { + for (int i = 0; i < length; i++) + crc.update(b[i+start]); + crc.getValue(); + } + + private static void updateDirect(CRC32 crc3, byte[] b, int start, int length) { + ByteBuffer buf = ByteBuffer.allocateDirect(length); + buf.put(b, start, length); + buf.flip(); + crc3.update(buf); + } + + private static void updateDirect(Adler32 crc3, byte[] b, int start, int length) { + ByteBuffer buf = ByteBuffer.allocateDirect(length); + buf.put(b, start, length); + buf.flip(); + crc3.update(buf); + } + +}