--- old/makefiles/mapfiles/libzip/mapfile-vers	2013-05-16 08:24:39.000000000 -0400
+++ new/makefiles/mapfiles/libzip/mapfile-vers	2013-05-16 08:24:39.000000000 -0400
@@ -31,6 +31,7 @@
 		Java_java_util_zip_Adler32_update;
 		Java_java_util_zip_Adler32_updateBytes;
 		Java_java_util_zip_Adler32_updateByteBuffer;
+		Java_java_util_zip_CRC32_init;
 		Java_java_util_zip_CRC32_update;
 		Java_java_util_zip_CRC32_updateBytes;
 		Java_java_util_zip_CRC32_updateByteBuffer;
--- old/src/share/classes/java/util/zip/Adler32.java	2013-05-16 08:24:40.000000000 -0400
+++ new/src/share/classes/java/util/zip/Adler32.java	2013-05-16 08:24:40.000000000 -0400
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1996, 2011, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2013, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -27,6 +27,7 @@
 
 import java.nio.ByteBuffer;
 import sun.nio.ch.DirectBuffer;
+import java.util.concurrent.RecursiveTask;
 
 /**
  * A class that can be used to compute the Adler-32 checksum of a data
@@ -41,8 +42,46 @@
  */
 public
 class Adler32 implements Checksum {
+    /*
+     * This is a reformulation of the Adler32 calculation that permits recursive
+     * subdivision of the problem, thus allowing both parallelism and faster calculation
+     * byte-at-a-time checksums.
+     *
+     * The Adler calculation is regarded as
+     * taking an input text T of length N = |T|,
+     * and computing two quantities, A(T) and B(T),
+     * where A(T) = 1 + sum_{0 <= i < |T|}(T_i)
+     * and B(T) = |T| + sum_{0 <= i < |T|}((|T| - i) * T_i),
+     * both modulo 65521.
+     *
+     * However, with sufficient algebraic manipulation, one can derive
+     * that A(U||V) = A(U) + A(V) - 1
+     * and B(U||V) = B(U) + B(V) + |V| (A(U) - 1).
+     */
+
+    /**
+     * The modulo operation can be deferred for MAX_SLOP bytes of input, permitting
+     * faster byte-by-byte Adler computations.  1024 is plenty conservative.
+     */
+    private final static int MAX_SLOP = 1024;
+
+    /**
+     * For inputs smaller than SERIAL_BELOW fork-join parallelism might
+     * not be profitable.
+     */
+    private final static int SERIAL_BELOW = 1024 * 1024;
+
+    /**
+     * For inputs smaller than JAVA_ADLER_BELOW JNI overheads make it faster
+     * to compute on the Java side.  (This may change as overheads and compiler
+     * quality change).
+     */
+    private final static int JAVA_ADLER_BELOW = 32;
 
     private int adler = 1;
+    private int aa = 1;
+    private int bb = 0;
+    private int slop = 0;
 
     /**
      * Creates a new Adler32 object.
@@ -57,7 +96,13 @@
      * @param b the byte to update the checksum with
      */
     public void update(int b) {
-        adler = update(adler, b);
+        int la = aa + (b & 0xFF);
+        bb = bb + la;
+        aa = la;
+        slop++;
+        if (slop == MAX_SLOP) {
+            getValueI();
+        }
     }
 
     /**
@@ -70,7 +115,12 @@
         if (off < 0 || len < 0 || off > b.length - len) {
             throw new ArrayIndexOutOfBoundsException();
         }
-        adler = updateBytes(adler, b, off, len);
+        if (len < JAVA_ADLER_BELOW) {
+            for (int i = 0; i < len; i++)
+                update(b[i+off]);
+        } else {
+            setValue(updateBytesFJ(getValueI(), b, off, len));
+        }
     }
 
     /**
@@ -79,10 +129,14 @@
      * @param b the byte array to update the checksum with
      */
     public void update(byte[] b) {
-        adler = updateBytes(adler, b, 0, b.length);
+        if (b.length < JAVA_ADLER_BELOW) {
+            for (int i = 0; i < b.length; i++)
+                update(b[i]);
+        } else {
+            setValue(updateBytesFJ(getValueI(), b, 0, b.length));
+        }
     }
 
-
     /**
      * Updates the checksum with the bytes from the specified buffer.
      *
@@ -104,13 +158,13 @@
         if (rem <= 0)
             return;
         if (buffer instanceof DirectBuffer) {
-            adler = updateByteBuffer(adler, ((DirectBuffer)buffer).address(), pos, rem);
+            setValue(updateByteBufferFJ(getValueI(), ((DirectBuffer)buffer).address(), pos, rem));
         } else if (buffer.hasArray()) {
-            adler = updateBytes(adler, buffer.array(), pos + buffer.arrayOffset(), rem);
+            setValue(updateBytesFJ(getValueI(), buffer.array(), pos + buffer.arrayOffset(), rem));
         } else {
             byte[] b = new byte[rem];
             buffer.get(b);
-            adler = updateBytes(adler, b, 0, b.length);
+            setValue(updateBytesFJ(getValueI(), b, 0, b.length));
         }
         buffer.position(limit);
     }
@@ -119,19 +173,133 @@
      * Resets the checksum to initial value.
      */
     public void reset() {
+        aa = 1;
+        bb = 0;
         adler = 1;
+        slop = 0;
     }
 
     /**
      * Returns the checksum value.
      */
     public long getValue() {
-        return (long)adler & 0xffffffffL;
+        return getValueI() & 0xffffffffL;
+    }
+
+    private int getValueI() {
+        if (slop > 0) {
+            aa = aa % 65521;
+            bb = bb % 65521;
+            adler = (bb << 16) + aa;
+            slop = 0;
+        }
+        return adler;
+    }
+
+    private void setValue(int newValue) {
+        aa = newValue & 0xffff;
+        bb = newValue >>> 16;
+        adler = newValue;
+        slop = 0;
     }
 
     private native static int update(int adler, int b);
     private native static int updateBytes(int adler, byte[] b, int off,
                                           int len);
+
     private native static int updateByteBuffer(int adler, long addr,
                                                int off, int len);
+
+    private static int updateBytesFJ(int adler, byte[] ba, int start, int length) {
+        if (length < SERIAL_BELOW) {
+            return updateBytes(adler, ba, start, length);
+        }
+        AdlerTask w = new AdlerTask(adler, ba, start, length);
+        w.invoke();
+        return(w.join());
+    }
+
+    private static int updateByteBufferFJ(int adler, long addr, int start, int length) {
+        if (length < SERIAL_BELOW) {
+            return updateByteBuffer(adler, addr, start, length);
+        }
+        AdlerBufferTask w = new AdlerBufferTask(adler, addr, start, length);
+        w.invoke();
+        return(w.join());
+    }
+
+    static int combineAdlers(int prev_adler, int next_adler, int length) {
+        /* that A(U||V) = A(U) + A(V) - 1
+         * and B(U||V) = B(U) + B(V) + |V| (A(U) - 1).
+         */
+        if (prev_adler == 1) {
+            // Appending to initial checksum
+            return next_adler;
+        } else {
+            int after_a = next_adler & 0xffff;
+            int after_b = next_adler >>> 16;
+
+            int prev_a = prev_adler & 0xffff;
+            int prev_b = prev_adler >>> 16;
+
+            long partial = (long) length * (prev_a + 65520) % 65521;
+            prev_b = (prev_b + after_b + (int) partial) % 65521;
+            prev_a = (prev_a + after_a + 65520) % 65521;
+            return ((prev_b << 16) + prev_a);
+        }
+    }
+
+    static class AdlerTask extends RecursiveTask<Integer> {
+        final int adler;
+        final byte[] ba;
+        final int start;
+        final int length;
+        AdlerTask(int adler, byte[] ba, int start, int length) {
+            this.ba = ba;
+            this.start = start;
+            this.length = length;
+            this.adler = adler;
+        }
+
+        @Override
+        protected Integer compute() {
+            if (length < SERIAL_BELOW) {
+                return updateBytes(adler, ba, start, length);
+            } else {
+                int half = length/2;
+                AdlerTask task2 = new AdlerTask(1, ba, start + half, length - half);
+                task2.fork();
+                AdlerTask task1 = new AdlerTask(adler, ba, start, half);
+                int result1 = task1.compute();
+                return combineAdlers(result1, task2.join(), length - half);
+            }
+        }
+    }
+
+    static class AdlerBufferTask extends RecursiveTask<Integer> {
+        final int adler;
+        final long addr;
+        final int start;
+        final int length;
+        AdlerBufferTask(int adler, long addr, int start, int length) {
+            this.addr = addr;
+            this.start = start;
+            this.length = length;
+            this.adler = adler;
+        }
+
+        @Override
+        protected Integer compute() {
+            if (length < SERIAL_BELOW) {
+                return updateByteBuffer(adler, addr, start, length);
+            } else {
+                int half = length/2;
+                AdlerBufferTask task2 = new AdlerBufferTask(1, addr, start + half, length - half);
+                task2.fork();
+                AdlerBufferTask task1 = new AdlerBufferTask(adler, addr, start, half);
+                int result1 = task1.compute();
+                return combineAdlers(result1, task2.join(), length - half);
+            }
+        }
+    }
 }
--- old/src/share/classes/java/util/zip/CRC32.java	2013-05-16 08:24:41.000000000 -0400
+++ new/src/share/classes/java/util/zip/CRC32.java	2013-05-16 08:24:41.000000000 -0400
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1996, 2011, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2013, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -25,8 +25,12 @@
 
 package java.util.zip;
 
-import java.nio.ByteBuffer;
 import sun.nio.ch.DirectBuffer;
+import java.lang.reflect.Field;
+import java.nio.ByteBuffer;
+import java.util.concurrent.RecursiveTask;
+import java.util.concurrent.ForkJoinPool;
+import sun.misc.Unsafe;
 
 /**
  * A class that can be used to compute the CRC-32 of a data stream.
@@ -55,7 +59,10 @@
      * @param b the byte to update the checksum with
      */
     public void update(int b) {
-        crc = update(crc, b);
+        int c = ~ crc;
+        b = timesXtoThe32[(b ^ c) & 0xFF];
+        b = b ^ (c >>> 8);
+        crc = ~b;
     }
 
     /**
@@ -68,7 +75,12 @@
         if (off < 0 || len < 0 || off > b.length - len) {
             throw new ArrayIndexOutOfBoundsException();
         }
-        crc = updateBytes(crc, b, off, len);
+
+        if (len < javaCRCIfSmallerThan) {
+            crc = updateBytesSimple(crc, b, off, len);
+        } else {
+            crc = updateBytesFJ(crc, b, off, len);
+        }
     }
 
     /**
@@ -77,7 +89,7 @@
      * @param b the array of bytes to update the checksum with
      */
     public void update(byte[] b) {
-        crc = updateBytes(crc, b, 0, b.length);
+        crc = updateBytesFJ(crc, b, 0, b.length);
     }
 
     /**
@@ -101,13 +113,13 @@
         if (rem <= 0)
             return;
         if (buffer instanceof DirectBuffer) {
-            crc = updateByteBuffer(crc, ((DirectBuffer)buffer).address(), pos, rem);
+            crc = updateByteBufferFJ(crc, ((DirectBuffer)buffer).address(), pos, rem);
         } else if (buffer.hasArray()) {
-            crc = updateBytes(crc, buffer.array(), pos + buffer.arrayOffset(), rem);
+            crc = updateBytesFJ(crc, buffer.array(), pos + buffer.arrayOffset(), rem);
         } else {
             byte[] b = new byte[rem];
             buffer.get(b);
-            crc = updateBytes(crc, b, 0, b.length);
+            crc = updateBytesFJ(crc, b, 0, b.length);
         }
         buffer.position(limit);
     }
@@ -131,4 +143,267 @@
 
     private native static int updateByteBuffer(int adler, long addr,
                                                int off, int len);
+
+
+    private static int updateBytesSimple(int crc, byte[] b, int off, int len) {
+        int[] a = timesXtoThe32;
+        if (a.length < 256)
+            throw new ArrayIndexOutOfBoundsException();
+        int c = ~crc;
+        for (int i = 0; i < len; i++ ) {
+            int x0 = b[i + off];
+            x0 = a[(x0 ^ c) & 0xFF];
+            c = x0 ^ (c >>> 8);
+        }
+        return ~c;
+    }
+
+    private native static boolean init(int[] timesXtoThe32, boolean try_use_clmul);
+
+    /**
+     * timesXtoThe32[a] = rep(poly(a)*x**32)
+     */
+    static int[] timesXtoThe32;
+
+    /**
+     * powersByLog[i] = rep(x**(2**i))
+     */
+    static final int[] powersByLog = new int[32];
+
+    static final int LOG_PB8_LEN = 8;
+
+    /**
+     * powersBy8[i] = rep(x**(8*i))
+     */
+    static final int[] powersBy8 = new int[1 << LOG_PB8_LEN];
+
+    static final int X_to_the_1 = 0x40000000;
+    static final int X_to_the_0 = 0x80000000;
+
+    /**
+     * Indicates if the clmul instruction is enabled in the native
+     * code; this changes the estimated cost of computing a CRC.
+     */
+    private static boolean clmulEnabled;
+
+    /**
+     * Helpful for deciding whether to use workstealing or not.
+     */
+    private static int fjParallelism = ForkJoinPool.getCommonPoolParallelism();
+
+    /**
+     * Estimated task size below which the fork-join overhead could be too large.
+     * May be modified depending on platform properties.
+     */
+    static int serialIfSmallerThan = 512 * 1024;
+
+    /**
+     * Estimated CRC size below which the JNI overhead is too large.
+     * May be modified depending on platform properties.
+     */
+    static int javaCRCIfSmallerThan = 80;
+
+    static int ARRAY_BYTE_BASE_OFFSET = 0;
+
+    static boolean debug = false;
+
+    static {
+      timesXtoThe32 = new int[256];
+      boolean try_use_clmul =
+           "true".equals(sun.misc.VM.getSavedProperty("sun.zip.clmulSupported"));
+      clmulEnabled = init(timesXtoThe32, try_use_clmul);
+
+      if (clmulEnabled)
+          serialIfSmallerThan *= 2;
+
+      if ("true".equals(sun.misc.VM.getSavedProperty("sun.zip.serialOnly")))
+          fjParallelism = 1;
+
+      powersByLog[0] = X_to_the_1;
+      for (int i = 1; i < powersByLog.length; i++) {
+          int x = powersByLog[i-1];
+          powersByLog[i] = mul(x,x);
+      }
+
+      powersBy8[0] = X_to_the_0;
+      for (int i = 1; i < powersBy8.length; i++) {
+          int x = powersBy8[i-1];
+          powersBy8[i] = mul(x,powersByLog[3]);
+      }
+
+      /* Attempt to do all fork-join splits so that they land on a 16-byte boundary.
+         Even if arrays are only 8-byte aligned, this improves our chances. */
+      try {
+          Field field = sun.misc.Unsafe.class.getDeclaredField("theUnsafe");
+          field.setAccessible(true);
+          Unsafe u = (Unsafe) field.get(null);
+          ARRAY_BYTE_BASE_OFFSET = u.ARRAY_BYTE_BASE_OFFSET;
+      } catch (NoSuchFieldException | SecurityException |
+              IllegalArgumentException | IllegalAccessException e) {
+          // It was just an optimization, no need to fail hard.
+      }
+    }
+
+    /* Java implementation of enough GF arithmetic to combine two CRCs for fork/join. */
+
+    /**
+     * Performs a carryless 32x32 into 63 (NOT 64) bit multiply.
+     * Note that the low-order term of the polynomial lands in bit
+     * 62, not 63.
+     *
+     * The Intel pclmulqdq instruction works in much the same way,
+     * except that it is 64 x 64 into 128.
+     */
+    private static long clmul32x32(int a, int b) {
+        long accum = 0;
+        long la = (long) a & 0xffffffffL;
+        long lb = (long) b & 0xffffffffL;
+        while (la != 0) {
+            if (0 != (la & 1))
+                accum ^= lb;
+            la = la >>> 1;
+        lb = lb << 1;
+        }
+        return accum;
+    }
+
+    /**
+     * Converts a 64-bit polynomial into a 32-bit polynomial, modulo P.
+     */
+    static int reduceLongTable(long x) {
+        x = (x >>> 8) ^ (timesXtoThe32[(int)(x & 0xFF)] & 0xffffffffL);
+        x = (x >>> 8) ^ (timesXtoThe32[(int)(x & 0xFF)] & 0xffffffffL);
+        x = (x >>> 8) ^ (timesXtoThe32[(int)(x & 0xFF)] & 0xffffffffL);
+        x = (x >>> 8) ^ (timesXtoThe32[(int)(x & 0xFF)] & 0xffffffffL);
+        return (int) x;
+    }
+
+   /**
+     * Returns polynomial a times b modulo P.
+     * The least (x**0) term of the polynomial is
+     * aligned with the sign bit of the returned int.
+     *
+     * @param a
+     * @param b
+     * @return
+     */
+    static int mul(int a, int b) {
+        long product = clmul32x32(a, b);
+        return reduceLongTable(product << 1);
+    }
+
+    /**
+     * Returns the polynomial for a * x ** 8n, where a
+     * is some other polynomial.  n is typically a byte count,
+     * and 8n is the number of bits in n bytes.
+     */
+    static int timesXtoThe8NTable(int a, int n) {
+        if (n == 0)
+            return a;
+        if (n < powersBy8.length) {
+            return mul(a,powersBy8[n]);
+        }
+        int lo = powersBy8.length - 1;
+        int accum = mul(a,powersBy8[n & lo]);
+        n = n >>> LOG_PB8_LEN;
+        int i = LOG_PB8_LEN + 3;
+        while (n != 0) {
+            if (0 != (n & 1))
+                accum = mul(accum,powersByLog[i]);
+            n = n >>> 1;
+            i++;
+        }
+        return accum;
+    }
+
+    static int combine(int prev, int next, int next_length) {
+        // x**(8 * length) * prev + next
+        return next ^ timesXtoThe8NTable(prev, next_length);
+    }
+
+    private static int updateBytesFJ(int crc, byte[] b, int off, int len) {
+        if (fjParallelism < 2 || len < serialIfSmallerThan)
+            return updateBytes(crc, b, off, len);
+        CRCArrayTask cat = new CRCArrayTask(crc, b, off, len);
+        cat.invoke();
+        return cat.join();
+    }
+
+    private static int updateByteBufferFJ(int crc, long addr,
+                                               int off, int len) {
+        if (fjParallelism < 2 || len < serialIfSmallerThan)
+            return updateByteBuffer(crc, addr, off, len);
+        CRCBufferTask cat = new CRCBufferTask(crc, addr, off, len);
+        cat.invoke();
+        return cat.join();
+    }
+
+    static final class CRCArrayTask extends RecursiveTask<Integer> {
+        final byte[] ba;
+        final int start;
+        final int length;
+        final int crc;
+
+        CRCArrayTask(int crc, byte[] ba, int start, int length) {
+            this.crc = crc;
+            this.ba = ba;
+            this.start = start;
+            this.length = length;
+        }
+
+        @Override
+        protected Integer compute() {
+            if (length < serialIfSmallerThan) {
+                if (length < javaCRCIfSmallerThan) {
+                    return updateBytesSimple(crc, ba, start, length);
+                } else {
+                    return updateBytes(crc, ba, start, length);
+                }
+            } else {
+                int half = length/2;
+                /* Avoid gratuitous misalignment. */
+                long addr = ARRAY_BYTE_BASE_OFFSET; // Best we can do given limited info.
+                int unaligned = (int) (addr + start + half) & 15;
+                if (half - unaligned >= 32)
+                    half -= unaligned;
+                CRCArrayTask task2 = new CRCArrayTask(0, ba, start + half, length - half);
+                task2.fork();
+                CRCArrayTask task1 = new CRCArrayTask(crc, ba, start, half);
+                int result1 = task1.compute();
+                return combine(result1, task2.join(), length - half);
+            }
+        }
+    }
+
+    static final class CRCBufferTask extends RecursiveTask<Integer> {
+        final long addr;
+        final int start;
+        final int length;
+        final int crc;
+
+        CRCBufferTask(int crc, long addr, int start, int length) {
+            this.crc = crc;
+            this.addr = addr;
+            this.start = start;
+            this.length = length;
+        }
+
+        @Override
+        protected Integer compute() {
+            if (length < serialIfSmallerThan) {
+                return updateByteBuffer(crc, addr, start, length);
+            } else {
+                int half = length/2;
+                /* Avoid gratuitous misalignment. */
+                int unaligned = (int) (addr + start + half) & 15;
+                if (half - unaligned >= 32)
+                    half -= unaligned;
+                CRCBufferTask task2 = new CRCBufferTask(0, addr, start + half, length - half);
+                task2.fork();
+                CRCBufferTask task1 = new CRCBufferTask(crc, addr, start, half);
+                int result1 = task1.compute();
+                return combine(result1, task2.join(), length - half);
+            }
+        }
+    }
 }
--- old/src/share/classes/sun/misc/VM.java	2013-05-16 08:24:42.000000000 -0400
+++ new/src/share/classes/sun/misc/VM.java	2013-05-16 08:24:41.000000000 -0400
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1996, 2012, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2013, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -298,6 +298,9 @@
 
         // used by sun.launcher.LauncherHelper
         props.remove("sun.java.launcher.diag");
+
+        // used by java.util.zip.CRC32
+        props.remove("sun.zip.clmulSupported");
     }
 
     // Initialize any miscellenous operating system settings that need to be
--- old/src/share/native/java/util/zip/CRC32.c	2013-05-16 08:24:42.000000000 -0400
+++ new/src/share/native/java/util/zip/CRC32.c	2013-05-16 08:24:42.000000000 -0400
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1997, 2012, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2013, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -33,13 +33,134 @@
 
 #include "java_util_zip_CRC32.h"
 
+/* define CAN_COMPILE_CLMUL 0 to disable fastcrc32 completely. */
+
+#ifndef CAN_COMPILE_CLMUL
+#  ifdef __x86_64
+#    define CAN_COMPILE_CLMUL 1
+#  elif defined(__i386)
+#    define CAN_COMPILE_CLMUL 1
+#  endif
+#endif
+
+#if CAN_COMPILE_CLMUL
+#include <stdint.h>
+#include <stdlib.h>
+
+struct crc_by128_K {
+    /* The fields in this structure are arranged so that if it is
+     * allocated at a 16-byte alignment they can be picked up two at
+     * a time with 128-bit loads.
+     *
+     * Because of flipped bit order for this CRC polynomials
+     * the constant for X**N is left-shifted by 1.  This is because
+     * a 64 x 64 polynomial multiply produces a 127-bit result
+     * but the highest term is always aligned to bit 0 in the container.
+     * Pre-shifting by one fixes this, at the cost of potentially making
+     * the 32-bit constant no longer fit in a 32-bit container (thus the
+     * use of uint64_t, though this is also the size used by the carry-
+     * less multiply instruction.
+     *
+     * In addition, the flipped bit order and highest-term-at-least-bit
+     * multiply changes the constants used.  The 96-bit result will be
+     * aligned to the high-term end of the target 128-bit container,
+     * not the low-term end; that is, instead of a 512-bit or 576-bit fold,
+     * instead it is a 480 (=512-32) or 544 (=512+64-32) bit fold.
+     *
+     * This cause additional problems in the 128-to-64-bit reduction; see the
+     * code for details.  By storing a mask in the otherwise unused half of
+     * a 128-bit constant, bits can be cleared before multiplication without
+     * storing and reloading.  Note that staying on a 128-bit datapath means
+     * that some data is uselessly stored and some unused data is intersected
+     * with an irrelevant constant.
+     */
+
+    uint64_t mask; /* low of K_M_64 */
+    uint64_t xtt64; /* high of K_M_64 */
+    uint64_t xtt160; /* low of K_160_96 */
+    uint64_t xtt96; /* high of K_160_96 */
+    uint64_t xtt544; /* low of K_544_480 */
+    uint64_t xtt480; /* high of K_544_480 */
+};
+
+struct crc_by128_K * K_struct = 0;
+
+static const uint64_t x64 = (uint64_t) 0xb1e6b092U << 1;
+static const uint64_t x96 = (uint64_t) 0x6655004fU << 1;
+static const uint64_t x160 = (uint64_t) 0xba8ccbe8U << 1;
+static const uint64_t x480 = (uint64_t) 0xe3720acbU << 1;
+static const uint64_t x544 = (uint64_t) 0xaa2215eaU << 1;
+
+static struct crc_by128_K * init_crc_by128_K() {
+    void * y;
+    int rc = posix_memalign( & y, 16, sizeof(struct crc_by128_K));
+    if (rc) {
+        return (struct crc_by128_K *) NULL;
+    } else {
+        struct crc_by128_K * x = y;
+        x -> mask = 0xffffffffUL;
+        x -> xtt64 = x64;
+        x -> xtt160 = x160;
+        x -> xtt96 = x96;
+        x -> xtt544 = x544;
+        x -> xtt480 = x480;
+        return x;
+    }
+}
+
+uint32_t fastcrc32(jint crc, Bytef * buf, jint len);
+
+/* Flag governing use of "CLMUL" instruction.
+   For now, implies little-endian.
+   Computed dynamically, incorporates information about
+   the current hardware and the compiler used to compile
+   this file. */
+static int useClmul = 0;
+#else
+/* Stub out fastcrc32 */
+# define fastcrc32 crc32
+# define useClmul 0
+#endif
+
+
+/* Local copy of CRC32 table is used to fill and drain CLMUL CRC.
+   Extra members beyond the first 256-entry row are ignored. */
+static const unsigned long FAR * crc_table;
+
+/* Initialize java-side table (for small CRCs) to avoid extra startup work,
+   and capture the platform-dependent useClmul flag.
+*/
+JNIEXPORT jboolean JNICALL
+Java_java_util_zip_CRC32_init(JNIEnv *env, jclass cls, jarray b, jboolean use_clmul)
+{
+  /* Get the CRC table from zip to initialize JNI.  Our private copy
+     is missing if not compiled for fastcrc32. */
+  crc_table = get_crc_table();
+  jint *buf = (*env)->GetPrimitiveArrayCritical(env, b, 0);
+  if (buf) {
+        /* Don't know for sure how big an unsigned long is, therefore
+           copy one at a time. */
+        int i;
+        for (i = 0; i < 256; i++) buf[i] = (jint) (crc_table[i]);
+        (*env)->ReleasePrimitiveArrayCritical(env, b, buf, 0);
+  }
+#if CAN_COMPILE_CLMUL
+ if (use_clmul) {
+      K_struct = init_crc_by128_K();
+      useClmul = K_struct != 0;
+      /* Rather than throw OOME, just do without fast CRC. */
+  }
+#endif
+  return useClmul;
+}
+
 JNIEXPORT jint JNICALL
 Java_java_util_zip_CRC32_update(JNIEnv *env, jclass cls, jint crc, jint b)
 {
     Bytef buf[1];
 
     buf[0] = (Bytef)b;
-    return crc32(crc, buf, 1);
+    return crc32(crc, buf, 1); // single byte not done quickly by fastcrc32
 }
 
 JNIEXPORT jint JNICALL
@@ -48,7 +169,8 @@
 {
     Bytef *buf = (*env)->GetPrimitiveArrayCritical(env, b, 0);
     if (buf) {
-        crc = crc32(crc, buf + off, len);
+        crc = (jint) (useClmul ? fastcrc32(crc, buf + off, len) :
+                                     crc32(crc, buf + off, len));
         (*env)->ReleasePrimitiveArrayCritical(env, b, buf, 0);
     }
     return crc;
@@ -56,7 +178,8 @@
 
 JNIEXPORT jint ZIP_CRC32(jint crc, const jbyte *buf, jint len)
 {
-    return crc32(crc, (Bytef*)buf, len);
+    return (jint) (useClmul ? fastcrc32(crc, (Bytef*)buf, len) :
+                                  crc32(crc, (Bytef*)buf, len));
 }
 
 JNIEXPORT jint JNICALL
@@ -65,7 +188,452 @@
 {
     Bytef *buf = (Bytef *)jlong_to_ptr(address);
     if (buf) {
-        crc = crc32(crc, buf + off, len);
+        crc = (jint) (useClmul ? fastcrc32(crc, buf + off, len) :
+                                     crc32(crc, buf + off, len));
     }
     return crc;
 }
+
+#if CAN_COMPILE_CLMUL
+#ifndef NO_ASM
+
+/* set up the platform-specific glop surrounding the function body. */
+#  ifdef __x86_64
+#    ifdef __APPLE__
+#      define ASM_PREFIX ".text\n\t.align 8\n\t.globl _kernel\n_kernel:\n\t"
+#      define ASM_SUFFIX ""
+#    elif defined(__GNUC__)
+#      define ASM_PREFIX ".text\n\t.align 16\n\t.globl kernel\n\t.type kernel,@function\nkernel:\n\t"
+#      define ASM_SUFFIX ""
+#    elif defined(__SUNPRO_C)
+#      define ASM_PREFIX ".section .text,\"ax\"\n\t.align 16, 0x90\n\t.globl kernel\n\t.type kernel,@function\nkernel:\n\t"
+#      define ASM_SUFFIX ".size kernel,.-kernel"
+#    else
+       /* Perhaps the mystery compiler can handle the intrinsics. */
+#      define NO_ASM 1
+#    endif
+
+#    ifndef NO_ASM
+__asm__(
+        ASM_PREFIX
+        " pushq  %rbp\n\t"
+        " movq   %rsp, %rbp\n\t"
+        " movl   %edi, %eax\n\t"
+        " .byte  0xc5,0xf9,0x6f,0x06  # vmovdqa(%rsi), %xmm0\n\t"
+        " .byte  0xc4,0xe1,0xf9,0x7e,0xc7  # vmovd  %xmm0, %rdi\n\t"
+        " xorq   %rax, %rdi\n\t"
+        " .byte  0xc4,0xe3,0xf9,0x22,0xd7,0x00  # vpinsrq$0, %rdi, %xmm0, %xmm2\n\t"
+        " .byte  0xc5,0x79,0x6f,0x01  # vmovdqa(%rcx), %xmm8\n\t"
+        " .byte  0xc5,0x79,0x6f,0x49,0x10  # vmovdqa16(%rcx), %xmm9\n\t"
+        " movl   $1, %eax\n\t"
+        " cmpl   $4, %edx\n\t"
+        " jl     1f\n\t"
+        " .byte  0xc5,0xf9,0x6f,0x6e,0x10  # vmovdqa16(%rsi), %xmm5\n\t"
+        " .byte  0xc5,0xf9,0x6f,0x66,0x20  # vmovdqa32(%rsi), %xmm4\n\t"
+        " .byte  0xc5,0xf9,0x6f,0x5e,0x30  # vmovdqa48(%rsi), %xmm3\n\t"
+        " leal   -3(%rdx), %edi\n\t"
+        " movl   $4, %eax\n\t"
+        " cmpl   $5, %edi\n\t"
+        " jl     2f\n\t"
+        " .byte  0xc5,0xf9,0x6f,0x71,0x20  # vmovdqa32(%rcx), %xmm6\n\t"
+        " leaq   112(%rsi), %rcx\n\t"
+        " movl   $4, %eax\n\t"
+        " .align  4, 0x90\n"
+        "3: .byte  0xc4,0xe3,0x49,0x44,0xc2,0x00  # vpclmulqdq$0, %xmm2, %xmm6, %xmm0\n\t"
+        " .byte  0xc4,0xe3,0x49,0x44,0xcb,0x11  # vpclmulqdq$17, %xmm3, %xmm6, %xmm1\n\t"
+        " .byte  0xc4,0xe3,0x49,0x44,0xdb,0x00  # vpclmulqdq$0, %xmm3, %xmm6, %xmm3\n\t"
+        " .byte  0xc5,0xe1,0xef,0x19  # vpxor  (%rcx), %xmm3, %xmm3\n\t"
+        " .byte  0xc4,0xe3,0x49,0x44,0xfd,0x00  # vpclmulqdq$0, %xmm5, %xmm6, %xmm7\n\t"
+        " .byte  0xc5,0xc1,0xef,0x79,0xe0  # vpxor  -32(%rcx), %xmm7, %xmm7\n\t"
+        " .byte  0xc5,0xf1,0xef,0xdb  # vpxor  %xmm3, %xmm1, %xmm3\n\t"
+        " .byte  0xc4,0xe3,0x49,0x44,0xd2,0x11  # vpclmulqdq$17, %xmm2, %xmm6, %xmm2\n\t"
+        " .byte  0xc5,0xf9,0xef,0x41,0xd0  # vpxor  -48(%rcx), %xmm0, %xmm0\n\t"
+        " .byte  0xc4,0xe3,0x49,0x44,0xcd,0x11  # vpclmulqdq$17, %xmm5, %xmm6, %xmm1\n\t"
+        " .byte  0xc4,0xe3,0x49,0x44,0xec,0x11  # vpclmulqdq$17, %xmm4, %xmm6, %xmm5\n\t"
+        " .byte  0xc4,0xe3,0x49,0x44,0xe4,0x00  # vpclmulqdq$0, %xmm4, %xmm6, %xmm4\n\t"
+        " .byte  0xc5,0xd9,0xef,0x61,0xf0  # vpxor  -16(%rcx), %xmm4, %xmm4\n\t"
+        " .byte  0xc5,0xd1,0xef,0xe4  # vpxor  %xmm4, %xmm5, %xmm4\n\t"
+        " .byte  0xc5,0xf1,0xef,0xef  # vpxor  %xmm7, %xmm1, %xmm5\n\t"
+        " .byte  0xc5,0xe9,0xef,0xd0  # vpxor  %xmm0, %xmm2, %xmm2\n\t"
+        " addq   $64, %rcx\n\t"
+        " addl   $4, %eax\n\t"
+        " cmpl   %edi, %eax\n\t"
+        " jl     3b\n"
+        "2: .byte  0xc4,0xe3,0x31,0x44,0xc2,0x11  # vpclmulqdq$17, %xmm2, %xmm9, %xmm0\n\t"
+        " .byte  0xc4,0xe3,0x31,0x44,0xca,0x00  # vpclmulqdq$0, %xmm2, %xmm9, %xmm1\n\t"
+        " .byte  0xc5,0xd1,0xef,0xc9  # vpxor  %xmm1, %xmm5, %xmm1\n\t"
+        " .byte  0xc5,0xf1,0xef,0xc8  # vpxor  %xmm0, %xmm1, %xmm1\n\t"
+        " .byte  0xc4,0xe3,0x31,0x44,0xc1,0x11  # vpclmulqdq$17, %xmm1, %xmm9, %xmm0\n\t"
+        " .byte  0xc4,0xe3,0x31,0x44,0xc9,0x00  # vpclmulqdq$0, %xmm1, %xmm9, %xmm1\n\t"
+        " .byte  0xc5,0xd9,0xef,0xc9  # vpxor  %xmm1, %xmm4, %xmm1\n\t"
+        " .byte  0xc5,0xf1,0xef,0xc8  # vpxor  %xmm0, %xmm1, %xmm1\n\t"
+        " .byte  0xc4,0xe3,0x31,0x44,0xc1,0x11  # vpclmulqdq$17, %xmm1, %xmm9, %xmm0\n\t"
+        " .byte  0xc4,0xe3,0x31,0x44,0xc9,0x00  # vpclmulqdq$0, %xmm1, %xmm9, %xmm1\n\t"
+        " .byte  0xc5,0xe1,0xef,0xc9  # vpxor  %xmm1, %xmm3, %xmm1\n\t"
+        " .byte  0xc5,0xf1,0xef,0xd0  # vpxor  %xmm0, %xmm1, %xmm2\n"
+        "1: cmpl   %edx, %eax\n\t"
+        " jge    4f\n\t"
+        " subl   %eax, %edx\n\t"
+        " movslq %eax, %rax\n\t"
+        " shlq   $4, %rax\n\t"
+        " addq   %rax, %rsi\n\t"
+        " .align  4, 0x90\n"
+        "5: .byte  0xc4,0xe3,0x31,0x44,0xc2,0x11  # vpclmulqdq$17, %xmm2, %xmm9, %xmm0\n\t"
+        " .byte  0xc4,0xe3,0x31,0x44,0xca,0x00  # vpclmulqdq$0, %xmm2, %xmm9, %xmm1\n\t"
+        " .byte  0xc5,0xf1,0xef,0x0e  # vpxor  (%rsi), %xmm1, %xmm1\n\t"
+        " .byte  0xc5,0xf1,0xef,0xd0  # vpxor  %xmm0, %xmm1, %xmm2\n\t"
+        " addq   $16, %rsi\n\t"
+        " decl   %edx\n\t"
+        " jne    5b\n"
+        "4: .byte  0xc4,0xe3,0x39,0x44,0xc2,0x01  # vpclmulqdq$1, %xmm2, %xmm8, %xmm0\n\t"
+        " .byte  0xc4,0xe1,0xf9,0x7e,0xc0  # vmovd  %xmm0, %rax\n\t"
+        " .byte  0xc4,0xe3,0xf9,0x16,0xc1,0x01  # vpextrq$1, %xmm0, %rcx\n\t"
+        " shldq  $32, %rax, %rcx\n\t"
+        " .byte  0xc5,0xb9,0xdb,0xc0  # vpand  %xmm0, %xmm8, %xmm0\n\t"
+        " .byte  0xc4,0xe3,0x39,0x44,0xc0,0x01  # vpclmulqdq$1, %xmm0, %xmm8, %xmm0\n\t"
+        " .byte  0xc4,0xe1,0xf9,0x7e,0xc2  # vmovd  %xmm0, %rdx\n\t"
+        " .byte  0xc4,0xe3,0xf9,0x16,0xd0,0x01  # vpextrq$1, %xmm2, %rax\n\t"
+        " xorq   %rdx, %rax\n\t"
+        " xorq   %rcx, %rax\n\t"
+        " popq   %rbp\n\t"
+        " ret\n"
+        ASM_SUFFIX
+        );
+#    endif
+#  elif defined(__i386)
+
+/* set up the platform-specific glop surrounding the function body. */
+#    ifdef __APPLE__
+#      define ASM_PREFIX ".text\n\t.align 16\n\t.globl _kernel\n_kernel:\n\t"
+#      define ASM_SUFFIX ""
+#    elif defined(__GNUC__)
+#      define ASM_PREFIX ".text\n\t.align 16\n\t.globl kernel\n\t.type kernel,@function\nkernel:\n\t"
+#      define ASM_SUFFIX ""
+#    elif defined(__SUNPRO_C)
+#      define ASM_PREFIX ".section .text,\"ax\"\n\t.align 16, 0x90\n\t.globl kernel\n\t.type kernel,@function\nkernel:\n\t"
+#      define ASM_SUFFIX ".size kernel,.-kernel"
+#    else
+       /* Perhaps the mystery compiler can handle the intrinsics. */
+#      define NO_ASM 1
+#    endif
+
+#    ifndef NO_ASM
+__asm__(
+        ASM_PREFIX
+        " pushl  %ebp\n\t"
+        " movl   %esp, %ebp\n\t"
+        " pushl  %edi\n\t"
+        " pushl  %esi\n\t"
+        " movl   12(%ebp), %eax\n\t"
+        " .byte  0xc5,0xf9,0x28,0x00  # vmovapd(%eax), %xmm0\n\t"
+        " .byte  0xc5,0xf9,0x7e,0xc1  # vmovd  %xmm0, %ecx\n\t"
+        " xorl   8(%ebp), %ecx\n\t"
+        " .byte  0xc4,0xe3,0x79,0x22,0xc9,0x00  # vpinsrd$0, %ecx, %xmm0, %xmm1\n\t"
+        " .byte  0xc4,0xe3,0x79,0x16,0xc1,0x01  # vpextrd$1, %xmm0, %ecx\n\t"
+        " .byte  0xc4,0xe3,0x71,0x22,0xc9,0x01  # vpinsrd$1, %ecx, %xmm1, %xmm1\n\t"
+        " movl   20(%ebp), %edi\n\t"
+        " .byte  0xc5,0xf9,0x6f,0x07  # vmovdqa(%edi), %xmm0\n\t"
+        " .byte  0xc5,0xf9,0x6f,0x57,0x10  # vmovdqa16(%edi), %xmm2\n\t"
+        " movl   $1, %edx\n\t"
+        " movl   16(%ebp), %ecx\n\t"
+        " cmpl   $4, %ecx\n\t"
+        " jl     1f\n\t"
+        " .byte  0xc5,0xf9,0x6f,0x58,0x30  # vmovdqa48(%eax), %xmm3\n\t"
+        " .byte  0xc5,0xf9,0x6f,0x68,0x10  # vmovdqa16(%eax), %xmm5\n\t"
+        " .byte  0xc5,0xf9,0x6f,0x60,0x20  # vmovdqa32(%eax), %xmm4\n\t"
+        " leal   -3(%ecx), %esi\n\t"
+        " movl   $4, %edx\n\t"
+        " cmpl   $5, %esi\n\t"
+        " jl     2f\n\t"
+        " .byte  0xc5,0xf9,0x6f,0x77,0x20  # vmovdqa32(%edi), %xmm6\n\t"
+        " leal   112(%eax), %edi\n\t"
+        " movl   $4, %edx\n\t"
+        " .align  4, 0x90\n"
+        "3: .byte  0xc4,0xe3,0x49,0x44,0xfb,0x11  # vpclmulqdq$17, %xmm3, %xmm6, %xmm7\n\t"
+        " .byte  0xc4,0xe3,0x49,0x44,0xdb,0x00  # vpclmulqdq$0, %xmm3, %xmm6, %xmm3\n\t"
+        " .byte  0xc5,0xe1,0xef,0x1f  # vpxor  (%edi), %xmm3, %xmm3\n\t"
+        " .byte  0xc5,0xc1,0xef,0xdb  # vpxor  %xmm3, %xmm7, %xmm3\n\t"
+        " .byte  0xc4,0xe3,0x49,0x44,0xfc,0x11  # vpclmulqdq$17, %xmm4, %xmm6, %xmm7\n\t"
+        " .byte  0xc4,0xe3,0x49,0x44,0xe4,0x00  # vpclmulqdq$0, %xmm4, %xmm6, %xmm4\n\t"
+        " .byte  0xc5,0xd9,0xef,0x67,0xf0  # vpxor  -16(%edi), %xmm4, %xmm4\n\t"
+        " .byte  0xc5,0xc1,0xef,0xe4  # vpxor  %xmm4, %xmm7, %xmm4\n\t"
+        " .byte  0xc4,0xe3,0x49,0x44,0xfd,0x11  # vpclmulqdq$17, %xmm5, %xmm6, %xmm7\n\t"
+        " .byte  0xc4,0xe3,0x49,0x44,0xed,0x00  # vpclmulqdq$0, %xmm5, %xmm6, %xmm5\n\t"
+        " .byte  0xc5,0xd1,0xef,0x6f,0xe0  # vpxor  -32(%edi), %xmm5, %xmm5\n\t"
+        " .byte  0xc5,0xc1,0xef,0xed  # vpxor  %xmm5, %xmm7, %xmm5\n\t"
+        " .byte  0xc4,0xe3,0x49,0x44,0xf9,0x11  # vpclmulqdq$17, %xmm1, %xmm6, %xmm7\n\t"
+        " .byte  0xc4,0xe3,0x49,0x44,0xc9,0x00  # vpclmulqdq$0, %xmm1, %xmm6, %xmm1\n\t"
+        " .byte  0xc5,0xf1,0xef,0x4f,0xd0  # vpxor  -48(%edi), %xmm1, %xmm1\n\t"
+        " .byte  0xc5,0xc1,0xef,0xc9  # vpxor  %xmm1, %xmm7, %xmm1\n\t"
+        " addl   $64, %edi\n\t"
+        " addl   $4, %edx\n\t"
+        " cmpl   %esi, %edx\n\t"
+        " jl     3b\n"
+        "2: .byte  0xc4,0xe3,0x69,0x44,0xf1,0x11  # vpclmulqdq$17, %xmm1, %xmm2, %xmm6\n\t"
+        " .byte  0xc4,0xe3,0x69,0x44,0xc9,0x00  # vpclmulqdq$0, %xmm1, %xmm2, %xmm1\n\t"
+        " .byte  0xc5,0xd1,0xef,0xc9  # vpxor  %xmm1, %xmm5, %xmm1\n\t"
+        " .byte  0xc5,0xf1,0xef,0xee  # vpxor  %xmm6, %xmm1, %xmm5\n\t"
+        " .byte  0xc4,0xe3,0x69,0x44,0xcd,0x11  # vpclmulqdq$17, %xmm5, %xmm2, %xmm1\n\t"
+        " .byte  0xc4,0xe3,0x69,0x44,0xed,0x00  # vpclmulqdq$0, %xmm5, %xmm2, %xmm5\n\t"
+        " .byte  0xc5,0xd9,0xef,0xe5  # vpxor  %xmm5, %xmm4, %xmm4\n\t"
+        " .byte  0xc5,0xd9,0xef,0xe1  # vpxor  %xmm1, %xmm4, %xmm4\n\t"
+        " .byte  0xc4,0xe3,0x69,0x44,0xcc,0x11  # vpclmulqdq$17, %xmm4, %xmm2, %xmm1\n\t"
+        " .byte  0xc4,0xe3,0x69,0x44,0xe4,0x00  # vpclmulqdq$0, %xmm4, %xmm2, %xmm4\n\t"
+        " .byte  0xc5,0xe1,0xef,0xdc  # vpxor  %xmm4, %xmm3, %xmm3\n\t"
+        " .byte  0xc5,0xe1,0xef,0xc9  # vpxor  %xmm1, %xmm3, %xmm1\n"
+        "1: cmpl   %ecx, %edx\n\t"
+        " jge    4f\n\t"
+        " subl   %edx, %ecx\n\t"
+        " shll   $4, %edx\n\t"
+        " addl   %edx, %eax\n\t"
+        " .align  4, 0x90\n"
+        "5: .byte  0xc4,0xe3,0x69,0x44,0xd9,0x11  # vpclmulqdq$17, %xmm1, %xmm2, %xmm3\n\t"
+        " .byte  0xc4,0xe3,0x69,0x44,0xc9,0x00  # vpclmulqdq$0, %xmm1, %xmm2, %xmm1\n\t"
+        " .byte  0xc5,0xf1,0xef,0x08  # vpxor  (%eax), %xmm1, %xmm1\n\t"
+        " .byte  0xc5,0xf1,0xef,0xcb  # vpxor  %xmm3, %xmm1, %xmm1\n\t"
+        " addl   $16, %eax\n\t"
+        " decl   %ecx\n\t"
+        " jne    5b\n"
+        "4: .byte  0xc4,0xe3,0x79,0x44,0xd1,0x01  # vpclmulqdq$1, %xmm1, %xmm0, %xmm2\n\t"
+        " .byte  0xc5,0xf9,0xdb,0xda  # vpand  %xmm2, %xmm0, %xmm3\n\t"
+        " .byte  0xc4,0xe3,0x79,0x44,0xc3,0x01  # vpclmulqdq$1, %xmm3, %xmm0, %xmm0\n\t"
+        " .byte  0xc5,0xf9,0x7e,0xc0  # vmovd  %xmm0, %eax\n\t"
+        " .byte  0xc4,0xe3,0x79,0x16,0xc9,0x02  # vpextrd$2, %xmm1, %ecx\n\t"
+        " xorl   %eax, %ecx\n\t"
+        " .byte  0xc4,0xe3,0x79,0x16,0xd0,0x01  # vpextrd$1, %xmm2, %eax\n\t"
+        " xorl   %ecx, %eax\n\t"
+        " .byte  0xc4,0xe3,0x79,0x16,0xc2,0x01  # vpextrd$1, %xmm0, %edx\n\t"
+        " .byte  0xc4,0xe3,0x79,0x16,0xc9,0x03  # vpextrd$3, %xmm1, %ecx\n\t"
+        " xorl   %edx, %ecx\n\t"
+        " .byte  0xc4,0xe3,0x79,0x16,0xd2,0x02  # vpextrd$2, %xmm2, %edx\n\t"
+        " xorl   %ecx, %edx\n\t"
+        " popl   %esi\n\t"
+        " popl   %edi\n\t"
+        " popl   %ebp\n\t"
+        " ret\n"
+        ASM_SUFFIX
+        );
+#    endif
+#  else /* architecture type */
+/* Not intel, not that the C intrinsics will compile anywhere else,
+ * but it will be a slightly better error message.
+ */
+#    define NO_ASM 1
+#  endif
+#endif /* NO_ASM */
+
+#ifndef NO_ASM
+/* Declaration for use below. */
+uint64_t kernel(uint32_t c, unsigned char * buf, int len_128bit, struct crc_by128_K * K);
+#else
+#pragma message("Compiling 'kernel' from C source with intrinsics")
+#include <wmmintrin.h>
+#include <emmintrin.h>
+
+union u {
+    __m128i v;
+    struct {
+        uint64_t lo;
+        uint64_t hi;
+    };
+};
+
+/**
+ * Assume c is existing crc,
+ * buf is 16-byte-aligned,
+ * len is a multiple of 16 greater than zero.
+ */
+uint64_t kernel(uint32_t c, unsigned char * buf, int len_128bit,
+        struct crc_by128_K * K) {
+
+    __m128i * b = (__m128i *) buf;
+    int i = 0;
+
+    /* 128 bit constants and variables. */
+    __m128i K_544_480, K_160_96, K_M_64,
+    x0, x1, x2, x3,
+    x0a, x1a, x2a, x3a,
+    x0b, x1b, x2b, x3b;
+
+    /* Use these to move data between xmm registers and "normal" registers. */
+    union u ut0, ut1, ut2, ut3;
+
+    K_544_480 = * (__m128i *) & (K -> xtt544);
+    K_160_96 = * (__m128i *) & (K -> xtt160);
+    K_M_64 = * (__m128i *) & (K -> mask);
+
+    /* Incorporate existing CRC into first item */
+    ut0.v = b[0];
+    ut0.lo ^= c;
+    x0 = ut0.v;
+
+    if (len_128bit >= 4) {
+        /* Written as a slightly pipelined loop. */
+
+        x1 = b[1];
+        x2 = b[2];
+        x3 = b[3];
+
+        /* Iterate once if len_128bit is between 8 and 11
+         * 4 < 8-3 < 11 - 3
+         * 8 !< 11 - 3 < 12 - 3.
+         *
+         * 0 1 2 3 | 4 5 6 7 | 8 9 10 11 | 12
+         *
+         */
+        for (i = 4; i < len_128bit - 3 ; i+= 4) {
+            /* Each iteration of this loop folds the 512 bits of polynomial
+             * in x0-x3 with the data in b[i]..b[i+3].
+             */
+            x0a = b[i];
+            x1a = b[i+1];
+            x2a = b[i+2];
+            x3a = b[i+3];
+
+            x0b = _mm_clmulepi64_si128(K_544_480, x0, 0x00);
+            x0 = _mm_clmulepi64_si128(K_544_480, x0, 0x11);
+            x1b = _mm_clmulepi64_si128(K_544_480, x1, 0x00);
+            x1 = _mm_clmulepi64_si128(K_544_480, x1, 0x11);
+
+            x2b = _mm_clmulepi64_si128(K_544_480, x2, 0x00);
+            x2 = _mm_clmulepi64_si128(K_544_480, x2, 0x11);
+            x3b = _mm_clmulepi64_si128(K_544_480, x3, 0x00);
+            x3 = _mm_clmulepi64_si128(K_544_480, x3, 0x11);
+
+            // x0 ^= x0a ^ x0b;
+            x0 = _mm_xor_si128(x0, x0a);
+            x0 = _mm_xor_si128(x0, x0b);
+            // x1 ^= x1a ^ x1b;
+            x1 = _mm_xor_si128(x1, x1a);
+            x1 = _mm_xor_si128(x1, x1b);
+            // x2 ^= x2a ^ x2b;
+            x2 = _mm_xor_si128(x2, x2a);
+            x2 = _mm_xor_si128(x2, x2b);
+            // x3 ^= x3a ^ x3b;
+            x3 = _mm_xor_si128(x3, x3a);
+            x3 = _mm_xor_si128(x3, x3b);
+        }
+        /* x0 - x3 contains 4 x 128 bits of accumulated result.
+         * 0-3 hexads potentially remain in [i,len_128bit) entries.
+         * Assume trailing bytes beyond that are handled by our caller.
+         */
+        x0a = _mm_clmulepi64_si128(K_160_96, x0, 0x00);
+        x0b = _mm_clmulepi64_si128(K_160_96, x0, 0x11);
+        x1 = _mm_xor_si128(x1, x0a);
+        x1 = _mm_xor_si128(x1, x0b);
+        x0a = _mm_clmulepi64_si128(K_160_96, x1, 0x00);
+        x0b = _mm_clmulepi64_si128(K_160_96, x1, 0x11);
+        x2 = _mm_xor_si128(x2, x0a);
+        x2 = _mm_xor_si128(x2, x0b);
+        x0a = _mm_clmulepi64_si128(K_160_96, x2, 0x00);
+        x0b = _mm_clmulepi64_si128(K_160_96, x2, 0x11);
+        x3 = _mm_xor_si128(x3, x0a);
+        x3 = _mm_xor_si128(x3, x0b);
+    } else {
+        /* Loaded 128 bits already into x0.
+         */
+        x3 = x0;
+        i = 1;
+    }
+
+    /* x3 is now 128-bit result.
+     * Fold 0-3 128-bit chunks into x3.
+     */
+    for (; i < len_128bit; i++) {
+        x0 = b[i]; // data to fold
+        // fold x3 down by 128 to align with data.
+        x0a = _mm_clmulepi64_si128(K_160_96, x3, 0x00);
+        x0b = _mm_clmulepi64_si128(K_160_96, x3, 0x11);
+        x3 = _mm_xor_si128(x0, x0a);
+        x3 = _mm_xor_si128(x3, x0b);
+        // x3 is now aligned with data we just loaded.
+    }
+
+    /*
+     * No more 128bits remain.
+     * Fold x3 down into 32 bits.
+     */
+    {
+        ut0.v = x3;
+        uint64_t w;
+        uint64_t y = ut0.hi; // 64 low-order terms of polynomial into y.
+
+        /* polynomial term order:
+         * high -> low
+         * bit number order
+         * 0 -> 127
+         *
+         * input, from which y was just extracted.
+         * w0 w1 y0 y1
+         * w0:w1 * x64 yields 96 bits.
+         * p0:p1:p2:__ (aligned wrong, store to extract p1 and p2)
+         * p0:p1:__:__ & ff:00:__:__ (mask to get rid of p1)
+         * p0:00:__:__
+         * p0:00 * x64 (times x64 yields 64 bits)
+         * r0:r1 store and xor.
+         */
+
+        x0 = _mm_clmulepi64_si128(K_M_64, x3, 0x01);
+        ut1.v = x0;
+        w = (ut1.lo >> 32) + (ut1.hi << 32); // extract low-poly 64 bits.
+        x0 = _mm_and_si128(K_M_64, x0); // mask away what we just extracted..
+        x0 = _mm_clmulepi64_si128(K_M_64, x0, 0x01);
+        w ^= y;
+        ut2.v = x0;
+        w ^= ut2.lo;
+
+        return w;
+    }
+}
+#endif /* NO_ASM */
+
+uint32_t fastcrc32(jint crc, Bytef * buf, jint len) {
+    const unsigned long FAR * timesXtoThe32 = crc_table;
+    intptr_t ibuf = (intptr_t) buf;
+    int log_align = 4;
+    int align = 1 << log_align;
+    int mask = align - 1;
+    int islop = (align - ibuf) & mask;
+    uint32_t c = ~crc;
+    int i = 0;
+
+    if (len - islop >= align) {
+        /* Handle bytes preceding 16-byte alignment. */
+        for (i = 0; i < islop; i++ ) {
+            uint32_t x0 = buf[i];
+            x0 = timesXtoThe32[(x0 ^ c) & 0xFF];
+            c = x0 ^ (c >> 8);
+        }
+        buf += i;
+        len -= i;
+
+        jint len_128bit = len >> log_align;
+
+        if (len_128bit > 0) {
+            uint64_t w = kernel(c, buf, len_128bit, K_struct);
+            /*
+             * 8 8-bit folds to compute 32-bit CRC.
+             */
+            w = timesXtoThe32[w & 0xFF] ^ (w >> 8);
+            w = timesXtoThe32[w & 0xFF] ^ (w >> 8);
+            w = timesXtoThe32[w & 0xFF] ^ (w >> 8);
+            w = timesXtoThe32[w & 0xFF] ^ (w >> 8);
+            w = timesXtoThe32[w & 0xFF] ^ (w >> 8);
+            w = timesXtoThe32[w & 0xFF] ^ (w >> 8);
+            w = timesXtoThe32[w & 0xFF] ^ (w >> 8);
+            w = timesXtoThe32[w & 0xFF] ^ (w >> 8);
+            c = (uint32_t) w;
+            i = len_128bit << log_align;
+        } else {
+            i = 0;
+        }
+    }
+    /* Handle short CRC and tail of long CRC */
+    for (; i < len; i++) {
+        uint32_t x0 = buf[i];
+        x0 = timesXtoThe32[(x0 ^ c) & 0xFF];
+        c = x0 ^ (c >> 8);
+    }
+    return ~c;
+}
+#endif
--- old/test/java/util/zip/TimeChecksum.java	2013-05-16 08:24:43.000000000 -0400
+++ new/test/java/util/zip/TimeChecksum.java	2013-05-16 08:24:43.000000000 -0400
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2011, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, 2013, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -120,6 +120,7 @@
         System.out.println("---------- Adler32 ----------");
         System.out.print("Warmup...");
         time(adler32, data, iters, len);
+        time(adler32, data, 2*iters, 16); // warmup short case, too
         time(adler32, ByteBuffer.wrap(data), iters);
         buf = ByteBuffer.allocateDirect(len);
         buf.put(data, 0, len);
@@ -162,6 +163,7 @@
         System.out.println("\n---------- CRC32 ----------");
         System.out.print("Warmup...");
         time(crc32, data, iters, len);
+        time(crc32, data, 2*iters, 16); // warmup short case, too
         time(crc32, ByteBuffer.wrap(data), iters);
         buf = ByteBuffer.allocateDirect(len);
         buf.put(data, 0, len);
--- /dev/null	2013-05-16 08:24:44.000000000 -0400
+++ new/test/java/util/zip/CRCandAdlerTest.java	2013-05-16 08:24:44.000000000 -0400
@@ -0,0 +1,220 @@
+/*
+ * Copyright (c) 2013, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+/*
+   @test
+   @bug 7088419
+   @summary Ensure that the byte-at-a-time, byte array, and DirectByteBuffer
+            methods of CRC32 and Adler32 are consistent across a range of inputs.
+ */
+
+import java.nio.ByteBuffer;
+import java.util.zip.Adler32;
+import java.util.zip.CRC32;
+import java.util.zip.Checksum;
+
+public class CRCandAdlerTest {
+
+    public static void main(String[] args) throws Exception {
+
+        byte[] b = initializedBytes(4096 * 4096);
+
+        {
+            CRC32 crc1 = new CRC32();
+            CRC32 crc2 = new CRC32();
+            CRC32 crc3 = new CRC32();
+            CRC32 crc4 = new CRC32();
+
+            crc1.update(b, 0, b.length);
+            updateSerial(crc2, b, 0, b.length);
+            updateDirect(crc3, b, 0, b.length);
+            updateSerialSlow(crc4, b, 0, b.length);
+
+            check(crc1, crc2);
+            check(crc3, crc4);
+            check(crc1, crc3);
+
+            crc1.update(17);
+            crc2.update(17);
+            crc3.update(17);
+            crc4.update(17);
+
+            crc1.update(b, 0, b.length);
+            updateSerial(crc2, b, 0, b.length);
+            updateDirect(crc3, b, 0, b.length);
+            updateSerialSlow(crc4, b, 0, b.length);
+
+            check(crc1, crc2);
+            check(crc3, crc4);
+            check(crc1, crc3);
+
+            report("finished huge crc", crc1, crc2, crc3, crc4);
+
+            for (int i = 0; i < 256; i++) {
+                for (int j = 0; j < 256; j += 1) {
+                    crc1.update(b, i, j);
+                    updateSerial(crc2, b, i, j);
+                    updateDirect(crc3, b, i, j);
+                    updateSerialSlow(crc4, b, i, j);
+
+                    check(crc1, crc2);
+                    check(crc3, crc4);
+                    check(crc1, crc3);
+
+                }
+            }
+
+            report("finished small survey crc", crc1, crc2, crc3, crc4);
+
+            for (int i = 0; i < 128; i+= 5) {
+                for (int j = 1024 * 1024; j < 1024 * 1024 + 128; j += 7) {
+                    crc1.update(b, i, j);
+                    updateSerial(crc2, b, i, j);
+                    updateDirect(crc3, b, i, j);
+                    updateSerialSlow(crc4, b, i, j);
+
+                    check(crc1, crc2);
+                    check(crc3, crc4);
+                    check(crc1, crc3);
+                }
+            }
+
+            report("finished large survey crc", crc1, crc2, crc3, crc4);
+        }
+
+        {
+            Adler32 crc1 = new Adler32();
+            Adler32 crc2 = new Adler32();
+            Adler32 crc3 = new Adler32();
+            Adler32 crc4 = new Adler32();
+            // Test long CRC.
+
+            crc1.update(b, 0, b.length);
+            updateSerial(crc2, b, 0, b.length);
+            updateDirect(crc3, b, 0, b.length);
+            updateSerialSlow(crc4, b, 0, b.length);
+
+            check(crc1, crc2);
+            check(crc3, crc4);
+            check(crc1, crc3);
+
+            crc1.update(17);
+            crc2.update(17);
+            crc3.update(17);
+            crc4.update(17);
+
+            crc1.update(b, 0, b.length);
+            updateSerial(crc2, b, 0, b.length);
+            updateDirect(crc3, b, 0, b.length);
+            updateSerialSlow(crc4, b, 0, b.length);
+
+            check(crc1, crc2);
+            check(crc3, crc4);
+            check(crc1, crc3);
+
+            report("finished huge adler32", crc1, crc2, crc3, crc4);
+
+            for (int i = 0; i < 256; i++) {
+                for (int j = 0; j < 256; j += 1) {
+                    crc1.update(b, i, j);
+                    updateSerial(crc2, b, i, j);
+                    updateDirect(crc3, b, i, j);
+                    updateSerialSlow(crc4, b, i, j);
+
+                    check(crc1, crc2);
+                    check(crc3, crc4);
+                    check(crc1, crc3);
+
+                }
+            }
+
+            report("finished small survey adler32", crc1, crc2, crc3, crc4);
+
+            for (int i = 0; i < 128; i+= 5) {
+                for (int j = 1024 * 1024; j < 1024 * 1024 + 128; j += 7) {
+                    crc1.update(b, i, j);
+                    updateSerial(crc2, b, i, j);
+                    updateDirect(crc3, b, i, j);
+                    updateSerialSlow(crc4, b, i, j);
+
+                    check(crc1, crc2);
+                    check(crc3, crc4);
+                    check(crc1, crc3);
+                }
+            }
+
+            report("finished large survey adler32", crc1, crc2, crc3, crc4);
+        }
+    }
+
+
+    private static void report(String s, Checksum crc1, Checksum crc2,
+            Checksum crc3, Checksum crc4) {
+        System.out.println(s + ", crc1 = " + crc1.getValue() +
+                ", crc2 = " + crc2.getValue()+
+                ", crc3 = " + crc3.getValue()+
+                ", crc4 = " + crc4.getValue());
+    }
+
+    private static void check(Checksum crc1, Checksum crc2) throws Exception {
+        if (crc1.getValue() != crc2.getValue()) {
+            String s = "value 1 = " + crc1.getValue() + ", value 2 = " + crc2.getValue();
+            System.err.println(s);
+            throw new Exception(s);
+        }
+    }
+
+    private static byte[] initializedBytes(int M) {
+        byte[] bytes = new byte[M];
+        for (int i = 0; i < bytes.length; i++) {
+            bytes[i] = (byte) i;
+        }
+        return bytes;
+    }
+
+    private static void updateSerial(Checksum crc, byte[] b, int start, int length) {
+        for (int i = 0; i < length; i++)
+            crc.update(b[i+start]);
+    }
+
+    private static void updateSerialSlow(Checksum crc, byte[] b, int start, int length) {
+        for (int i = 0; i < length; i++)
+            crc.update(b[i+start]);
+        crc.getValue();
+    }
+
+    private static void updateDirect(CRC32 crc3, byte[] b, int start, int length) {
+        ByteBuffer buf = ByteBuffer.allocateDirect(length);
+        buf.put(b, start, length);
+        buf.flip();
+        crc3.update(buf);
+    }
+
+    private static void updateDirect(Adler32 crc3, byte[] b, int start, int length) {
+        ByteBuffer buf = ByteBuffer.allocateDirect(length);
+        buf.put(b, start, length);
+        buf.flip();
+        crc3.update(buf);
+    }
+
+}