package org.openjdk;

import org.openjdk.jmh.annotations.Benchmark;
import org.openjdk.jmh.annotations.BenchmarkMode;
import org.openjdk.jmh.annotations.CompilerControl;
import org.openjdk.jmh.annotations.Fork;
import org.openjdk.jmh.annotations.Measurement;
import org.openjdk.jmh.annotations.Mode;
import org.openjdk.jmh.annotations.OutputTimeUnit;
import org.openjdk.jmh.annotations.Param;
import org.openjdk.jmh.annotations.Scope;
import org.openjdk.jmh.annotations.Setup;
import org.openjdk.jmh.annotations.State;
import org.openjdk.jmh.annotations.Warmup;
import sun.misc.Unsafe;

import java.lang.reflect.Field;
import java.util.concurrent.TimeUnit;

@State(Scope.Thread)
@Warmup(iterations = 5, time = 100, timeUnit = TimeUnit.MILLISECONDS)
@Measurement(iterations = 5, time = 100, timeUnit = TimeUnit.MILLISECONDS)
@Fork(5)
@BenchmarkMode(Mode.AverageTime)
@OutputTimeUnit(TimeUnit.MICROSECONDS)
public class Misaligned {

    /*
        "Performance Horror Stories" issue #(N+1): "Misaligned accesses"
          aleksey.shipilev@oracle.com, @shipilev

        It is a "common wisdom" (tm) that misaligned accesses have the associated performance
        penalties. Let's try to quantify these costs. The benchmark code is below, and the
        explanation for the benchmark choices is inlined there.

        The results and discussion follow the benchmark code.
    */

    private static final Unsafe U;

    private long aligned;
    private long misaligned;

    static {
        try {
            Field field = Unsafe.class.getDeclaredField("theUnsafe");
            field.setAccessible(true);
            U = (Unsafe) field.get(null);
        } catch (Exception e) {
            throw new IllegalStateException(e);
        }
    }

    /**
     * Total chunk size we are walking.
     */
    @Param({"4096", "65536", "1048576", "16777216"})
    private int size;

    /**
     * Should the test cross the cache line or not?
     */
    @Param({"false", "true"})
    private boolean crossCL;

    private int sizeMask;

    /**
     * Some large power of 2. Any cache line size that's power of 2 and less
     * than CACHE_LINE_MAX would be covered by the code.
     */
    private static final int CACHE_LINE_MAX = 256;

    /**
     * Walking stride for inlined PRNG. We walk the chunk randomly to avoid
     * streaming reads.
     */
    private static final int OFFSET_ADD = CACHE_LINE_MAX * 1337;

    @Setup
    public void init() {
        /*
         * Whatever the actual cache line size is, $aligned always starts at a cache line
         * boundary. Depending on $crossCL setting, we either step into the cache line,
         * thus making sure $misaligned is always within the cache line, or step out the
         * cache line for $misaligned.
         */

        long addr = U.allocateMemory(size + CACHE_LINE_MAX);
        aligned = (addr & ~(CACHE_LINE_MAX - 1)) + CACHE_LINE_MAX;
        misaligned = aligned + CACHE_LINE_MAX;

        if (crossCL) {
            misaligned -= 1;
        } else {
            misaligned += 1;
        }

        sizeMask = (size - 1);

        if (aligned % CACHE_LINE_MAX != 0) {
            throw new IllegalStateException("Base address is not aligned");
        }

        if ((size & (size - 1)) != 0) {
            throw new IllegalStateException("Size is not a power of two:" + size);
        }

        int off = 0;
        for (int c = 0; c < size; c++) {
            off = (off + OFFSET_ADD)  & sizeMask;
            if ((aligned + off) % 4 != 0) throw new IllegalStateException("Aligned address is not really aligned");
            if ((misaligned + off) % 4 == 0) throw new IllegalStateException("Misaligned address is really aligned");
        }
    }

    @Benchmark
    public void read_aligned() {
        int off = 0;
        int lSize = size;
        int lSizeMask = sizeMask;
        long base = aligned;
        for (int c = 0; c < lSize; c++) {
            off = (off + OFFSET_ADD) & lSizeMask;
            doReadWith(base + off);
        }
    }

    @Benchmark
    public void read_misaligned() {
        int off = 0;
        int lSize = size;
        int lSizeMask = sizeMask;
        long base = misaligned;
        for (int c = 0; c < lSize; c++) {
            off = (off + OFFSET_ADD) & lSizeMask;
            doReadWith(base + off);
        }
    }

    @Benchmark
    public void cas_aligned() {
        int off = 0;
        int lSize = size;
        int lSizeMask = sizeMask;
        long base = aligned;
        for (int c = 0; c < lSize; c++) {
            off = (off + OFFSET_ADD) & lSizeMask;
            doCasWith(base + off);
        }
    }

    @Benchmark
    public void cas_misaligned() {
        int off = 0;
        int lSize = size;
        int lSizeMask = sizeMask;
        long base = misaligned;
        for (int c = 0; c < lSize; c++) {
            off = (off + OFFSET_ADD) & lSizeMask;
            doCasWith(base + off);
        }
    }

    @Benchmark
    public void write_aligned() {
        int off = 0;
        int lSize = size;
        int lSizeMask = sizeMask;
        long base = aligned;
        for (int c = 0; c < lSize; c++) {
            off = (off + OFFSET_ADD) & lSizeMask;
            doWriteWith(base + off);
        }
    }

    @Benchmark
    public void write_misaligned() {
        int off = 0;
        int lSize = size;
        int lSizeMask = sizeMask;
        long base = misaligned;
        for (int c = 0; c < lSize; c++) {
            off = (off + OFFSET_ADD) & lSizeMask;
            doWriteWith(base + off);
        }
    }

    /**
     * This method should not be inlined, because otherwise compilers
     * are free to eliminate the memory read (dead-code elimination).
     * It also helps to avoid weird loop unrolling and ops coalescing effects.
     */
    @CompilerControl(CompilerControl.Mode.DONT_INLINE)
    private int doReadWith(long addr) {
        return U.getInt(addr);
    }

    /**
     * This method should not be inlined to avoid weird loop unrolling
     * and ops coalescing effects. Breaking the inlining will also serve better
     * comparability against read benchmarks.
     */
    @CompilerControl(CompilerControl.Mode.DONT_INLINE)
    private void doWriteWith(long addr) {
        U.putInt(addr, 42);
    }

    /**
     * This method should not be inlined to avoid weird loop unrolling
     * and ops coalescing effects. Breaking the inlining will also serve better
     * comparability against read benchmarks. Note that in JDK 8, there is
     * an getAndAddInt() method that gets intrinsifed to XADD on x86; but, we are
     * better off doing the plain CAS to avoid that optimization here.
     */
    @CompilerControl(CompilerControl.Mode.DONT_INLINE)
    private int doCasWith(long addr) {
        int v;
        do {
            v = U.getIntVolatile(null, addr);
        } while(!U.compareAndSwapInt(null, addr, v, v + 1));
        return v;
    }

    /*
    The interpretations and results below were validated by looking at generated code (with -prof perfasm),
    and hardware counters (-prof perf). If you are playing with nano-benchmarks like these, you *have* to
    do the same.

    =====================================================================================================

    PART I. READ TESTS

    First, read_* tests. Those are simpler and provide basic insights into the misaligned accesses behaviors.

    x86_64, i7-4790K (Haswell, 2014) @ 4.0 GHz, running 8u40, Linux x86_64:

        Benchmark                   (crossCL)    (size)  Mode  Cnt          Score      Error  Units

        Misaligned.read_aligned         false      4096  avgt   25         8.755 ±     0.086  us/op
        Misaligned.read_aligned         false     65536  avgt   25       139.500 ±     1.312  us/op
        Misaligned.read_aligned         false   1048576  avgt   25      2349.098 ±    35.448  us/op
        Misaligned.read_aligned         false  16777216  avgt   25     91687.487 ±   202.527  us/op

        Misaligned.read_aligned          true      4096  avgt   25         8.728 ±     0.056  us/op
        Misaligned.read_aligned          true     65536  avgt   25       139.393 ±     2.506  us/op
        Misaligned.read_aligned          true   1048576  avgt   25      2328.423 ±     7.654  us/op
        Misaligned.read_aligned          true  16777216  avgt   25     91699.157 ±   186.216  us/op

        Misaligned.read_misaligned      false      4096  avgt   25         8.720 ±     0.056  us/op
        Misaligned.read_misaligned      false     65536  avgt   25       139.228 ±     0.479  us/op
        Misaligned.read_misaligned      false   1048576  avgt   25      2331.398 ±    10.531  us/op
        Misaligned.read_misaligned      false  16777216  avgt   25     91600.554 ±   223.064  us/op

        Misaligned.read_misaligned       true      4096  avgt   25         9.054 ±     0.103  us/op
        Misaligned.read_misaligned       true     65536  avgt   25       143.906 ±     0.980  us/op
        Misaligned.read_misaligned       true   1048576  avgt   25      2824.426 ±    30.592  us/op
        Misaligned.read_misaligned       true  16777216  avgt   25    103685.366 ±   349.994  us/op

    These results may be interpreted as follows:
        * aligned reads do the same performance regardless of $crossCL setting, as it should be;
        * there is no effect for misaligned reads, within the cache line;
        * there is an effect at larger sizes; but that is explained by CPUs tracking the cache
          coherency in cache line chunks, and cross-cache line reads have *worse* cache footprint,
          since they access and evict *two* cache lines on each read;

    -----------------------------------------------------------------------------------------------------

    x86, Atom Z530 (Silverthorne, 2008) @ 1.60GHz, running 8u40, Linux i586:

        Benchmark                   (crossCL)    (size)  Mode  Cnt         Score        Error  Units

        Misaligned.read_aligned         false      4096  avgt   25        86.965 ±      3.849  us/op
        Misaligned.read_aligned         false     65536  avgt   25      2351.004 ±     58.902  us/op
        Misaligned.read_aligned         false   1048576  avgt   25     37352.877 ±    740.877  us/op
        Misaligned.read_aligned         false  16777216  avgt   25   2340603.212 ±  63923.937  us/op

        Misaligned.read_aligned          true      4096  avgt   25        86.204 ±      2.773  us/op
        Misaligned.read_aligned          true     65536  avgt   25      2345.983 ±     50.380  us/op
        Misaligned.read_aligned          true   1048576  avgt   25     37351.358 ±    712.102  us/op
        Misaligned.read_aligned          true  16777216  avgt   25   2341722.719 ±  64223.248  us/op

        Misaligned.read_misaligned      false      4096  avgt   25        86.296 ±      2.882  us/op
        Misaligned.read_misaligned      false     65536  avgt   25      2346.996 ±     48.296  us/op
        Misaligned.read_misaligned      false   1048576  avgt   25     37395.121 ±    684.096  us/op
        Misaligned.read_misaligned      false  16777216  avgt   25   2296262.297 ±   4641.360  us/op

        Misaligned.read_misaligned       true      4096  avgt   25       124.400 ±      4.495  us/op
        Misaligned.read_misaligned       true     65536  avgt   25      4134.313 ±     44.997  us/op
        Misaligned.read_misaligned       true   1048576  avgt   25     49512.139 ±   1681.804  us/op
        Misaligned.read_misaligned       true  16777216  avgt   25   4532555.518 ± 171744.886  us/op

    Interpretation:
        * The results are similar to Haswell, but...
        * The cost of cross-cache-line misaligned read is now visible even at small sizes
        * The performance hit when we don't fit into cache is much worse.

    -----------------------------------------------------------------------------------------------------

    ARMv7, Exynos 4412 Prime (Cortex-A9, 2012) @ 1.7 GHz, JDK 8 EA, Arch Linux (ARM):

        Benchmark                   (crossCL)    (size)  Mode  Cnt       Score       Error  Units

        Misaligned.read_aligned         false      4096  avgt   25       46.924 ±    0.414  us/op
        Misaligned.read_aligned         false     65536  avgt   25      732.457 ±    3.646  us/op
        Misaligned.read_aligned         false   1048576  avgt   25    17549.447 ±   18.741  us/op
        Misaligned.read_aligned         false  16777216  avgt   25   311406.725 ± 4615.227  us/op

        Misaligned.read_aligned          true      4096  avgt   25       47.263 ±    0.850  us/op
        Misaligned.read_aligned          true     65536  avgt   25      733.148 ±    2.686  us/op
        Misaligned.read_aligned          true   1048576  avgt   25    17555.494 ±   15.131  us/op
        Misaligned.read_aligned          true  16777216  avgt   25   316229.064 ± 6874.816  us/op

        Misaligned.read_misaligned      false      4096  avgt   25       50.357 ±    0.463  us/op
        Misaligned.read_misaligned      false     65536  avgt   25      793.118 ±    4.794  us/op
        Misaligned.read_misaligned      false   1048576  avgt   25    18865.431 ±  284.503  us/op
        Misaligned.read_misaligned      false  16777216  avgt   25   331968.092 ± 5213.652  us/op

        Misaligned.read_misaligned       true      4096  avgt   25       59.035 ±    0.241  us/op
        Misaligned.read_misaligned       true     65536  avgt   25      948.751 ±   16.876  us/op
        Misaligned.read_misaligned       true   1048576  avgt   25    22746.179 ±  388.915  us/op
        Misaligned.read_misaligned       true  16777216  avgt   25   386329.428 ± 3644.090  us/op

    These results may be interpreted as follows:
        * aligned reads do the same performance regardless of $crossCL setting, as it should be;
        * misaligned reads within the cache line are now experiencing a bit of a hit;
        * again, cross-cache line reads experience similar hit as x86 does;

    Aside: ARM hardware cannot do misaligned accesses, so kernel has to assist with it, see:
       https://www.kernel.org/doc/Documentation/arm/mem_alignment

    =====================================================================================================

    PART II. WRITE TESTS

    x86_64, i7-4790K (Haswell, 2014) @ 4.0 GHz, running 8u40, Linux x86_64:

        Benchmark                    (crossCL)    (size)  Mode  Cnt       Score      Error  Units

        Misaligned.write_aligned         false      4096  avgt   25       8.965 ±    0.760  us/op
        Misaligned.write_aligned         false     65536  avgt   25     311.255 ±   12.849  us/op
        Misaligned.write_aligned         false   1048576  avgt   25   11045.313 ±  101.177  us/op
        Misaligned.write_aligned         false  16777216  avgt   25  211496.696 ± 2995.297  us/op

        Misaligned.write_aligned          true      4096  avgt   25       8.872 ±    0.176  us/op
        Misaligned.write_aligned          true     65536  avgt   25     305.690 ±    3.549  us/op
        Misaligned.write_aligned          true   1048576  avgt   25   11180.258 ±  127.326  us/op
        Misaligned.write_aligned          true  16777216  avgt   25  211525.284 ± 3115.075  us/op

        Misaligned.write_misaligned      false      4096  avgt   25       8.767 ±    0.086  us/op
        Misaligned.write_misaligned      false     65536  avgt   25     308.465 ±    3.755  us/op
        Misaligned.write_misaligned      false   1048576  avgt   25   11079.203 ±   47.419  us/op
        Misaligned.write_misaligned      false  16777216  avgt   25  211450.957 ± 3006.821  us/op

        Misaligned.write_misaligned       true      4096  avgt   25       9.292 ±    0.060  us/op
        Misaligned.write_misaligned       true     65536  avgt   25     413.991 ±    2.174  us/op
        Misaligned.write_misaligned       true   1048576  avgt   25   13051.035 ±  170.585  us/op
        Misaligned.write_misaligned       true  16777216  avgt   25  247012.148 ± 2081.628  us/op

    These results are explained exactly like the read tests: misaligned tests perform the same,
    cross-cache-line accesses produce more memory traffic.

    -----------------------------------------------------------------------------------------------------

    x86, Atom Z530 (Silverthorne, 2008) @ 1.60GHz, running 8u40, Linux i586:

        Benchmark                    (crossCL)    (size)  Mode  Cnt        Score       Error  Units

        Misaligned.write_aligned         false      4096  avgt   25       90.347 ±     4.916  us/op
        Misaligned.write_aligned         false     65536  avgt   25     2134.940 ±    74.604  us/op
        Misaligned.write_aligned         false   1048576  avgt   25    87249.811 ±  3066.937  us/op
        Misaligned.write_aligned         false  16777216  avgt   25  1246839.018 ±  8295.413  us/op

        Misaligned.write_aligned          true      4096  avgt   25       87.664 ±     3.186  us/op
        Misaligned.write_aligned          true     65536  avgt   25     2117.784 ±    53.355  us/op
        Misaligned.write_aligned          true   1048576  avgt   25    87156.902 ±  3914.265  us/op
        Misaligned.write_aligned          true  16777216  avgt   25  1257505.324 ± 10123.354  us/op

        Misaligned.write_misaligned      false      4096  avgt   25       87.970 ±     3.888  us/op
        Misaligned.write_misaligned      false     65536  avgt   25     2110.528 ±    38.906  us/op
        Misaligned.write_misaligned      false   1048576  avgt   25    86155.333 ±  1171.532  us/op
        Misaligned.write_misaligned      false  16777216  avgt   25  1258761.385 ±  9412.386  us/op

        Misaligned.write_misaligned       true      4096  avgt   25      125.989 ±     3.079  us/op
        Misaligned.write_misaligned       true     65536  avgt   25     3237.641 ±    81.833  us/op
        Misaligned.write_misaligned       true   1048576  avgt   25   211490.981 ±  7147.815  us/op
        Misaligned.write_misaligned       true  16777216  avgt   25  3115393.442 ± 14133.420  us/op

    Interpretation:
        * The results are similar to Haswell, but...
        * The cost of cross-cache-line misaligned write is now visible even at small sizes;
        * The performance hit when we don't fit into cache is much worse.

    -----------------------------------------------------------------------------------------------------

    ARMv7, Exynos 4412 Prime (Cortex-A9, 2012) @ 1.7 GHz, JDK 8 EA, Arch Linux (ARM):

        Benchmark                    (crossCL)    (size)  Mode  Cnt        Score      Error  Units

        Misaligned.write_aligned         false      4096  avgt   25       47.688 ±    1.186  us/op
        Misaligned.write_aligned         false     65536  avgt   25      740.984 ±    1.665  us/op
        Misaligned.write_aligned         false   1048576  avgt   25    22083.708 ± 3477.479  us/op
        Misaligned.write_aligned         false  16777216  avgt   25  1282259.126 ±  917.745  us/op

        Misaligned.write_aligned          true      4096  avgt   25       47.612 ±    1.144  us/op
        Misaligned.write_aligned          true     65536  avgt   25      742.014 ±    2.182  us/op
        Misaligned.write_aligned          true   1048576  avgt   25    21649.346 ± 2383.012  us/op
        Misaligned.write_aligned          true  16777216  avgt   25  1282411.425 ±  952.226  us/op

        Misaligned.write_misaligned      false      4096  avgt   25       46.957 ±    0.170  us/op
        Misaligned.write_misaligned      false     65536  avgt   25      745.223 ±    2.648  us/op
        Misaligned.write_misaligned      false   1048576  avgt   25    24553.741 ± 3796.744  us/op
        Misaligned.write_misaligned      false  16777216  avgt   25  1282430.109 ±  610.716  us/op

        Misaligned.write_misaligned       true      4096  avgt   25       57.260 ±    0.382  us/op
        Misaligned.write_misaligned       true     65536  avgt   25      909.365 ±    6.851  us/op
        Misaligned.write_misaligned       true   1048576  avgt   25    25885.933 ± 1653.189  us/op
        Misaligned.write_misaligned       true  16777216  avgt   25  2342911.594 ± 1039.060  us/op

    These results are explained exactly like the read tests: misaligned tests perform slightly slower,
    cross-cache-line misaligned accesses produce more memory traffic.

    =====================================================================================================

    PART III. COMPARE-AND-SWAP TESTS

    In the end, onto interlocked accesses, where things get haywire.

    x86_64, i7-4790K (Haswell, 2014) @ 4.0 GHz, running 8u40, Linux x86_64:

        Benchmark                   (crossCL)    (size)  Mode  Cnt         Score       Error  Units
        Misaligned.cas_aligned          false      4096  avgt   25        26.976 ±     0.101  us/op
        Misaligned.cas_aligned          false     65536  avgt   25       438.581 ±     1.975  us/op
        Misaligned.cas_aligned          false   1048576  avgt   25      7611.854 ±    55.586  us/op
        Misaligned.cas_aligned          false  16777216  avgt   25    324815.252 ±  4148.163  us/op

        Misaligned.cas_aligned           true      4096  avgt   25        27.121 ±     0.289  us/op
        Misaligned.cas_aligned           true     65536  avgt   25       458.464 ±    29.846  us/op
        Misaligned.cas_aligned           true   1048576  avgt   25      7681.182 ±   352.401  us/op
        Misaligned.cas_aligned           true  16777216  avgt   25    328297.240 ± 18495.807  us/op

        Misaligned.cas_misaligned       false      4096  avgt   25        26.957 ±     0.097  us/op
        Misaligned.cas_misaligned       false     65536  avgt   25       444.175 ±     3.015  us/op
        Misaligned.cas_misaligned       false   1048576  avgt   25      7634.348 ±    51.966  us/op
        Misaligned.cas_misaligned       false  16777216  avgt   25    321333.811 ±  1993.466  us/op

        Misaligned.cas_misaligned        true      4096  avgt   25      3120.221 ±    29.095  us/op
        Misaligned.cas_misaligned        true     65536  avgt   25     49793.036 ±   455.091  us/op
        Misaligned.cas_misaligned        true   1048576  avgt   25    784908.241 ±  4880.751  us/op
        Misaligned.cas_misaligned        true  16777216  avgt   25  12488689.288 ± 37988.009  us/op

    These results may be interpreted as follows:
        * aligned CASes do the same performance regardless of $crossCL setting, as it should be;
        * misaligned CASes within the cache line perform the same as the aligned;
        * now, misaligned CASes crossing the cache line are experiencing huge slowdowns, even on
          small sizes; in fact, the slowdowns are in order of 100x, which means 1 us per access!

    -----------------------------------------------------------------------------------------------------

    x86, Atom Z530 (Silverthorne, 2008) @ 1.60GHz, running 8u40, Linux i586:

        Benchmark                   (crossCL)    (size)  Mode  Cnt         Score        Error  Units

        Misaligned.cas_aligned          false      4096  avgt   25       150.718 ±      8.568  us/op
        Misaligned.cas_aligned          false     65536  avgt   25      3478.051 ±    163.594  us/op
        Misaligned.cas_aligned          false   1048576  avgt   25    220773.231 ±   3336.234  us/op
        Misaligned.cas_aligned          false  16777216  avgt   25   3459877.420 ±  18434.285  us/op

        Misaligned.cas_aligned           true      4096  avgt   25       145.297 ±      4.986  us/op
        Misaligned.cas_aligned           true     65536  avgt   25      3405.696 ±     72.922  us/op
        Misaligned.cas_aligned           true   1048576  avgt   25    221868.942 ±   2114.731  us/op
        Misaligned.cas_aligned           true  16777216  avgt   25   3451625.551 ±   3404.674  us/op

        Misaligned.cas_misaligned       false      4096  avgt   25       144.951 ±      4.454  us/op
        Misaligned.cas_misaligned       false     65536  avgt   25      3425.299 ±    102.673  us/op
        Misaligned.cas_misaligned       false   1048576  avgt   25    222924.231 ±   2813.820  us/op
        Misaligned.cas_misaligned       false  16777216  avgt   25   3464066.536 ±  16515.780  us/op

        Misaligned.cas_misaligned        true      4096  avgt   25      2546.885 ±     49.823  us/op
        Misaligned.cas_misaligned        true     65536  avgt   25     41499.723 ±    629.741  us/op
        Misaligned.cas_misaligned        true   1048576  avgt   25    672641.226 ±   3051.502  us/op
        Misaligned.cas_misaligned        true  16777216  avgt   25  10643164.064 ±  10521.791  us/op

    The results are similar to Haswell.

    -----------------------------------------------------------------------------------------------------

    ARMv7, Exynos 4412 Prime (Cortex-A9, 2012) @ 1.7 GHz, JDK 8 EA, Arch Linux (ARM):

        Benchmark                   (crossCL)    (size)  Mode  Cnt        Score      Error  Units

        Misaligned.cas_aligned          false      4096  avgt   25      142.309 ±    0.150  us/op
        Misaligned.cas_aligned          false     65536  avgt   25     3071.501 ±    3.877  us/op
        Misaligned.cas_aligned          false   1048576  avgt   25    80511.210 ± 7474.018  us/op
        Misaligned.cas_aligned          false  16777216  avgt   25  2870433.376 ± 4760.159  us/op

        Misaligned.cas_aligned           true      4096  avgt   25      142.108 ±    0.399  us/op
        Misaligned.cas_aligned           true     65536  avgt   25     3073.019 ±    2.462  us/op
        Misaligned.cas_aligned           true   1048576  avgt   25    89758.528 ± 4102.178  us/op
        Misaligned.cas_aligned           true  16777216  avgt   25  2862898.786 ± 3203.916  us/op

    cas_misaligned tests all failed with SIGBUS caused by unfixable misaligned access:

        #
        # A fatal error has been detected by the Java Runtime Environment:
        #
        #  SIGBUS (0x7) at pc=0xb3f00630, pid=32042, tid=2443179104
        #
        # JRE version: Java(TM) SE Runtime Environment (8.0-b132) (build 1.8.0-b132)
        # Java VM: Java HotSpot(TM) Server VM (25.0-b70 mixed mode linux-arm )
        # Problematic frame:
        # v  ~StubRoutines::atomic_cmpxchg

    =====================================================================================================

    CONCLUSION: Misaligned accesses are still bad. (Off-heap developers,) beware.

    JVMs (at least HotSpot) will align fields and array elements to their natural alignment, so this
    an issue only if you are doing the naked memory accesses on your own. See these JOL examples:
      http://hg.openjdk.java.net/code-tools/jol/file/tip/jol-samples/src/main/java/org/openjdk/jol/samples/JOLSample_02_Alignment.java
      http://hg.openjdk.java.net/code-tools/jol/file/tip/jol-samples/src/main/java/org/openjdk/jol/samples/JOLSample_03_Packing.java

   */

}