package org.openjdk; import org.openjdk.jmh.annotations.Benchmark; import org.openjdk.jmh.annotations.BenchmarkMode; import org.openjdk.jmh.annotations.CompilerControl; import org.openjdk.jmh.annotations.Fork; import org.openjdk.jmh.annotations.Measurement; import org.openjdk.jmh.annotations.Mode; import org.openjdk.jmh.annotations.OutputTimeUnit; import org.openjdk.jmh.annotations.Param; import org.openjdk.jmh.annotations.Scope; import org.openjdk.jmh.annotations.Setup; import org.openjdk.jmh.annotations.State; import org.openjdk.jmh.annotations.Warmup; import sun.misc.Unsafe; import java.lang.reflect.Field; import java.util.concurrent.TimeUnit; @State(Scope.Thread) @Warmup(iterations = 5, time = 100, timeUnit = TimeUnit.MILLISECONDS) @Measurement(iterations = 5, time = 100, timeUnit = TimeUnit.MILLISECONDS) @Fork(5) @BenchmarkMode(Mode.AverageTime) @OutputTimeUnit(TimeUnit.MICROSECONDS) public class Misaligned { /* "Performance Horror Stories" issue #(N+1): "Misaligned accesses" aleksey.shipilev@oracle.com, @shipilev It is a "common wisdom" (tm) that misaligned accesses have the associated performance penalties. Let's try to quantify these costs. The benchmark code is below, and the explanation for the benchmark choices is inlined there. The results and discussion follow the benchmark code. */ private static final Unsafe U; private long aligned; private long misaligned; static { try { Field field = Unsafe.class.getDeclaredField("theUnsafe"); field.setAccessible(true); U = (Unsafe) field.get(null); } catch (Exception e) { throw new IllegalStateException(e); } } /** * Total chunk size we are walking. */ @Param({"4096", "65536", "1048576", "16777216"}) private int size; /** * Should the test cross the cache line or not? */ @Param({"false", "true"}) private boolean crossCL; private int sizeMask; /** * Some large power of 2. Any cache line size that's power of 2 and less * than CACHE_LINE_MAX would be covered by the code. */ private static final int CACHE_LINE_MAX = 256; /** * Walking stride for inlined PRNG. We walk the chunk randomly to avoid * streaming reads. */ private static final int OFFSET_ADD = CACHE_LINE_MAX * 1337; @Setup public void init() { /* * Whatever the actual cache line size is, $aligned always starts at a cache line * boundary. Depending on $crossCL setting, we either step into the cache line, * thus making sure $misaligned is always within the cache line, or step out the * cache line for $misaligned. */ long addr = U.allocateMemory(size + CACHE_LINE_MAX); aligned = (addr & ~(CACHE_LINE_MAX - 1)) + CACHE_LINE_MAX; misaligned = aligned + CACHE_LINE_MAX; if (crossCL) { misaligned -= 1; } else { misaligned += 1; } sizeMask = (size - 1); if (aligned % CACHE_LINE_MAX != 0) { throw new IllegalStateException("Base address is not aligned"); } if ((size & (size - 1)) != 0) { throw new IllegalStateException("Size is not a power of two:" + size); } int off = 0; for (int c = 0; c < size; c++) { off = (off + OFFSET_ADD) & sizeMask; if ((aligned + off) % 4 != 0) throw new IllegalStateException("Aligned address is not really aligned"); if ((misaligned + off) % 4 == 0) throw new IllegalStateException("Misaligned address is really aligned"); } } @Benchmark public void read_aligned() { int off = 0; int lSize = size; int lSizeMask = sizeMask; long base = aligned; for (int c = 0; c < lSize; c++) { off = (off + OFFSET_ADD) & lSizeMask; doReadWith(base + off); } } @Benchmark public void read_misaligned() { int off = 0; int lSize = size; int lSizeMask = sizeMask; long base = misaligned; for (int c = 0; c < lSize; c++) { off = (off + OFFSET_ADD) & lSizeMask; doReadWith(base + off); } } @Benchmark public void cas_aligned() { int off = 0; int lSize = size; int lSizeMask = sizeMask; long base = aligned; for (int c = 0; c < lSize; c++) { off = (off + OFFSET_ADD) & lSizeMask; doCasWith(base + off); } } @Benchmark public void cas_misaligned() { int off = 0; int lSize = size; int lSizeMask = sizeMask; long base = misaligned; for (int c = 0; c < lSize; c++) { off = (off + OFFSET_ADD) & lSizeMask; doCasWith(base + off); } } @Benchmark public void write_aligned() { int off = 0; int lSize = size; int lSizeMask = sizeMask; long base = aligned; for (int c = 0; c < lSize; c++) { off = (off + OFFSET_ADD) & lSizeMask; doWriteWith(base + off); } } @Benchmark public void write_misaligned() { int off = 0; int lSize = size; int lSizeMask = sizeMask; long base = misaligned; for (int c = 0; c < lSize; c++) { off = (off + OFFSET_ADD) & lSizeMask; doWriteWith(base + off); } } /** * This method should not be inlined, because otherwise compilers * are free to eliminate the memory read (dead-code elimination). * It also helps to avoid weird loop unrolling and ops coalescing effects. */ @CompilerControl(CompilerControl.Mode.DONT_INLINE) private int doReadWith(long addr) { return U.getInt(addr); } /** * This method should not be inlined to avoid weird loop unrolling * and ops coalescing effects. Breaking the inlining will also serve better * comparability against read benchmarks. */ @CompilerControl(CompilerControl.Mode.DONT_INLINE) private void doWriteWith(long addr) { U.putInt(addr, 42); } /** * This method should not be inlined to avoid weird loop unrolling * and ops coalescing effects. Breaking the inlining will also serve better * comparability against read benchmarks. Note that in JDK 8, there is * an getAndAddInt() method that gets intrinsifed to XADD on x86; but, we are * better off doing the plain CAS to avoid that optimization here. */ @CompilerControl(CompilerControl.Mode.DONT_INLINE) private int doCasWith(long addr) { int v; do { v = U.getIntVolatile(null, addr); } while(!U.compareAndSwapInt(null, addr, v, v + 1)); return v; } /* The interpretations and results below were validated by looking at generated code (with -prof perfasm), and hardware counters (-prof perf). If you are playing with nano-benchmarks like these, you *have* to do the same. ===================================================================================================== PART I. READ TESTS First, read_* tests. Those are simpler and provide basic insights into the misaligned accesses behaviors. x86_64, i7-4790K (Haswell, 2014) @ 4.0 GHz, running 8u40, Linux x86_64: Benchmark (crossCL) (size) Mode Cnt Score Error Units Misaligned.read_aligned false 4096 avgt 25 8.755 ± 0.086 us/op Misaligned.read_aligned false 65536 avgt 25 139.500 ± 1.312 us/op Misaligned.read_aligned false 1048576 avgt 25 2349.098 ± 35.448 us/op Misaligned.read_aligned false 16777216 avgt 25 91687.487 ± 202.527 us/op Misaligned.read_aligned true 4096 avgt 25 8.728 ± 0.056 us/op Misaligned.read_aligned true 65536 avgt 25 139.393 ± 2.506 us/op Misaligned.read_aligned true 1048576 avgt 25 2328.423 ± 7.654 us/op Misaligned.read_aligned true 16777216 avgt 25 91699.157 ± 186.216 us/op Misaligned.read_misaligned false 4096 avgt 25 8.720 ± 0.056 us/op Misaligned.read_misaligned false 65536 avgt 25 139.228 ± 0.479 us/op Misaligned.read_misaligned false 1048576 avgt 25 2331.398 ± 10.531 us/op Misaligned.read_misaligned false 16777216 avgt 25 91600.554 ± 223.064 us/op Misaligned.read_misaligned true 4096 avgt 25 9.054 ± 0.103 us/op Misaligned.read_misaligned true 65536 avgt 25 143.906 ± 0.980 us/op Misaligned.read_misaligned true 1048576 avgt 25 2824.426 ± 30.592 us/op Misaligned.read_misaligned true 16777216 avgt 25 103685.366 ± 349.994 us/op These results may be interpreted as follows: * aligned reads do the same performance regardless of $crossCL setting, as it should be; * there is no effect for misaligned reads, within the cache line; * there is an effect at larger sizes; but that is explained by CPUs tracking the cache coherency in cache line chunks, and cross-cache line reads have *worse* cache footprint, since they access and evict *two* cache lines on each read; ----------------------------------------------------------------------------------------------------- x86, Atom Z530 (Silverthorne, 2008) @ 1.60GHz, running 8u40, Linux i586: Benchmark (crossCL) (size) Mode Cnt Score Error Units Misaligned.read_aligned false 4096 avgt 25 86.965 ± 3.849 us/op Misaligned.read_aligned false 65536 avgt 25 2351.004 ± 58.902 us/op Misaligned.read_aligned false 1048576 avgt 25 37352.877 ± 740.877 us/op Misaligned.read_aligned false 16777216 avgt 25 2340603.212 ± 63923.937 us/op Misaligned.read_aligned true 4096 avgt 25 86.204 ± 2.773 us/op Misaligned.read_aligned true 65536 avgt 25 2345.983 ± 50.380 us/op Misaligned.read_aligned true 1048576 avgt 25 37351.358 ± 712.102 us/op Misaligned.read_aligned true 16777216 avgt 25 2341722.719 ± 64223.248 us/op Misaligned.read_misaligned false 4096 avgt 25 86.296 ± 2.882 us/op Misaligned.read_misaligned false 65536 avgt 25 2346.996 ± 48.296 us/op Misaligned.read_misaligned false 1048576 avgt 25 37395.121 ± 684.096 us/op Misaligned.read_misaligned false 16777216 avgt 25 2296262.297 ± 4641.360 us/op Misaligned.read_misaligned true 4096 avgt 25 124.400 ± 4.495 us/op Misaligned.read_misaligned true 65536 avgt 25 4134.313 ± 44.997 us/op Misaligned.read_misaligned true 1048576 avgt 25 49512.139 ± 1681.804 us/op Misaligned.read_misaligned true 16777216 avgt 25 4532555.518 ± 171744.886 us/op Interpretation: * The results are similar to Haswell, but... * The cost of cross-cache-line misaligned read is now visible even at small sizes * The performance hit when we don't fit into cache is much worse. ----------------------------------------------------------------------------------------------------- ARMv7, Exynos 4412 Prime (Cortex-A9, 2012) @ 1.7 GHz, JDK 8 EA, Arch Linux (ARM): Benchmark (crossCL) (size) Mode Cnt Score Error Units Misaligned.read_aligned false 4096 avgt 25 46.924 ± 0.414 us/op Misaligned.read_aligned false 65536 avgt 25 732.457 ± 3.646 us/op Misaligned.read_aligned false 1048576 avgt 25 17549.447 ± 18.741 us/op Misaligned.read_aligned false 16777216 avgt 25 311406.725 ± 4615.227 us/op Misaligned.read_aligned true 4096 avgt 25 47.263 ± 0.850 us/op Misaligned.read_aligned true 65536 avgt 25 733.148 ± 2.686 us/op Misaligned.read_aligned true 1048576 avgt 25 17555.494 ± 15.131 us/op Misaligned.read_aligned true 16777216 avgt 25 316229.064 ± 6874.816 us/op Misaligned.read_misaligned false 4096 avgt 25 50.357 ± 0.463 us/op Misaligned.read_misaligned false 65536 avgt 25 793.118 ± 4.794 us/op Misaligned.read_misaligned false 1048576 avgt 25 18865.431 ± 284.503 us/op Misaligned.read_misaligned false 16777216 avgt 25 331968.092 ± 5213.652 us/op Misaligned.read_misaligned true 4096 avgt 25 59.035 ± 0.241 us/op Misaligned.read_misaligned true 65536 avgt 25 948.751 ± 16.876 us/op Misaligned.read_misaligned true 1048576 avgt 25 22746.179 ± 388.915 us/op Misaligned.read_misaligned true 16777216 avgt 25 386329.428 ± 3644.090 us/op These results may be interpreted as follows: * aligned reads do the same performance regardless of $crossCL setting, as it should be; * misaligned reads within the cache line are now experiencing a bit of a hit; * again, cross-cache line reads experience similar hit as x86 does; Aside: ARM hardware cannot do misaligned accesses, so kernel has to assist with it, see: https://www.kernel.org/doc/Documentation/arm/mem_alignment ===================================================================================================== PART II. WRITE TESTS x86_64, i7-4790K (Haswell, 2014) @ 4.0 GHz, running 8u40, Linux x86_64: Benchmark (crossCL) (size) Mode Cnt Score Error Units Misaligned.write_aligned false 4096 avgt 25 8.965 ± 0.760 us/op Misaligned.write_aligned false 65536 avgt 25 311.255 ± 12.849 us/op Misaligned.write_aligned false 1048576 avgt 25 11045.313 ± 101.177 us/op Misaligned.write_aligned false 16777216 avgt 25 211496.696 ± 2995.297 us/op Misaligned.write_aligned true 4096 avgt 25 8.872 ± 0.176 us/op Misaligned.write_aligned true 65536 avgt 25 305.690 ± 3.549 us/op Misaligned.write_aligned true 1048576 avgt 25 11180.258 ± 127.326 us/op Misaligned.write_aligned true 16777216 avgt 25 211525.284 ± 3115.075 us/op Misaligned.write_misaligned false 4096 avgt 25 8.767 ± 0.086 us/op Misaligned.write_misaligned false 65536 avgt 25 308.465 ± 3.755 us/op Misaligned.write_misaligned false 1048576 avgt 25 11079.203 ± 47.419 us/op Misaligned.write_misaligned false 16777216 avgt 25 211450.957 ± 3006.821 us/op Misaligned.write_misaligned true 4096 avgt 25 9.292 ± 0.060 us/op Misaligned.write_misaligned true 65536 avgt 25 413.991 ± 2.174 us/op Misaligned.write_misaligned true 1048576 avgt 25 13051.035 ± 170.585 us/op Misaligned.write_misaligned true 16777216 avgt 25 247012.148 ± 2081.628 us/op These results are explained exactly like the read tests: misaligned tests perform the same, cross-cache-line accesses produce more memory traffic. ----------------------------------------------------------------------------------------------------- x86, Atom Z530 (Silverthorne, 2008) @ 1.60GHz, running 8u40, Linux i586: Benchmark (crossCL) (size) Mode Cnt Score Error Units Misaligned.write_aligned false 4096 avgt 25 90.347 ± 4.916 us/op Misaligned.write_aligned false 65536 avgt 25 2134.940 ± 74.604 us/op Misaligned.write_aligned false 1048576 avgt 25 87249.811 ± 3066.937 us/op Misaligned.write_aligned false 16777216 avgt 25 1246839.018 ± 8295.413 us/op Misaligned.write_aligned true 4096 avgt 25 87.664 ± 3.186 us/op Misaligned.write_aligned true 65536 avgt 25 2117.784 ± 53.355 us/op Misaligned.write_aligned true 1048576 avgt 25 87156.902 ± 3914.265 us/op Misaligned.write_aligned true 16777216 avgt 25 1257505.324 ± 10123.354 us/op Misaligned.write_misaligned false 4096 avgt 25 87.970 ± 3.888 us/op Misaligned.write_misaligned false 65536 avgt 25 2110.528 ± 38.906 us/op Misaligned.write_misaligned false 1048576 avgt 25 86155.333 ± 1171.532 us/op Misaligned.write_misaligned false 16777216 avgt 25 1258761.385 ± 9412.386 us/op Misaligned.write_misaligned true 4096 avgt 25 125.989 ± 3.079 us/op Misaligned.write_misaligned true 65536 avgt 25 3237.641 ± 81.833 us/op Misaligned.write_misaligned true 1048576 avgt 25 211490.981 ± 7147.815 us/op Misaligned.write_misaligned true 16777216 avgt 25 3115393.442 ± 14133.420 us/op Interpretation: * The results are similar to Haswell, but... * The cost of cross-cache-line misaligned write is now visible even at small sizes; * The performance hit when we don't fit into cache is much worse. ----------------------------------------------------------------------------------------------------- ARMv7, Exynos 4412 Prime (Cortex-A9, 2012) @ 1.7 GHz, JDK 8 EA, Arch Linux (ARM): Benchmark (crossCL) (size) Mode Cnt Score Error Units Misaligned.write_aligned false 4096 avgt 25 47.688 ± 1.186 us/op Misaligned.write_aligned false 65536 avgt 25 740.984 ± 1.665 us/op Misaligned.write_aligned false 1048576 avgt 25 22083.708 ± 3477.479 us/op Misaligned.write_aligned false 16777216 avgt 25 1282259.126 ± 917.745 us/op Misaligned.write_aligned true 4096 avgt 25 47.612 ± 1.144 us/op Misaligned.write_aligned true 65536 avgt 25 742.014 ± 2.182 us/op Misaligned.write_aligned true 1048576 avgt 25 21649.346 ± 2383.012 us/op Misaligned.write_aligned true 16777216 avgt 25 1282411.425 ± 952.226 us/op Misaligned.write_misaligned false 4096 avgt 25 46.957 ± 0.170 us/op Misaligned.write_misaligned false 65536 avgt 25 745.223 ± 2.648 us/op Misaligned.write_misaligned false 1048576 avgt 25 24553.741 ± 3796.744 us/op Misaligned.write_misaligned false 16777216 avgt 25 1282430.109 ± 610.716 us/op Misaligned.write_misaligned true 4096 avgt 25 57.260 ± 0.382 us/op Misaligned.write_misaligned true 65536 avgt 25 909.365 ± 6.851 us/op Misaligned.write_misaligned true 1048576 avgt 25 25885.933 ± 1653.189 us/op Misaligned.write_misaligned true 16777216 avgt 25 2342911.594 ± 1039.060 us/op These results are explained exactly like the read tests: misaligned tests perform slightly slower, cross-cache-line misaligned accesses produce more memory traffic. ===================================================================================================== PART III. COMPARE-AND-SWAP TESTS In the end, onto interlocked accesses, where things get haywire. x86_64, i7-4790K (Haswell, 2014) @ 4.0 GHz, running 8u40, Linux x86_64: Benchmark (crossCL) (size) Mode Cnt Score Error Units Misaligned.cas_aligned false 4096 avgt 25 26.976 ± 0.101 us/op Misaligned.cas_aligned false 65536 avgt 25 438.581 ± 1.975 us/op Misaligned.cas_aligned false 1048576 avgt 25 7611.854 ± 55.586 us/op Misaligned.cas_aligned false 16777216 avgt 25 324815.252 ± 4148.163 us/op Misaligned.cas_aligned true 4096 avgt 25 27.121 ± 0.289 us/op Misaligned.cas_aligned true 65536 avgt 25 458.464 ± 29.846 us/op Misaligned.cas_aligned true 1048576 avgt 25 7681.182 ± 352.401 us/op Misaligned.cas_aligned true 16777216 avgt 25 328297.240 ± 18495.807 us/op Misaligned.cas_misaligned false 4096 avgt 25 26.957 ± 0.097 us/op Misaligned.cas_misaligned false 65536 avgt 25 444.175 ± 3.015 us/op Misaligned.cas_misaligned false 1048576 avgt 25 7634.348 ± 51.966 us/op Misaligned.cas_misaligned false 16777216 avgt 25 321333.811 ± 1993.466 us/op Misaligned.cas_misaligned true 4096 avgt 25 3120.221 ± 29.095 us/op Misaligned.cas_misaligned true 65536 avgt 25 49793.036 ± 455.091 us/op Misaligned.cas_misaligned true 1048576 avgt 25 784908.241 ± 4880.751 us/op Misaligned.cas_misaligned true 16777216 avgt 25 12488689.288 ± 37988.009 us/op These results may be interpreted as follows: * aligned CASes do the same performance regardless of $crossCL setting, as it should be; * misaligned CASes within the cache line perform the same as the aligned; * now, misaligned CASes crossing the cache line are experiencing huge slowdowns, even on small sizes; in fact, the slowdowns are in order of 100x, which means 1 us per access! ----------------------------------------------------------------------------------------------------- x86, Atom Z530 (Silverthorne, 2008) @ 1.60GHz, running 8u40, Linux i586: Benchmark (crossCL) (size) Mode Cnt Score Error Units Misaligned.cas_aligned false 4096 avgt 25 150.718 ± 8.568 us/op Misaligned.cas_aligned false 65536 avgt 25 3478.051 ± 163.594 us/op Misaligned.cas_aligned false 1048576 avgt 25 220773.231 ± 3336.234 us/op Misaligned.cas_aligned false 16777216 avgt 25 3459877.420 ± 18434.285 us/op Misaligned.cas_aligned true 4096 avgt 25 145.297 ± 4.986 us/op Misaligned.cas_aligned true 65536 avgt 25 3405.696 ± 72.922 us/op Misaligned.cas_aligned true 1048576 avgt 25 221868.942 ± 2114.731 us/op Misaligned.cas_aligned true 16777216 avgt 25 3451625.551 ± 3404.674 us/op Misaligned.cas_misaligned false 4096 avgt 25 144.951 ± 4.454 us/op Misaligned.cas_misaligned false 65536 avgt 25 3425.299 ± 102.673 us/op Misaligned.cas_misaligned false 1048576 avgt 25 222924.231 ± 2813.820 us/op Misaligned.cas_misaligned false 16777216 avgt 25 3464066.536 ± 16515.780 us/op Misaligned.cas_misaligned true 4096 avgt 25 2546.885 ± 49.823 us/op Misaligned.cas_misaligned true 65536 avgt 25 41499.723 ± 629.741 us/op Misaligned.cas_misaligned true 1048576 avgt 25 672641.226 ± 3051.502 us/op Misaligned.cas_misaligned true 16777216 avgt 25 10643164.064 ± 10521.791 us/op The results are similar to Haswell. ----------------------------------------------------------------------------------------------------- ARMv7, Exynos 4412 Prime (Cortex-A9, 2012) @ 1.7 GHz, JDK 8 EA, Arch Linux (ARM): Benchmark (crossCL) (size) Mode Cnt Score Error Units Misaligned.cas_aligned false 4096 avgt 25 142.309 ± 0.150 us/op Misaligned.cas_aligned false 65536 avgt 25 3071.501 ± 3.877 us/op Misaligned.cas_aligned false 1048576 avgt 25 80511.210 ± 7474.018 us/op Misaligned.cas_aligned false 16777216 avgt 25 2870433.376 ± 4760.159 us/op Misaligned.cas_aligned true 4096 avgt 25 142.108 ± 0.399 us/op Misaligned.cas_aligned true 65536 avgt 25 3073.019 ± 2.462 us/op Misaligned.cas_aligned true 1048576 avgt 25 89758.528 ± 4102.178 us/op Misaligned.cas_aligned true 16777216 avgt 25 2862898.786 ± 3203.916 us/op cas_misaligned tests all failed with SIGBUS caused by unfixable misaligned access: # # A fatal error has been detected by the Java Runtime Environment: # # SIGBUS (0x7) at pc=0xb3f00630, pid=32042, tid=2443179104 # # JRE version: Java(TM) SE Runtime Environment (8.0-b132) (build 1.8.0-b132) # Java VM: Java HotSpot(TM) Server VM (25.0-b70 mixed mode linux-arm ) # Problematic frame: # v ~StubRoutines::atomic_cmpxchg ===================================================================================================== CONCLUSION: Misaligned accesses are still bad. (Off-heap developers,) beware. JVMs (at least HotSpot) will align fields and array elements to their natural alignment, so this an issue only if you are doing the naked memory accesses on your own. See these JOL examples: http://hg.openjdk.java.net/code-tools/jol/file/tip/jol-samples/src/main/java/org/openjdk/jol/samples/JOLSample_02_Alignment.java http://hg.openjdk.java.net/code-tools/jol/file/tip/jol-samples/src/main/java/org/openjdk/jol/samples/JOLSample_03_Packing.java */ }