package org.openjdk; import org.openjdk.jmh.annotations.*; import org.openjdk.jmh.infra.Blackhole; import java.lang.invoke.MethodHandles; import java.lang.invoke.VarHandle; import java.util.concurrent.TimeUnit; @BenchmarkMode(Mode.AverageTime) @OutputTimeUnit(TimeUnit.NANOSECONDS) @Fork(3) @State(Scope.Benchmark) public class BooleanGetAndSet { private static final VarHandle VALUE; static { try { MethodHandles.Lookup l = MethodHandles.lookup(); VALUE = l.findVarHandle(BooleanGetAndSet.class, "value", int.class); } catch (ReflectiveOperationException e) { throw new Error(e); } } private volatile int value; @Benchmark public void casLoop(Blackhole bh) { bh.consume(doCASLoop(true)); bh.consume(doCASLoop(false)); } @Benchmark public void getAndSet(Blackhole bh) { bh.consume(doGetAndSet(true)); bh.consume(doGetAndSet(false)); } @Benchmark public void singleCas(Blackhole bh) { bh.consume(doSingleCAS(true)); bh.consume(doSingleCAS(false)); } @CompilerControl(CompilerControl.Mode.DONT_INLINE) private boolean doCASLoop(boolean newValue) { int prevInt; do { prevInt = value; } while (!VALUE.compareAndSet(this, prevInt, (newValue ? 1 : 0))); return (prevInt != 0); } @CompilerControl(CompilerControl.Mode.DONT_INLINE) private boolean doGetAndSet(boolean newValue) { int oldInt = (int)VALUE.getAndSet(this, (newValue ? 1 : 0)); return (oldInt != 0); } @CompilerControl(CompilerControl.Mode.DONT_INLINE) private boolean doSingleCAS(boolean newValue) { int newInt = newValue ? 1 : 0; return newValue ^ VALUE.compareAndSet(this, newInt ^ 1, newInt); } /* 1x4x2 i7-4790K, JDK 9b128, Linux x86_64: Single thread: Benchmark Mode Cnt Score Error Units BooleanGetAndSet.casLoop avgt 15 17.630 ± 0.118 ns/op BooleanGetAndSet.casLoop:·CPI avgt 3 0.702 ± 0.050 #/op BooleanGetAndSet.casLoop:·L1-dcache-load-misses avgt 3 0.107 ± 0.160 #/op BooleanGetAndSet.casLoop:·L1-dcache-loads avgt 3 29.911 ± 0.508 #/op BooleanGetAndSet.casLoop:·L1-dcache-stores avgt 3 17.100 ± 2.447 #/op BooleanGetAndSet.casLoop:·branch-misses avgt 3 0.044 ± 0.104 #/op BooleanGetAndSet.casLoop:·branches avgt 3 14.256 ± 2.050 #/op BooleanGetAndSet.casLoop:·cycles avgt 3 70.528 ± 8.979 #/op BooleanGetAndSet.casLoop:·instructions avgt 3 100.489 ± 6.688 #/op BooleanGetAndSet.getAndSet avgt 15 16.154 ± 0.044 ns/op <--- faster BooleanGetAndSet.getAndSet:·CPI avgt 3 0.770 ± 0.036 #/op BooleanGetAndSet.getAndSet:·L1-dcache-load-misses avgt 3 0.096 ± 0.272 #/op BooleanGetAndSet.getAndSet:·L1-dcache-loads avgt 3 27.876 ± 3.990 #/op BooleanGetAndSet.getAndSet:·L1-dcache-stores avgt 2 14.825 #/op BooleanGetAndSet.getAndSet:·branch-misses avgt 3 0.042 ± 0.062 #/op BooleanGetAndSet.getAndSet:·branches avgt 3 12.237 ± 0.130 #/op BooleanGetAndSet.getAndSet:·cycles avgt 3 64.545 ± 4.208 #/op BooleanGetAndSet.getAndSet:·instructions avgt 3 83.783 ± 1.634 #/op <--- because less instructions BooleanGetAndSet.singleCas avgt 15 17.065 ± 0.243 ns/op BooleanGetAndSet.singleCas:·CPI avgt 3 0.742 ± 0.386 #/op BooleanGetAndSet.singleCas:·L1-dcache-load-misses avgt 3 0.115 ± 0.377 #/op BooleanGetAndSet.singleCas:·L1-dcache-loads avgt 3 27.932 ± 10.922 #/op BooleanGetAndSet.singleCas:·L1-dcache-stores avgt 2 15.127 #/op BooleanGetAndSet.singleCas:·branch-misses avgt 3 0.045 ± 0.029 #/op BooleanGetAndSet.singleCas:·branches avgt 3 12.233 ± 4.223 #/op BooleanGetAndSet.singleCas:·cycles avgt 3 68.431 ± 14.530 #/op BooleanGetAndSet.singleCas:·instructions avgt 3 92.306 ± 31.171 #/op 8 threads: Benchmark Mode Cnt Score Error Units BooleanGetAndSet.casLoop avgt 15 871.000 ± 15.490 ns/op BooleanGetAndSet.casLoop:·CPI avgt 3 19.163 ± 20.658 #/op <--- completely dominated by CPI BooleanGetAndSet.casLoop:·L1-dcache-load-misses avgt 2 4.403 #/op BooleanGetAndSet.casLoop:·L1-dcache-loads avgt 3 47.645 ± 32.827 #/op BooleanGetAndSet.casLoop:·L1-dcache-stores avgt 2 21.565 #/op BooleanGetAndSet.casLoop:·branch-misses avgt 3 2.492 ± 1.866 #/op BooleanGetAndSet.casLoop:·branches avgt 3 24.407 ± 29.638 #/op BooleanGetAndSet.casLoop:·cycles avgt 3 3133.645 ± 3810.733 #/op BooleanGetAndSet.casLoop:·instructions avgt 3 163.498 ± 68.249 #/op BooleanGetAndSet.getAndSet avgt 15 307.957 ± 6.856 ns/op BooleanGetAndSet.getAndSet:·CPI avgt 3 11.386 ± 16.652 #/op <--- completely dominated by CPI BooleanGetAndSet.getAndSet:·L1-dcache-load-misses avgt 3 1.320 ± 1.308 #/op BooleanGetAndSet.getAndSet:·L1-dcache-loads avgt 3 28.944 ± 9.595 #/op BooleanGetAndSet.getAndSet:·L1-dcache-stores avgt 2 16.125 #/op BooleanGetAndSet.getAndSet:·branch-misses avgt 3 0.146 ± 0.495 #/op BooleanGetAndSet.getAndSet:·branches avgt 3 14.130 ± 2.526 #/op BooleanGetAndSet.getAndSet:·cycles avgt 3 1077.438 ± 1565.055 #/op BooleanGetAndSet.getAndSet:·instructions avgt 3 94.631 ± 9.399 #/op BooleanGetAndSet.singleCas avgt 15 314.693 ± 9.539 ns/op BooleanGetAndSet.singleCas:·CPI avgt 3 10.995 ± 9.417 #/op <--- completely dominated by CPI BooleanGetAndSet.singleCas:·L1-dcache-load-misses avgt 3 0.913 ± 2.806 #/op BooleanGetAndSet.singleCas:·L1-dcache-loads avgt 3 29.329 ± 13.102 #/op BooleanGetAndSet.singleCas:·L1-dcache-stores avgt 3 16.511 ± 9.062 #/op BooleanGetAndSet.singleCas:·L1-icache-load-misses avgt 3 0.127 ± 0.247 #/op BooleanGetAndSet.singleCas:·branch-misses avgt 3 0.137 ± 0.252 #/op BooleanGetAndSet.singleCas:·branches avgt 3 14.111 ± 1.456 #/op BooleanGetAndSet.singleCas:·cycles avgt 3 1113.898 ± 1135.182 #/op BooleanGetAndSet.singleCas:·instructions avgt 3 101.279 ± 23.011 #/op Single-threaded results are explained by better codegen in getAndSet case: doCASLoop: 7.78% 0x00007f01238277d3: test %edx,%edx 0x00007f01238277d5: setne %r11b 0x00007f01238277d9: movzbl %r11b,%r11d 0x00007f01238277dd: mov %r9d,%eax 3.08% 0.01% 0x00007f01238277e0: lock cmpxchg %r11d,0xc(%rsi) 50.10% 89.19% 0x00007f01238277e6: sete %r10b 0x00007f01238277ea: movzbl %r10b,%r10d 3.31% 5.51% 0x00007f01238277ee: test %r10d,%r10d ╭ 0x00007f01238277f1: je 0x00007f0123827808 0.37% 0.06% │ 0x00007f01238277f3: test %r9d,%r9d │ 0x00007f01238277f6: setne %al 0.01% 0.03% │ 0x00007f01238277f9: movzbl %al,%eax doGetAndSet: 1.70% 0x00007f2a5b1e5a4c: test %edx,%edx ; convert boolean -> int 0x00007f2a5b1e5a4e: setne %r10b 1.59% 0x00007f2a5b1e5a52: movzbl %r10b,%r10d 0x00007f2a5b1e5a56: xchg %r10d,0xc(%rsi) ; exchange! 60.54% 92.68% 0x00007f2a5b1e5a5a: test %r10d,%r10d ; convert int -> boolean 0.01% 0x00007f2a5b1e5a5d: setne %al 3.66% 2.65% 0x00007f2a5b1e5a60: movzbl %al,%eax doSingleCAS: 1.46% 0.02% 0x00007feb0a776e4c: test %edx,%edx ; convert boolean -> int 0.02% 0x00007feb0a776e4e: setne %r11b 1.41% 0x00007feb0a776e52: movzbl %r11b,%r11d 0.03% 0x00007feb0a776e56: mov %r11d,%eax 1.76% 0x00007feb0a776e59: xor $0x1,%eax ; xor 0.02% 0x00007feb0a776e5c: lock cmpxchg %r11d,0xc(%rsi) ; compare-and-set! 59.88% 96.85% 0x00007feb0a776e62: sete %r10b ; convert to boolean 0x00007feb0a776e66: movzbl %r10b,%r10d 3.45% 0.02% 0x00007feb0a776e6a: xor %r10d,%edx ; "boolean" xor 0x00007feb0a776e6d: mov %edx,%eax ; filter out the lowest bit (oops) 0x00007feb0a776e6f: and $0x1,%eax */ }