== Patches The patches used here are as follows: === Hotspot: diff -r e68e062ced07 src/share/vm/opto/library_call.cpp --- a/src/share/vm/opto/library_call.cpp Fri Mar 13 14:33:51 2015 +0100 +++ b/src/share/vm/opto/library_call.cpp Wed Mar 18 20:05:19 2015 +0300 @@ -1576,6 +1576,22 @@ if (is_store) { (void) store_to_memory(control(), adr, ch, T_CHAR, TypeAryPtr::BYTES, MemNode::unordered); } else { + RegionNode* bailout = new RegionNode(1); + record_for_igvn(bailout); + + // Version 1 + Node* len = _gvn.transform(new RShiftINode(load_array_length(value), intcon(1))); + generate_limit_guard(index, intcon(0), len, bailout); + + // Version 2 + //generate_limit_guard(offset, intcon(0), load_array_length(value), bailout); + + if (bailout->req() > 1) { + PreserveJVMState pjvms(this); + set_control(_gvn.transform(bailout)); + uncommon_trap(Deoptimization::Reason_intrinsic, + Deoptimization::Action_maybe_recompile); + } ch = make_load(control(), adr, TypeInt::CHAR, T_CHAR, MemNode::unordered); set_result(ch); } === JDK: diff -r fcd0cb8ca91d src/java.base/share/classes/java/lang/String.java --- a/src/java.base/share/classes/java/lang/String.java Mon Mar 16 19:18:51 2015 +0000 +++ b/src/java.base/share/classes/java/lang/String.java Wed Mar 18 20:05:53 2015 +0300 @@ -690,9 +690,6 @@ } return (char)(value[index] & 0xff); } - if (index < 0 || index >= StringUTF16.length(value)) { - throw new StringIndexOutOfBoundsException(index); - } return StringUTF16.getChar(value, index); } diff -r fcd0cb8ca91d src/java.base/share/classes/java/lang/StringUTF16.java --- a/src/java.base/share/classes/java/lang/StringUTF16.java Mon Mar 16 19:18:51 2015 +0000 +++ b/src/java.base/share/classes/java/lang/StringUTF16.java Wed Mar 18 20:05:53 2015 +0300 @@ -2,7 +2,6 @@ import java.util.Arrays; import java.util.Locale; -import java.util.Objects; import static java.lang.String.UTF16; final class StringUTF16 { @@ -586,6 +585,9 @@ public static char getChar(byte[] val, int index) { index <<= 1; + if (index < 0 || index >= val.length) { + throw new StringIndexOutOfBoundsException(index>>1); + } return (char)(((val[index++] & 0xff) << HI_BYTE_SHIFT) | ((val[index] & 0xff) << LO_BYTE_SHIFT)); } == Tests Tests are done with -XX:LoopUnrollLimit=1 to constrast charAt() costs. === String Density baseline: Benchmark (size) Mode Cnt Score Error Units CharAtBench.test_cmp2 4096 avgt 5 10434.204 ± 14.314 ns/op CharAtStreamBench.test_cmp2 4096 avgt 5 1652.096 ± 2.865 ns/op CharAtBench: 0.35% 0.35% 0x00007fb5d4ade460: mov %eax,-0x14000(%rsp) 9.35% 7.97% 0x00007fb5d4ade467: push %rbp 0x00007fb5d4ade468: sub $0x20,%rsp 0.68% 0.63% 0x00007fb5d4ade46c: movsbl 0x14(%rsi),%r10d ; get field $coder 8.05% 6.00% 0x00007fb5d4ade471: mov 0xc(%rsi),%r11d ; get field $value 0x00007fb5d4ade475: test %r10d,%r10d ; test $coder == 0, jump out 0x00007fb5d4ade478: je 0x00007fb5d4ade4a4 0.18% 0.35% 0x00007fb5d4ade47a: test %edx,%edx ; range check, (idx < 0) 0x00007fb5d4ade47c: jl 0x00007fb5d4ade4c1 0.49% 0.51% 0x00007fb5d4ade47e: mov 0xc(%r12,%r11,8),%ebp ; arraylength 11.55% 11.44% 0x00007fb5d4ade483: sar %ebp ; /2 5.54% 6.58% 0x00007fb5d4ade485: cmp %ebp,%edx ; range check, (idx > arraylength/2) 0x00007fb5d4ade487: jge 0x00007fb5d4ade4d9 8.79% 9.50% 0x00007fb5d4ade489: mov %r11,%r10 ; unpack $value 0.12% 0.05% 0x00007fb5d4ade48c: shl $0x3,%r10 0.54% 0.74% 0x00007fb5d4ade490: shl %edx ; idx *= 2 0x00007fb5d4ade492: movzwl 0x10(%r10,%rdx,1),%eax ; get char 9.05% 10.44% 0x00007fb5d4ade498: add $0x20,%rsp 0.14% 0.18% 0x00007fb5d4ade49c: pop %rbp 0.54% 0.54% 0x00007fb5d4ade49d: test %eax,0xe569b5d(%rip) 4.63% 5.90% 0x00007fb5d4ade4a3: retq CharAtStreamBench: 30.10% 32.13% 0x00007f86e84ae6f0: mov %r9d,%eax 0.41% 0.33% 0x00007f86e84ae6f3: shl %eax 30.23% 27.21% 0x00007f86e84ae6f5: movzwl 0x10(%rdi,%rax,1),%eax 0.52% 0.23% 0x00007f86e84ae6fa: add %eax,%edx 32.17% 34.06% 0x00007f86e84ae6fc: inc %r9d 0.31% 0.21% 0x00007f86e84ae6ff: cmp %r8d,%r9d 0x00007f86e84ae702: jl 0x00007f86e84ae6f0 The original suggestion was to multiply $idx right away, and then use the non-sar-ed arraylength for the range check. === Version 1: Benchmark (size) Mode Cnt Score Error Units CharAtBench.test_cmp2 4096 avgt 5 10060.331 ± 7.993 ns/op CharAtStreamBench.test_cmp2 4096 avgt 5 1807.226 ± 1.656 ns/op CharAtBench: 0.14% 0.07% 0x00007f1270ade260: mov %eax,-0x14000(%rsp) 9.20% 5.52% 0x00007f1270ade267: push %rbp 0.02% 0x00007f1270ade268: sub $0x20,%rsp 0.12% 0.17% 0x00007f1270ade26c: movsbl 0x14(%rsi),%r10d ; get field $coder 9.30% 5.23% 0x00007f1270ade271: mov 0xc(%rsi),%r11d ; get field $value 0.02% 0x00007f1270ade275: test %r10d,%r10d ; test $coder == 0, jump out 0x00007f1270ade278: je 0x00007f1270ade2a2 0.20% 0.20% 0x00007f1270ade27a: mov 0xc(%r12,%r11,8),%r10d ; arraylength 8.69% 12.18% 0x00007f1270ade27f: sar %r10d ; /2 9.12% 10.48% 0x00007f1270ade282: cmp %edx,%r10d ; range check (0 <= idx < arraylength /= 2) 0x00007f1270ade285: jb 0x00007f1270ade2bf 8.30% 8.11% 0x00007f1270ade287: mov %r11,%r10 ; unpack $value 0.09% 0.03% 0x00007f1270ade28a: shl $0x3,%r10 0x00007f1270ade28e: shl %edx ; idx *= 2 0.99% 0.92% 0x00007f1270ade290: movzwl 0x10(%r10,%rdx,1),%eax ; get char 9.24% 12.27% 0x00007f1270ade296: add $0x20,%rsp 0x00007f1270ade29a: pop %rbp 0.02% 0x00007f1270ade29b: test %eax,0x10617d5f(%rip) 4.66% 5.20% 0x00007f1270ade2a1: retq While the performance had improved, probably because two range checks have merged into one, we are still doing the excess "sar" over arraylength. CharAtStreamBench: 13.85% 15.51% 0x00007fbc70ae2732: cmp %r11d,%esi ; <--- WTF are these? 0x00007fbc70ae2735: jb 0x00007fbc70ae277b ; ... 14.50% 16.43% 0x00007fbc70ae2737: mov %r11d,%edi 13.49% 15.85% 0x00007fbc70ae273a: shl %edi 14.14% 13.64% 0x00007fbc70ae273c: movzwl 0x10(%r9,%rdi,1),%ecx 14.03% 10.37% 0x00007fbc70ae2742: add %ecx,%edx 13.09% 10.73% 0x00007fbc70ae2744: inc %r11d 13.04% 14.00% 0x00007fbc70ae2747: cmp %ebx,%r11d 0x00007fbc70ae274a: jl 0x00007fbc70ae2732 ...has some stray range checks, that look suspiciously like the introduced "unsigned" range check, but not hoisted from the loop. This significantly penalizes the test. ==== Version 2: Benchmark (size) Mode Cnt Score Error Units CharAtBench.test_cmp2 4096 avgt 5 9660.682 ± 26.819 ns/op CharAtStreamBench.test_cmp2 4096 avgt 5 1807.266 ± 5.248 ns/op CharAtBench: 0.24% 0.48% 0x00007fa9b48c1f60: mov %eax,-0x14000(%rsp) 9.95% 7.83% 0x00007fa9b48c1f67: push %rbp 0.03% 0.02% 0x00007fa9b48c1f68: sub $0x20,%rsp 0.48% 0.48% 0x00007fa9b48c1f6c: movsbl 0x14(%rsi),%r10d ; get field $coder 10.11% 9.73% 0x00007fa9b48c1f71: mov 0xc(%rsi),%r11d ; get field $value 0.02% 0.02% 0x00007fa9b48c1f75: test %r10d,%r10d ; test $coder == 0, jump out 0x00007fa9b48c1f78: je 0x00007fa9b48c1fa3 0.48% 0.48% 0x00007fa9b48c1f7a: mov 0xc(%r12,%r11,8),%r10d ; array length 9.47% 10.32% 0x00007fa9b48c1f7f: mov %edx,%r8d ; <-- WTF is this 4.43% 4.58% 0x00007fa9b48c1f82: shl %r8d ; idx *= 2 0x00007fa9b48c1f85: cmp %r8d,%r10d ; range check (0 <= idx < arraylength) 0x00007fa9b48c1f88: jb 0x00007fa9b48c1fc0 5.24% 5.41% 0x00007fa9b48c1f8a: mov %r11,%r10 ; unpack $value reference 0.09% 0.05% 0x00007fa9b48c1f8d: shl $0x3,%r10 4.62% 4.23% 0x00007fa9b48c1f91: movzwl 0x10(%r10,%r8,1),%eax ; get char 3.05% 2.66% 0x00007fa9b48c1f97: add $0x20,%rsp 2.52% 2.83% 0x00007fa9b48c1f9b: pop %rbp 0.22% 0.10% 0x00007fa9b48c1f9c: test %eax,0xe46305e(%rip) 14.30% 14.83% 0x00007fa9b48c1fa2: retq Here, "sar" is gone, and we do exactly what was proposed. Notably, the performance had improved significantly. CharAtStreamBench: 12.79% 13.08% 0x00007f15f4ae1c80: mov %edi,%ecx 14.17% 13.69% 0x00007f15f4ae1c82: shl %ecx 13.43% 15.92% 0x00007f15f4ae1c84: cmp %ecx,%eax ; <--- WTF are these? 0x00007f15f4ae1c86: jb 0x00007f15f4ae1cc0 ; <--- 12.67% 12.05% 0x00007f15f4ae1c88: movzwl 0x10(%r11,%rcx,1),%ecx 13.41% 12.65% 0x00007f15f4ae1c8e: add %ecx,%edx 13.36% 12.79% 0x00007f15f4ae1c90: inc %edi 13.58% 13.36% 0x00007f15f4ae1c92: cmp %r10d,%edi 0x00007f15f4ae1c95: jl 0x00007f15f4ae1c80 ...still experiences the same problem. == Conclusion While the patch with Version 2 does what was suggested, it breaks the CharAtStreamBench performance.