--- /dev/null 2016-08-24 15:41:39.598575000 -0400 +++ new/src/os_cpu/linux_arm/vm/linux_arm_64.s 2016-12-13 12:56:39.393102953 -0500 @@ -0,0 +1,542 @@ +# +# Copyright (c) 2008, 2013, Oracle and/or its affiliates. All rights reserved. +# DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. +# +# This code is free software; you can redistribute it and/or modify it +# under the terms of the GNU General Public License version 2 only, as +# published by the Free Software Foundation. +# +# This code is distributed in the hope that it will be useful, but WITHOUT +# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +# version 2 for more details (a copy is included in the LICENSE file that +# accompanied this code). +# +# You should have received a copy of the GNU General Public License version +# 2 along with this work; if not, write to the Free Software Foundation, +# Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. +# +# Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA +# or visit www.oracle.com if you need additional information or have any +# questions. +# + + # TODO-AARCH64 + + # NOTE WELL! The _Copy functions are called directly + # from server-compiler-generated code via CallLeafNoFP, + # which means that they *must* either not use floating + # point or use it in the same manner as does the server + # compiler. + + .globl _Copy_conjoint_bytes + .type _Copy_conjoint_bytes, %function + .globl _Copy_arrayof_conjoint_bytes + .type _Copy_arrayof_conjoint_bytes, %function + .globl _Copy_disjoint_words + .type _Copy_disjoint_words, %function + .globl _Copy_conjoint_words + .type _Copy_conjoint_words, %function + .globl _Copy_conjoint_jshorts_atomic + .type _Copy_conjoint_jshorts_atomic, %function + .globl _Copy_arrayof_conjoint_jshorts + .type _Copy_arrayof_conjoint_jshorts, %function + .globl _Copy_conjoint_jints_atomic + .type _Copy_conjoint_jints_atomic, %function + .globl _Copy_arrayof_conjoint_jints + .type _Copy_arrayof_conjoint_jints, %function + .globl _Copy_conjoint_jlongs_atomic + .type _Copy_conjoint_jlongs_atomic, %function + .globl _Copy_arrayof_conjoint_jlongs + .type _Copy_arrayof_conjoint_jlongs, %function + + .text + .globl SpinPause + .type SpinPause, %function +SpinPause: + yield + ret + + # Support for void Copy::conjoint_bytes(void* from, + # void* to, + # size_t count) +_Copy_conjoint_bytes: + hlt 1002 + + # Support for void Copy::arrayof_conjoint_bytes(void* from, + # void* to, + # size_t count) +_Copy_arrayof_conjoint_bytes: + hlt 1003 + + + # Support for void Copy::disjoint_words(void* from, + # void* to, + # size_t count) +_Copy_disjoint_words: + # These and further memory prefetches may hit out of array ranges. + # Experiments showed that prefetching of inaccessible memory doesn't result in exceptions. + prfm pldl1keep, [x0, #0] + prfm pstl1keep, [x1, #0] + prfm pldl1keep, [x0, #64] + prfm pstl1keep, [x1, #64] + + subs x18, x2, #128 + b.ge dw_large + +dw_lt_128: + # Copy [x0, x0 + x2) to [x1, x1 + x2) + + adr x15, dw_tail_table_base + and x16, x2, #~8 + + # Calculate address to jump and store it to x15: + # Each pair of instructions before dw_tail_table_base copies 16 bytes. + # x16 is count of bytes to copy aligned down by 16. + # So x16/16 pairs of instructions should be executed. + # Each pair takes 8 bytes, so x15 = dw_tail_table_base - (x16/16)*8 = x15 - x16/2 + sub x15, x15, x16, lsr #1 + prfm plil1keep, [x15] + + add x17, x0, x2 + add x18, x1, x2 + + # If x2 = x16 + 8, then copy 8 bytes and x16 bytes after that. + # Otherwise x2 = x16, so proceed to copy x16 bytes. + tbz x2, #3, dw_lt_128_even + ldr x3, [x0] + str x3, [x1] +dw_lt_128_even: + # Copy [x17 - x16, x17) to [x18 - x16, x18) + # x16 is aligned by 16 and less than 128 + + # Execute (x16/16) ldp-stp pairs; each pair copies 16 bytes + br x15 + + ldp x3, x4, [x17, #-112] + stp x3, x4, [x18, #-112] + ldp x5, x6, [x17, #-96] + stp x5, x6, [x18, #-96] + ldp x7, x8, [x17, #-80] + stp x7, x8, [x18, #-80] + ldp x9, x10, [x17, #-64] + stp x9, x10, [x18, #-64] + ldp x11, x12, [x17, #-48] + stp x11, x12, [x18, #-48] + ldp x13, x14, [x17, #-32] + stp x13, x14, [x18, #-32] + ldp x15, x16, [x17, #-16] + stp x15, x16, [x18, #-16] +dw_tail_table_base: + ret + +.p2align 6 +.rept 12 + nop +.endr +dw_large: + # x18 >= 0; + # Copy [x0, x0 + x18 + 128) to [x1, x1 + x18 + 128) + + ldp x3, x4, [x0], #64 + ldp x5, x6, [x0, #-48] + ldp x7, x8, [x0, #-32] + ldp x9, x10, [x0, #-16] + + # Before and after each iteration of loop registers x3-x10 contain [x0 - 64, x0), + # and x1 is a place to copy this data; + # x18 contains number of bytes to be stored minus 128 + + # Exactly 16 instructions from p2align, so dw_loop starts from cache line boundary + # Checking it explictly by aligning with "hlt 1000" instructions +.p2alignl 6, 0xd4407d00 +dw_loop: + prfm pldl1keep, [x0, #64] + # Next line actually hurted memory copy performance (for interpreter) - JDK-8078120 + # prfm pstl1keep, [x1, #64] + + subs x18, x18, #64 + + stp x3, x4, [x1, #0] + ldp x3, x4, [x0, #0] + stp x5, x6, [x1, #16] + ldp x5, x6, [x0, #16] + stp x7, x8, [x1, #32] + ldp x7, x8, [x0, #32] + stp x9, x10, [x1, #48] + ldp x9, x10, [x0, #48] + + add x1, x1, #64 + add x0, x0, #64 + + b.ge dw_loop + + # 13 instructions from dw_loop, so the loop body hits into one cache line + +dw_loop_end: + adds x2, x18, #64 + + stp x3, x4, [x1], #64 + stp x5, x6, [x1, #-48] + stp x7, x8, [x1, #-32] + stp x9, x10, [x1, #-16] + + # Increased x18 by 64, but stored 64 bytes, so x2 contains exact number of bytes to be stored + + # If this number is not zero, also copy remaining bytes + b.ne dw_lt_128 + ret + + + # Support for void Copy::conjoint_words(void* from, + # void* to, + # size_t count) +_Copy_conjoint_words: + subs x3, x1, x0 + # hi condition is met <=> from < to + ccmp x2, x3, #0, hi + # hi condition is met <=> (from < to) and (to - from < count) + # otherwise _Copy_disjoint_words may be used, because it performs forward copying, + # so it also works when ranges overlap but to <= from + b.ls _Copy_disjoint_words + + # Overlapping case should be the rare one, it does not worth optimizing + + ands x3, x2, #~8 + # x3 is count aligned down by 2*wordSize + add x0, x0, x2 + add x1, x1, x2 + sub x3, x3, #16 + # Skip loop if 0 or 1 words + b.eq cw_backward_loop_end + + # x3 >= 0 + # Copy [x0 - x3 - 16, x0) to [x1 - x3 - 16, x1) backward +cw_backward_loop: + subs x3, x3, #16 + ldp x4, x5, [x0, #-16]! + stp x4, x5, [x1, #-16]! + b.ge cw_backward_loop + +cw_backward_loop_end: + # Copy remaining 0 or 1 words + tbz x2, #3, cw_finish + ldr x3, [x0, #-8] + str x3, [x1, #-8] + +cw_finish: + ret + + + # Support for void Copy::conjoint_jshorts_atomic(void* from, + # void* to, + # size_t count) +_Copy_conjoint_jshorts_atomic: + add x17, x0, x2 + add x18, x1, x2 + + subs x3, x1, x0 + # hi is met <=> (from < to) and (to - from < count) + ccmp x2, x3, #0, hi + b.hi cs_backward + + subs x3, x2, #14 + b.ge cs_forward_loop + + # Copy x2 < 14 bytes from x0 to x1 +cs_forward_lt14: + ands x7, x2, #7 + tbz x2, #3, cs_forward_lt8 + ldrh w3, [x0, #0] + ldrh w4, [x0, #2] + ldrh w5, [x0, #4] + ldrh w6, [x0, #6] + + strh w3, [x1, #0] + strh w4, [x1, #2] + strh w5, [x1, #4] + strh w6, [x1, #6] + + # Copy x7 < 8 bytes from x17 - x7 to x18 - x7 +cs_forward_lt8: + b.eq cs_forward_0 + cmp x7, #4 + b.lt cs_forward_2 + b.eq cs_forward_4 + +cs_forward_6: + ldrh w3, [x17, #-6] + strh w3, [x18, #-6] +cs_forward_4: + ldrh w4, [x17, #-4] + strh w4, [x18, #-4] +cs_forward_2: + ldrh w5, [x17, #-2] + strh w5, [x18, #-2] +cs_forward_0: + ret + + + # Copy [x0, x0 + x3 + 14) to [x1, x1 + x3 + 14) + # x3 >= 0 +.p2align 6 +cs_forward_loop: + subs x3, x3, #14 + + ldrh w4, [x0], #14 + ldrh w5, [x0, #-12] + ldrh w6, [x0, #-10] + ldrh w7, [x0, #-8] + ldrh w8, [x0, #-6] + ldrh w9, [x0, #-4] + ldrh w10, [x0, #-2] + + strh w4, [x1], #14 + strh w5, [x1, #-12] + strh w6, [x1, #-10] + strh w7, [x1, #-8] + strh w8, [x1, #-6] + strh w9, [x1, #-4] + strh w10, [x1, #-2] + + b.ge cs_forward_loop + # Exactly 16 instruction from cs_forward_loop, so loop fits into one cache line + + adds x2, x3, #14 + # x2 bytes should be copied from x0 to x1 + b.ne cs_forward_lt14 + ret + + # Very similar to forward copying +cs_backward: + subs x3, x2, #14 + b.ge cs_backward_loop + +cs_backward_lt14: + ands x7, x2, #7 + tbz x2, #3, cs_backward_lt8 + + ldrh w3, [x17, #-8] + ldrh w4, [x17, #-6] + ldrh w5, [x17, #-4] + ldrh w6, [x17, #-2] + + strh w3, [x18, #-8] + strh w4, [x18, #-6] + strh w5, [x18, #-4] + strh w6, [x18, #-2] + +cs_backward_lt8: + b.eq cs_backward_0 + cmp x7, #4 + b.lt cs_backward_2 + b.eq cs_backward_4 + +cs_backward_6: + ldrh w3, [x0, #4] + strh w3, [x1, #4] + +cs_backward_4: + ldrh w4, [x0, #2] + strh w4, [x1, #2] + +cs_backward_2: + ldrh w5, [x0, #0] + strh w5, [x1, #0] + +cs_backward_0: + ret + + +.p2align 6 +cs_backward_loop: + subs x3, x3, #14 + + ldrh w4, [x17, #-14]! + ldrh w5, [x17, #2] + ldrh w6, [x17, #4] + ldrh w7, [x17, #6] + ldrh w8, [x17, #8] + ldrh w9, [x17, #10] + ldrh w10, [x17, #12] + + strh w4, [x18, #-14]! + strh w5, [x18, #2] + strh w6, [x18, #4] + strh w7, [x18, #6] + strh w8, [x18, #8] + strh w9, [x18, #10] + strh w10, [x18, #12] + + b.ge cs_backward_loop + adds x2, x3, #14 + b.ne cs_backward_lt14 + ret + + + # Support for void Copy::arrayof_conjoint_jshorts(void* from, + # void* to, + # size_t count) +_Copy_arrayof_conjoint_jshorts: + hlt 1007 + + + # Support for void Copy::conjoint_jlongs_atomic(jlong* from, + # jlong* to, + # size_t count) +_Copy_conjoint_jlongs_atomic: +_Copy_arrayof_conjoint_jlongs: + hlt 1009 + + + # Support for void Copy::conjoint_jints_atomic(void* from, + # void* to, + # size_t count) +_Copy_conjoint_jints_atomic: +_Copy_arrayof_conjoint_jints: + # These and further memory prefetches may hit out of array ranges. + # Experiments showed that prefetching of inaccessible memory doesn't result in exceptions. + prfm pldl1keep, [x0, #0] + prfm pstl1keep, [x1, #0] + prfm pldl1keep, [x0, #32] + prfm pstl1keep, [x1, #32] + + subs x3, x1, x0 + # hi condition is met <=> from < to + ccmp x2, x3, #0, hi + # hi condition is met <=> (from < to) and (to - from < count) + b.hi ci_backward + + subs x18, x2, #64 + b.ge ci_forward_large + +ci_forward_lt_64: + # Copy [x0, x0 + x2) to [x1, x1 + x2) + + adr x15, ci_forward_tail_table_base + and x16, x2, #~4 + + # Calculate address to jump and store it to x15: + # Each pair of instructions before ci_forward_tail_table_base copies 8 bytes. + # x16 is count of bytes to copy aligned down by 8. + # So x16/8 pairs of instructions should be executed. + # Each pair takes 8 bytes, so x15 = ci_forward_tail_table_base - (x16/8)*8 = x15 - x16 + sub x15, x15, x16 + prfm plil1keep, [x15] + + add x17, x0, x2 + add x18, x1, x2 + + # If x2 = x16 + 4, then copy 4 bytes and x16 bytes after that. + # Otherwise x2 = x16, so proceed to copy x16 bytes. + tbz x2, #2, ci_forward_lt_64_even + ldr w3, [x0] + str w3, [x1] +ci_forward_lt_64_even: + # Copy [x17 - x16, x17) to [x18 - x16, x18) + # x16 is aligned by 8 and less than 64 + + # Execute (x16/8) ldp-stp pairs; each pair copies 8 bytes + br x15 + + ldp w3, w4, [x17, #-56] + stp w3, w4, [x18, #-56] + ldp w5, w6, [x17, #-48] + stp w5, w6, [x18, #-48] + ldp w7, w8, [x17, #-40] + stp w7, w8, [x18, #-40] + ldp w9, w10, [x17, #-32] + stp w9, w10, [x18, #-32] + ldp w11, w12, [x17, #-24] + stp w11, w12, [x18, #-24] + ldp w13, w14, [x17, #-16] + stp w13, w14, [x18, #-16] + ldp w15, w16, [x17, #-8] + stp w15, w16, [x18, #-8] +ci_forward_tail_table_base: + ret + +.p2align 6 +.rept 12 + nop +.endr +ci_forward_large: + # x18 >= 0; + # Copy [x0, x0 + x18 + 64) to [x1, x1 + x18 + 64) + + ldp w3, w4, [x0], #32 + ldp w5, w6, [x0, #-24] + ldp w7, w8, [x0, #-16] + ldp w9, w10, [x0, #-8] + + # Before and after each iteration of loop registers w3-w10 contain [x0 - 32, x0), + # and x1 is a place to copy this data; + # x18 contains number of bytes to be stored minus 64 + + # Exactly 16 instructions from p2align, so ci_forward_loop starts from cache line boundary + # Checking it explictly by aligning with "hlt 1000" instructions +.p2alignl 6, 0xd4407d00 +ci_forward_loop: + prfm pldl1keep, [x0, #32] + prfm pstl1keep, [x1, #32] + + subs x18, x18, #32 + + stp w3, w4, [x1, #0] + ldp w3, w4, [x0, #0] + stp w5, w6, [x1, #8] + ldp w5, w6, [x0, #8] + stp w7, w8, [x1, #16] + ldp w7, w8, [x0, #16] + stp w9, w10, [x1, #24] + ldp w9, w10, [x0, #24] + + add x1, x1, #32 + add x0, x0, #32 + + b.ge ci_forward_loop + + # 14 instructions from ci_forward_loop, so the loop body hits into one cache line + +ci_forward_loop_end: + adds x2, x18, #32 + + stp w3, w4, [x1], #32 + stp w5, w6, [x1, #-24] + stp w7, w8, [x1, #-16] + stp w9, w10, [x1, #-8] + + # Increased x18 by 32, but stored 32 bytes, so x2 contains exact number of bytes to be stored + + # If this number is not zero, also copy remaining bytes + b.ne ci_forward_lt_64 + ret + +ci_backward: + + # Overlapping case should be the rare one, it does not worth optimizing + + ands x3, x2, #~4 + # x3 is count aligned down by 2*jintSize + add x0, x0, x2 + add x1, x1, x2 + sub x3, x3, #8 + # Skip loop if 0 or 1 jints + b.eq ci_backward_loop_end + + # x3 >= 0 + # Copy [x0 - x3 - 8, x0) to [x1 - x3 - 8, x1) backward +ci_backward_loop: + subs x3, x3, #8 + ldp w4, w5, [x0, #-8]! + stp w4, w5, [x1, #-8]! + b.ge ci_backward_loop + +ci_backward_loop_end: + # Copy remaining 0 or 1 jints + tbz x2, #2, ci_backward_finish + ldr w3, [x0, #-4] + str w3, [x1, #-4] + +ci_backward_finish: + ret