--- /dev/null	2016-08-24 15:41:39.598575000 -0400
+++ new/src/os_cpu/linux_arm/vm/linux_arm_64.s	2016-12-13 12:56:39.393102953 -0500
@@ -0,0 +1,542 @@
+# 
+# Copyright (c) 2008, 2013, Oracle and/or its affiliates. All rights reserved.
+# DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+#
+# This code is free software; you can redistribute it and/or modify it
+# under the terms of the GNU General Public License version 2 only, as
+# published by the Free Software Foundation.
+#
+# This code is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+# FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+# version 2 for more details (a copy is included in the LICENSE file that
+# accompanied this code).
+#
+# You should have received a copy of the GNU General Public License version
+# 2 along with this work; if not, write to the Free Software Foundation,
+# Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+#
+# Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+# or visit www.oracle.com if you need additional information or have any
+# questions.
+# 
+
+        # TODO-AARCH64
+        
+        # NOTE WELL!  The _Copy functions are called directly
+        # from server-compiler-generated code via CallLeafNoFP,
+        # which means that they *must* either not use floating
+        # point or use it in the same manner as does the server
+        # compiler.
+        
+        .globl _Copy_conjoint_bytes
+        .type _Copy_conjoint_bytes, %function
+        .globl _Copy_arrayof_conjoint_bytes
+        .type _Copy_arrayof_conjoint_bytes, %function
+        .globl _Copy_disjoint_words
+        .type _Copy_disjoint_words, %function
+        .globl _Copy_conjoint_words
+        .type _Copy_conjoint_words, %function
+        .globl _Copy_conjoint_jshorts_atomic
+        .type _Copy_conjoint_jshorts_atomic, %function
+        .globl _Copy_arrayof_conjoint_jshorts
+        .type _Copy_arrayof_conjoint_jshorts, %function
+        .globl _Copy_conjoint_jints_atomic
+        .type _Copy_conjoint_jints_atomic, %function
+        .globl _Copy_arrayof_conjoint_jints
+        .type _Copy_arrayof_conjoint_jints, %function
+        .globl _Copy_conjoint_jlongs_atomic
+        .type _Copy_conjoint_jlongs_atomic, %function
+        .globl _Copy_arrayof_conjoint_jlongs
+        .type _Copy_arrayof_conjoint_jlongs, %function
+
+        .text
+        .globl  SpinPause
+        .type SpinPause, %function
+SpinPause:
+        yield
+        ret
+
+        # Support for void Copy::conjoint_bytes(void* from,
+        #                                       void* to,
+        #                                       size_t count)
+_Copy_conjoint_bytes:
+        hlt 1002
+
+        # Support for void Copy::arrayof_conjoint_bytes(void* from,
+        #                                               void* to,
+        #                                               size_t count)
+_Copy_arrayof_conjoint_bytes:
+        hlt 1003
+
+
+        # Support for void Copy::disjoint_words(void* from,
+        #                                       void* to,
+        #                                       size_t count)
+_Copy_disjoint_words:
+        # These and further memory prefetches may hit out of array ranges.
+        # Experiments showed that prefetching of inaccessible memory doesn't result in exceptions.
+        prfm    pldl1keep,  [x0, #0]
+        prfm    pstl1keep,  [x1, #0]
+        prfm    pldl1keep,  [x0, #64]
+        prfm    pstl1keep,  [x1, #64]
+
+        subs    x18, x2,  #128
+        b.ge    dw_large
+
+dw_lt_128:
+        # Copy [x0, x0 + x2) to [x1, x1 + x2)
+        
+        adr     x15,  dw_tail_table_base
+        and     x16,  x2,  #~8
+
+        # Calculate address to jump and store it to x15:
+        #   Each pair of instructions before dw_tail_table_base copies 16 bytes.
+        #   x16 is count of bytes to copy aligned down by 16.
+        #   So x16/16 pairs of instructions should be executed. 
+        #   Each pair takes 8 bytes, so x15 = dw_tail_table_base - (x16/16)*8 = x15 - x16/2
+        sub     x15,  x15, x16, lsr #1
+        prfm    plil1keep, [x15]
+    
+        add     x17,  x0,  x2
+        add     x18,  x1,  x2
+
+        # If x2 = x16 + 8, then copy 8 bytes and x16 bytes after that.
+        # Otherwise x2 = x16, so proceed to copy x16 bytes.
+        tbz     x2, #3, dw_lt_128_even
+        ldr     x3, [x0]
+        str     x3, [x1]
+dw_lt_128_even:
+        # Copy [x17 - x16, x17) to [x18 - x16, x18)
+        # x16 is aligned by 16 and less than 128
+
+        # Execute (x16/16) ldp-stp pairs; each pair copies 16 bytes
+        br      x15
+
+        ldp     x3,  x4,  [x17, #-112]
+        stp     x3,  x4,  [x18, #-112]
+        ldp     x5,  x6,  [x17, #-96]
+        stp     x5,  x6,  [x18, #-96]
+        ldp     x7,  x8,  [x17, #-80]
+        stp     x7,  x8,  [x18, #-80]
+        ldp     x9,  x10, [x17, #-64]
+        stp     x9,  x10, [x18, #-64]
+        ldp     x11, x12, [x17, #-48]
+        stp     x11, x12, [x18, #-48]
+        ldp     x13, x14, [x17, #-32]
+        stp     x13, x14, [x18, #-32]
+        ldp     x15, x16, [x17, #-16]
+        stp     x15, x16, [x18, #-16]
+dw_tail_table_base:
+        ret
+
+.p2align  6
+.rept   12
+        nop
+.endr
+dw_large:
+        # x18 >= 0;
+        # Copy [x0, x0 + x18 + 128) to [x1, x1 + x18 + 128)
+
+        ldp     x3,  x4,  [x0], #64
+        ldp     x5,  x6,  [x0, #-48]
+        ldp     x7,  x8,  [x0, #-32]
+        ldp     x9,  x10, [x0, #-16]
+
+        # Before and after each iteration of loop registers x3-x10 contain [x0 - 64, x0),
+        # and x1 is a place to copy this data;
+        # x18 contains number of bytes to be stored minus 128
+
+        # Exactly 16 instructions from p2align, so dw_loop starts from cache line boundary
+        # Checking it explictly by aligning with "hlt 1000" instructions 
+.p2alignl  6, 0xd4407d00
+dw_loop:
+        prfm    pldl1keep,  [x0, #64]
+        # Next line actually hurted memory copy performance (for interpreter) - JDK-8078120
+        # prfm    pstl1keep,  [x1, #64]
+
+        subs    x18, x18, #64
+
+        stp     x3,  x4,  [x1, #0]
+        ldp     x3,  x4,  [x0, #0]
+        stp     x5,  x6,  [x1, #16]
+        ldp     x5,  x6,  [x0, #16]
+        stp     x7,  x8,  [x1, #32]
+        ldp     x7,  x8,  [x0, #32]
+        stp     x9,  x10, [x1, #48]
+        ldp     x9,  x10, [x0, #48]
+        
+        add     x1,  x1,  #64
+        add     x0,  x0,  #64
+
+        b.ge    dw_loop
+
+        # 13 instructions from dw_loop, so the loop body hits into one cache line
+
+dw_loop_end:
+        adds    x2,  x18, #64
+
+        stp     x3,  x4,  [x1], #64
+        stp     x5,  x6,  [x1, #-48]
+        stp     x7,  x8,  [x1, #-32]
+        stp     x9,  x10, [x1, #-16]
+
+        # Increased x18 by 64, but stored 64 bytes, so x2 contains exact number of bytes to be stored
+
+        # If this number is not zero, also copy remaining bytes
+        b.ne    dw_lt_128
+        ret
+
+
+        # Support for void Copy::conjoint_words(void* from,
+        #                                       void* to,
+        #                                       size_t count)
+_Copy_conjoint_words:
+        subs    x3, x1, x0
+        # hi condition is met <=> from < to
+        ccmp    x2, x3, #0, hi
+        # hi condition is met <=> (from < to) and (to - from < count)
+        # otherwise _Copy_disjoint_words may be used, because it performs forward copying,
+        # so it also works when ranges overlap but to <= from
+        b.ls    _Copy_disjoint_words
+
+        # Overlapping case should be the rare one, it does not worth optimizing
+
+        ands    x3,  x2,  #~8
+        # x3 is count aligned down by 2*wordSize
+        add     x0,  x0,  x2
+        add     x1,  x1,  x2
+        sub     x3,  x3,  #16
+        # Skip loop if 0 or 1 words
+        b.eq    cw_backward_loop_end
+
+        # x3 >= 0
+        # Copy [x0 - x3 - 16, x0) to [x1 - x3 - 16, x1) backward
+cw_backward_loop:
+        subs    x3,  x3,  #16
+        ldp     x4,  x5,  [x0, #-16]!
+        stp     x4,  x5,  [x1, #-16]!
+        b.ge    cw_backward_loop
+
+cw_backward_loop_end:
+        # Copy remaining 0 or 1 words
+        tbz     x2,  #3,  cw_finish
+        ldr     x3, [x0, #-8]
+        str     x3, [x1, #-8]
+
+cw_finish:
+        ret
+
+
+        # Support for void Copy::conjoint_jshorts_atomic(void* from,
+        #                                                void* to,
+        #                                                size_t count)
+_Copy_conjoint_jshorts_atomic:
+        add     x17, x0, x2
+        add     x18, x1, x2
+
+        subs    x3, x1, x0
+        # hi is met <=> (from < to) and (to - from < count)
+        ccmp    x2, x3, #0, hi
+        b.hi    cs_backward
+        
+        subs    x3, x2, #14
+        b.ge    cs_forward_loop
+
+        # Copy x2 < 14 bytes from x0 to x1
+cs_forward_lt14:
+        ands    x7, x2, #7
+        tbz     x2, #3, cs_forward_lt8
+        ldrh    w3, [x0, #0]
+        ldrh    w4, [x0, #2]
+        ldrh    w5, [x0, #4]
+        ldrh    w6, [x0, #6]
+
+        strh    w3, [x1, #0]
+        strh    w4, [x1, #2]
+        strh    w5, [x1, #4]
+        strh    w6, [x1, #6]
+
+        # Copy x7 < 8 bytes from x17 - x7 to x18 - x7
+cs_forward_lt8:
+        b.eq    cs_forward_0
+        cmp     x7, #4
+        b.lt    cs_forward_2
+        b.eq    cs_forward_4
+
+cs_forward_6:
+        ldrh    w3, [x17, #-6]
+        strh    w3, [x18, #-6]
+cs_forward_4:
+        ldrh    w4, [x17, #-4]
+        strh    w4, [x18, #-4]
+cs_forward_2:
+        ldrh    w5, [x17, #-2]
+        strh    w5, [x18, #-2]
+cs_forward_0:
+        ret
+
+
+        # Copy [x0, x0 + x3 + 14) to [x1, x1 + x3 + 14)
+        # x3 >= 0
+.p2align 6
+cs_forward_loop:
+        subs    x3, x3, #14
+        
+        ldrh    w4, [x0], #14
+        ldrh    w5, [x0, #-12]
+        ldrh    w6, [x0, #-10]
+        ldrh    w7, [x0, #-8]
+        ldrh    w8, [x0, #-6]
+        ldrh    w9, [x0, #-4]
+        ldrh    w10, [x0, #-2]
+
+        strh    w4, [x1], #14
+        strh    w5, [x1, #-12]
+        strh    w6, [x1, #-10]
+        strh    w7, [x1, #-8]
+        strh    w8, [x1, #-6]
+        strh    w9, [x1, #-4]
+        strh    w10, [x1, #-2]
+
+        b.ge    cs_forward_loop
+        # Exactly 16 instruction from cs_forward_loop, so loop fits into one cache line
+
+        adds    x2, x3, #14
+        # x2 bytes should be copied from x0 to x1
+        b.ne    cs_forward_lt14
+        ret
+        
+        # Very similar to forward copying
+cs_backward:
+        subs    x3, x2, #14
+        b.ge    cs_backward_loop
+
+cs_backward_lt14:
+        ands    x7, x2, #7
+        tbz     x2, #3, cs_backward_lt8
+
+        ldrh    w3, [x17, #-8]
+        ldrh    w4, [x17, #-6]
+        ldrh    w5, [x17, #-4]
+        ldrh    w6, [x17, #-2]
+        
+        strh    w3, [x18, #-8]
+        strh    w4, [x18, #-6]
+        strh    w5, [x18, #-4]
+        strh    w6, [x18, #-2]
+
+cs_backward_lt8:
+        b.eq    cs_backward_0
+        cmp     x7, #4
+        b.lt    cs_backward_2
+        b.eq    cs_backward_4
+
+cs_backward_6:
+        ldrh    w3, [x0, #4]
+        strh    w3, [x1, #4]
+
+cs_backward_4:
+        ldrh    w4, [x0, #2]
+        strh    w4, [x1, #2]
+
+cs_backward_2:
+        ldrh    w5, [x0, #0]
+        strh    w5, [x1, #0]
+
+cs_backward_0:
+        ret
+
+
+.p2align 6
+cs_backward_loop:
+        subs    x3, x3, #14
+
+        ldrh    w4, [x17, #-14]!
+        ldrh    w5, [x17, #2]
+        ldrh    w6, [x17, #4]
+        ldrh    w7, [x17, #6]
+        ldrh    w8, [x17, #8]
+        ldrh    w9, [x17, #10]
+        ldrh    w10, [x17, #12]
+
+        strh    w4, [x18, #-14]!
+        strh    w5, [x18, #2]
+        strh    w6, [x18, #4]
+        strh    w7, [x18, #6]
+        strh    w8, [x18, #8]
+        strh    w9, [x18, #10]
+        strh    w10, [x18, #12]
+
+        b.ge    cs_backward_loop
+        adds    x2, x3, #14
+        b.ne    cs_backward_lt14
+        ret
+
+
+        # Support for void Copy::arrayof_conjoint_jshorts(void* from,
+        #                                                 void* to,
+        #                                                 size_t count)
+_Copy_arrayof_conjoint_jshorts:
+        hlt 1007
+
+
+        # Support for void Copy::conjoint_jlongs_atomic(jlong* from,
+        #                                               jlong* to,
+        #                                               size_t count)
+_Copy_conjoint_jlongs_atomic:
+_Copy_arrayof_conjoint_jlongs:
+        hlt 1009
+
+
+        # Support for void Copy::conjoint_jints_atomic(void* from,
+        #                                              void* to,
+        #                                              size_t count)
+_Copy_conjoint_jints_atomic:
+_Copy_arrayof_conjoint_jints:
+        # These and further memory prefetches may hit out of array ranges.
+        # Experiments showed that prefetching of inaccessible memory doesn't result in exceptions.
+        prfm    pldl1keep,  [x0, #0]
+        prfm    pstl1keep,  [x1, #0]
+        prfm    pldl1keep,  [x0, #32]
+        prfm    pstl1keep,  [x1, #32]
+
+        subs    x3, x1, x0
+        # hi condition is met <=> from < to
+        ccmp    x2, x3, #0, hi
+        # hi condition is met <=> (from < to) and (to - from < count)
+        b.hi    ci_backward
+
+        subs    x18, x2,  #64
+        b.ge    ci_forward_large
+
+ci_forward_lt_64:
+        # Copy [x0, x0 + x2) to [x1, x1 + x2)
+        
+        adr     x15,  ci_forward_tail_table_base
+        and     x16,  x2,  #~4
+
+        # Calculate address to jump and store it to x15:
+        #   Each pair of instructions before ci_forward_tail_table_base copies 8 bytes.
+        #   x16 is count of bytes to copy aligned down by 8.
+        #   So x16/8 pairs of instructions should be executed. 
+        #   Each pair takes 8 bytes, so x15 = ci_forward_tail_table_base - (x16/8)*8 = x15 - x16
+        sub     x15,  x15, x16
+        prfm    plil1keep, [x15]
+    
+        add     x17,  x0,  x2
+        add     x18,  x1,  x2
+
+        # If x2 = x16 + 4, then copy 4 bytes and x16 bytes after that.
+        # Otherwise x2 = x16, so proceed to copy x16 bytes.
+        tbz     x2, #2, ci_forward_lt_64_even
+        ldr     w3, [x0]
+        str     w3, [x1]
+ci_forward_lt_64_even:
+        # Copy [x17 - x16, x17) to [x18 - x16, x18)
+        # x16 is aligned by 8 and less than 64
+
+        # Execute (x16/8) ldp-stp pairs; each pair copies 8 bytes
+        br      x15
+
+        ldp     w3,  w4,  [x17, #-56]
+        stp     w3,  w4,  [x18, #-56]
+        ldp     w5,  w6,  [x17, #-48]
+        stp     w5,  w6,  [x18, #-48]
+        ldp     w7,  w8,  [x17, #-40]
+        stp     w7,  w8,  [x18, #-40]
+        ldp     w9,  w10, [x17, #-32]
+        stp     w9,  w10, [x18, #-32]
+        ldp     w11, w12, [x17, #-24]
+        stp     w11, w12, [x18, #-24]
+        ldp     w13, w14, [x17, #-16]
+        stp     w13, w14, [x18, #-16]
+        ldp     w15, w16, [x17, #-8]
+        stp     w15, w16, [x18, #-8]
+ci_forward_tail_table_base:
+        ret
+
+.p2align  6
+.rept   12
+        nop
+.endr
+ci_forward_large:
+        # x18 >= 0;
+        # Copy [x0, x0 + x18 + 64) to [x1, x1 + x18 + 64)
+
+        ldp     w3,  w4,  [x0], #32
+        ldp     w5,  w6,  [x0, #-24]
+        ldp     w7,  w8,  [x0, #-16]
+        ldp     w9,  w10, [x0, #-8]
+
+        # Before and after each iteration of loop registers w3-w10 contain [x0 - 32, x0),
+        # and x1 is a place to copy this data;
+        # x18 contains number of bytes to be stored minus 64
+
+        # Exactly 16 instructions from p2align, so ci_forward_loop starts from cache line boundary
+        # Checking it explictly by aligning with "hlt 1000" instructions 
+.p2alignl  6, 0xd4407d00
+ci_forward_loop:
+        prfm    pldl1keep,  [x0, #32]
+        prfm    pstl1keep,  [x1, #32]
+
+        subs    x18, x18, #32
+
+        stp     w3,  w4,  [x1, #0]
+        ldp     w3,  w4,  [x0, #0]
+        stp     w5,  w6,  [x1, #8]
+        ldp     w5,  w6,  [x0, #8]
+        stp     w7,  w8,  [x1, #16]
+        ldp     w7,  w8,  [x0, #16]
+        stp     w9,  w10, [x1, #24]
+        ldp     w9,  w10, [x0, #24]
+        
+        add     x1,  x1,  #32
+        add     x0,  x0,  #32
+
+        b.ge    ci_forward_loop
+
+        # 14 instructions from ci_forward_loop, so the loop body hits into one cache line
+
+ci_forward_loop_end:
+        adds    x2,  x18, #32
+
+        stp     w3,  w4,  [x1], #32
+        stp     w5,  w6,  [x1, #-24]
+        stp     w7,  w8,  [x1, #-16]
+        stp     w9,  w10, [x1, #-8]
+
+        # Increased x18 by 32, but stored 32 bytes, so x2 contains exact number of bytes to be stored
+
+        # If this number is not zero, also copy remaining bytes
+        b.ne    ci_forward_lt_64
+        ret
+
+ci_backward:
+
+        # Overlapping case should be the rare one, it does not worth optimizing
+
+        ands    x3,  x2,  #~4
+        # x3 is count aligned down by 2*jintSize
+        add     x0,  x0,  x2
+        add     x1,  x1,  x2
+        sub     x3,  x3,  #8
+        # Skip loop if 0 or 1 jints
+        b.eq    ci_backward_loop_end
+
+        # x3 >= 0
+        # Copy [x0 - x3 - 8, x0) to [x1 - x3 - 8, x1) backward
+ci_backward_loop:
+        subs    x3,  x3,  #8
+        ldp     w4,  w5,  [x0, #-8]!
+        stp     w4,  w5,  [x1, #-8]!
+        b.ge    ci_backward_loop
+
+ci_backward_loop_end:
+        # Copy remaining 0 or 1 jints
+        tbz     x2,  #2,  ci_backward_finish
+        ldr     w3, [x0, #-4]
+        str     w3, [x1, #-4]
+
+ci_backward_finish:
+        ret