--- old/src/os_cpu/linux_x86/vm/linux_x86_64.s Tue Sep 13 12:29:13 2011 +++ /dev/null Tue Sep 13 12:28:49 2011 @@ -1,402 +0,0 @@ -# -# Copyright (c) 2004, 2007, Oracle and/or its affiliates. All rights reserved. -# DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. -# -# This code is free software; you can redistribute it and/or modify it -# under the terms of the GNU General Public License version 2 only, as -# published by the Free Software Foundation. -# -# This code is distributed in the hope that it will be useful, but WITHOUT -# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or -# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -# version 2 for more details (a copy is included in the LICENSE file that -# accompanied this code). -# -# You should have received a copy of the GNU General Public License version -# 2 along with this work; if not, write to the Free Software Foundation, -# Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. -# -# Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA -# or visit www.oracle.com if you need additional information or have any -# questions. -# - - - # NOTE WELL! The _Copy functions are called directly - # from server-compiler-generated code via CallLeafNoFP, - # which means that they *must* either not use floating - # point or use it in the same manner as does the server - # compiler. - - .globl _Copy_arrayof_conjoint_bytes - .globl _Copy_arrayof_conjoint_jshorts - .globl _Copy_conjoint_jshorts_atomic - .globl _Copy_arrayof_conjoint_jints - .globl _Copy_conjoint_jints_atomic - .globl _Copy_arrayof_conjoint_jlongs - .globl _Copy_conjoint_jlongs_atomic - - .text - - .globl SafeFetch32, Fetch32PFI, Fetch32Resume - .align 16 - .type SafeFetch32,@function - // Prototype: int SafeFetch32 (int * Adr, int ErrValue) -SafeFetch32: - movl %esi, %eax -Fetch32PFI: - movl (%rdi), %eax -Fetch32Resume: - ret - - .globl SafeFetchN, FetchNPFI, FetchNResume - .align 16 - .type SafeFetchN,@function - // Prototype: intptr_t SafeFetchN (intptr_t * Adr, intptr_t ErrValue) -SafeFetchN: - movq %rsi, %rax -FetchNPFI: - movq (%rdi), %rax -FetchNResume: - ret - - .globl SpinPause - .align 16 - .type SpinPause,@function -SpinPause: - rep - nop - movq $1, %rax - ret - - # Support for void Copy::arrayof_conjoint_bytes(void* from, - # void* to, - # size_t count) - # rdi - from - # rsi - to - # rdx - count, treated as ssize_t - # - .p2align 4,,15 - .type _Copy_arrayof_conjoint_bytes,@function -_Copy_arrayof_conjoint_bytes: - movq %rdx,%r8 # byte count - shrq $3,%rdx # qword count - cmpq %rdi,%rsi - leaq -1(%rdi,%r8,1),%rax # from + bcount*1 - 1 - jbe acb_CopyRight - cmpq %rax,%rsi - jbe acb_CopyLeft -acb_CopyRight: - leaq -8(%rdi,%rdx,8),%rax # from + qcount*8 - 8 - leaq -8(%rsi,%rdx,8),%rcx # to + qcount*8 - 8 - negq %rdx - jmp 7f - .p2align 4,,15 -1: movq 8(%rax,%rdx,8),%rsi - movq %rsi,8(%rcx,%rdx,8) - addq $1,%rdx - jnz 1b -2: testq $4,%r8 # check for trailing dword - jz 3f - movl 8(%rax),%esi # copy trailing dword - movl %esi,8(%rcx) - addq $4,%rax - addq $4,%rcx # original %rsi is trashed, so we - # can't use it as a base register -3: testq $2,%r8 # check for trailing word - jz 4f - movw 8(%rax),%si # copy trailing word - movw %si,8(%rcx) - addq $2,%rcx -4: testq $1,%r8 # check for trailing byte - jz 5f - movb -1(%rdi,%r8,1),%al # copy trailing byte - movb %al,8(%rcx) -5: ret - .p2align 4,,15 -6: movq -24(%rax,%rdx,8),%rsi - movq %rsi,-24(%rcx,%rdx,8) - movq -16(%rax,%rdx,8),%rsi - movq %rsi,-16(%rcx,%rdx,8) - movq -8(%rax,%rdx,8),%rsi - movq %rsi,-8(%rcx,%rdx,8) - movq (%rax,%rdx,8),%rsi - movq %rsi,(%rcx,%rdx,8) -7: addq $4,%rdx - jle 6b - subq $4,%rdx - jl 1b - jmp 2b -acb_CopyLeft: - testq $1,%r8 # check for trailing byte - jz 1f - movb -1(%rdi,%r8,1),%cl # copy trailing byte - movb %cl,-1(%rsi,%r8,1) - subq $1,%r8 # adjust for possible trailing word -1: testq $2,%r8 # check for trailing word - jz 2f - movw -2(%rdi,%r8,1),%cx # copy trailing word - movw %cx,-2(%rsi,%r8,1) -2: testq $4,%r8 # check for trailing dword - jz 5f - movl (%rdi,%rdx,8),%ecx # copy trailing dword - movl %ecx,(%rsi,%rdx,8) - jmp 5f - .p2align 4,,15 -3: movq -8(%rdi,%rdx,8),%rcx - movq %rcx,-8(%rsi,%rdx,8) - subq $1,%rdx - jnz 3b - ret - .p2align 4,,15 -4: movq 24(%rdi,%rdx,8),%rcx - movq %rcx,24(%rsi,%rdx,8) - movq 16(%rdi,%rdx,8),%rcx - movq %rcx,16(%rsi,%rdx,8) - movq 8(%rdi,%rdx,8),%rcx - movq %rcx,8(%rsi,%rdx,8) - movq (%rdi,%rdx,8),%rcx - movq %rcx,(%rsi,%rdx,8) -5: subq $4,%rdx - jge 4b - addq $4,%rdx - jg 3b - ret - - # Support for void Copy::arrayof_conjoint_jshorts(void* from, - # void* to, - # size_t count) - # Equivalent to - # conjoint_jshorts_atomic - # - # If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we - # let the hardware handle it. The tow or four words within dwords - # or qwords that span cache line boundaries will still be loaded - # and stored atomically. - # - # rdi - from - # rsi - to - # rdx - count, treated as ssize_t - # - .p2align 4,,15 - .type _Copy_arrayof_conjoint_jshorts,@function - .type _Copy_conjoint_jshorts_atomic,@function -_Copy_arrayof_conjoint_jshorts: -_Copy_conjoint_jshorts_atomic: - movq %rdx,%r8 # word count - shrq $2,%rdx # qword count - cmpq %rdi,%rsi - leaq -2(%rdi,%r8,2),%rax # from + wcount*2 - 2 - jbe acs_CopyRight - cmpq %rax,%rsi - jbe acs_CopyLeft -acs_CopyRight: - leaq -8(%rdi,%rdx,8),%rax # from + qcount*8 - 8 - leaq -8(%rsi,%rdx,8),%rcx # to + qcount*8 - 8 - negq %rdx - jmp 6f -1: movq 8(%rax,%rdx,8),%rsi - movq %rsi,8(%rcx,%rdx,8) - addq $1,%rdx - jnz 1b -2: testq $2,%r8 # check for trailing dword - jz 3f - movl 8(%rax),%esi # copy trailing dword - movl %esi,8(%rcx) - addq $4,%rcx # original %rsi is trashed, so we - # can't use it as a base register -3: testq $1,%r8 # check for trailing word - jz 4f - movw -2(%rdi,%r8,2),%si # copy trailing word - movw %si,8(%rcx) -4: ret - .p2align 4,,15 -5: movq -24(%rax,%rdx,8),%rsi - movq %rsi,-24(%rcx,%rdx,8) - movq -16(%rax,%rdx,8),%rsi - movq %rsi,-16(%rcx,%rdx,8) - movq -8(%rax,%rdx,8),%rsi - movq %rsi,-8(%rcx,%rdx,8) - movq (%rax,%rdx,8),%rsi - movq %rsi,(%rcx,%rdx,8) -6: addq $4,%rdx - jle 5b - subq $4,%rdx - jl 1b - jmp 2b -acs_CopyLeft: - testq $1,%r8 # check for trailing word - jz 1f - movw -2(%rdi,%r8,2),%cx # copy trailing word - movw %cx,-2(%rsi,%r8,2) -1: testq $2,%r8 # check for trailing dword - jz 4f - movl (%rdi,%rdx,8),%ecx # copy trailing dword - movl %ecx,(%rsi,%rdx,8) - jmp 4f -2: movq -8(%rdi,%rdx,8),%rcx - movq %rcx,-8(%rsi,%rdx,8) - subq $1,%rdx - jnz 2b - ret - .p2align 4,,15 -3: movq 24(%rdi,%rdx,8),%rcx - movq %rcx,24(%rsi,%rdx,8) - movq 16(%rdi,%rdx,8),%rcx - movq %rcx,16(%rsi,%rdx,8) - movq 8(%rdi,%rdx,8),%rcx - movq %rcx,8(%rsi,%rdx,8) - movq (%rdi,%rdx,8),%rcx - movq %rcx,(%rsi,%rdx,8) -4: subq $4,%rdx - jge 3b - addq $4,%rdx - jg 2b - ret - - # Support for void Copy::arrayof_conjoint_jints(jint* from, - # jint* to, - # size_t count) - # Equivalent to - # conjoint_jints_atomic - # - # If 'from' and/or 'to' are aligned on 4-byte boundaries, we let - # the hardware handle it. The two dwords within qwords that span - # cache line boundaries will still be loaded and stored atomically. - # - # rdi - from - # rsi - to - # rdx - count, treated as ssize_t - # - .p2align 4,,15 - .type _Copy_arrayof_conjoint_jints,@function - .type _Copy_conjoint_jints_atomic,@function -_Copy_arrayof_conjoint_jints: -_Copy_conjoint_jints_atomic: - movq %rdx,%r8 # dword count - shrq %rdx # qword count - cmpq %rdi,%rsi - leaq -4(%rdi,%r8,4),%rax # from + dcount*4 - 4 - jbe aci_CopyRight - cmpq %rax,%rsi - jbe aci_CopyLeft -aci_CopyRight: - leaq -8(%rdi,%rdx,8),%rax # from + qcount*8 - 8 - leaq -8(%rsi,%rdx,8),%rcx # to + qcount*8 - 8 - negq %rdx - jmp 5f - .p2align 4,,15 -1: movq 8(%rax,%rdx,8),%rsi - movq %rsi,8(%rcx,%rdx,8) - addq $1,%rdx - jnz 1b -2: testq $1,%r8 # check for trailing dword - jz 3f - movl 8(%rax),%esi # copy trailing dword - movl %esi,8(%rcx) -3: ret - .p2align 4,,15 -4: movq -24(%rax,%rdx,8),%rsi - movq %rsi,-24(%rcx,%rdx,8) - movq -16(%rax,%rdx,8),%rsi - movq %rsi,-16(%rcx,%rdx,8) - movq -8(%rax,%rdx,8),%rsi - movq %rsi,-8(%rcx,%rdx,8) - movq (%rax,%rdx,8),%rsi - movq %rsi,(%rcx,%rdx,8) -5: addq $4,%rdx - jle 4b - subq $4,%rdx - jl 1b - jmp 2b -aci_CopyLeft: - testq $1,%r8 # check for trailing dword - jz 3f - movl -4(%rdi,%r8,4),%ecx # copy trailing dword - movl %ecx,-4(%rsi,%r8,4) - jmp 3f -1: movq -8(%rdi,%rdx,8),%rcx - movq %rcx,-8(%rsi,%rdx,8) - subq $1,%rdx - jnz 1b - ret - .p2align 4,,15 -2: movq 24(%rdi,%rdx,8),%rcx - movq %rcx,24(%rsi,%rdx,8) - movq 16(%rdi,%rdx,8),%rcx - movq %rcx,16(%rsi,%rdx,8) - movq 8(%rdi,%rdx,8),%rcx - movq %rcx,8(%rsi,%rdx,8) - movq (%rdi,%rdx,8),%rcx - movq %rcx,(%rsi,%rdx,8) -3: subq $4,%rdx - jge 2b - addq $4,%rdx - jg 1b - ret - - # Support for void Copy::arrayof_conjoint_jlongs(jlong* from, - # jlong* to, - # size_t count) - # Equivalent to - # conjoint_jlongs_atomic - # arrayof_conjoint_oops - # conjoint_oops_atomic - # - # rdi - from - # rsi - to - # rdx - count, treated as ssize_t - # - .p2align 4,,15 - .type _Copy_arrayof_conjoint_jlongs,@function - .type _Copy_conjoint_jlongs_atomic,@function -_Copy_arrayof_conjoint_jlongs: -_Copy_conjoint_jlongs_atomic: - cmpq %rdi,%rsi - leaq -8(%rdi,%rdx,8),%rax # from + count*8 - 8 - jbe acl_CopyRight - cmpq %rax,%rsi - jbe acl_CopyLeft -acl_CopyRight: - leaq -8(%rsi,%rdx,8),%rcx # to + count*8 - 8 - negq %rdx - jmp 3f -1: movq 8(%rax,%rdx,8),%rsi - movq %rsi,8(%rcx,%rdx,8) - addq $1,%rdx - jnz 1b - ret - .p2align 4,,15 -2: movq -24(%rax,%rdx,8),%rsi - movq %rsi,-24(%rcx,%rdx,8) - movq -16(%rax,%rdx,8),%rsi - movq %rsi,-16(%rcx,%rdx,8) - movq -8(%rax,%rdx,8),%rsi - movq %rsi,-8(%rcx,%rdx,8) - movq (%rax,%rdx,8),%rsi - movq %rsi,(%rcx,%rdx,8) -3: addq $4,%rdx - jle 2b - subq $4,%rdx - jl 1b - ret -4: movq -8(%rdi,%rdx,8),%rcx - movq %rcx,-8(%rsi,%rdx,8) - subq $1,%rdx - jnz 4b - ret - .p2align 4,,15 -5: movq 24(%rdi,%rdx,8),%rcx - movq %rcx,24(%rsi,%rdx,8) - movq 16(%rdi,%rdx,8),%rcx - movq %rcx,16(%rsi,%rdx,8) - movq 8(%rdi,%rdx,8),%rcx - movq %rcx,8(%rsi,%rdx,8) - movq (%rdi,%rdx,8),%rcx - movq %rcx,(%rsi,%rdx,8) -acl_CopyLeft: - subq $4,%rdx - jge 5b - addq $4,%rdx - jg 4b - ret --- /dev/null Tue Sep 13 12:28:49 2011 +++ new/src/os_cpu/bsd_x86/vm/bsd_x86_64.s Tue Sep 13 12:29:13 2011 @@ -0,0 +1,422 @@ +# +# Copyright (c) 2004, 2007, Oracle and/or its affiliates. All rights reserved. +# DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. +# +# This code is free software; you can redistribute it and/or modify it +# under the terms of the GNU General Public License version 2 only, as +# published by the Free Software Foundation. +# +# This code is distributed in the hope that it will be useful, but WITHOUT +# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +# version 2 for more details (a copy is included in the LICENSE file that +# accompanied this code). +# +# You should have received a copy of the GNU General Public License version +# 2 along with this work; if not, write to the Free Software Foundation, +# Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. +# +# Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA +# or visit www.oracle.com if you need additional information or have any +# questions. +# + +#ifdef __APPLE__ +# Darwin uses _ prefixed global symbols +#define SYMBOL(s) _ ## s +#define ELF_TYPE(name, description) +#else +#define SYMBOL(s) s +#define ELF_TYPE(name, description) .type name,description +#endif + + # NOTE WELL! The _Copy functions are called directly + # from server-compiler-generated code via CallLeafNoFP, + # which means that they *must* either not use floating + # point or use it in the same manner as does the server + # compiler. + + .globl SYMBOL(_Copy_arrayof_conjoint_bytes) + .globl SYMBOL(_Copy_arrayof_conjoint_jshorts) + .globl SYMBOL(_Copy_conjoint_jshorts_atomic) + .globl SYMBOL(_Copy_arrayof_conjoint_jints) + .globl SYMBOL(_Copy_conjoint_jints_atomic) + .globl SYMBOL(_Copy_arrayof_conjoint_jlongs) + .globl SYMBOL(_Copy_conjoint_jlongs_atomic) + + .text + + .globl SYMBOL(SafeFetch32), SYMBOL(Fetch32PFI), SYMBOL(Fetch32Resume) +#ifdef __APPLE__ + .align 4 +#else + .align 16 +#endif + ELF_TYPE(SafeFetch32,@function) + // Prototype: int SafeFetch32 (int * Adr, int ErrValue) +SYMBOL(SafeFetch32): + movl %esi, %eax +SYMBOL(Fetch32PFI): + movl (%rdi), %eax +SYMBOL(Fetch32Resume): + ret + + .globl SYMBOL(SafeFetchN), SYMBOL(FetchNPFI), SYMBOL(FetchNResume) +#ifdef __APPLE__ + .align 4 +#else + .align 16 +#endif + ELF_TYPE(SafeFetchN,@function) + // Prototype: intptr_t SafeFetchN (intptr_t * Adr, intptr_t ErrValue) +SYMBOL(SafeFetchN): + movq %rsi, %rax +SYMBOL(FetchNPFI): + movq (%rdi), %rax +SYMBOL(FetchNResume): + ret + + .globl SYMBOL(SpinPause) +#ifdef __APPLE__ + .align 4 +#else + .align 16 +#endif + ELF_TYPE(SpinPause,@function) +SYMBOL(SpinPause): + rep + nop + movq $1, %rax + ret + + # Support for void Copy::arrayof_conjoint_bytes(void* from, + # void* to, + # size_t count) + # rdi - from + # rsi - to + # rdx - count, treated as ssize_t + # + .p2align 4,,15 + ELF_TYPE(_Copy_arrayof_conjoint_bytes,@function) +SYMBOL(_Copy_arrayof_conjoint_bytes): + movq %rdx,%r8 # byte count + shrq $3,%rdx # qword count + cmpq %rdi,%rsi + leaq -1(%rdi,%r8,1),%rax # from + bcount*1 - 1 + jbe acb_CopyRight + cmpq %rax,%rsi + jbe acb_CopyLeft +acb_CopyRight: + leaq -8(%rdi,%rdx,8),%rax # from + qcount*8 - 8 + leaq -8(%rsi,%rdx,8),%rcx # to + qcount*8 - 8 + negq %rdx + jmp 7f + .p2align 4,,15 +1: movq 8(%rax,%rdx,8),%rsi + movq %rsi,8(%rcx,%rdx,8) + addq $1,%rdx + jnz 1b +2: testq $4,%r8 # check for trailing dword + jz 3f + movl 8(%rax),%esi # copy trailing dword + movl %esi,8(%rcx) + addq $4,%rax + addq $4,%rcx # original %rsi is trashed, so we + # can't use it as a base register +3: testq $2,%r8 # check for trailing word + jz 4f + movw 8(%rax),%si # copy trailing word + movw %si,8(%rcx) + addq $2,%rcx +4: testq $1,%r8 # check for trailing byte + jz 5f + movb -1(%rdi,%r8,1),%al # copy trailing byte + movb %al,8(%rcx) +5: ret + .p2align 4,,15 +6: movq -24(%rax,%rdx,8),%rsi + movq %rsi,-24(%rcx,%rdx,8) + movq -16(%rax,%rdx,8),%rsi + movq %rsi,-16(%rcx,%rdx,8) + movq -8(%rax,%rdx,8),%rsi + movq %rsi,-8(%rcx,%rdx,8) + movq (%rax,%rdx,8),%rsi + movq %rsi,(%rcx,%rdx,8) +7: addq $4,%rdx + jle 6b + subq $4,%rdx + jl 1b + jmp 2b +acb_CopyLeft: + testq $1,%r8 # check for trailing byte + jz 1f + movb -1(%rdi,%r8,1),%cl # copy trailing byte + movb %cl,-1(%rsi,%r8,1) + subq $1,%r8 # adjust for possible trailing word +1: testq $2,%r8 # check for trailing word + jz 2f + movw -2(%rdi,%r8,1),%cx # copy trailing word + movw %cx,-2(%rsi,%r8,1) +2: testq $4,%r8 # check for trailing dword + jz 5f + movl (%rdi,%rdx,8),%ecx # copy trailing dword + movl %ecx,(%rsi,%rdx,8) + jmp 5f + .p2align 4,,15 +3: movq -8(%rdi,%rdx,8),%rcx + movq %rcx,-8(%rsi,%rdx,8) + subq $1,%rdx + jnz 3b + ret + .p2align 4,,15 +4: movq 24(%rdi,%rdx,8),%rcx + movq %rcx,24(%rsi,%rdx,8) + movq 16(%rdi,%rdx,8),%rcx + movq %rcx,16(%rsi,%rdx,8) + movq 8(%rdi,%rdx,8),%rcx + movq %rcx,8(%rsi,%rdx,8) + movq (%rdi,%rdx,8),%rcx + movq %rcx,(%rsi,%rdx,8) +5: subq $4,%rdx + jge 4b + addq $4,%rdx + jg 3b + ret + + # Support for void Copy::arrayof_conjoint_jshorts(void* from, + # void* to, + # size_t count) + # Equivalent to + # conjoint_jshorts_atomic + # + # If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we + # let the hardware handle it. The tow or four words within dwords + # or qwords that span cache line boundaries will still be loaded + # and stored atomically. + # + # rdi - from + # rsi - to + # rdx - count, treated as ssize_t + # + .p2align 4,,15 + ELF_TYPE(_Copy_arrayof_conjoint_jshorts,@function) + ELF_TYPE(_Copy_conjoint_jshorts_atomic,@function) +SYMBOL(_Copy_arrayof_conjoint_jshorts): +SYMBOL(_Copy_conjoint_jshorts_atomic): + movq %rdx,%r8 # word count + shrq $2,%rdx # qword count + cmpq %rdi,%rsi + leaq -2(%rdi,%r8,2),%rax # from + wcount*2 - 2 + jbe acs_CopyRight + cmpq %rax,%rsi + jbe acs_CopyLeft +acs_CopyRight: + leaq -8(%rdi,%rdx,8),%rax # from + qcount*8 - 8 + leaq -8(%rsi,%rdx,8),%rcx # to + qcount*8 - 8 + negq %rdx + jmp 6f +1: movq 8(%rax,%rdx,8),%rsi + movq %rsi,8(%rcx,%rdx,8) + addq $1,%rdx + jnz 1b +2: testq $2,%r8 # check for trailing dword + jz 3f + movl 8(%rax),%esi # copy trailing dword + movl %esi,8(%rcx) + addq $4,%rcx # original %rsi is trashed, so we + # can't use it as a base register +3: testq $1,%r8 # check for trailing word + jz 4f + movw -2(%rdi,%r8,2),%si # copy trailing word + movw %si,8(%rcx) +4: ret + .p2align 4,,15 +5: movq -24(%rax,%rdx,8),%rsi + movq %rsi,-24(%rcx,%rdx,8) + movq -16(%rax,%rdx,8),%rsi + movq %rsi,-16(%rcx,%rdx,8) + movq -8(%rax,%rdx,8),%rsi + movq %rsi,-8(%rcx,%rdx,8) + movq (%rax,%rdx,8),%rsi + movq %rsi,(%rcx,%rdx,8) +6: addq $4,%rdx + jle 5b + subq $4,%rdx + jl 1b + jmp 2b +acs_CopyLeft: + testq $1,%r8 # check for trailing word + jz 1f + movw -2(%rdi,%r8,2),%cx # copy trailing word + movw %cx,-2(%rsi,%r8,2) +1: testq $2,%r8 # check for trailing dword + jz 4f + movl (%rdi,%rdx,8),%ecx # copy trailing dword + movl %ecx,(%rsi,%rdx,8) + jmp 4f +2: movq -8(%rdi,%rdx,8),%rcx + movq %rcx,-8(%rsi,%rdx,8) + subq $1,%rdx + jnz 2b + ret + .p2align 4,,15 +3: movq 24(%rdi,%rdx,8),%rcx + movq %rcx,24(%rsi,%rdx,8) + movq 16(%rdi,%rdx,8),%rcx + movq %rcx,16(%rsi,%rdx,8) + movq 8(%rdi,%rdx,8),%rcx + movq %rcx,8(%rsi,%rdx,8) + movq (%rdi,%rdx,8),%rcx + movq %rcx,(%rsi,%rdx,8) +4: subq $4,%rdx + jge 3b + addq $4,%rdx + jg 2b + ret + + # Support for void Copy::arrayof_conjoint_jints(jint* from, + # jint* to, + # size_t count) + # Equivalent to + # conjoint_jints_atomic + # + # If 'from' and/or 'to' are aligned on 4-byte boundaries, we let + # the hardware handle it. The two dwords within qwords that span + # cache line boundaries will still be loaded and stored atomically. + # + # rdi - from + # rsi - to + # rdx - count, treated as ssize_t + # + .p2align 4,,15 + ELF_TYPE(_Copy_arrayof_conjoint_jints,@function) + ELF_TYPE(_Copy_conjoint_jints_atomic,@function) +SYMBOL(_Copy_arrayof_conjoint_jints): +SYMBOL(_Copy_conjoint_jints_atomic): + movq %rdx,%r8 # dword count + shrq %rdx # qword count + cmpq %rdi,%rsi + leaq -4(%rdi,%r8,4),%rax # from + dcount*4 - 4 + jbe aci_CopyRight + cmpq %rax,%rsi + jbe aci_CopyLeft +aci_CopyRight: + leaq -8(%rdi,%rdx,8),%rax # from + qcount*8 - 8 + leaq -8(%rsi,%rdx,8),%rcx # to + qcount*8 - 8 + negq %rdx + jmp 5f + .p2align 4,,15 +1: movq 8(%rax,%rdx,8),%rsi + movq %rsi,8(%rcx,%rdx,8) + addq $1,%rdx + jnz 1b +2: testq $1,%r8 # check for trailing dword + jz 3f + movl 8(%rax),%esi # copy trailing dword + movl %esi,8(%rcx) +3: ret + .p2align 4,,15 +4: movq -24(%rax,%rdx,8),%rsi + movq %rsi,-24(%rcx,%rdx,8) + movq -16(%rax,%rdx,8),%rsi + movq %rsi,-16(%rcx,%rdx,8) + movq -8(%rax,%rdx,8),%rsi + movq %rsi,-8(%rcx,%rdx,8) + movq (%rax,%rdx,8),%rsi + movq %rsi,(%rcx,%rdx,8) +5: addq $4,%rdx + jle 4b + subq $4,%rdx + jl 1b + jmp 2b +aci_CopyLeft: + testq $1,%r8 # check for trailing dword + jz 3f + movl -4(%rdi,%r8,4),%ecx # copy trailing dword + movl %ecx,-4(%rsi,%r8,4) + jmp 3f +1: movq -8(%rdi,%rdx,8),%rcx + movq %rcx,-8(%rsi,%rdx,8) + subq $1,%rdx + jnz 1b + ret + .p2align 4,,15 +2: movq 24(%rdi,%rdx,8),%rcx + movq %rcx,24(%rsi,%rdx,8) + movq 16(%rdi,%rdx,8),%rcx + movq %rcx,16(%rsi,%rdx,8) + movq 8(%rdi,%rdx,8),%rcx + movq %rcx,8(%rsi,%rdx,8) + movq (%rdi,%rdx,8),%rcx + movq %rcx,(%rsi,%rdx,8) +3: subq $4,%rdx + jge 2b + addq $4,%rdx + jg 1b + ret + + # Support for void Copy::arrayof_conjoint_jlongs(jlong* from, + # jlong* to, + # size_t count) + # Equivalent to + # conjoint_jlongs_atomic + # arrayof_conjoint_oops + # conjoint_oops_atomic + # + # rdi - from + # rsi - to + # rdx - count, treated as ssize_t + # + .p2align 4,,15 + ELF_TYPE(_Copy_arrayof_conjoint_jlongs,@function) + ELF_TYPE(_Copy_conjoint_jlongs_atomic,@function) +SYMBOL(_Copy_arrayof_conjoint_jlongs): +SYMBOL(_Copy_conjoint_jlongs_atomic): + cmpq %rdi,%rsi + leaq -8(%rdi,%rdx,8),%rax # from + count*8 - 8 + jbe acl_CopyRight + cmpq %rax,%rsi + jbe acl_CopyLeft +acl_CopyRight: + leaq -8(%rsi,%rdx,8),%rcx # to + count*8 - 8 + negq %rdx + jmp 3f +1: movq 8(%rax,%rdx,8),%rsi + movq %rsi,8(%rcx,%rdx,8) + addq $1,%rdx + jnz 1b + ret + .p2align 4,,15 +2: movq -24(%rax,%rdx,8),%rsi + movq %rsi,-24(%rcx,%rdx,8) + movq -16(%rax,%rdx,8),%rsi + movq %rsi,-16(%rcx,%rdx,8) + movq -8(%rax,%rdx,8),%rsi + movq %rsi,-8(%rcx,%rdx,8) + movq (%rax,%rdx,8),%rsi + movq %rsi,(%rcx,%rdx,8) +3: addq $4,%rdx + jle 2b + subq $4,%rdx + jl 1b + ret +4: movq -8(%rdi,%rdx,8),%rcx + movq %rcx,-8(%rsi,%rdx,8) + subq $1,%rdx + jnz 4b + ret + .p2align 4,,15 +5: movq 24(%rdi,%rdx,8),%rcx + movq %rcx,24(%rsi,%rdx,8) + movq 16(%rdi,%rdx,8),%rcx + movq %rcx,16(%rsi,%rdx,8) + movq 8(%rdi,%rdx,8),%rcx + movq %rcx,8(%rsi,%rdx,8) + movq (%rdi,%rdx,8),%rcx + movq %rcx,(%rsi,%rdx,8) +acl_CopyLeft: + subq $4,%rdx + jge 5b + addq $4,%rdx + jg 4b + ret