--- /dev/null 2021-01-19 17:38:25.908523431 +0000 +++ new/src/os_cpu/linux_aarch64/vm/copy_linux_aarch64.s 2021-01-25 19:32:07.081795415 +0000 @@ -0,0 +1,411 @@ +/* + * Copyright (c) 2016, Linaro Ltd. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + * + */ + .global _Copy_conjoint_words + .global _Copy_disjoint_words + +s .req x0 +d .req x1 +count .req x2 +t0 .req x3 +t1 .req x4 +t2 .req x5 +t3 .req x6 +t4 .req x7 +t5 .req x8 +t6 .req x9 +t7 .req x10 + + .align 6 +_Copy_disjoint_words: + // Ensure 2 word aligned + tbz s, #3, fwd_copy_aligned + ldr t0, [s], #8 + str t0, [d], #8 + sub count, count, #1 + +fwd_copy_aligned: + ldp t0, t1, [s, #0] + ldp t2, t3, [s, #16] + ldp t4, t5, [s, #32] + ldp t6, t7, [s, #48]! // Source now biased by -16 + + tbnz d, #3, unal_fwd_copy + sub d, d, #16 // and bias dest + + subs count, count, #16 + blo fwd_copy_drain + +fwd_copy_again: + prfm pldl1keep, [s, #256] + stp t0, t1, [d, #16] + ldp t0, t1, [s, #16] + stp t2, t3, [d, #32] + ldp t2, t3, [s, #32] + stp t4, t5, [d, #48] + ldp t4, t5, [s, #48] + stp t6, t7, [d, #64]! + ldp t6, t7, [s, #64]! + subs count, count, #8 + bhs fwd_copy_again + +fwd_copy_drain: + stp t0, t1, [d, #16] + stp t2, t3, [d, #32] + stp t4, t5, [d, #48] + stp t6, t7, [d, #64]! + + // count is now -8..-1 for 0..7 words to copy + adr t0, 0f + add t0, t0, count, lsl #5 + br t0 + + .align 5 + ret // -8 == 0 words + .align 5 + ldr t0, [s, #16] // -7 == 1 word + str t0, [d, #16] + ret + .align 5 + ldp t0, t1, [s, #16] // -6 = 2 words + stp t0, t1, [d, #16] + ret + .align 5 + ldp t0, t1, [s, #16] // -5 = 3 words + ldr t2, [s, #32] + stp t0, t1, [d, #16] + str t2, [d, #32] + ret + .align 5 + ldp t0, t1, [s, #16] // -4 = 4 words + ldp t2, t3, [s, #32] + stp t0, t1, [d, #16] + stp t2, t3, [d, #32] + ret + .align 5 + ldp t0, t1, [s, #16] // -3 = 5 words + ldp t2, t3, [s, #32] + ldr t4, [s, #48] + stp t0, t1, [d, #16] + stp t2, t3, [d, #32] + str t4, [d, #48] + ret + .align 5 + ldp t0, t1, [s, #16] // -2 = 6 words + ldp t2, t3, [s, #32] + ldp t4, t5, [s, #48] + stp t0, t1, [d, #16] + stp t2, t3, [d, #32] + stp t4, t5, [d, #48] + ret + .align 5 + ldp t0, t1, [s, #16] // -1 = 7 words + ldp t2, t3, [s, #32] + ldp t4, t5, [s, #48] + ldr t6, [s, #64] + stp t0, t1, [d, #16] + stp t2, t3, [d, #32] + stp t4, t5, [d, #48] + str t6, [d, #64] + // Is always aligned here, code for 7 words is one instruction + // too large so it just falls through. + .align 5 +0: + ret + +unal_fwd_copy: + // Bias dest so we only pre index on the last copy + sub d, d, #8 + subs count, count, #16 + blo unal_fwd_copy_drain + +unal_fwd_copy_again: + prfm pldl1keep, [s, #256] + str t0, [d, #8] + stp t1, t2, [d, #16] + ldp t0, t1, [s, #16] + stp t3, t4, [d, #32] + ldp t2, t3, [s, #32] + stp t5, t6, [d, #48] + ldp t4, t5, [s, #48] + str t7, [d, #64]! + ldp t6, t7, [s, #64]! + subs count, count, #8 + bhs unal_fwd_copy_again + +unal_fwd_copy_drain: + str t0, [d, #8] + stp t1, t2, [d, #16] + stp t3, t4, [d, #32] + stp t5, t6, [d, #48] + str t7, [d, #64]! + + // count is now -8..-1 for 0..7 words to copy + adr t0, 0f + add t0, t0, count, lsl #5 + br t0 + + .align 5 + ret // -8 == 0 words + .align 5 + ldr t0, [s, #16] // -7 == 1 word + str t0, [d, #8] + ret + .align 5 + ldp t0, t1, [s, #16] // -6 = 2 words + str t0, [d, #8] + str t1, [d, #16] + ret + .align 5 + ldp t0, t1, [s, #16] // -5 = 3 words + ldr t2, [s, #32] + str t0, [d, #8] + stp t1, t2, [d, #16] + ret + .align 5 + ldp t0, t1, [s, #16] // -4 = 4 words + ldp t2, t3, [s, #32] + str t0, [d, #8] + stp t1, t2, [d, #16] + str t3, [d, #32] + ret + .align 5 + ldp t0, t1, [s, #16] // -3 = 5 words + ldp t2, t3, [s, #32] + ldr t4, [s, #48] + str t0, [d, #8] + stp t1, t2, [d, #16] + stp t3, t4, [d, #32] + ret + .align 5 + ldp t0, t1, [s, #16] // -2 = 6 words + ldp t2, t3, [s, #32] + ldp t4, t5, [s, #48] + str t0, [d, #8] + stp t1, t2, [d, #16] + stp t3, t4, [d, #32] + str t5, [d, #48] + ret + .align 5 + ldp t0, t1, [s, #16] // -1 = 7 words + ldp t2, t3, [s, #32] + ldp t4, t5, [s, #48] + ldr t6, [s, #64] + str t0, [d, #8] + stp t1, t2, [d, #16] + stp t3, t4, [d, #32] + stp t5, t6, [d, #48] + // Is always aligned here, code for 7 words is one instruction + // too large so it just falls through. + .align 5 +0: + ret + + .align 6 +_Copy_conjoint_words: + sub t0, d, s + cmp t0, count, lsl #3 + bhs _Copy_disjoint_words + + add s, s, count, lsl #3 + add d, d, count, lsl #3 + + // Ensure 2 word aligned + tbz s, #3, bwd_copy_aligned + ldr t0, [s, #-8]! + str t0, [d, #-8]! + sub count, count, #1 + +bwd_copy_aligned: + ldp t0, t1, [s, #-16] + ldp t2, t3, [s, #-32] + ldp t4, t5, [s, #-48] + ldp t6, t7, [s, #-64]! + + tbnz d, #3, unal_bwd_copy + + subs count, count, #16 + blo bwd_copy_drain + +bwd_copy_again: + prfum pldl1keep, [s, #-256] + stp t0, t1, [d, #-16] + ldp t0, t1, [s, #-16] + stp t2, t3, [d, #-32] + ldp t2, t3, [s, #-32] + stp t4, t5, [d, #-48] + ldp t4, t5, [s, #-48] + stp t6, t7, [d, #-64]! + ldp t6, t7, [s, #-64]! + subs count, count, #8 + bhs bwd_copy_again + +bwd_copy_drain: + stp t0, t1, [d, #-16] + stp t2, t3, [d, #-32] + stp t4, t5, [d, #-48] + stp t6, t7, [d, #-64]! + + // count is now -8..-1 for 0..7 words to copy + adr t0, 0f + add t0, t0, count, lsl #5 + br t0 + + .align 5 + ret // -8 == 0 words + .align 5 + ldr t0, [s, #-8] // -7 == 1 word + str t0, [d, #-8] + ret + .align 5 + ldp t0, t1, [s, #-16] // -6 = 2 words + stp t0, t1, [d, #-16] + ret + .align 5 + ldp t0, t1, [s, #-16] // -5 = 3 words + ldr t2, [s, #-24] + stp t0, t1, [d, #-16] + str t2, [d, #-24] + ret + .align 5 + ldp t0, t1, [s, #-16] // -4 = 4 words + ldp t2, t3, [s, #-32] + stp t0, t1, [d, #-16] + stp t2, t3, [d, #-32] + ret + .align 5 + ldp t0, t1, [s, #-16] // -3 = 5 words + ldp t2, t3, [s, #-32] + ldr t4, [s, #-40] + stp t0, t1, [d, #-16] + stp t2, t3, [d, #-32] + str t4, [d, #-40] + ret + .align 5 + ldp t0, t1, [s, #-16] // -2 = 6 words + ldp t2, t3, [s, #-32] + ldp t4, t5, [s, #-48] + stp t0, t1, [d, #-16] + stp t2, t3, [d, #-32] + stp t4, t5, [d, #-48] + ret + .align 5 + ldp t0, t1, [s, #-16] // -1 = 7 words + ldp t2, t3, [s, #-32] + ldp t4, t5, [s, #-48] + ldr t6, [s, #-56] + stp t0, t1, [d, #-16] + stp t2, t3, [d, #-32] + stp t4, t5, [d, #-48] + str t6, [d, #-56] + // Is always aligned here, code for 7 words is one instruction + // too large so it just falls through. + .align 5 +0: + ret + +unal_bwd_copy: + subs count, count, #16 + blo unal_bwd_copy_drain + +unal_bwd_copy_again: + prfm pldl1keep, [s, #-256] + str t1, [d, #-8] + stp t3, t0, [d, #-24] + ldp t0, t1, [s, #-16] + stp t5, t2, [d, #-40] + ldp t2, t3, [s, #-32] + stp t7, t4, [d, #-56] + ldp t4, t5, [s, #-48] + str t6, [d, #-64]! + ldp t6, t7, [s, #-64]! + subs count, count, #8 + bhs unal_bwd_copy_again + +unal_bwd_copy_drain: + str t1, [d, #-8] + stp t3, t0, [d, #-24] + stp t5, t2, [d, #-40] + stp t7, t4, [d, #-56] + str t6, [d, #-64]! + + // count is now -8..-1 for 0..7 words to copy + adr t0, 0f + add t0, t0, count, lsl #5 + br t0 + + .align 5 + ret // -8 == 0 words + .align 5 + ldr t0, [s, #-8] // -7 == 1 word + str t0, [d, #-8] + ret + .align 5 + ldp t0, t1, [s, #-16] // -6 = 2 words + str t1, [d, #-8] + str t0, [d, #-16] + ret + .align 5 + ldp t0, t1, [s, #-16] // -5 = 3 words + ldr t2, [s, #-24] + str t1, [d, #-8] + stp t2, t0, [d, #-24] + ret + .align 5 + ldp t0, t1, [s, #-16] // -4 = 4 words + ldp t2, t3, [s, #-32] + str t1, [d, #-8] + stp t3, t0, [d, #-24] + str t2, [d, #-32] + ret + .align 5 + ldp t0, t1, [s, #-16] // -3 = 5 words + ldp t2, t3, [s, #-32] + ldr t4, [s, #-40] + str t1, [d, #-8] + stp t3, t0, [d, #-24] + stp t4, t2, [d, #-40] + ret + .align 5 + ldp t0, t1, [s, #-16] // -2 = 6 words + ldp t2, t3, [s, #-32] + ldp t4, t5, [s, #-48] + str t1, [d, #-8] + stp t3, t0, [d, #-24] + stp t5, t2, [d, #-40] + str t4, [d, #-48] + ret + .align 5 + ldp t0, t1, [s, #-16] // -1 = 7 words + ldp t2, t3, [s, #-32] + ldp t4, t5, [s, #-48] + ldr t6, [s, #-56] + str t1, [d, #-8] + stp t3, t0, [d, #-24] + stp t5, t2, [d, #-40] + stp t6, t4, [d, #-56] + // Is always aligned here, code for 7 words is one instruction + // too large so it just falls through. + .align 5 +0: + ret