1 # 
   2 # Copyright (c) 2008, 2013, Oracle and/or its affiliates. All rights reserved.
   3 # DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4 #
   5 # This code is free software; you can redistribute it and/or modify it
   6 # under the terms of the GNU General Public License version 2 only, as
   7 # published by the Free Software Foundation.
   8 #
   9 # This code is distributed in the hope that it will be useful, but WITHOUT
  10 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11 # FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12 # version 2 for more details (a copy is included in the LICENSE file that
  13 # accompanied this code).
  14 #
  15 # You should have received a copy of the GNU General Public License version
  16 # 2 along with this work; if not, write to the Free Software Foundation,
  17 # Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18 #
  19 # Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20 # or visit www.oracle.com if you need additional information or have any
  21 # questions.
  22 # 
  23 
  24         # TODO-AARCH64
  25         
  26         # NOTE WELL!  The _Copy functions are called directly
  27         # from server-compiler-generated code via CallLeafNoFP,
  28         # which means that they *must* either not use floating
  29         # point or use it in the same manner as does the server
  30         # compiler.
  31         
  32         .globl _Copy_conjoint_bytes
  33         .type _Copy_conjoint_bytes, %function
  34         .globl _Copy_arrayof_conjoint_bytes
  35         .type _Copy_arrayof_conjoint_bytes, %function
  36         .globl _Copy_disjoint_words
  37         .type _Copy_disjoint_words, %function
  38         .globl _Copy_conjoint_words
  39         .type _Copy_conjoint_words, %function
  40         .globl _Copy_conjoint_jshorts_atomic
  41         .type _Copy_conjoint_jshorts_atomic, %function
  42         .globl _Copy_arrayof_conjoint_jshorts
  43         .type _Copy_arrayof_conjoint_jshorts, %function
  44         .globl _Copy_conjoint_jints_atomic
  45         .type _Copy_conjoint_jints_atomic, %function
  46         .globl _Copy_arrayof_conjoint_jints
  47         .type _Copy_arrayof_conjoint_jints, %function
  48         .globl _Copy_conjoint_jlongs_atomic
  49         .type _Copy_conjoint_jlongs_atomic, %function
  50         .globl _Copy_arrayof_conjoint_jlongs
  51         .type _Copy_arrayof_conjoint_jlongs, %function
  52 
  53         .text
  54         .globl  SpinPause
  55         .type SpinPause, %function
  56 SpinPause:
  57         yield
  58         ret
  59 
  60         # Support for void Copy::conjoint_bytes(void* from,
  61         #                                       void* to,
  62         #                                       size_t count)
  63 _Copy_conjoint_bytes:
  64         hlt 1002
  65 
  66         # Support for void Copy::arrayof_conjoint_bytes(void* from,
  67         #                                               void* to,
  68         #                                               size_t count)
  69 _Copy_arrayof_conjoint_bytes:
  70         hlt 1003
  71 
  72 
  73         # Support for void Copy::disjoint_words(void* from,
  74         #                                       void* to,
  75         #                                       size_t count)
  76 _Copy_disjoint_words:
  77         # These and further memory prefetches may hit out of array ranges.
  78         # Experiments showed that prefetching of inaccessible memory doesn't result in exceptions.
  79         prfm    pldl1keep,  [x0, #0]
  80         prfm    pstl1keep,  [x1, #0]
  81         prfm    pldl1keep,  [x0, #64]
  82         prfm    pstl1keep,  [x1, #64]
  83 
  84         subs    x18, x2,  #128
  85         b.ge    dw_large
  86 
  87 dw_lt_128:
  88         # Copy [x0, x0 + x2) to [x1, x1 + x2)
  89         
  90         adr     x15,  dw_tail_table_base
  91         and     x16,  x2,  #~8
  92 
  93         # Calculate address to jump and store it to x15:
  94         #   Each pair of instructions before dw_tail_table_base copies 16 bytes.
  95         #   x16 is count of bytes to copy aligned down by 16.
  96         #   So x16/16 pairs of instructions should be executed. 
  97         #   Each pair takes 8 bytes, so x15 = dw_tail_table_base - (x16/16)*8 = x15 - x16/2
  98         sub     x15,  x15, x16, lsr #1
  99         prfm    plil1keep, [x15]
 100     
 101         add     x17,  x0,  x2
 102         add     x18,  x1,  x2
 103 
 104         # If x2 = x16 + 8, then copy 8 bytes and x16 bytes after that.
 105         # Otherwise x2 = x16, so proceed to copy x16 bytes.
 106         tbz     x2, #3, dw_lt_128_even
 107         ldr     x3, [x0]
 108         str     x3, [x1]
 109 dw_lt_128_even:
 110         # Copy [x17 - x16, x17) to [x18 - x16, x18)
 111         # x16 is aligned by 16 and less than 128
 112 
 113         # Execute (x16/16) ldp-stp pairs; each pair copies 16 bytes
 114         br      x15
 115 
 116         ldp     x3,  x4,  [x17, #-112]
 117         stp     x3,  x4,  [x18, #-112]
 118         ldp     x5,  x6,  [x17, #-96]
 119         stp     x5,  x6,  [x18, #-96]
 120         ldp     x7,  x8,  [x17, #-80]
 121         stp     x7,  x8,  [x18, #-80]
 122         ldp     x9,  x10, [x17, #-64]
 123         stp     x9,  x10, [x18, #-64]
 124         ldp     x11, x12, [x17, #-48]
 125         stp     x11, x12, [x18, #-48]
 126         ldp     x13, x14, [x17, #-32]
 127         stp     x13, x14, [x18, #-32]
 128         ldp     x15, x16, [x17, #-16]
 129         stp     x15, x16, [x18, #-16]
 130 dw_tail_table_base:
 131         ret
 132 
 133 .p2align  6
 134 .rept   12
 135         nop
 136 .endr
 137 dw_large:
 138         # x18 >= 0;
 139         # Copy [x0, x0 + x18 + 128) to [x1, x1 + x18 + 128)
 140 
 141         ldp     x3,  x4,  [x0], #64
 142         ldp     x5,  x6,  [x0, #-48]
 143         ldp     x7,  x8,  [x0, #-32]
 144         ldp     x9,  x10, [x0, #-16]
 145 
 146         # Before and after each iteration of loop registers x3-x10 contain [x0 - 64, x0),
 147         # and x1 is a place to copy this data;
 148         # x18 contains number of bytes to be stored minus 128
 149 
 150         # Exactly 16 instructions from p2align, so dw_loop starts from cache line boundary
 151         # Checking it explictly by aligning with "hlt 1000" instructions 
 152 .p2alignl  6, 0xd4407d00
 153 dw_loop:
 154         prfm    pldl1keep,  [x0, #64]
 155         # Next line actually hurted memory copy performance (for interpreter) - JDK-8078120
 156         # prfm    pstl1keep,  [x1, #64]
 157 
 158         subs    x18, x18, #64
 159 
 160         stp     x3,  x4,  [x1, #0]
 161         ldp     x3,  x4,  [x0, #0]
 162         stp     x5,  x6,  [x1, #16]
 163         ldp     x5,  x6,  [x0, #16]
 164         stp     x7,  x8,  [x1, #32]
 165         ldp     x7,  x8,  [x0, #32]
 166         stp     x9,  x10, [x1, #48]
 167         ldp     x9,  x10, [x0, #48]
 168         
 169         add     x1,  x1,  #64
 170         add     x0,  x0,  #64
 171 
 172         b.ge    dw_loop
 173 
 174         # 13 instructions from dw_loop, so the loop body hits into one cache line
 175 
 176 dw_loop_end:
 177         adds    x2,  x18, #64
 178 
 179         stp     x3,  x4,  [x1], #64
 180         stp     x5,  x6,  [x1, #-48]
 181         stp     x7,  x8,  [x1, #-32]
 182         stp     x9,  x10, [x1, #-16]
 183 
 184         # Increased x18 by 64, but stored 64 bytes, so x2 contains exact number of bytes to be stored
 185 
 186         # If this number is not zero, also copy remaining bytes
 187         b.ne    dw_lt_128
 188         ret
 189 
 190 
 191         # Support for void Copy::conjoint_words(void* from,
 192         #                                       void* to,
 193         #                                       size_t count)
 194 _Copy_conjoint_words:
 195         subs    x3, x1, x0
 196         # hi condition is met <=> from < to
 197         ccmp    x2, x3, #0, hi
 198         # hi condition is met <=> (from < to) and (to - from < count)
 199         # otherwise _Copy_disjoint_words may be used, because it performs forward copying,
 200         # so it also works when ranges overlap but to <= from
 201         b.ls    _Copy_disjoint_words
 202 
 203         # Overlapping case should be the rare one, it does not worth optimizing
 204 
 205         ands    x3,  x2,  #~8
 206         # x3 is count aligned down by 2*wordSize
 207         add     x0,  x0,  x2
 208         add     x1,  x1,  x2
 209         sub     x3,  x3,  #16
 210         # Skip loop if 0 or 1 words
 211         b.eq    cw_backward_loop_end
 212 
 213         # x3 >= 0
 214         # Copy [x0 - x3 - 16, x0) to [x1 - x3 - 16, x1) backward
 215 cw_backward_loop:
 216         subs    x3,  x3,  #16
 217         ldp     x4,  x5,  [x0, #-16]!
 218         stp     x4,  x5,  [x1, #-16]!
 219         b.ge    cw_backward_loop
 220 
 221 cw_backward_loop_end:
 222         # Copy remaining 0 or 1 words
 223         tbz     x2,  #3,  cw_finish
 224         ldr     x3, [x0, #-8]
 225         str     x3, [x1, #-8]
 226 
 227 cw_finish:
 228         ret
 229 
 230 
 231         # Support for void Copy::conjoint_jshorts_atomic(void* from,
 232         #                                                void* to,
 233         #                                                size_t count)
 234 _Copy_conjoint_jshorts_atomic:
 235         add     x17, x0, x2
 236         add     x18, x1, x2
 237 
 238         subs    x3, x1, x0
 239         # hi is met <=> (from < to) and (to - from < count)
 240         ccmp    x2, x3, #0, hi
 241         b.hi    cs_backward
 242         
 243         subs    x3, x2, #14
 244         b.ge    cs_forward_loop
 245 
 246         # Copy x2 < 14 bytes from x0 to x1
 247 cs_forward_lt14:
 248         ands    x7, x2, #7
 249         tbz     x2, #3, cs_forward_lt8
 250         ldrh    w3, [x0, #0]
 251         ldrh    w4, [x0, #2]
 252         ldrh    w5, [x0, #4]
 253         ldrh    w6, [x0, #6]
 254 
 255         strh    w3, [x1, #0]
 256         strh    w4, [x1, #2]
 257         strh    w5, [x1, #4]
 258         strh    w6, [x1, #6]
 259 
 260         # Copy x7 < 8 bytes from x17 - x7 to x18 - x7
 261 cs_forward_lt8:
 262         b.eq    cs_forward_0
 263         cmp     x7, #4
 264         b.lt    cs_forward_2
 265         b.eq    cs_forward_4
 266 
 267 cs_forward_6:
 268         ldrh    w3, [x17, #-6]
 269         strh    w3, [x18, #-6]
 270 cs_forward_4:
 271         ldrh    w4, [x17, #-4]
 272         strh    w4, [x18, #-4]
 273 cs_forward_2:
 274         ldrh    w5, [x17, #-2]
 275         strh    w5, [x18, #-2]
 276 cs_forward_0:
 277         ret
 278 
 279 
 280         # Copy [x0, x0 + x3 + 14) to [x1, x1 + x3 + 14)
 281         # x3 >= 0
 282 .p2align 6
 283 cs_forward_loop:
 284         subs    x3, x3, #14
 285         
 286         ldrh    w4, [x0], #14
 287         ldrh    w5, [x0, #-12]
 288         ldrh    w6, [x0, #-10]
 289         ldrh    w7, [x0, #-8]
 290         ldrh    w8, [x0, #-6]
 291         ldrh    w9, [x0, #-4]
 292         ldrh    w10, [x0, #-2]
 293 
 294         strh    w4, [x1], #14
 295         strh    w5, [x1, #-12]
 296         strh    w6, [x1, #-10]
 297         strh    w7, [x1, #-8]
 298         strh    w8, [x1, #-6]
 299         strh    w9, [x1, #-4]
 300         strh    w10, [x1, #-2]
 301 
 302         b.ge    cs_forward_loop
 303         # Exactly 16 instruction from cs_forward_loop, so loop fits into one cache line
 304 
 305         adds    x2, x3, #14
 306         # x2 bytes should be copied from x0 to x1
 307         b.ne    cs_forward_lt14
 308         ret
 309         
 310         # Very similar to forward copying
 311 cs_backward:
 312         subs    x3, x2, #14
 313         b.ge    cs_backward_loop
 314 
 315 cs_backward_lt14:
 316         ands    x7, x2, #7
 317         tbz     x2, #3, cs_backward_lt8
 318 
 319         ldrh    w3, [x17, #-8]
 320         ldrh    w4, [x17, #-6]
 321         ldrh    w5, [x17, #-4]
 322         ldrh    w6, [x17, #-2]
 323         
 324         strh    w3, [x18, #-8]
 325         strh    w4, [x18, #-6]
 326         strh    w5, [x18, #-4]
 327         strh    w6, [x18, #-2]
 328 
 329 cs_backward_lt8:
 330         b.eq    cs_backward_0
 331         cmp     x7, #4
 332         b.lt    cs_backward_2
 333         b.eq    cs_backward_4
 334 
 335 cs_backward_6:
 336         ldrh    w3, [x0, #4]
 337         strh    w3, [x1, #4]
 338 
 339 cs_backward_4:
 340         ldrh    w4, [x0, #2]
 341         strh    w4, [x1, #2]
 342 
 343 cs_backward_2:
 344         ldrh    w5, [x0, #0]
 345         strh    w5, [x1, #0]
 346 
 347 cs_backward_0:
 348         ret
 349 
 350 
 351 .p2align 6
 352 cs_backward_loop:
 353         subs    x3, x3, #14
 354 
 355         ldrh    w4, [x17, #-14]!
 356         ldrh    w5, [x17, #2]
 357         ldrh    w6, [x17, #4]
 358         ldrh    w7, [x17, #6]
 359         ldrh    w8, [x17, #8]
 360         ldrh    w9, [x17, #10]
 361         ldrh    w10, [x17, #12]
 362 
 363         strh    w4, [x18, #-14]!
 364         strh    w5, [x18, #2]
 365         strh    w6, [x18, #4]
 366         strh    w7, [x18, #6]
 367         strh    w8, [x18, #8]
 368         strh    w9, [x18, #10]
 369         strh    w10, [x18, #12]
 370 
 371         b.ge    cs_backward_loop
 372         adds    x2, x3, #14
 373         b.ne    cs_backward_lt14
 374         ret
 375 
 376 
 377         # Support for void Copy::arrayof_conjoint_jshorts(void* from,
 378         #                                                 void* to,
 379         #                                                 size_t count)
 380 _Copy_arrayof_conjoint_jshorts:
 381         hlt 1007
 382 
 383 
 384         # Support for void Copy::conjoint_jlongs_atomic(jlong* from,
 385         #                                               jlong* to,
 386         #                                               size_t count)
 387 _Copy_conjoint_jlongs_atomic:
 388 _Copy_arrayof_conjoint_jlongs:
 389         hlt 1009
 390 
 391 
 392         # Support for void Copy::conjoint_jints_atomic(void* from,
 393         #                                              void* to,
 394         #                                              size_t count)
 395 _Copy_conjoint_jints_atomic:
 396 _Copy_arrayof_conjoint_jints:
 397         # These and further memory prefetches may hit out of array ranges.
 398         # Experiments showed that prefetching of inaccessible memory doesn't result in exceptions.
 399         prfm    pldl1keep,  [x0, #0]
 400         prfm    pstl1keep,  [x1, #0]
 401         prfm    pldl1keep,  [x0, #32]
 402         prfm    pstl1keep,  [x1, #32]
 403 
 404         subs    x3, x1, x0
 405         # hi condition is met <=> from < to
 406         ccmp    x2, x3, #0, hi
 407         # hi condition is met <=> (from < to) and (to - from < count)
 408         b.hi    ci_backward
 409 
 410         subs    x18, x2,  #64
 411         b.ge    ci_forward_large
 412 
 413 ci_forward_lt_64:
 414         # Copy [x0, x0 + x2) to [x1, x1 + x2)
 415         
 416         adr     x15,  ci_forward_tail_table_base
 417         and     x16,  x2,  #~4
 418 
 419         # Calculate address to jump and store it to x15:
 420         #   Each pair of instructions before ci_forward_tail_table_base copies 8 bytes.
 421         #   x16 is count of bytes to copy aligned down by 8.
 422         #   So x16/8 pairs of instructions should be executed. 
 423         #   Each pair takes 8 bytes, so x15 = ci_forward_tail_table_base - (x16/8)*8 = x15 - x16
 424         sub     x15,  x15, x16
 425         prfm    plil1keep, [x15]
 426     
 427         add     x17,  x0,  x2
 428         add     x18,  x1,  x2
 429 
 430         # If x2 = x16 + 4, then copy 4 bytes and x16 bytes after that.
 431         # Otherwise x2 = x16, so proceed to copy x16 bytes.
 432         tbz     x2, #2, ci_forward_lt_64_even
 433         ldr     w3, [x0]
 434         str     w3, [x1]
 435 ci_forward_lt_64_even:
 436         # Copy [x17 - x16, x17) to [x18 - x16, x18)
 437         # x16 is aligned by 8 and less than 64
 438 
 439         # Execute (x16/8) ldp-stp pairs; each pair copies 8 bytes
 440         br      x15
 441 
 442         ldp     w3,  w4,  [x17, #-56]
 443         stp     w3,  w4,  [x18, #-56]
 444         ldp     w5,  w6,  [x17, #-48]
 445         stp     w5,  w6,  [x18, #-48]
 446         ldp     w7,  w8,  [x17, #-40]
 447         stp     w7,  w8,  [x18, #-40]
 448         ldp     w9,  w10, [x17, #-32]
 449         stp     w9,  w10, [x18, #-32]
 450         ldp     w11, w12, [x17, #-24]
 451         stp     w11, w12, [x18, #-24]
 452         ldp     w13, w14, [x17, #-16]
 453         stp     w13, w14, [x18, #-16]
 454         ldp     w15, w16, [x17, #-8]
 455         stp     w15, w16, [x18, #-8]
 456 ci_forward_tail_table_base:
 457         ret
 458 
 459 .p2align  6
 460 .rept   12
 461         nop
 462 .endr
 463 ci_forward_large:
 464         # x18 >= 0;
 465         # Copy [x0, x0 + x18 + 64) to [x1, x1 + x18 + 64)
 466 
 467         ldp     w3,  w4,  [x0], #32
 468         ldp     w5,  w6,  [x0, #-24]
 469         ldp     w7,  w8,  [x0, #-16]
 470         ldp     w9,  w10, [x0, #-8]
 471 
 472         # Before and after each iteration of loop registers w3-w10 contain [x0 - 32, x0),
 473         # and x1 is a place to copy this data;
 474         # x18 contains number of bytes to be stored minus 64
 475 
 476         # Exactly 16 instructions from p2align, so ci_forward_loop starts from cache line boundary
 477         # Checking it explictly by aligning with "hlt 1000" instructions 
 478 .p2alignl  6, 0xd4407d00
 479 ci_forward_loop:
 480         prfm    pldl1keep,  [x0, #32]
 481         prfm    pstl1keep,  [x1, #32]
 482 
 483         subs    x18, x18, #32
 484 
 485         stp     w3,  w4,  [x1, #0]
 486         ldp     w3,  w4,  [x0, #0]
 487         stp     w5,  w6,  [x1, #8]
 488         ldp     w5,  w6,  [x0, #8]
 489         stp     w7,  w8,  [x1, #16]
 490         ldp     w7,  w8,  [x0, #16]
 491         stp     w9,  w10, [x1, #24]
 492         ldp     w9,  w10, [x0, #24]
 493         
 494         add     x1,  x1,  #32
 495         add     x0,  x0,  #32
 496 
 497         b.ge    ci_forward_loop
 498 
 499         # 14 instructions from ci_forward_loop, so the loop body hits into one cache line
 500 
 501 ci_forward_loop_end:
 502         adds    x2,  x18, #32
 503 
 504         stp     w3,  w4,  [x1], #32
 505         stp     w5,  w6,  [x1, #-24]
 506         stp     w7,  w8,  [x1, #-16]
 507         stp     w9,  w10, [x1, #-8]
 508 
 509         # Increased x18 by 32, but stored 32 bytes, so x2 contains exact number of bytes to be stored
 510 
 511         # If this number is not zero, also copy remaining bytes
 512         b.ne    ci_forward_lt_64
 513         ret
 514 
 515 ci_backward:
 516 
 517         # Overlapping case should be the rare one, it does not worth optimizing
 518 
 519         ands    x3,  x2,  #~4
 520         # x3 is count aligned down by 2*jintSize
 521         add     x0,  x0,  x2
 522         add     x1,  x1,  x2
 523         sub     x3,  x3,  #8
 524         # Skip loop if 0 or 1 jints
 525         b.eq    ci_backward_loop_end
 526 
 527         # x3 >= 0
 528         # Copy [x0 - x3 - 8, x0) to [x1 - x3 - 8, x1) backward
 529 ci_backward_loop:
 530         subs    x3,  x3,  #8
 531         ldp     w4,  w5,  [x0, #-8]!
 532         stp     w4,  w5,  [x1, #-8]!
 533         b.ge    ci_backward_loop
 534 
 535 ci_backward_loop_end:
 536         # Copy remaining 0 or 1 jints
 537         tbz     x2,  #2,  ci_backward_finish
 538         ldr     w3, [x0, #-4]
 539         str     w3, [x1, #-4]
 540 
 541 ci_backward_finish:
 542         ret