1 # 2 # Copyright (c) 2008, 2013, Oracle and/or its affiliates. All rights reserved. 3 # DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 # 5 # This code is free software; you can redistribute it and/or modify it 6 # under the terms of the GNU General Public License version 2 only, as 7 # published by the Free Software Foundation. 8 # 9 # This code is distributed in the hope that it will be useful, but WITHOUT 10 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 # FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 # version 2 for more details (a copy is included in the LICENSE file that 13 # accompanied this code). 14 # 15 # You should have received a copy of the GNU General Public License version 16 # 2 along with this work; if not, write to the Free Software Foundation, 17 # Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 # 19 # Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 # or visit www.oracle.com if you need additional information or have any 21 # questions. 22 # 23 24 # TODO-AARCH64 25 26 # NOTE WELL! The _Copy functions are called directly 27 # from server-compiler-generated code via CallLeafNoFP, 28 # which means that they *must* either not use floating 29 # point or use it in the same manner as does the server 30 # compiler. 31 32 .globl _Copy_conjoint_bytes 33 .type _Copy_conjoint_bytes, %function 34 .globl _Copy_arrayof_conjoint_bytes 35 .type _Copy_arrayof_conjoint_bytes, %function 36 .globl _Copy_disjoint_words 37 .type _Copy_disjoint_words, %function 38 .globl _Copy_conjoint_words 39 .type _Copy_conjoint_words, %function 40 .globl _Copy_conjoint_jshorts_atomic 41 .type _Copy_conjoint_jshorts_atomic, %function 42 .globl _Copy_arrayof_conjoint_jshorts 43 .type _Copy_arrayof_conjoint_jshorts, %function 44 .globl _Copy_conjoint_jints_atomic 45 .type _Copy_conjoint_jints_atomic, %function 46 .globl _Copy_arrayof_conjoint_jints 47 .type _Copy_arrayof_conjoint_jints, %function 48 .globl _Copy_conjoint_jlongs_atomic 49 .type _Copy_conjoint_jlongs_atomic, %function 50 .globl _Copy_arrayof_conjoint_jlongs 51 .type _Copy_arrayof_conjoint_jlongs, %function 52 53 .text 54 .globl SpinPause 55 .type SpinPause, %function 56 SpinPause: 57 yield 58 ret 59 60 # Support for void Copy::conjoint_bytes(void* from, 61 # void* to, 62 # size_t count) 63 _Copy_conjoint_bytes: 64 hlt 1002 65 66 # Support for void Copy::arrayof_conjoint_bytes(void* from, 67 # void* to, 68 # size_t count) 69 _Copy_arrayof_conjoint_bytes: 70 hlt 1003 71 72 73 # Support for void Copy::disjoint_words(void* from, 74 # void* to, 75 # size_t count) 76 _Copy_disjoint_words: 77 # These and further memory prefetches may hit out of array ranges. 78 # Experiments showed that prefetching of inaccessible memory doesn't result in exceptions. 79 prfm pldl1keep, [x0, #0] 80 prfm pstl1keep, [x1, #0] 81 prfm pldl1keep, [x0, #64] 82 prfm pstl1keep, [x1, #64] 83 84 subs x18, x2, #128 85 b.ge dw_large 86 87 dw_lt_128: 88 # Copy [x0, x0 + x2) to [x1, x1 + x2) 89 90 adr x15, dw_tail_table_base 91 and x16, x2, #~8 92 93 # Calculate address to jump and store it to x15: 94 # Each pair of instructions before dw_tail_table_base copies 16 bytes. 95 # x16 is count of bytes to copy aligned down by 16. 96 # So x16/16 pairs of instructions should be executed. 97 # Each pair takes 8 bytes, so x15 = dw_tail_table_base - (x16/16)*8 = x15 - x16/2 98 sub x15, x15, x16, lsr #1 99 prfm plil1keep, [x15] 100 101 add x17, x0, x2 102 add x18, x1, x2 103 104 # If x2 = x16 + 8, then copy 8 bytes and x16 bytes after that. 105 # Otherwise x2 = x16, so proceed to copy x16 bytes. 106 tbz x2, #3, dw_lt_128_even 107 ldr x3, [x0] 108 str x3, [x1] 109 dw_lt_128_even: 110 # Copy [x17 - x16, x17) to [x18 - x16, x18) 111 # x16 is aligned by 16 and less than 128 112 113 # Execute (x16/16) ldp-stp pairs; each pair copies 16 bytes 114 br x15 115 116 ldp x3, x4, [x17, #-112] 117 stp x3, x4, [x18, #-112] 118 ldp x5, x6, [x17, #-96] 119 stp x5, x6, [x18, #-96] 120 ldp x7, x8, [x17, #-80] 121 stp x7, x8, [x18, #-80] 122 ldp x9, x10, [x17, #-64] 123 stp x9, x10, [x18, #-64] 124 ldp x11, x12, [x17, #-48] 125 stp x11, x12, [x18, #-48] 126 ldp x13, x14, [x17, #-32] 127 stp x13, x14, [x18, #-32] 128 ldp x15, x16, [x17, #-16] 129 stp x15, x16, [x18, #-16] 130 dw_tail_table_base: 131 ret 132 133 .p2align 6 134 .rept 12 135 nop 136 .endr 137 dw_large: 138 # x18 >= 0; 139 # Copy [x0, x0 + x18 + 128) to [x1, x1 + x18 + 128) 140 141 ldp x3, x4, [x0], #64 142 ldp x5, x6, [x0, #-48] 143 ldp x7, x8, [x0, #-32] 144 ldp x9, x10, [x0, #-16] 145 146 # Before and after each iteration of loop registers x3-x10 contain [x0 - 64, x0), 147 # and x1 is a place to copy this data; 148 # x18 contains number of bytes to be stored minus 128 149 150 # Exactly 16 instructions from p2align, so dw_loop starts from cache line boundary 151 # Checking it explictly by aligning with "hlt 1000" instructions 152 .p2alignl 6, 0xd4407d00 153 dw_loop: 154 prfm pldl1keep, [x0, #64] 155 # Next line actually hurted memory copy performance (for interpreter) - JDK-8078120 156 # prfm pstl1keep, [x1, #64] 157 158 subs x18, x18, #64 159 160 stp x3, x4, [x1, #0] 161 ldp x3, x4, [x0, #0] 162 stp x5, x6, [x1, #16] 163 ldp x5, x6, [x0, #16] 164 stp x7, x8, [x1, #32] 165 ldp x7, x8, [x0, #32] 166 stp x9, x10, [x1, #48] 167 ldp x9, x10, [x0, #48] 168 169 add x1, x1, #64 170 add x0, x0, #64 171 172 b.ge dw_loop 173 174 # 13 instructions from dw_loop, so the loop body hits into one cache line 175 176 dw_loop_end: 177 adds x2, x18, #64 178 179 stp x3, x4, [x1], #64 180 stp x5, x6, [x1, #-48] 181 stp x7, x8, [x1, #-32] 182 stp x9, x10, [x1, #-16] 183 184 # Increased x18 by 64, but stored 64 bytes, so x2 contains exact number of bytes to be stored 185 186 # If this number is not zero, also copy remaining bytes 187 b.ne dw_lt_128 188 ret 189 190 191 # Support for void Copy::conjoint_words(void* from, 192 # void* to, 193 # size_t count) 194 _Copy_conjoint_words: 195 subs x3, x1, x0 196 # hi condition is met <=> from < to 197 ccmp x2, x3, #0, hi 198 # hi condition is met <=> (from < to) and (to - from < count) 199 # otherwise _Copy_disjoint_words may be used, because it performs forward copying, 200 # so it also works when ranges overlap but to <= from 201 b.ls _Copy_disjoint_words 202 203 # Overlapping case should be the rare one, it does not worth optimizing 204 205 ands x3, x2, #~8 206 # x3 is count aligned down by 2*wordSize 207 add x0, x0, x2 208 add x1, x1, x2 209 sub x3, x3, #16 210 # Skip loop if 0 or 1 words 211 b.eq cw_backward_loop_end 212 213 # x3 >= 0 214 # Copy [x0 - x3 - 16, x0) to [x1 - x3 - 16, x1) backward 215 cw_backward_loop: 216 subs x3, x3, #16 217 ldp x4, x5, [x0, #-16]! 218 stp x4, x5, [x1, #-16]! 219 b.ge cw_backward_loop 220 221 cw_backward_loop_end: 222 # Copy remaining 0 or 1 words 223 tbz x2, #3, cw_finish 224 ldr x3, [x0, #-8] 225 str x3, [x1, #-8] 226 227 cw_finish: 228 ret 229 230 231 # Support for void Copy::conjoint_jshorts_atomic(void* from, 232 # void* to, 233 # size_t count) 234 _Copy_conjoint_jshorts_atomic: 235 add x17, x0, x2 236 add x18, x1, x2 237 238 subs x3, x1, x0 239 # hi is met <=> (from < to) and (to - from < count) 240 ccmp x2, x3, #0, hi 241 b.hi cs_backward 242 243 subs x3, x2, #14 244 b.ge cs_forward_loop 245 246 # Copy x2 < 14 bytes from x0 to x1 247 cs_forward_lt14: 248 ands x7, x2, #7 249 tbz x2, #3, cs_forward_lt8 250 ldrh w3, [x0, #0] 251 ldrh w4, [x0, #2] 252 ldrh w5, [x0, #4] 253 ldrh w6, [x0, #6] 254 255 strh w3, [x1, #0] 256 strh w4, [x1, #2] 257 strh w5, [x1, #4] 258 strh w6, [x1, #6] 259 260 # Copy x7 < 8 bytes from x17 - x7 to x18 - x7 261 cs_forward_lt8: 262 b.eq cs_forward_0 263 cmp x7, #4 264 b.lt cs_forward_2 265 b.eq cs_forward_4 266 267 cs_forward_6: 268 ldrh w3, [x17, #-6] 269 strh w3, [x18, #-6] 270 cs_forward_4: 271 ldrh w4, [x17, #-4] 272 strh w4, [x18, #-4] 273 cs_forward_2: 274 ldrh w5, [x17, #-2] 275 strh w5, [x18, #-2] 276 cs_forward_0: 277 ret 278 279 280 # Copy [x0, x0 + x3 + 14) to [x1, x1 + x3 + 14) 281 # x3 >= 0 282 .p2align 6 283 cs_forward_loop: 284 subs x3, x3, #14 285 286 ldrh w4, [x0], #14 287 ldrh w5, [x0, #-12] 288 ldrh w6, [x0, #-10] 289 ldrh w7, [x0, #-8] 290 ldrh w8, [x0, #-6] 291 ldrh w9, [x0, #-4] 292 ldrh w10, [x0, #-2] 293 294 strh w4, [x1], #14 295 strh w5, [x1, #-12] 296 strh w6, [x1, #-10] 297 strh w7, [x1, #-8] 298 strh w8, [x1, #-6] 299 strh w9, [x1, #-4] 300 strh w10, [x1, #-2] 301 302 b.ge cs_forward_loop 303 # Exactly 16 instruction from cs_forward_loop, so loop fits into one cache line 304 305 adds x2, x3, #14 306 # x2 bytes should be copied from x0 to x1 307 b.ne cs_forward_lt14 308 ret 309 310 # Very similar to forward copying 311 cs_backward: 312 subs x3, x2, #14 313 b.ge cs_backward_loop 314 315 cs_backward_lt14: 316 ands x7, x2, #7 317 tbz x2, #3, cs_backward_lt8 318 319 ldrh w3, [x17, #-8] 320 ldrh w4, [x17, #-6] 321 ldrh w5, [x17, #-4] 322 ldrh w6, [x17, #-2] 323 324 strh w3, [x18, #-8] 325 strh w4, [x18, #-6] 326 strh w5, [x18, #-4] 327 strh w6, [x18, #-2] 328 329 cs_backward_lt8: 330 b.eq cs_backward_0 331 cmp x7, #4 332 b.lt cs_backward_2 333 b.eq cs_backward_4 334 335 cs_backward_6: 336 ldrh w3, [x0, #4] 337 strh w3, [x1, #4] 338 339 cs_backward_4: 340 ldrh w4, [x0, #2] 341 strh w4, [x1, #2] 342 343 cs_backward_2: 344 ldrh w5, [x0, #0] 345 strh w5, [x1, #0] 346 347 cs_backward_0: 348 ret 349 350 351 .p2align 6 352 cs_backward_loop: 353 subs x3, x3, #14 354 355 ldrh w4, [x17, #-14]! 356 ldrh w5, [x17, #2] 357 ldrh w6, [x17, #4] 358 ldrh w7, [x17, #6] 359 ldrh w8, [x17, #8] 360 ldrh w9, [x17, #10] 361 ldrh w10, [x17, #12] 362 363 strh w4, [x18, #-14]! 364 strh w5, [x18, #2] 365 strh w6, [x18, #4] 366 strh w7, [x18, #6] 367 strh w8, [x18, #8] 368 strh w9, [x18, #10] 369 strh w10, [x18, #12] 370 371 b.ge cs_backward_loop 372 adds x2, x3, #14 373 b.ne cs_backward_lt14 374 ret 375 376 377 # Support for void Copy::arrayof_conjoint_jshorts(void* from, 378 # void* to, 379 # size_t count) 380 _Copy_arrayof_conjoint_jshorts: 381 hlt 1007 382 383 384 # Support for void Copy::conjoint_jlongs_atomic(jlong* from, 385 # jlong* to, 386 # size_t count) 387 _Copy_conjoint_jlongs_atomic: 388 _Copy_arrayof_conjoint_jlongs: 389 hlt 1009 390 391 392 # Support for void Copy::conjoint_jints_atomic(void* from, 393 # void* to, 394 # size_t count) 395 _Copy_conjoint_jints_atomic: 396 _Copy_arrayof_conjoint_jints: 397 # These and further memory prefetches may hit out of array ranges. 398 # Experiments showed that prefetching of inaccessible memory doesn't result in exceptions. 399 prfm pldl1keep, [x0, #0] 400 prfm pstl1keep, [x1, #0] 401 prfm pldl1keep, [x0, #32] 402 prfm pstl1keep, [x1, #32] 403 404 subs x3, x1, x0 405 # hi condition is met <=> from < to 406 ccmp x2, x3, #0, hi 407 # hi condition is met <=> (from < to) and (to - from < count) 408 b.hi ci_backward 409 410 subs x18, x2, #64 411 b.ge ci_forward_large 412 413 ci_forward_lt_64: 414 # Copy [x0, x0 + x2) to [x1, x1 + x2) 415 416 adr x15, ci_forward_tail_table_base 417 and x16, x2, #~4 418 419 # Calculate address to jump and store it to x15: 420 # Each pair of instructions before ci_forward_tail_table_base copies 8 bytes. 421 # x16 is count of bytes to copy aligned down by 8. 422 # So x16/8 pairs of instructions should be executed. 423 # Each pair takes 8 bytes, so x15 = ci_forward_tail_table_base - (x16/8)*8 = x15 - x16 424 sub x15, x15, x16 425 prfm plil1keep, [x15] 426 427 add x17, x0, x2 428 add x18, x1, x2 429 430 # If x2 = x16 + 4, then copy 4 bytes and x16 bytes after that. 431 # Otherwise x2 = x16, so proceed to copy x16 bytes. 432 tbz x2, #2, ci_forward_lt_64_even 433 ldr w3, [x0] 434 str w3, [x1] 435 ci_forward_lt_64_even: 436 # Copy [x17 - x16, x17) to [x18 - x16, x18) 437 # x16 is aligned by 8 and less than 64 438 439 # Execute (x16/8) ldp-stp pairs; each pair copies 8 bytes 440 br x15 441 442 ldp w3, w4, [x17, #-56] 443 stp w3, w4, [x18, #-56] 444 ldp w5, w6, [x17, #-48] 445 stp w5, w6, [x18, #-48] 446 ldp w7, w8, [x17, #-40] 447 stp w7, w8, [x18, #-40] 448 ldp w9, w10, [x17, #-32] 449 stp w9, w10, [x18, #-32] 450 ldp w11, w12, [x17, #-24] 451 stp w11, w12, [x18, #-24] 452 ldp w13, w14, [x17, #-16] 453 stp w13, w14, [x18, #-16] 454 ldp w15, w16, [x17, #-8] 455 stp w15, w16, [x18, #-8] 456 ci_forward_tail_table_base: 457 ret 458 459 .p2align 6 460 .rept 12 461 nop 462 .endr 463 ci_forward_large: 464 # x18 >= 0; 465 # Copy [x0, x0 + x18 + 64) to [x1, x1 + x18 + 64) 466 467 ldp w3, w4, [x0], #32 468 ldp w5, w6, [x0, #-24] 469 ldp w7, w8, [x0, #-16] 470 ldp w9, w10, [x0, #-8] 471 472 # Before and after each iteration of loop registers w3-w10 contain [x0 - 32, x0), 473 # and x1 is a place to copy this data; 474 # x18 contains number of bytes to be stored minus 64 475 476 # Exactly 16 instructions from p2align, so ci_forward_loop starts from cache line boundary 477 # Checking it explictly by aligning with "hlt 1000" instructions 478 .p2alignl 6, 0xd4407d00 479 ci_forward_loop: 480 prfm pldl1keep, [x0, #32] 481 prfm pstl1keep, [x1, #32] 482 483 subs x18, x18, #32 484 485 stp w3, w4, [x1, #0] 486 ldp w3, w4, [x0, #0] 487 stp w5, w6, [x1, #8] 488 ldp w5, w6, [x0, #8] 489 stp w7, w8, [x1, #16] 490 ldp w7, w8, [x0, #16] 491 stp w9, w10, [x1, #24] 492 ldp w9, w10, [x0, #24] 493 494 add x1, x1, #32 495 add x0, x0, #32 496 497 b.ge ci_forward_loop 498 499 # 14 instructions from ci_forward_loop, so the loop body hits into one cache line 500 501 ci_forward_loop_end: 502 adds x2, x18, #32 503 504 stp w3, w4, [x1], #32 505 stp w5, w6, [x1, #-24] 506 stp w7, w8, [x1, #-16] 507 stp w9, w10, [x1, #-8] 508 509 # Increased x18 by 32, but stored 32 bytes, so x2 contains exact number of bytes to be stored 510 511 # If this number is not zero, also copy remaining bytes 512 b.ne ci_forward_lt_64 513 ret 514 515 ci_backward: 516 517 # Overlapping case should be the rare one, it does not worth optimizing 518 519 ands x3, x2, #~4 520 # x3 is count aligned down by 2*jintSize 521 add x0, x0, x2 522 add x1, x1, x2 523 sub x3, x3, #8 524 # Skip loop if 0 or 1 jints 525 b.eq ci_backward_loop_end 526 527 # x3 >= 0 528 # Copy [x0 - x3 - 8, x0) to [x1 - x3 - 8, x1) backward 529 ci_backward_loop: 530 subs x3, x3, #8 531 ldp w4, w5, [x0, #-8]! 532 stp w4, w5, [x1, #-8]! 533 b.ge ci_backward_loop 534 535 ci_backward_loop_end: 536 # Copy remaining 0 or 1 jints 537 tbz x2, #2, ci_backward_finish 538 ldr w3, [x0, #-4] 539 str w3, [x1, #-4] 540 541 ci_backward_finish: 542 ret