1 /* 2 * Copyright (c) 1997, 2019, Oracle and/or its affiliates. All rights reserved. 3 * Copyright (c) 2014, 2019, Red Hat Inc. All rights reserved. 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. 9 * 10 * This code is distributed in the hope that it will be useful, but WITHOUT 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 * version 2 for more details (a copy is included in the LICENSE file that 14 * accompanied this code). 15 * 16 * You should have received a copy of the GNU General Public License version 17 * 2 along with this work; if not, write to the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 19 * 20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 21 * or visit www.oracle.com if you need additional information or have any 22 * questions. 23 * 24 */ 25 26 #include <sys/types.h> 27 28 #include "precompiled.hpp" 29 #include "jvm.h" 30 #include "asm/assembler.hpp" 31 #include "asm/assembler.inline.hpp" 32 #include "gc/shared/barrierSet.hpp" 33 #include "gc/shared/cardTable.hpp" 34 #include "gc/shared/barrierSetAssembler.hpp" 35 #include "gc/shared/cardTableBarrierSet.hpp" 36 #include "interpreter/interpreter.hpp" 37 #include "compiler/disassembler.hpp" 38 #include "memory/resourceArea.hpp" 39 #include "nativeInst_aarch64.hpp" 40 #include "oops/accessDecorators.hpp" 41 #include "oops/compressedOops.inline.hpp" 42 #include "oops/klass.inline.hpp" 43 #include "runtime/biasedLocking.hpp" 44 #include "runtime/icache.hpp" 45 #include "runtime/interfaceSupport.inline.hpp" 46 #include "runtime/jniHandles.inline.hpp" 47 #include "runtime/sharedRuntime.hpp" 48 #include "runtime/thread.hpp" 49 #ifdef COMPILER1 50 #include "c1/c1_LIRAssembler.hpp" 51 #endif 52 #ifdef COMPILER2 53 #include "oops/oop.hpp" 54 #include "opto/compile.hpp" 55 #include "opto/intrinsicnode.hpp" 56 #include "opto/node.hpp" 57 #endif 58 59 #ifdef PRODUCT 60 #define BLOCK_COMMENT(str) /* nothing */ 61 #define STOP(error) stop(error) 62 #else 63 #define BLOCK_COMMENT(str) block_comment(str) 64 #define STOP(error) block_comment(error); stop(error) 65 #endif 66 67 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 68 69 // Patch any kind of instruction; there may be several instructions. 70 // Return the total length (in bytes) of the instructions. 71 int MacroAssembler::pd_patch_instruction_size(address branch, address target) { 72 int instructions = 1; 73 assert((uint64_t)target < (1ul << 48), "48-bit overflow in address constant"); 74 long offset = (target - branch) >> 2; 75 unsigned insn = *(unsigned*)branch; 76 if ((Instruction_aarch64::extract(insn, 29, 24) & 0b111011) == 0b011000) { 77 // Load register (literal) 78 Instruction_aarch64::spatch(branch, 23, 5, offset); 79 } else if (Instruction_aarch64::extract(insn, 30, 26) == 0b00101) { 80 // Unconditional branch (immediate) 81 Instruction_aarch64::spatch(branch, 25, 0, offset); 82 } else if (Instruction_aarch64::extract(insn, 31, 25) == 0b0101010) { 83 // Conditional branch (immediate) 84 Instruction_aarch64::spatch(branch, 23, 5, offset); 85 } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011010) { 86 // Compare & branch (immediate) 87 Instruction_aarch64::spatch(branch, 23, 5, offset); 88 } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011011) { 89 // Test & branch (immediate) 90 Instruction_aarch64::spatch(branch, 18, 5, offset); 91 } else if (Instruction_aarch64::extract(insn, 28, 24) == 0b10000) { 92 // PC-rel. addressing 93 offset = target-branch; 94 int shift = Instruction_aarch64::extract(insn, 31, 31); 95 if (shift) { 96 u_int64_t dest = (u_int64_t)target; 97 uint64_t pc_page = (uint64_t)branch >> 12; 98 uint64_t adr_page = (uint64_t)target >> 12; 99 unsigned offset_lo = dest & 0xfff; 100 offset = adr_page - pc_page; 101 102 // We handle 4 types of PC relative addressing 103 // 1 - adrp Rx, target_page 104 // ldr/str Ry, [Rx, #offset_in_page] 105 // 2 - adrp Rx, target_page 106 // add Ry, Rx, #offset_in_page 107 // 3 - adrp Rx, target_page (page aligned reloc, offset == 0) 108 // movk Rx, #imm16<<32 109 // 4 - adrp Rx, target_page (page aligned reloc, offset == 0) 110 // In the first 3 cases we must check that Rx is the same in the adrp and the 111 // subsequent ldr/str, add or movk instruction. Otherwise we could accidentally end 112 // up treating a type 4 relocation as a type 1, 2 or 3 just because it happened 113 // to be followed by a random unrelated ldr/str, add or movk instruction. 114 // 115 unsigned insn2 = ((unsigned*)branch)[1]; 116 if (Instruction_aarch64::extract(insn2, 29, 24) == 0b111001 && 117 Instruction_aarch64::extract(insn, 4, 0) == 118 Instruction_aarch64::extract(insn2, 9, 5)) { 119 // Load/store register (unsigned immediate) 120 unsigned size = Instruction_aarch64::extract(insn2, 31, 30); 121 Instruction_aarch64::patch(branch + sizeof (unsigned), 122 21, 10, offset_lo >> size); 123 guarantee(((dest >> size) << size) == dest, "misaligned target"); 124 instructions = 2; 125 } else if (Instruction_aarch64::extract(insn2, 31, 22) == 0b1001000100 && 126 Instruction_aarch64::extract(insn, 4, 0) == 127 Instruction_aarch64::extract(insn2, 4, 0)) { 128 // add (immediate) 129 Instruction_aarch64::patch(branch + sizeof (unsigned), 130 21, 10, offset_lo); 131 instructions = 2; 132 } else if (Instruction_aarch64::extract(insn2, 31, 21) == 0b11110010110 && 133 Instruction_aarch64::extract(insn, 4, 0) == 134 Instruction_aarch64::extract(insn2, 4, 0)) { 135 // movk #imm16<<32 136 Instruction_aarch64::patch(branch + 4, 20, 5, (uint64_t)target >> 32); 137 long dest = ((long)target & 0xffffffffL) | ((long)branch & 0xffff00000000L); 138 long pc_page = (long)branch >> 12; 139 long adr_page = (long)dest >> 12; 140 offset = adr_page - pc_page; 141 instructions = 2; 142 } 143 } 144 int offset_lo = offset & 3; 145 offset >>= 2; 146 Instruction_aarch64::spatch(branch, 23, 5, offset); 147 Instruction_aarch64::patch(branch, 30, 29, offset_lo); 148 } else if (Instruction_aarch64::extract(insn, 31, 21) == 0b11010010100) { 149 u_int64_t dest = (u_int64_t)target; 150 // Move wide constant 151 assert(nativeInstruction_at(branch+4)->is_movk(), "wrong insns in patch"); 152 assert(nativeInstruction_at(branch+8)->is_movk(), "wrong insns in patch"); 153 Instruction_aarch64::patch(branch, 20, 5, dest & 0xffff); 154 Instruction_aarch64::patch(branch+4, 20, 5, (dest >>= 16) & 0xffff); 155 Instruction_aarch64::patch(branch+8, 20, 5, (dest >>= 16) & 0xffff); 156 assert(target_addr_for_insn(branch) == target, "should be"); 157 instructions = 3; 158 } else if (Instruction_aarch64::extract(insn, 31, 22) == 0b1011100101 && 159 Instruction_aarch64::extract(insn, 4, 0) == 0b11111) { 160 // nothing to do 161 assert(target == 0, "did not expect to relocate target for polling page load"); 162 } else { 163 ShouldNotReachHere(); 164 } 165 return instructions * NativeInstruction::instruction_size; 166 } 167 168 int MacroAssembler::patch_oop(address insn_addr, address o) { 169 int instructions; 170 unsigned insn = *(unsigned*)insn_addr; 171 assert(nativeInstruction_at(insn_addr+4)->is_movk(), "wrong insns in patch"); 172 173 // OOPs are either narrow (32 bits) or wide (48 bits). We encode 174 // narrow OOPs by setting the upper 16 bits in the first 175 // instruction. 176 if (Instruction_aarch64::extract(insn, 31, 21) == 0b11010010101) { 177 // Move narrow OOP 178 narrowOop n = CompressedOops::encode((oop)o); 179 Instruction_aarch64::patch(insn_addr, 20, 5, n >> 16); 180 Instruction_aarch64::patch(insn_addr+4, 20, 5, n & 0xffff); 181 instructions = 2; 182 } else { 183 // Move wide OOP 184 assert(nativeInstruction_at(insn_addr+8)->is_movk(), "wrong insns in patch"); 185 uintptr_t dest = (uintptr_t)o; 186 Instruction_aarch64::patch(insn_addr, 20, 5, dest & 0xffff); 187 Instruction_aarch64::patch(insn_addr+4, 20, 5, (dest >>= 16) & 0xffff); 188 Instruction_aarch64::patch(insn_addr+8, 20, 5, (dest >>= 16) & 0xffff); 189 instructions = 3; 190 } 191 return instructions * NativeInstruction::instruction_size; 192 } 193 194 int MacroAssembler::patch_narrow_klass(address insn_addr, narrowKlass n) { 195 // Metatdata pointers are either narrow (32 bits) or wide (48 bits). 196 // We encode narrow ones by setting the upper 16 bits in the first 197 // instruction. 198 NativeInstruction *insn = nativeInstruction_at(insn_addr); 199 assert(Instruction_aarch64::extract(insn->encoding(), 31, 21) == 0b11010010101 && 200 nativeInstruction_at(insn_addr+4)->is_movk(), "wrong insns in patch"); 201 202 Instruction_aarch64::patch(insn_addr, 20, 5, n >> 16); 203 Instruction_aarch64::patch(insn_addr+4, 20, 5, n & 0xffff); 204 return 2 * NativeInstruction::instruction_size; 205 } 206 207 address MacroAssembler::target_addr_for_insn(address insn_addr, unsigned insn) { 208 long offset = 0; 209 if ((Instruction_aarch64::extract(insn, 29, 24) & 0b011011) == 0b00011000) { 210 // Load register (literal) 211 offset = Instruction_aarch64::sextract(insn, 23, 5); 212 return address(((uint64_t)insn_addr + (offset << 2))); 213 } else if (Instruction_aarch64::extract(insn, 30, 26) == 0b00101) { 214 // Unconditional branch (immediate) 215 offset = Instruction_aarch64::sextract(insn, 25, 0); 216 } else if (Instruction_aarch64::extract(insn, 31, 25) == 0b0101010) { 217 // Conditional branch (immediate) 218 offset = Instruction_aarch64::sextract(insn, 23, 5); 219 } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011010) { 220 // Compare & branch (immediate) 221 offset = Instruction_aarch64::sextract(insn, 23, 5); 222 } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011011) { 223 // Test & branch (immediate) 224 offset = Instruction_aarch64::sextract(insn, 18, 5); 225 } else if (Instruction_aarch64::extract(insn, 28, 24) == 0b10000) { 226 // PC-rel. addressing 227 offset = Instruction_aarch64::extract(insn, 30, 29); 228 offset |= Instruction_aarch64::sextract(insn, 23, 5) << 2; 229 int shift = Instruction_aarch64::extract(insn, 31, 31) ? 12 : 0; 230 if (shift) { 231 offset <<= shift; 232 uint64_t target_page = ((uint64_t)insn_addr) + offset; 233 target_page &= ((uint64_t)-1) << shift; 234 // Return the target address for the following sequences 235 // 1 - adrp Rx, target_page 236 // ldr/str Ry, [Rx, #offset_in_page] 237 // 2 - adrp Rx, target_page 238 // add Ry, Rx, #offset_in_page 239 // 3 - adrp Rx, target_page (page aligned reloc, offset == 0) 240 // movk Rx, #imm12<<32 241 // 4 - adrp Rx, target_page (page aligned reloc, offset == 0) 242 // 243 // In the first two cases we check that the register is the same and 244 // return the target_page + the offset within the page. 245 // Otherwise we assume it is a page aligned relocation and return 246 // the target page only. 247 // 248 unsigned insn2 = ((unsigned*)insn_addr)[1]; 249 if (Instruction_aarch64::extract(insn2, 29, 24) == 0b111001 && 250 Instruction_aarch64::extract(insn, 4, 0) == 251 Instruction_aarch64::extract(insn2, 9, 5)) { 252 // Load/store register (unsigned immediate) 253 unsigned int byte_offset = Instruction_aarch64::extract(insn2, 21, 10); 254 unsigned int size = Instruction_aarch64::extract(insn2, 31, 30); 255 return address(target_page + (byte_offset << size)); 256 } else if (Instruction_aarch64::extract(insn2, 31, 22) == 0b1001000100 && 257 Instruction_aarch64::extract(insn, 4, 0) == 258 Instruction_aarch64::extract(insn2, 4, 0)) { 259 // add (immediate) 260 unsigned int byte_offset = Instruction_aarch64::extract(insn2, 21, 10); 261 return address(target_page + byte_offset); 262 } else { 263 if (Instruction_aarch64::extract(insn2, 31, 21) == 0b11110010110 && 264 Instruction_aarch64::extract(insn, 4, 0) == 265 Instruction_aarch64::extract(insn2, 4, 0)) { 266 target_page = (target_page & 0xffffffff) | 267 ((uint64_t)Instruction_aarch64::extract(insn2, 20, 5) << 32); 268 } 269 return (address)target_page; 270 } 271 } else { 272 ShouldNotReachHere(); 273 } 274 } else if (Instruction_aarch64::extract(insn, 31, 23) == 0b110100101) { 275 u_int32_t *insns = (u_int32_t *)insn_addr; 276 // Move wide constant: movz, movk, movk. See movptr(). 277 assert(nativeInstruction_at(insns+1)->is_movk(), "wrong insns in patch"); 278 assert(nativeInstruction_at(insns+2)->is_movk(), "wrong insns in patch"); 279 return address(u_int64_t(Instruction_aarch64::extract(insns[0], 20, 5)) 280 + (u_int64_t(Instruction_aarch64::extract(insns[1], 20, 5)) << 16) 281 + (u_int64_t(Instruction_aarch64::extract(insns[2], 20, 5)) << 32)); 282 } else if (Instruction_aarch64::extract(insn, 31, 22) == 0b1011100101 && 283 Instruction_aarch64::extract(insn, 4, 0) == 0b11111) { 284 return 0; 285 } else { 286 ShouldNotReachHere(); 287 } 288 return address(((uint64_t)insn_addr + (offset << 2))); 289 } 290 291 void MacroAssembler::safepoint_poll(Label& slow_path) { 292 if (SafepointMechanism::uses_thread_local_poll()) { 293 ldr(rscratch1, Address(rthread, Thread::polling_page_offset())); 294 tbnz(rscratch1, exact_log2(SafepointMechanism::poll_bit()), slow_path); 295 } else { 296 unsigned long offset; 297 adrp(rscratch1, ExternalAddress(SafepointSynchronize::address_of_state()), offset); 298 ldrw(rscratch1, Address(rscratch1, offset)); 299 assert(SafepointSynchronize::_not_synchronized == 0, "rewrite this code"); 300 cbnz(rscratch1, slow_path); 301 } 302 } 303 304 // Just like safepoint_poll, but use an acquiring load for thread- 305 // local polling. 306 // 307 // We need an acquire here to ensure that any subsequent load of the 308 // global SafepointSynchronize::_state flag is ordered after this load 309 // of the local Thread::_polling page. We don't want this poll to 310 // return false (i.e. not safepointing) and a later poll of the global 311 // SafepointSynchronize::_state spuriously to return true. 312 // 313 // This is to avoid a race when we're in a native->Java transition 314 // racing the code which wakes up from a safepoint. 315 // 316 void MacroAssembler::safepoint_poll_acquire(Label& slow_path) { 317 if (SafepointMechanism::uses_thread_local_poll()) { 318 lea(rscratch1, Address(rthread, Thread::polling_page_offset())); 319 ldar(rscratch1, rscratch1); 320 tbnz(rscratch1, exact_log2(SafepointMechanism::poll_bit()), slow_path); 321 } else { 322 safepoint_poll(slow_path); 323 } 324 } 325 326 void MacroAssembler::reset_last_Java_frame(bool clear_fp) { 327 // we must set sp to zero to clear frame 328 str(zr, Address(rthread, JavaThread::last_Java_sp_offset())); 329 330 // must clear fp, so that compiled frames are not confused; it is 331 // possible that we need it only for debugging 332 if (clear_fp) { 333 str(zr, Address(rthread, JavaThread::last_Java_fp_offset())); 334 } 335 336 // Always clear the pc because it could have been set by make_walkable() 337 str(zr, Address(rthread, JavaThread::last_Java_pc_offset())); 338 } 339 340 // Calls to C land 341 // 342 // When entering C land, the rfp, & resp of the last Java frame have to be recorded 343 // in the (thread-local) JavaThread object. When leaving C land, the last Java fp 344 // has to be reset to 0. This is required to allow proper stack traversal. 345 void MacroAssembler::set_last_Java_frame(Register last_java_sp, 346 Register last_java_fp, 347 Register last_java_pc, 348 Register scratch) { 349 350 if (last_java_pc->is_valid()) { 351 str(last_java_pc, Address(rthread, 352 JavaThread::frame_anchor_offset() 353 + JavaFrameAnchor::last_Java_pc_offset())); 354 } 355 356 // determine last_java_sp register 357 if (last_java_sp == sp) { 358 mov(scratch, sp); 359 last_java_sp = scratch; 360 } else if (!last_java_sp->is_valid()) { 361 last_java_sp = esp; 362 } 363 364 str(last_java_sp, Address(rthread, JavaThread::last_Java_sp_offset())); 365 366 // last_java_fp is optional 367 if (last_java_fp->is_valid()) { 368 str(last_java_fp, Address(rthread, JavaThread::last_Java_fp_offset())); 369 } 370 } 371 372 void MacroAssembler::set_last_Java_frame(Register last_java_sp, 373 Register last_java_fp, 374 address last_java_pc, 375 Register scratch) { 376 assert(last_java_pc != NULL, "must provide a valid PC"); 377 378 adr(scratch, last_java_pc); 379 str(scratch, Address(rthread, 380 JavaThread::frame_anchor_offset() 381 + JavaFrameAnchor::last_Java_pc_offset())); 382 383 set_last_Java_frame(last_java_sp, last_java_fp, noreg, scratch); 384 } 385 386 void MacroAssembler::set_last_Java_frame(Register last_java_sp, 387 Register last_java_fp, 388 Label &L, 389 Register scratch) { 390 if (L.is_bound()) { 391 set_last_Java_frame(last_java_sp, last_java_fp, target(L), scratch); 392 } else { 393 InstructionMark im(this); 394 L.add_patch_at(code(), locator()); 395 set_last_Java_frame(last_java_sp, last_java_fp, pc() /* Patched later */, scratch); 396 } 397 } 398 399 void MacroAssembler::far_call(Address entry, CodeBuffer *cbuf, Register tmp) { 400 assert(ReservedCodeCacheSize < 4*G, "branch out of range"); 401 assert(CodeCache::find_blob(entry.target()) != NULL, 402 "destination of far call not found in code cache"); 403 if (far_branches()) { 404 unsigned long offset; 405 // We can use ADRP here because we know that the total size of 406 // the code cache cannot exceed 2Gb. 407 adrp(tmp, entry, offset); 408 add(tmp, tmp, offset); 409 if (cbuf) cbuf->set_insts_mark(); 410 blr(tmp); 411 } else { 412 if (cbuf) cbuf->set_insts_mark(); 413 bl(entry); 414 } 415 } 416 417 void MacroAssembler::far_jump(Address entry, CodeBuffer *cbuf, Register tmp) { 418 assert(ReservedCodeCacheSize < 4*G, "branch out of range"); 419 assert(CodeCache::find_blob(entry.target()) != NULL, 420 "destination of far call not found in code cache"); 421 if (far_branches()) { 422 unsigned long offset; 423 // We can use ADRP here because we know that the total size of 424 // the code cache cannot exceed 2Gb. 425 adrp(tmp, entry, offset); 426 add(tmp, tmp, offset); 427 if (cbuf) cbuf->set_insts_mark(); 428 br(tmp); 429 } else { 430 if (cbuf) cbuf->set_insts_mark(); 431 b(entry); 432 } 433 } 434 435 void MacroAssembler::reserved_stack_check() { 436 // testing if reserved zone needs to be enabled 437 Label no_reserved_zone_enabling; 438 439 ldr(rscratch1, Address(rthread, JavaThread::reserved_stack_activation_offset())); 440 cmp(sp, rscratch1); 441 br(Assembler::LO, no_reserved_zone_enabling); 442 443 enter(); // LR and FP are live. 444 lea(rscratch1, CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone)); 445 mov(c_rarg0, rthread); 446 blr(rscratch1); 447 leave(); 448 449 // We have already removed our own frame. 450 // throw_delayed_StackOverflowError will think that it's been 451 // called by our caller. 452 lea(rscratch1, RuntimeAddress(StubRoutines::throw_delayed_StackOverflowError_entry())); 453 br(rscratch1); 454 should_not_reach_here(); 455 456 bind(no_reserved_zone_enabling); 457 } 458 459 int MacroAssembler::biased_locking_enter(Register lock_reg, 460 Register obj_reg, 461 Register swap_reg, 462 Register tmp_reg, 463 bool swap_reg_contains_mark, 464 Label& done, 465 Label* slow_case, 466 BiasedLockingCounters* counters) { 467 assert(UseBiasedLocking, "why call this otherwise?"); 468 assert_different_registers(lock_reg, obj_reg, swap_reg); 469 470 if (PrintBiasedLockingStatistics && counters == NULL) 471 counters = BiasedLocking::counters(); 472 473 assert_different_registers(lock_reg, obj_reg, swap_reg, tmp_reg, rscratch1, rscratch2, noreg); 474 assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits, "biased locking makes assumptions about bit layout"); 475 Address mark_addr (obj_reg, oopDesc::mark_offset_in_bytes()); 476 Address klass_addr (obj_reg, oopDesc::klass_offset_in_bytes()); 477 Address saved_mark_addr(lock_reg, 0); 478 479 // Biased locking 480 // See whether the lock is currently biased toward our thread and 481 // whether the epoch is still valid 482 // Note that the runtime guarantees sufficient alignment of JavaThread 483 // pointers to allow age to be placed into low bits 484 // First check to see whether biasing is even enabled for this object 485 Label cas_label; 486 int null_check_offset = -1; 487 if (!swap_reg_contains_mark) { 488 null_check_offset = offset(); 489 ldr(swap_reg, mark_addr); 490 } 491 andr(tmp_reg, swap_reg, markOopDesc::biased_lock_mask_in_place); 492 cmp(tmp_reg, (u1)markOopDesc::biased_lock_pattern); 493 br(Assembler::NE, cas_label); 494 // The bias pattern is present in the object's header. Need to check 495 // whether the bias owner and the epoch are both still current. 496 load_prototype_header(tmp_reg, obj_reg); 497 orr(tmp_reg, tmp_reg, rthread); 498 eor(tmp_reg, swap_reg, tmp_reg); 499 andr(tmp_reg, tmp_reg, ~((int) markOopDesc::age_mask_in_place)); 500 if (counters != NULL) { 501 Label around; 502 cbnz(tmp_reg, around); 503 atomic_incw(Address((address)counters->biased_lock_entry_count_addr()), tmp_reg, rscratch1, rscratch2); 504 b(done); 505 bind(around); 506 } else { 507 cbz(tmp_reg, done); 508 } 509 510 Label try_revoke_bias; 511 Label try_rebias; 512 513 // At this point we know that the header has the bias pattern and 514 // that we are not the bias owner in the current epoch. We need to 515 // figure out more details about the state of the header in order to 516 // know what operations can be legally performed on the object's 517 // header. 518 519 // If the low three bits in the xor result aren't clear, that means 520 // the prototype header is no longer biased and we have to revoke 521 // the bias on this object. 522 andr(rscratch1, tmp_reg, markOopDesc::biased_lock_mask_in_place); 523 cbnz(rscratch1, try_revoke_bias); 524 525 // Biasing is still enabled for this data type. See whether the 526 // epoch of the current bias is still valid, meaning that the epoch 527 // bits of the mark word are equal to the epoch bits of the 528 // prototype header. (Note that the prototype header's epoch bits 529 // only change at a safepoint.) If not, attempt to rebias the object 530 // toward the current thread. Note that we must be absolutely sure 531 // that the current epoch is invalid in order to do this because 532 // otherwise the manipulations it performs on the mark word are 533 // illegal. 534 andr(rscratch1, tmp_reg, markOopDesc::epoch_mask_in_place); 535 cbnz(rscratch1, try_rebias); 536 537 // The epoch of the current bias is still valid but we know nothing 538 // about the owner; it might be set or it might be clear. Try to 539 // acquire the bias of the object using an atomic operation. If this 540 // fails we will go in to the runtime to revoke the object's bias. 541 // Note that we first construct the presumed unbiased header so we 542 // don't accidentally blow away another thread's valid bias. 543 { 544 Label here; 545 mov(rscratch1, markOopDesc::biased_lock_mask_in_place | markOopDesc::age_mask_in_place | markOopDesc::epoch_mask_in_place); 546 andr(swap_reg, swap_reg, rscratch1); 547 orr(tmp_reg, swap_reg, rthread); 548 cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, rscratch1, here, slow_case); 549 // If the biasing toward our thread failed, this means that 550 // another thread succeeded in biasing it toward itself and we 551 // need to revoke that bias. The revocation will occur in the 552 // interpreter runtime in the slow case. 553 bind(here); 554 if (counters != NULL) { 555 atomic_incw(Address((address)counters->anonymously_biased_lock_entry_count_addr()), 556 tmp_reg, rscratch1, rscratch2); 557 } 558 } 559 b(done); 560 561 bind(try_rebias); 562 // At this point we know the epoch has expired, meaning that the 563 // current "bias owner", if any, is actually invalid. Under these 564 // circumstances _only_, we are allowed to use the current header's 565 // value as the comparison value when doing the cas to acquire the 566 // bias in the current epoch. In other words, we allow transfer of 567 // the bias from one thread to another directly in this situation. 568 // 569 // FIXME: due to a lack of registers we currently blow away the age 570 // bits in this situation. Should attempt to preserve them. 571 { 572 Label here; 573 load_prototype_header(tmp_reg, obj_reg); 574 orr(tmp_reg, rthread, tmp_reg); 575 cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, rscratch1, here, slow_case); 576 // If the biasing toward our thread failed, then another thread 577 // succeeded in biasing it toward itself and we need to revoke that 578 // bias. The revocation will occur in the runtime in the slow case. 579 bind(here); 580 if (counters != NULL) { 581 atomic_incw(Address((address)counters->rebiased_lock_entry_count_addr()), 582 tmp_reg, rscratch1, rscratch2); 583 } 584 } 585 b(done); 586 587 bind(try_revoke_bias); 588 // The prototype mark in the klass doesn't have the bias bit set any 589 // more, indicating that objects of this data type are not supposed 590 // to be biased any more. We are going to try to reset the mark of 591 // this object to the prototype value and fall through to the 592 // CAS-based locking scheme. Note that if our CAS fails, it means 593 // that another thread raced us for the privilege of revoking the 594 // bias of this particular object, so it's okay to continue in the 595 // normal locking code. 596 // 597 // FIXME: due to a lack of registers we currently blow away the age 598 // bits in this situation. Should attempt to preserve them. 599 { 600 Label here, nope; 601 load_prototype_header(tmp_reg, obj_reg); 602 cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, rscratch1, here, &nope); 603 bind(here); 604 605 // Fall through to the normal CAS-based lock, because no matter what 606 // the result of the above CAS, some thread must have succeeded in 607 // removing the bias bit from the object's header. 608 if (counters != NULL) { 609 atomic_incw(Address((address)counters->revoked_lock_entry_count_addr()), tmp_reg, 610 rscratch1, rscratch2); 611 } 612 bind(nope); 613 } 614 615 bind(cas_label); 616 617 return null_check_offset; 618 } 619 620 void MacroAssembler::biased_locking_exit(Register obj_reg, Register temp_reg, Label& done) { 621 assert(UseBiasedLocking, "why call this otherwise?"); 622 623 // Check for biased locking unlock case, which is a no-op 624 // Note: we do not have to check the thread ID for two reasons. 625 // First, the interpreter checks for IllegalMonitorStateException at 626 // a higher level. Second, if the bias was revoked while we held the 627 // lock, the object could not be rebiased toward another thread, so 628 // the bias bit would be clear. 629 ldr(temp_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes())); 630 andr(temp_reg, temp_reg, markOopDesc::biased_lock_mask_in_place); 631 cmp(temp_reg, (u1)markOopDesc::biased_lock_pattern); 632 br(Assembler::EQ, done); 633 } 634 635 static void pass_arg0(MacroAssembler* masm, Register arg) { 636 if (c_rarg0 != arg ) { 637 masm->mov(c_rarg0, arg); 638 } 639 } 640 641 static void pass_arg1(MacroAssembler* masm, Register arg) { 642 if (c_rarg1 != arg ) { 643 masm->mov(c_rarg1, arg); 644 } 645 } 646 647 static void pass_arg2(MacroAssembler* masm, Register arg) { 648 if (c_rarg2 != arg ) { 649 masm->mov(c_rarg2, arg); 650 } 651 } 652 653 static void pass_arg3(MacroAssembler* masm, Register arg) { 654 if (c_rarg3 != arg ) { 655 masm->mov(c_rarg3, arg); 656 } 657 } 658 659 void MacroAssembler::call_VM_base(Register oop_result, 660 Register java_thread, 661 Register last_java_sp, 662 address entry_point, 663 int number_of_arguments, 664 bool check_exceptions) { 665 // determine java_thread register 666 if (!java_thread->is_valid()) { 667 java_thread = rthread; 668 } 669 670 // determine last_java_sp register 671 if (!last_java_sp->is_valid()) { 672 last_java_sp = esp; 673 } 674 675 // debugging support 676 assert(number_of_arguments >= 0 , "cannot have negative number of arguments"); 677 assert(java_thread == rthread, "unexpected register"); 678 #ifdef ASSERT 679 // TraceBytecodes does not use r12 but saves it over the call, so don't verify 680 // if ((UseCompressedOops || UseCompressedClassPointers) && !TraceBytecodes) verify_heapbase("call_VM_base: heap base corrupted?"); 681 #endif // ASSERT 682 683 assert(java_thread != oop_result , "cannot use the same register for java_thread & oop_result"); 684 assert(java_thread != last_java_sp, "cannot use the same register for java_thread & last_java_sp"); 685 686 // push java thread (becomes first argument of C function) 687 688 mov(c_rarg0, java_thread); 689 690 // set last Java frame before call 691 assert(last_java_sp != rfp, "can't use rfp"); 692 693 Label l; 694 set_last_Java_frame(last_java_sp, rfp, l, rscratch1); 695 696 // do the call, remove parameters 697 MacroAssembler::call_VM_leaf_base(entry_point, number_of_arguments, &l); 698 699 // reset last Java frame 700 // Only interpreter should have to clear fp 701 reset_last_Java_frame(true); 702 703 // C++ interp handles this in the interpreter 704 check_and_handle_popframe(java_thread); 705 check_and_handle_earlyret(java_thread); 706 707 if (check_exceptions) { 708 // check for pending exceptions (java_thread is set upon return) 709 ldr(rscratch1, Address(java_thread, in_bytes(Thread::pending_exception_offset()))); 710 Label ok; 711 cbz(rscratch1, ok); 712 lea(rscratch1, RuntimeAddress(StubRoutines::forward_exception_entry())); 713 br(rscratch1); 714 bind(ok); 715 } 716 717 // get oop result if there is one and reset the value in the thread 718 if (oop_result->is_valid()) { 719 get_vm_result(oop_result, java_thread); 720 } 721 } 722 723 void MacroAssembler::call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions) { 724 call_VM_base(oop_result, noreg, noreg, entry_point, number_of_arguments, check_exceptions); 725 } 726 727 // Maybe emit a call via a trampoline. If the code cache is small 728 // trampolines won't be emitted. 729 730 address MacroAssembler::trampoline_call(Address entry, CodeBuffer *cbuf) { 731 assert(JavaThread::current()->is_Compiler_thread(), "just checking"); 732 assert(entry.rspec().type() == relocInfo::runtime_call_type 733 || entry.rspec().type() == relocInfo::opt_virtual_call_type 734 || entry.rspec().type() == relocInfo::static_call_type 735 || entry.rspec().type() == relocInfo::virtual_call_type, "wrong reloc type"); 736 737 // We need a trampoline if branches are far. 738 if (far_branches()) { 739 bool in_scratch_emit_size = false; 740 #ifdef COMPILER2 741 // We don't want to emit a trampoline if C2 is generating dummy 742 // code during its branch shortening phase. 743 CompileTask* task = ciEnv::current()->task(); 744 in_scratch_emit_size = 745 (task != NULL && is_c2_compile(task->comp_level()) && 746 Compile::current()->in_scratch_emit_size()); 747 #endif 748 if (!in_scratch_emit_size) { 749 address stub = emit_trampoline_stub(offset(), entry.target()); 750 if (stub == NULL) { 751 return NULL; // CodeCache is full 752 } 753 } 754 } 755 756 if (cbuf) cbuf->set_insts_mark(); 757 relocate(entry.rspec()); 758 if (!far_branches()) { 759 bl(entry.target()); 760 } else { 761 bl(pc()); 762 } 763 // just need to return a non-null address 764 return pc(); 765 } 766 767 768 // Emit a trampoline stub for a call to a target which is too far away. 769 // 770 // code sequences: 771 // 772 // call-site: 773 // branch-and-link to <destination> or <trampoline stub> 774 // 775 // Related trampoline stub for this call site in the stub section: 776 // load the call target from the constant pool 777 // branch (LR still points to the call site above) 778 779 address MacroAssembler::emit_trampoline_stub(int insts_call_instruction_offset, 780 address dest) { 781 // Max stub size: alignment nop, TrampolineStub. 782 address stub = start_a_stub(NativeInstruction::instruction_size 783 + NativeCallTrampolineStub::instruction_size); 784 if (stub == NULL) { 785 return NULL; // CodeBuffer::expand failed 786 } 787 788 // Create a trampoline stub relocation which relates this trampoline stub 789 // with the call instruction at insts_call_instruction_offset in the 790 // instructions code-section. 791 align(wordSize); 792 relocate(trampoline_stub_Relocation::spec(code()->insts()->start() 793 + insts_call_instruction_offset)); 794 const int stub_start_offset = offset(); 795 796 // Now, create the trampoline stub's code: 797 // - load the call 798 // - call 799 Label target; 800 ldr(rscratch1, target); 801 br(rscratch1); 802 bind(target); 803 assert(offset() - stub_start_offset == NativeCallTrampolineStub::data_offset, 804 "should be"); 805 emit_int64((int64_t)dest); 806 807 const address stub_start_addr = addr_at(stub_start_offset); 808 809 assert(is_NativeCallTrampolineStub_at(stub_start_addr), "doesn't look like a trampoline"); 810 811 end_a_stub(); 812 return stub_start_addr; 813 } 814 815 void MacroAssembler::c2bool(Register x) { 816 // implements x == 0 ? 0 : 1 817 // note: must only look at least-significant byte of x 818 // since C-style booleans are stored in one byte 819 // only! (was bug) 820 tst(x, 0xff); 821 cset(x, Assembler::NE); 822 } 823 824 address MacroAssembler::ic_call(address entry, jint method_index) { 825 RelocationHolder rh = virtual_call_Relocation::spec(pc(), method_index); 826 // address const_ptr = long_constant((jlong)Universe::non_oop_word()); 827 // unsigned long offset; 828 // ldr_constant(rscratch2, const_ptr); 829 movptr(rscratch2, (uintptr_t)Universe::non_oop_word()); 830 return trampoline_call(Address(entry, rh)); 831 } 832 833 // Implementation of call_VM versions 834 835 void MacroAssembler::call_VM(Register oop_result, 836 address entry_point, 837 bool check_exceptions) { 838 call_VM_helper(oop_result, entry_point, 0, check_exceptions); 839 } 840 841 void MacroAssembler::call_VM(Register oop_result, 842 address entry_point, 843 Register arg_1, 844 bool check_exceptions) { 845 pass_arg1(this, arg_1); 846 call_VM_helper(oop_result, entry_point, 1, check_exceptions); 847 } 848 849 void MacroAssembler::call_VM(Register oop_result, 850 address entry_point, 851 Register arg_1, 852 Register arg_2, 853 bool check_exceptions) { 854 assert(arg_1 != c_rarg2, "smashed arg"); 855 pass_arg2(this, arg_2); 856 pass_arg1(this, arg_1); 857 call_VM_helper(oop_result, entry_point, 2, check_exceptions); 858 } 859 860 void MacroAssembler::call_VM(Register oop_result, 861 address entry_point, 862 Register arg_1, 863 Register arg_2, 864 Register arg_3, 865 bool check_exceptions) { 866 assert(arg_1 != c_rarg3, "smashed arg"); 867 assert(arg_2 != c_rarg3, "smashed arg"); 868 pass_arg3(this, arg_3); 869 870 assert(arg_1 != c_rarg2, "smashed arg"); 871 pass_arg2(this, arg_2); 872 873 pass_arg1(this, arg_1); 874 call_VM_helper(oop_result, entry_point, 3, check_exceptions); 875 } 876 877 void MacroAssembler::call_VM(Register oop_result, 878 Register last_java_sp, 879 address entry_point, 880 int number_of_arguments, 881 bool check_exceptions) { 882 call_VM_base(oop_result, rthread, last_java_sp, entry_point, number_of_arguments, check_exceptions); 883 } 884 885 void MacroAssembler::call_VM(Register oop_result, 886 Register last_java_sp, 887 address entry_point, 888 Register arg_1, 889 bool check_exceptions) { 890 pass_arg1(this, arg_1); 891 call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions); 892 } 893 894 void MacroAssembler::call_VM(Register oop_result, 895 Register last_java_sp, 896 address entry_point, 897 Register arg_1, 898 Register arg_2, 899 bool check_exceptions) { 900 901 assert(arg_1 != c_rarg2, "smashed arg"); 902 pass_arg2(this, arg_2); 903 pass_arg1(this, arg_1); 904 call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions); 905 } 906 907 void MacroAssembler::call_VM(Register oop_result, 908 Register last_java_sp, 909 address entry_point, 910 Register arg_1, 911 Register arg_2, 912 Register arg_3, 913 bool check_exceptions) { 914 assert(arg_1 != c_rarg3, "smashed arg"); 915 assert(arg_2 != c_rarg3, "smashed arg"); 916 pass_arg3(this, arg_3); 917 assert(arg_1 != c_rarg2, "smashed arg"); 918 pass_arg2(this, arg_2); 919 pass_arg1(this, arg_1); 920 call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions); 921 } 922 923 924 void MacroAssembler::get_vm_result(Register oop_result, Register java_thread) { 925 ldr(oop_result, Address(java_thread, JavaThread::vm_result_offset())); 926 str(zr, Address(java_thread, JavaThread::vm_result_offset())); 927 verify_oop(oop_result, "broken oop in call_VM_base"); 928 } 929 930 void MacroAssembler::get_vm_result_2(Register metadata_result, Register java_thread) { 931 ldr(metadata_result, Address(java_thread, JavaThread::vm_result_2_offset())); 932 str(zr, Address(java_thread, JavaThread::vm_result_2_offset())); 933 } 934 935 void MacroAssembler::align(int modulus) { 936 while (offset() % modulus != 0) nop(); 937 } 938 939 // these are no-ops overridden by InterpreterMacroAssembler 940 941 void MacroAssembler::check_and_handle_earlyret(Register java_thread) { } 942 943 void MacroAssembler::check_and_handle_popframe(Register java_thread) { } 944 945 946 RegisterOrConstant MacroAssembler::delayed_value_impl(intptr_t* delayed_value_addr, 947 Register tmp, 948 int offset) { 949 intptr_t value = *delayed_value_addr; 950 if (value != 0) 951 return RegisterOrConstant(value + offset); 952 953 // load indirectly to solve generation ordering problem 954 ldr(tmp, ExternalAddress((address) delayed_value_addr)); 955 956 if (offset != 0) 957 add(tmp, tmp, offset); 958 959 return RegisterOrConstant(tmp); 960 } 961 962 963 void MacroAssembler:: notify(int type) { 964 if (type == bytecode_start) { 965 // set_last_Java_frame(esp, rfp, (address)NULL); 966 Assembler:: notify(type); 967 // reset_last_Java_frame(true); 968 } 969 else 970 Assembler:: notify(type); 971 } 972 973 // Look up the method for a megamorphic invokeinterface call. 974 // The target method is determined by <intf_klass, itable_index>. 975 // The receiver klass is in recv_klass. 976 // On success, the result will be in method_result, and execution falls through. 977 // On failure, execution transfers to the given label. 978 void MacroAssembler::lookup_interface_method(Register recv_klass, 979 Register intf_klass, 980 RegisterOrConstant itable_index, 981 Register method_result, 982 Register scan_temp, 983 Label& L_no_such_interface, 984 bool return_method) { 985 assert_different_registers(recv_klass, intf_klass, scan_temp); 986 assert_different_registers(method_result, intf_klass, scan_temp); 987 assert(recv_klass != method_result || !return_method, 988 "recv_klass can be destroyed when method isn't needed"); 989 assert(itable_index.is_constant() || itable_index.as_register() == method_result, 990 "caller must use same register for non-constant itable index as for method"); 991 992 // Compute start of first itableOffsetEntry (which is at the end of the vtable) 993 int vtable_base = in_bytes(Klass::vtable_start_offset()); 994 int itentry_off = itableMethodEntry::method_offset_in_bytes(); 995 int scan_step = itableOffsetEntry::size() * wordSize; 996 int vte_size = vtableEntry::size_in_bytes(); 997 assert(vte_size == wordSize, "else adjust times_vte_scale"); 998 999 ldrw(scan_temp, Address(recv_klass, Klass::vtable_length_offset())); 1000 1001 // %%% Could store the aligned, prescaled offset in the klassoop. 1002 // lea(scan_temp, Address(recv_klass, scan_temp, times_vte_scale, vtable_base)); 1003 lea(scan_temp, Address(recv_klass, scan_temp, Address::lsl(3))); 1004 add(scan_temp, scan_temp, vtable_base); 1005 1006 if (return_method) { 1007 // Adjust recv_klass by scaled itable_index, so we can free itable_index. 1008 assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below"); 1009 // lea(recv_klass, Address(recv_klass, itable_index, Address::times_ptr, itentry_off)); 1010 lea(recv_klass, Address(recv_klass, itable_index, Address::lsl(3))); 1011 if (itentry_off) 1012 add(recv_klass, recv_klass, itentry_off); 1013 } 1014 1015 // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) { 1016 // if (scan->interface() == intf) { 1017 // result = (klass + scan->offset() + itable_index); 1018 // } 1019 // } 1020 Label search, found_method; 1021 1022 for (int peel = 1; peel >= 0; peel--) { 1023 ldr(method_result, Address(scan_temp, itableOffsetEntry::interface_offset_in_bytes())); 1024 cmp(intf_klass, method_result); 1025 1026 if (peel) { 1027 br(Assembler::EQ, found_method); 1028 } else { 1029 br(Assembler::NE, search); 1030 // (invert the test to fall through to found_method...) 1031 } 1032 1033 if (!peel) break; 1034 1035 bind(search); 1036 1037 // Check that the previous entry is non-null. A null entry means that 1038 // the receiver class doesn't implement the interface, and wasn't the 1039 // same as when the caller was compiled. 1040 cbz(method_result, L_no_such_interface); 1041 add(scan_temp, scan_temp, scan_step); 1042 } 1043 1044 bind(found_method); 1045 1046 // Got a hit. 1047 if (return_method) { 1048 ldrw(scan_temp, Address(scan_temp, itableOffsetEntry::offset_offset_in_bytes())); 1049 ldr(method_result, Address(recv_klass, scan_temp, Address::uxtw(0))); 1050 } 1051 } 1052 1053 // virtual method calling 1054 void MacroAssembler::lookup_virtual_method(Register recv_klass, 1055 RegisterOrConstant vtable_index, 1056 Register method_result) { 1057 const int base = in_bytes(Klass::vtable_start_offset()); 1058 assert(vtableEntry::size() * wordSize == 8, 1059 "adjust the scaling in the code below"); 1060 int vtable_offset_in_bytes = base + vtableEntry::method_offset_in_bytes(); 1061 1062 if (vtable_index.is_register()) { 1063 lea(method_result, Address(recv_klass, 1064 vtable_index.as_register(), 1065 Address::lsl(LogBytesPerWord))); 1066 ldr(method_result, Address(method_result, vtable_offset_in_bytes)); 1067 } else { 1068 vtable_offset_in_bytes += vtable_index.as_constant() * wordSize; 1069 ldr(method_result, 1070 form_address(rscratch1, recv_klass, vtable_offset_in_bytes, 0)); 1071 } 1072 } 1073 1074 void MacroAssembler::check_klass_subtype(Register sub_klass, 1075 Register super_klass, 1076 Register temp_reg, 1077 Label& L_success) { 1078 Label L_failure; 1079 check_klass_subtype_fast_path(sub_klass, super_klass, temp_reg, &L_success, &L_failure, NULL); 1080 check_klass_subtype_slow_path(sub_klass, super_klass, temp_reg, noreg, &L_success, NULL); 1081 bind(L_failure); 1082 } 1083 1084 1085 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass, 1086 Register super_klass, 1087 Register temp_reg, 1088 Label* L_success, 1089 Label* L_failure, 1090 Label* L_slow_path, 1091 RegisterOrConstant super_check_offset) { 1092 assert_different_registers(sub_klass, super_klass, temp_reg); 1093 bool must_load_sco = (super_check_offset.constant_or_zero() == -1); 1094 if (super_check_offset.is_register()) { 1095 assert_different_registers(sub_klass, super_klass, 1096 super_check_offset.as_register()); 1097 } else if (must_load_sco) { 1098 assert(temp_reg != noreg, "supply either a temp or a register offset"); 1099 } 1100 1101 Label L_fallthrough; 1102 int label_nulls = 0; 1103 if (L_success == NULL) { L_success = &L_fallthrough; label_nulls++; } 1104 if (L_failure == NULL) { L_failure = &L_fallthrough; label_nulls++; } 1105 if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; } 1106 assert(label_nulls <= 1, "at most one NULL in the batch"); 1107 1108 int sc_offset = in_bytes(Klass::secondary_super_cache_offset()); 1109 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 1110 Address super_check_offset_addr(super_klass, sco_offset); 1111 1112 // Hacked jmp, which may only be used just before L_fallthrough. 1113 #define final_jmp(label) \ 1114 if (&(label) == &L_fallthrough) { /*do nothing*/ } \ 1115 else b(label) /*omit semi*/ 1116 1117 // If the pointers are equal, we are done (e.g., String[] elements). 1118 // This self-check enables sharing of secondary supertype arrays among 1119 // non-primary types such as array-of-interface. Otherwise, each such 1120 // type would need its own customized SSA. 1121 // We move this check to the front of the fast path because many 1122 // type checks are in fact trivially successful in this manner, 1123 // so we get a nicely predicted branch right at the start of the check. 1124 cmp(sub_klass, super_klass); 1125 br(Assembler::EQ, *L_success); 1126 1127 // Check the supertype display: 1128 if (must_load_sco) { 1129 ldrw(temp_reg, super_check_offset_addr); 1130 super_check_offset = RegisterOrConstant(temp_reg); 1131 } 1132 Address super_check_addr(sub_klass, super_check_offset); 1133 ldr(rscratch1, super_check_addr); 1134 cmp(super_klass, rscratch1); // load displayed supertype 1135 1136 // This check has worked decisively for primary supers. 1137 // Secondary supers are sought in the super_cache ('super_cache_addr'). 1138 // (Secondary supers are interfaces and very deeply nested subtypes.) 1139 // This works in the same check above because of a tricky aliasing 1140 // between the super_cache and the primary super display elements. 1141 // (The 'super_check_addr' can address either, as the case requires.) 1142 // Note that the cache is updated below if it does not help us find 1143 // what we need immediately. 1144 // So if it was a primary super, we can just fail immediately. 1145 // Otherwise, it's the slow path for us (no success at this point). 1146 1147 if (super_check_offset.is_register()) { 1148 br(Assembler::EQ, *L_success); 1149 subs(zr, super_check_offset.as_register(), sc_offset); 1150 if (L_failure == &L_fallthrough) { 1151 br(Assembler::EQ, *L_slow_path); 1152 } else { 1153 br(Assembler::NE, *L_failure); 1154 final_jmp(*L_slow_path); 1155 } 1156 } else if (super_check_offset.as_constant() == sc_offset) { 1157 // Need a slow path; fast failure is impossible. 1158 if (L_slow_path == &L_fallthrough) { 1159 br(Assembler::EQ, *L_success); 1160 } else { 1161 br(Assembler::NE, *L_slow_path); 1162 final_jmp(*L_success); 1163 } 1164 } else { 1165 // No slow path; it's a fast decision. 1166 if (L_failure == &L_fallthrough) { 1167 br(Assembler::EQ, *L_success); 1168 } else { 1169 br(Assembler::NE, *L_failure); 1170 final_jmp(*L_success); 1171 } 1172 } 1173 1174 bind(L_fallthrough); 1175 1176 #undef final_jmp 1177 } 1178 1179 // These two are taken from x86, but they look generally useful 1180 1181 // scans count pointer sized words at [addr] for occurence of value, 1182 // generic 1183 void MacroAssembler::repne_scan(Register addr, Register value, Register count, 1184 Register scratch) { 1185 Label Lloop, Lexit; 1186 cbz(count, Lexit); 1187 bind(Lloop); 1188 ldr(scratch, post(addr, wordSize)); 1189 cmp(value, scratch); 1190 br(EQ, Lexit); 1191 sub(count, count, 1); 1192 cbnz(count, Lloop); 1193 bind(Lexit); 1194 } 1195 1196 // scans count 4 byte words at [addr] for occurence of value, 1197 // generic 1198 void MacroAssembler::repne_scanw(Register addr, Register value, Register count, 1199 Register scratch) { 1200 Label Lloop, Lexit; 1201 cbz(count, Lexit); 1202 bind(Lloop); 1203 ldrw(scratch, post(addr, wordSize)); 1204 cmpw(value, scratch); 1205 br(EQ, Lexit); 1206 sub(count, count, 1); 1207 cbnz(count, Lloop); 1208 bind(Lexit); 1209 } 1210 1211 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass, 1212 Register super_klass, 1213 Register temp_reg, 1214 Register temp2_reg, 1215 Label* L_success, 1216 Label* L_failure, 1217 bool set_cond_codes) { 1218 assert_different_registers(sub_klass, super_klass, temp_reg); 1219 if (temp2_reg != noreg) 1220 assert_different_registers(sub_klass, super_klass, temp_reg, temp2_reg, rscratch1); 1221 #define IS_A_TEMP(reg) ((reg) == temp_reg || (reg) == temp2_reg) 1222 1223 Label L_fallthrough; 1224 int label_nulls = 0; 1225 if (L_success == NULL) { L_success = &L_fallthrough; label_nulls++; } 1226 if (L_failure == NULL) { L_failure = &L_fallthrough; label_nulls++; } 1227 assert(label_nulls <= 1, "at most one NULL in the batch"); 1228 1229 // a couple of useful fields in sub_klass: 1230 int ss_offset = in_bytes(Klass::secondary_supers_offset()); 1231 int sc_offset = in_bytes(Klass::secondary_super_cache_offset()); 1232 Address secondary_supers_addr(sub_klass, ss_offset); 1233 Address super_cache_addr( sub_klass, sc_offset); 1234 1235 BLOCK_COMMENT("check_klass_subtype_slow_path"); 1236 1237 // Do a linear scan of the secondary super-klass chain. 1238 // This code is rarely used, so simplicity is a virtue here. 1239 // The repne_scan instruction uses fixed registers, which we must spill. 1240 // Don't worry too much about pre-existing connections with the input regs. 1241 1242 assert(sub_klass != r0, "killed reg"); // killed by mov(r0, super) 1243 assert(sub_klass != r2, "killed reg"); // killed by lea(r2, &pst_counter) 1244 1245 RegSet pushed_registers; 1246 if (!IS_A_TEMP(r2)) pushed_registers += r2; 1247 if (!IS_A_TEMP(r5)) pushed_registers += r5; 1248 1249 if (super_klass != r0 || UseCompressedOops) { 1250 if (!IS_A_TEMP(r0)) pushed_registers += r0; 1251 } 1252 1253 push(pushed_registers, sp); 1254 1255 // Get super_klass value into r0 (even if it was in r5 or r2). 1256 if (super_klass != r0) { 1257 mov(r0, super_klass); 1258 } 1259 1260 #ifndef PRODUCT 1261 mov(rscratch2, (address)&SharedRuntime::_partial_subtype_ctr); 1262 Address pst_counter_addr(rscratch2); 1263 ldr(rscratch1, pst_counter_addr); 1264 add(rscratch1, rscratch1, 1); 1265 str(rscratch1, pst_counter_addr); 1266 #endif //PRODUCT 1267 1268 // We will consult the secondary-super array. 1269 ldr(r5, secondary_supers_addr); 1270 // Load the array length. 1271 ldrw(r2, Address(r5, Array<Klass*>::length_offset_in_bytes())); 1272 // Skip to start of data. 1273 add(r5, r5, Array<Klass*>::base_offset_in_bytes()); 1274 1275 cmp(sp, zr); // Clear Z flag; SP is never zero 1276 // Scan R2 words at [R5] for an occurrence of R0. 1277 // Set NZ/Z based on last compare. 1278 repne_scan(r5, r0, r2, rscratch1); 1279 1280 // Unspill the temp. registers: 1281 pop(pushed_registers, sp); 1282 1283 br(Assembler::NE, *L_failure); 1284 1285 // Success. Cache the super we found and proceed in triumph. 1286 str(super_klass, super_cache_addr); 1287 1288 if (L_success != &L_fallthrough) { 1289 b(*L_success); 1290 } 1291 1292 #undef IS_A_TEMP 1293 1294 bind(L_fallthrough); 1295 } 1296 1297 1298 void MacroAssembler::verify_oop(Register reg, const char* s) { 1299 if (!VerifyOops || VerifyAdapterSharing) { 1300 // Below address of the code string confuses VerifyAdapterSharing 1301 // because it may differ between otherwise equivalent adapters. 1302 return; 1303 } 1304 1305 // Pass register number to verify_oop_subroutine 1306 const char* b = NULL; 1307 { 1308 ResourceMark rm; 1309 stringStream ss; 1310 ss.print("verify_oop: %s: %s", reg->name(), s); 1311 b = code_string(ss.as_string()); 1312 } 1313 BLOCK_COMMENT("verify_oop {"); 1314 1315 stp(r0, rscratch1, Address(pre(sp, -2 * wordSize))); 1316 stp(rscratch2, lr, Address(pre(sp, -2 * wordSize))); 1317 1318 mov(r0, reg); 1319 mov(rscratch1, (address)b); 1320 1321 // call indirectly to solve generation ordering problem 1322 lea(rscratch2, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address())); 1323 ldr(rscratch2, Address(rscratch2)); 1324 blr(rscratch2); 1325 1326 ldp(rscratch2, lr, Address(post(sp, 2 * wordSize))); 1327 ldp(r0, rscratch1, Address(post(sp, 2 * wordSize))); 1328 1329 BLOCK_COMMENT("} verify_oop"); 1330 } 1331 1332 void MacroAssembler::verify_oop_addr(Address addr, const char* s) { 1333 if (!VerifyOops || VerifyAdapterSharing) { 1334 // Below address of the code string confuses VerifyAdapterSharing 1335 // because it may differ between otherwise equivalent adapters. 1336 return; 1337 } 1338 1339 const char* b = NULL; 1340 { 1341 ResourceMark rm; 1342 stringStream ss; 1343 ss.print("verify_oop_addr: %s", s); 1344 b = code_string(ss.as_string()); 1345 } 1346 BLOCK_COMMENT("verify_oop_addr {"); 1347 1348 stp(r0, rscratch1, Address(pre(sp, -2 * wordSize))); 1349 stp(rscratch2, lr, Address(pre(sp, -2 * wordSize))); 1350 1351 // addr may contain sp so we will have to adjust it based on the 1352 // pushes that we just did. 1353 if (addr.uses(sp)) { 1354 lea(r0, addr); 1355 ldr(r0, Address(r0, 4 * wordSize)); 1356 } else { 1357 ldr(r0, addr); 1358 } 1359 mov(rscratch1, (address)b); 1360 1361 // call indirectly to solve generation ordering problem 1362 lea(rscratch2, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address())); 1363 ldr(rscratch2, Address(rscratch2)); 1364 blr(rscratch2); 1365 1366 ldp(rscratch2, lr, Address(post(sp, 2 * wordSize))); 1367 ldp(r0, rscratch1, Address(post(sp, 2 * wordSize))); 1368 1369 BLOCK_COMMENT("} verify_oop_addr"); 1370 } 1371 1372 Address MacroAssembler::argument_address(RegisterOrConstant arg_slot, 1373 int extra_slot_offset) { 1374 // cf. TemplateTable::prepare_invoke(), if (load_receiver). 1375 int stackElementSize = Interpreter::stackElementSize; 1376 int offset = Interpreter::expr_offset_in_bytes(extra_slot_offset+0); 1377 #ifdef ASSERT 1378 int offset1 = Interpreter::expr_offset_in_bytes(extra_slot_offset+1); 1379 assert(offset1 - offset == stackElementSize, "correct arithmetic"); 1380 #endif 1381 if (arg_slot.is_constant()) { 1382 return Address(esp, arg_slot.as_constant() * stackElementSize 1383 + offset); 1384 } else { 1385 add(rscratch1, esp, arg_slot.as_register(), 1386 ext::uxtx, exact_log2(stackElementSize)); 1387 return Address(rscratch1, offset); 1388 } 1389 } 1390 1391 void MacroAssembler::call_VM_leaf_base(address entry_point, 1392 int number_of_arguments, 1393 Label *retaddr) { 1394 call_VM_leaf_base1(entry_point, number_of_arguments, 0, ret_type_integral, retaddr); 1395 } 1396 1397 void MacroAssembler::call_VM_leaf_base1(address entry_point, 1398 int number_of_gp_arguments, 1399 int number_of_fp_arguments, 1400 ret_type type, 1401 Label *retaddr) { 1402 Label E, L; 1403 1404 stp(rscratch1, rmethod, Address(pre(sp, -2 * wordSize))); 1405 1406 // We add 1 to number_of_arguments because the thread in arg0 is 1407 // not counted 1408 mov(rscratch1, entry_point); 1409 blrt(rscratch1, number_of_gp_arguments + 1, number_of_fp_arguments, type); 1410 if (retaddr) 1411 bind(*retaddr); 1412 1413 ldp(rscratch1, rmethod, Address(post(sp, 2 * wordSize))); 1414 maybe_isb(); 1415 } 1416 1417 void MacroAssembler::call_VM_leaf(address entry_point, int number_of_arguments) { 1418 call_VM_leaf_base(entry_point, number_of_arguments); 1419 } 1420 1421 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0) { 1422 pass_arg0(this, arg_0); 1423 call_VM_leaf_base(entry_point, 1); 1424 } 1425 1426 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1) { 1427 pass_arg0(this, arg_0); 1428 pass_arg1(this, arg_1); 1429 call_VM_leaf_base(entry_point, 2); 1430 } 1431 1432 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, 1433 Register arg_1, Register arg_2) { 1434 pass_arg0(this, arg_0); 1435 pass_arg1(this, arg_1); 1436 pass_arg2(this, arg_2); 1437 call_VM_leaf_base(entry_point, 3); 1438 } 1439 1440 void MacroAssembler::super_call_VM_leaf(address entry_point) { 1441 MacroAssembler::call_VM_leaf_base(entry_point, 1); 1442 } 1443 1444 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0) { 1445 pass_arg0(this, arg_0); 1446 MacroAssembler::call_VM_leaf_base(entry_point, 1); 1447 } 1448 1449 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1) { 1450 1451 assert(arg_0 != c_rarg1, "smashed arg"); 1452 pass_arg1(this, arg_1); 1453 pass_arg0(this, arg_0); 1454 MacroAssembler::call_VM_leaf_base(entry_point, 2); 1455 } 1456 1457 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) { 1458 assert(arg_0 != c_rarg2, "smashed arg"); 1459 assert(arg_1 != c_rarg2, "smashed arg"); 1460 pass_arg2(this, arg_2); 1461 assert(arg_0 != c_rarg1, "smashed arg"); 1462 pass_arg1(this, arg_1); 1463 pass_arg0(this, arg_0); 1464 MacroAssembler::call_VM_leaf_base(entry_point, 3); 1465 } 1466 1467 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) { 1468 assert(arg_0 != c_rarg3, "smashed arg"); 1469 assert(arg_1 != c_rarg3, "smashed arg"); 1470 assert(arg_2 != c_rarg3, "smashed arg"); 1471 pass_arg3(this, arg_3); 1472 assert(arg_0 != c_rarg2, "smashed arg"); 1473 assert(arg_1 != c_rarg2, "smashed arg"); 1474 pass_arg2(this, arg_2); 1475 assert(arg_0 != c_rarg1, "smashed arg"); 1476 pass_arg1(this, arg_1); 1477 pass_arg0(this, arg_0); 1478 MacroAssembler::call_VM_leaf_base(entry_point, 4); 1479 } 1480 1481 void MacroAssembler::null_check(Register reg, int offset) { 1482 if (needs_explicit_null_check(offset)) { 1483 // provoke OS NULL exception if reg = NULL by 1484 // accessing M[reg] w/o changing any registers 1485 // NOTE: this is plenty to provoke a segv 1486 ldr(zr, Address(reg)); 1487 } else { 1488 // nothing to do, (later) access of M[reg + offset] 1489 // will provoke OS NULL exception if reg = NULL 1490 } 1491 } 1492 1493 void MacroAssembler::test_klass_is_value(Register klass, Register temp_reg, Label& is_value) { 1494 ldrw(temp_reg, Address(klass, Klass::access_flags_offset())); 1495 andr(temp_reg, temp_reg, JVM_ACC_VALUE); 1496 cbnz(temp_reg, is_value); 1497 } 1498 1499 void MacroAssembler::test_field_is_flattenable(Register flags, Register temp_reg, Label& is_flattenable) { 1500 (void) temp_reg; // keep signature uniform with x86 1501 tbnz(flags, ConstantPoolCacheEntry::is_flattenable_field_shift, is_flattenable); 1502 } 1503 1504 void MacroAssembler::test_field_is_not_flattenable(Register flags, Register temp_reg, Label& not_flattenable) { 1505 (void) temp_reg; // keep signature uniform with x86 1506 tbz(flags, ConstantPoolCacheEntry::is_flattenable_field_shift, not_flattenable); 1507 } 1508 1509 void MacroAssembler::test_field_is_flattened(Register flags, Register temp_reg, Label& is_flattened) { 1510 (void) temp_reg; // keep signature uniform with x86 1511 tbnz(flags, ConstantPoolCacheEntry::is_flattened_field_shift, is_flattened); 1512 } 1513 1514 void MacroAssembler::test_flat_array_klass(Register klass, Register temp_reg, Label& is_flattened) { 1515 ldrw(temp_reg, Address(klass, Klass::layout_helper_offset())); 1516 asrw(temp_reg, temp_reg, Klass::_lh_array_tag_shift); 1517 cmpw(temp_reg, Klass::_lh_array_tag_vt_value); 1518 br(Assembler::EQ, is_flattened); 1519 } 1520 1521 void MacroAssembler::test_flat_array_oop(Register oop, Register temp_reg, Label& is_flattened) { 1522 load_klass(temp_reg, oop); 1523 test_flat_array_klass(temp_reg, temp_reg, is_flattened); 1524 } 1525 1526 // MacroAssembler protected routines needed to implement 1527 // public methods 1528 1529 void MacroAssembler::mov(Register r, Address dest) { 1530 code_section()->relocate(pc(), dest.rspec()); 1531 u_int64_t imm64 = (u_int64_t)dest.target(); 1532 movptr(r, imm64); 1533 } 1534 1535 // Move a constant pointer into r. In AArch64 mode the virtual 1536 // address space is 48 bits in size, so we only need three 1537 // instructions to create a patchable instruction sequence that can 1538 // reach anywhere. 1539 void MacroAssembler::movptr(Register r, uintptr_t imm64) { 1540 #ifndef PRODUCT 1541 { 1542 char buffer[64]; 1543 snprintf(buffer, sizeof(buffer), "0x%" PRIX64, imm64); 1544 block_comment(buffer); 1545 } 1546 #endif 1547 assert(imm64 < (1ul << 48), "48-bit overflow in address constant"); 1548 movz(r, imm64 & 0xffff); 1549 imm64 >>= 16; 1550 movk(r, imm64 & 0xffff, 16); 1551 imm64 >>= 16; 1552 movk(r, imm64 & 0xffff, 32); 1553 } 1554 1555 // Macro to mov replicated immediate to vector register. 1556 // Vd will get the following values for different arrangements in T 1557 // imm32 == hex 000000gh T8B: Vd = ghghghghghghghgh 1558 // imm32 == hex 000000gh T16B: Vd = ghghghghghghghghghghghghghghghgh 1559 // imm32 == hex 0000efgh T4H: Vd = efghefghefghefgh 1560 // imm32 == hex 0000efgh T8H: Vd = efghefghefghefghefghefghefghefgh 1561 // imm32 == hex abcdefgh T2S: Vd = abcdefghabcdefgh 1562 // imm32 == hex abcdefgh T4S: Vd = abcdefghabcdefghabcdefghabcdefgh 1563 // T1D/T2D: invalid 1564 void MacroAssembler::mov(FloatRegister Vd, SIMD_Arrangement T, u_int32_t imm32) { 1565 assert(T != T1D && T != T2D, "invalid arrangement"); 1566 if (T == T8B || T == T16B) { 1567 assert((imm32 & ~0xff) == 0, "extraneous bits in unsigned imm32 (T8B/T16B)"); 1568 movi(Vd, T, imm32 & 0xff, 0); 1569 return; 1570 } 1571 u_int32_t nimm32 = ~imm32; 1572 if (T == T4H || T == T8H) { 1573 assert((imm32 & ~0xffff) == 0, "extraneous bits in unsigned imm32 (T4H/T8H)"); 1574 imm32 &= 0xffff; 1575 nimm32 &= 0xffff; 1576 } 1577 u_int32_t x = imm32; 1578 int movi_cnt = 0; 1579 int movn_cnt = 0; 1580 while (x) { if (x & 0xff) movi_cnt++; x >>= 8; } 1581 x = nimm32; 1582 while (x) { if (x & 0xff) movn_cnt++; x >>= 8; } 1583 if (movn_cnt < movi_cnt) imm32 = nimm32; 1584 unsigned lsl = 0; 1585 while (imm32 && (imm32 & 0xff) == 0) { lsl += 8; imm32 >>= 8; } 1586 if (movn_cnt < movi_cnt) 1587 mvni(Vd, T, imm32 & 0xff, lsl); 1588 else 1589 movi(Vd, T, imm32 & 0xff, lsl); 1590 imm32 >>= 8; lsl += 8; 1591 while (imm32) { 1592 while ((imm32 & 0xff) == 0) { lsl += 8; imm32 >>= 8; } 1593 if (movn_cnt < movi_cnt) 1594 bici(Vd, T, imm32 & 0xff, lsl); 1595 else 1596 orri(Vd, T, imm32 & 0xff, lsl); 1597 lsl += 8; imm32 >>= 8; 1598 } 1599 } 1600 1601 void MacroAssembler::mov_immediate64(Register dst, u_int64_t imm64) 1602 { 1603 #ifndef PRODUCT 1604 { 1605 char buffer[64]; 1606 snprintf(buffer, sizeof(buffer), "0x%" PRIX64, imm64); 1607 block_comment(buffer); 1608 } 1609 #endif 1610 if (operand_valid_for_logical_immediate(false, imm64)) { 1611 orr(dst, zr, imm64); 1612 } else { 1613 // we can use a combination of MOVZ or MOVN with 1614 // MOVK to build up the constant 1615 u_int64_t imm_h[4]; 1616 int zero_count = 0; 1617 int neg_count = 0; 1618 int i; 1619 for (i = 0; i < 4; i++) { 1620 imm_h[i] = ((imm64 >> (i * 16)) & 0xffffL); 1621 if (imm_h[i] == 0) { 1622 zero_count++; 1623 } else if (imm_h[i] == 0xffffL) { 1624 neg_count++; 1625 } 1626 } 1627 if (zero_count == 4) { 1628 // one MOVZ will do 1629 movz(dst, 0); 1630 } else if (neg_count == 4) { 1631 // one MOVN will do 1632 movn(dst, 0); 1633 } else if (zero_count == 3) { 1634 for (i = 0; i < 4; i++) { 1635 if (imm_h[i] != 0L) { 1636 movz(dst, (u_int32_t)imm_h[i], (i << 4)); 1637 break; 1638 } 1639 } 1640 } else if (neg_count == 3) { 1641 // one MOVN will do 1642 for (int i = 0; i < 4; i++) { 1643 if (imm_h[i] != 0xffffL) { 1644 movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4)); 1645 break; 1646 } 1647 } 1648 } else if (zero_count == 2) { 1649 // one MOVZ and one MOVK will do 1650 for (i = 0; i < 3; i++) { 1651 if (imm_h[i] != 0L) { 1652 movz(dst, (u_int32_t)imm_h[i], (i << 4)); 1653 i++; 1654 break; 1655 } 1656 } 1657 for (;i < 4; i++) { 1658 if (imm_h[i] != 0L) { 1659 movk(dst, (u_int32_t)imm_h[i], (i << 4)); 1660 } 1661 } 1662 } else if (neg_count == 2) { 1663 // one MOVN and one MOVK will do 1664 for (i = 0; i < 4; i++) { 1665 if (imm_h[i] != 0xffffL) { 1666 movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4)); 1667 i++; 1668 break; 1669 } 1670 } 1671 for (;i < 4; i++) { 1672 if (imm_h[i] != 0xffffL) { 1673 movk(dst, (u_int32_t)imm_h[i], (i << 4)); 1674 } 1675 } 1676 } else if (zero_count == 1) { 1677 // one MOVZ and two MOVKs will do 1678 for (i = 0; i < 4; i++) { 1679 if (imm_h[i] != 0L) { 1680 movz(dst, (u_int32_t)imm_h[i], (i << 4)); 1681 i++; 1682 break; 1683 } 1684 } 1685 for (;i < 4; i++) { 1686 if (imm_h[i] != 0x0L) { 1687 movk(dst, (u_int32_t)imm_h[i], (i << 4)); 1688 } 1689 } 1690 } else if (neg_count == 1) { 1691 // one MOVN and two MOVKs will do 1692 for (i = 0; i < 4; i++) { 1693 if (imm_h[i] != 0xffffL) { 1694 movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4)); 1695 i++; 1696 break; 1697 } 1698 } 1699 for (;i < 4; i++) { 1700 if (imm_h[i] != 0xffffL) { 1701 movk(dst, (u_int32_t)imm_h[i], (i << 4)); 1702 } 1703 } 1704 } else { 1705 // use a MOVZ and 3 MOVKs (makes it easier to debug) 1706 movz(dst, (u_int32_t)imm_h[0], 0); 1707 for (i = 1; i < 4; i++) { 1708 movk(dst, (u_int32_t)imm_h[i], (i << 4)); 1709 } 1710 } 1711 } 1712 } 1713 1714 void MacroAssembler::mov_immediate32(Register dst, u_int32_t imm32) 1715 { 1716 #ifndef PRODUCT 1717 { 1718 char buffer[64]; 1719 snprintf(buffer, sizeof(buffer), "0x%" PRIX32, imm32); 1720 block_comment(buffer); 1721 } 1722 #endif 1723 if (operand_valid_for_logical_immediate(true, imm32)) { 1724 orrw(dst, zr, imm32); 1725 } else { 1726 // we can use MOVZ, MOVN or two calls to MOVK to build up the 1727 // constant 1728 u_int32_t imm_h[2]; 1729 imm_h[0] = imm32 & 0xffff; 1730 imm_h[1] = ((imm32 >> 16) & 0xffff); 1731 if (imm_h[0] == 0) { 1732 movzw(dst, imm_h[1], 16); 1733 } else if (imm_h[0] == 0xffff) { 1734 movnw(dst, imm_h[1] ^ 0xffff, 16); 1735 } else if (imm_h[1] == 0) { 1736 movzw(dst, imm_h[0], 0); 1737 } else if (imm_h[1] == 0xffff) { 1738 movnw(dst, imm_h[0] ^ 0xffff, 0); 1739 } else { 1740 // use a MOVZ and MOVK (makes it easier to debug) 1741 movzw(dst, imm_h[0], 0); 1742 movkw(dst, imm_h[1], 16); 1743 } 1744 } 1745 } 1746 1747 // Form an address from base + offset in Rd. Rd may or may 1748 // not actually be used: you must use the Address that is returned. 1749 // It is up to you to ensure that the shift provided matches the size 1750 // of your data. 1751 Address MacroAssembler::form_address(Register Rd, Register base, long byte_offset, int shift) { 1752 if (Address::offset_ok_for_immed(byte_offset, shift)) 1753 // It fits; no need for any heroics 1754 return Address(base, byte_offset); 1755 1756 // Don't do anything clever with negative or misaligned offsets 1757 unsigned mask = (1 << shift) - 1; 1758 if (byte_offset < 0 || byte_offset & mask) { 1759 mov(Rd, byte_offset); 1760 add(Rd, base, Rd); 1761 return Address(Rd); 1762 } 1763 1764 // See if we can do this with two 12-bit offsets 1765 { 1766 unsigned long word_offset = byte_offset >> shift; 1767 unsigned long masked_offset = word_offset & 0xfff000; 1768 if (Address::offset_ok_for_immed(word_offset - masked_offset) 1769 && Assembler::operand_valid_for_add_sub_immediate(masked_offset << shift)) { 1770 add(Rd, base, masked_offset << shift); 1771 word_offset -= masked_offset; 1772 return Address(Rd, word_offset << shift); 1773 } 1774 } 1775 1776 // Do it the hard way 1777 mov(Rd, byte_offset); 1778 add(Rd, base, Rd); 1779 return Address(Rd); 1780 } 1781 1782 void MacroAssembler::atomic_incw(Register counter_addr, Register tmp, Register tmp2) { 1783 if (UseLSE) { 1784 mov(tmp, 1); 1785 ldadd(Assembler::word, tmp, zr, counter_addr); 1786 return; 1787 } 1788 Label retry_load; 1789 if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH)) 1790 prfm(Address(counter_addr), PSTL1STRM); 1791 bind(retry_load); 1792 // flush and load exclusive from the memory location 1793 ldxrw(tmp, counter_addr); 1794 addw(tmp, tmp, 1); 1795 // if we store+flush with no intervening write tmp wil be zero 1796 stxrw(tmp2, tmp, counter_addr); 1797 cbnzw(tmp2, retry_load); 1798 } 1799 1800 1801 int MacroAssembler::corrected_idivl(Register result, Register ra, Register rb, 1802 bool want_remainder, Register scratch) 1803 { 1804 // Full implementation of Java idiv and irem. The function 1805 // returns the (pc) offset of the div instruction - may be needed 1806 // for implicit exceptions. 1807 // 1808 // constraint : ra/rb =/= scratch 1809 // normal case 1810 // 1811 // input : ra: dividend 1812 // rb: divisor 1813 // 1814 // result: either 1815 // quotient (= ra idiv rb) 1816 // remainder (= ra irem rb) 1817 1818 assert(ra != scratch && rb != scratch, "reg cannot be scratch"); 1819 1820 int idivl_offset = offset(); 1821 if (! want_remainder) { 1822 sdivw(result, ra, rb); 1823 } else { 1824 sdivw(scratch, ra, rb); 1825 Assembler::msubw(result, scratch, rb, ra); 1826 } 1827 1828 return idivl_offset; 1829 } 1830 1831 int MacroAssembler::corrected_idivq(Register result, Register ra, Register rb, 1832 bool want_remainder, Register scratch) 1833 { 1834 // Full implementation of Java ldiv and lrem. The function 1835 // returns the (pc) offset of the div instruction - may be needed 1836 // for implicit exceptions. 1837 // 1838 // constraint : ra/rb =/= scratch 1839 // normal case 1840 // 1841 // input : ra: dividend 1842 // rb: divisor 1843 // 1844 // result: either 1845 // quotient (= ra idiv rb) 1846 // remainder (= ra irem rb) 1847 1848 assert(ra != scratch && rb != scratch, "reg cannot be scratch"); 1849 1850 int idivq_offset = offset(); 1851 if (! want_remainder) { 1852 sdiv(result, ra, rb); 1853 } else { 1854 sdiv(scratch, ra, rb); 1855 Assembler::msub(result, scratch, rb, ra); 1856 } 1857 1858 return idivq_offset; 1859 } 1860 1861 void MacroAssembler::membar(Membar_mask_bits order_constraint) { 1862 address prev = pc() - NativeMembar::instruction_size; 1863 address last = code()->last_insn(); 1864 if (last != NULL && nativeInstruction_at(last)->is_Membar() && prev == last) { 1865 NativeMembar *bar = NativeMembar_at(prev); 1866 // We are merging two memory barrier instructions. On AArch64 we 1867 // can do this simply by ORing them together. 1868 bar->set_kind(bar->get_kind() | order_constraint); 1869 BLOCK_COMMENT("merged membar"); 1870 } else { 1871 code()->set_last_insn(pc()); 1872 dmb(Assembler::barrier(order_constraint)); 1873 } 1874 } 1875 1876 bool MacroAssembler::try_merge_ldst(Register rt, const Address &adr, size_t size_in_bytes, bool is_store) { 1877 if (ldst_can_merge(rt, adr, size_in_bytes, is_store)) { 1878 merge_ldst(rt, adr, size_in_bytes, is_store); 1879 code()->clear_last_insn(); 1880 return true; 1881 } else { 1882 assert(size_in_bytes == 8 || size_in_bytes == 4, "only 8 bytes or 4 bytes load/store is supported."); 1883 const unsigned mask = size_in_bytes - 1; 1884 if (adr.getMode() == Address::base_plus_offset && 1885 (adr.offset() & mask) == 0) { // only supports base_plus_offset. 1886 code()->set_last_insn(pc()); 1887 } 1888 return false; 1889 } 1890 } 1891 1892 void MacroAssembler::ldr(Register Rx, const Address &adr) { 1893 // We always try to merge two adjacent loads into one ldp. 1894 if (!try_merge_ldst(Rx, adr, 8, false)) { 1895 Assembler::ldr(Rx, adr); 1896 } 1897 } 1898 1899 void MacroAssembler::ldrw(Register Rw, const Address &adr) { 1900 // We always try to merge two adjacent loads into one ldp. 1901 if (!try_merge_ldst(Rw, adr, 4, false)) { 1902 Assembler::ldrw(Rw, adr); 1903 } 1904 } 1905 1906 void MacroAssembler::str(Register Rx, const Address &adr) { 1907 // We always try to merge two adjacent stores into one stp. 1908 if (!try_merge_ldst(Rx, adr, 8, true)) { 1909 Assembler::str(Rx, adr); 1910 } 1911 } 1912 1913 void MacroAssembler::strw(Register Rw, const Address &adr) { 1914 // We always try to merge two adjacent stores into one stp. 1915 if (!try_merge_ldst(Rw, adr, 4, true)) { 1916 Assembler::strw(Rw, adr); 1917 } 1918 } 1919 1920 // MacroAssembler routines found actually to be needed 1921 1922 void MacroAssembler::push(Register src) 1923 { 1924 str(src, Address(pre(esp, -1 * wordSize))); 1925 } 1926 1927 void MacroAssembler::pop(Register dst) 1928 { 1929 ldr(dst, Address(post(esp, 1 * wordSize))); 1930 } 1931 1932 // Note: load_unsigned_short used to be called load_unsigned_word. 1933 int MacroAssembler::load_unsigned_short(Register dst, Address src) { 1934 int off = offset(); 1935 ldrh(dst, src); 1936 return off; 1937 } 1938 1939 int MacroAssembler::load_unsigned_byte(Register dst, Address src) { 1940 int off = offset(); 1941 ldrb(dst, src); 1942 return off; 1943 } 1944 1945 int MacroAssembler::load_signed_short(Register dst, Address src) { 1946 int off = offset(); 1947 ldrsh(dst, src); 1948 return off; 1949 } 1950 1951 int MacroAssembler::load_signed_byte(Register dst, Address src) { 1952 int off = offset(); 1953 ldrsb(dst, src); 1954 return off; 1955 } 1956 1957 int MacroAssembler::load_signed_short32(Register dst, Address src) { 1958 int off = offset(); 1959 ldrshw(dst, src); 1960 return off; 1961 } 1962 1963 int MacroAssembler::load_signed_byte32(Register dst, Address src) { 1964 int off = offset(); 1965 ldrsbw(dst, src); 1966 return off; 1967 } 1968 1969 void MacroAssembler::load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed, Register dst2) { 1970 switch (size_in_bytes) { 1971 case 8: ldr(dst, src); break; 1972 case 4: ldrw(dst, src); break; 1973 case 2: is_signed ? load_signed_short(dst, src) : load_unsigned_short(dst, src); break; 1974 case 1: is_signed ? load_signed_byte( dst, src) : load_unsigned_byte( dst, src); break; 1975 default: ShouldNotReachHere(); 1976 } 1977 } 1978 1979 void MacroAssembler::store_sized_value(Address dst, Register src, size_t size_in_bytes, Register src2) { 1980 switch (size_in_bytes) { 1981 case 8: str(src, dst); break; 1982 case 4: strw(src, dst); break; 1983 case 2: strh(src, dst); break; 1984 case 1: strb(src, dst); break; 1985 default: ShouldNotReachHere(); 1986 } 1987 } 1988 1989 void MacroAssembler::decrementw(Register reg, int value) 1990 { 1991 if (value < 0) { incrementw(reg, -value); return; } 1992 if (value == 0) { return; } 1993 if (value < (1 << 12)) { subw(reg, reg, value); return; } 1994 /* else */ { 1995 guarantee(reg != rscratch2, "invalid dst for register decrement"); 1996 movw(rscratch2, (unsigned)value); 1997 subw(reg, reg, rscratch2); 1998 } 1999 } 2000 2001 void MacroAssembler::decrement(Register reg, int value) 2002 { 2003 if (value < 0) { increment(reg, -value); return; } 2004 if (value == 0) { return; } 2005 if (value < (1 << 12)) { sub(reg, reg, value); return; } 2006 /* else */ { 2007 assert(reg != rscratch2, "invalid dst for register decrement"); 2008 mov(rscratch2, (unsigned long)value); 2009 sub(reg, reg, rscratch2); 2010 } 2011 } 2012 2013 void MacroAssembler::decrementw(Address dst, int value) 2014 { 2015 assert(!dst.uses(rscratch1), "invalid dst for address decrement"); 2016 if (dst.getMode() == Address::literal) { 2017 assert(abs(value) < (1 << 12), "invalid value and address mode combination"); 2018 lea(rscratch2, dst); 2019 dst = Address(rscratch2); 2020 } 2021 ldrw(rscratch1, dst); 2022 decrementw(rscratch1, value); 2023 strw(rscratch1, dst); 2024 } 2025 2026 void MacroAssembler::decrement(Address dst, int value) 2027 { 2028 assert(!dst.uses(rscratch1), "invalid address for decrement"); 2029 if (dst.getMode() == Address::literal) { 2030 assert(abs(value) < (1 << 12), "invalid value and address mode combination"); 2031 lea(rscratch2, dst); 2032 dst = Address(rscratch2); 2033 } 2034 ldr(rscratch1, dst); 2035 decrement(rscratch1, value); 2036 str(rscratch1, dst); 2037 } 2038 2039 void MacroAssembler::incrementw(Register reg, int value) 2040 { 2041 if (value < 0) { decrementw(reg, -value); return; } 2042 if (value == 0) { return; } 2043 if (value < (1 << 12)) { addw(reg, reg, value); return; } 2044 /* else */ { 2045 assert(reg != rscratch2, "invalid dst for register increment"); 2046 movw(rscratch2, (unsigned)value); 2047 addw(reg, reg, rscratch2); 2048 } 2049 } 2050 2051 void MacroAssembler::increment(Register reg, int value) 2052 { 2053 if (value < 0) { decrement(reg, -value); return; } 2054 if (value == 0) { return; } 2055 if (value < (1 << 12)) { add(reg, reg, value); return; } 2056 /* else */ { 2057 assert(reg != rscratch2, "invalid dst for register increment"); 2058 movw(rscratch2, (unsigned)value); 2059 add(reg, reg, rscratch2); 2060 } 2061 } 2062 2063 void MacroAssembler::incrementw(Address dst, int value) 2064 { 2065 assert(!dst.uses(rscratch1), "invalid dst for address increment"); 2066 if (dst.getMode() == Address::literal) { 2067 assert(abs(value) < (1 << 12), "invalid value and address mode combination"); 2068 lea(rscratch2, dst); 2069 dst = Address(rscratch2); 2070 } 2071 ldrw(rscratch1, dst); 2072 incrementw(rscratch1, value); 2073 strw(rscratch1, dst); 2074 } 2075 2076 void MacroAssembler::increment(Address dst, int value) 2077 { 2078 assert(!dst.uses(rscratch1), "invalid dst for address increment"); 2079 if (dst.getMode() == Address::literal) { 2080 assert(abs(value) < (1 << 12), "invalid value and address mode combination"); 2081 lea(rscratch2, dst); 2082 dst = Address(rscratch2); 2083 } 2084 ldr(rscratch1, dst); 2085 increment(rscratch1, value); 2086 str(rscratch1, dst); 2087 } 2088 2089 2090 void MacroAssembler::pusha() { 2091 push(0x7fffffff, sp); 2092 } 2093 2094 void MacroAssembler::popa() { 2095 pop(0x7fffffff, sp); 2096 } 2097 2098 // Push lots of registers in the bit set supplied. Don't push sp. 2099 // Return the number of words pushed 2100 int MacroAssembler::push(unsigned int bitset, Register stack) { 2101 int words_pushed = 0; 2102 2103 // Scan bitset to accumulate register pairs 2104 unsigned char regs[32]; 2105 int count = 0; 2106 for (int reg = 0; reg <= 30; reg++) { 2107 if (1 & bitset) 2108 regs[count++] = reg; 2109 bitset >>= 1; 2110 } 2111 regs[count++] = zr->encoding_nocheck(); 2112 count &= ~1; // Only push an even nuber of regs 2113 2114 if (count) { 2115 stp(as_Register(regs[0]), as_Register(regs[1]), 2116 Address(pre(stack, -count * wordSize))); 2117 words_pushed += 2; 2118 } 2119 for (int i = 2; i < count; i += 2) { 2120 stp(as_Register(regs[i]), as_Register(regs[i+1]), 2121 Address(stack, i * wordSize)); 2122 words_pushed += 2; 2123 } 2124 2125 assert(words_pushed == count, "oops, pushed != count"); 2126 2127 return count; 2128 } 2129 2130 int MacroAssembler::pop(unsigned int bitset, Register stack) { 2131 int words_pushed = 0; 2132 2133 // Scan bitset to accumulate register pairs 2134 unsigned char regs[32]; 2135 int count = 0; 2136 for (int reg = 0; reg <= 30; reg++) { 2137 if (1 & bitset) 2138 regs[count++] = reg; 2139 bitset >>= 1; 2140 } 2141 regs[count++] = zr->encoding_nocheck(); 2142 count &= ~1; 2143 2144 for (int i = 2; i < count; i += 2) { 2145 ldp(as_Register(regs[i]), as_Register(regs[i+1]), 2146 Address(stack, i * wordSize)); 2147 words_pushed += 2; 2148 } 2149 if (count) { 2150 ldp(as_Register(regs[0]), as_Register(regs[1]), 2151 Address(post(stack, count * wordSize))); 2152 words_pushed += 2; 2153 } 2154 2155 assert(words_pushed == count, "oops, pushed != count"); 2156 2157 return count; 2158 } 2159 #ifdef ASSERT 2160 void MacroAssembler::verify_heapbase(const char* msg) { 2161 #if 0 2162 assert (UseCompressedOops || UseCompressedClassPointers, "should be compressed"); 2163 assert (Universe::heap() != NULL, "java heap should be initialized"); 2164 if (CheckCompressedOops) { 2165 Label ok; 2166 push(1 << rscratch1->encoding(), sp); // cmpptr trashes rscratch1 2167 cmpptr(rheapbase, ExternalAddress((address)Universe::narrow_ptrs_base_addr())); 2168 br(Assembler::EQ, ok); 2169 stop(msg); 2170 bind(ok); 2171 pop(1 << rscratch1->encoding(), sp); 2172 } 2173 #endif 2174 } 2175 #endif 2176 2177 void MacroAssembler::resolve_jobject(Register value, Register thread, Register tmp) { 2178 Label done, not_weak; 2179 cbz(value, done); // Use NULL as-is. 2180 2181 STATIC_ASSERT(JNIHandles::weak_tag_mask == 1u); 2182 tbz(r0, 0, not_weak); // Test for jweak tag. 2183 2184 // Resolve jweak. 2185 access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF, value, 2186 Address(value, -JNIHandles::weak_tag_value), tmp, thread); 2187 verify_oop(value); 2188 b(done); 2189 2190 bind(not_weak); 2191 // Resolve (untagged) jobject. 2192 access_load_at(T_OBJECT, IN_NATIVE, value, Address(value, 0), tmp, thread); 2193 verify_oop(value); 2194 bind(done); 2195 } 2196 2197 void MacroAssembler::stop(const char* msg) { 2198 address ip = pc(); 2199 pusha(); 2200 mov(c_rarg0, (address)msg); 2201 mov(c_rarg1, (address)ip); 2202 mov(c_rarg2, sp); 2203 mov(c_rarg3, CAST_FROM_FN_PTR(address, MacroAssembler::debug64)); 2204 // call(c_rarg3); 2205 blrt(c_rarg3, 3, 0, 1); 2206 hlt(0); 2207 } 2208 2209 void MacroAssembler::warn(const char* msg) { 2210 pusha(); 2211 mov(c_rarg0, (address)msg); 2212 mov(lr, CAST_FROM_FN_PTR(address, warning)); 2213 blrt(lr, 1, 0, MacroAssembler::ret_type_void); 2214 popa(); 2215 } 2216 2217 void MacroAssembler::unimplemented(const char* what) { 2218 const char* buf = NULL; 2219 { 2220 ResourceMark rm; 2221 stringStream ss; 2222 ss.print("unimplemented: %s", what); 2223 buf = code_string(ss.as_string()); 2224 } 2225 stop(buf); 2226 } 2227 2228 // If a constant does not fit in an immediate field, generate some 2229 // number of MOV instructions and then perform the operation. 2230 void MacroAssembler::wrap_add_sub_imm_insn(Register Rd, Register Rn, unsigned imm, 2231 add_sub_imm_insn insn1, 2232 add_sub_reg_insn insn2) { 2233 assert(Rd != zr, "Rd = zr and not setting flags?"); 2234 if (operand_valid_for_add_sub_immediate((int)imm)) { 2235 (this->*insn1)(Rd, Rn, imm); 2236 } else { 2237 if (uabs(imm) < (1 << 24)) { 2238 (this->*insn1)(Rd, Rn, imm & -(1 << 12)); 2239 (this->*insn1)(Rd, Rd, imm & ((1 << 12)-1)); 2240 } else { 2241 assert_different_registers(Rd, Rn); 2242 mov(Rd, (uint64_t)imm); 2243 (this->*insn2)(Rd, Rn, Rd, LSL, 0); 2244 } 2245 } 2246 } 2247 2248 // Seperate vsn which sets the flags. Optimisations are more restricted 2249 // because we must set the flags correctly. 2250 void MacroAssembler::wrap_adds_subs_imm_insn(Register Rd, Register Rn, unsigned imm, 2251 add_sub_imm_insn insn1, 2252 add_sub_reg_insn insn2) { 2253 if (operand_valid_for_add_sub_immediate((int)imm)) { 2254 (this->*insn1)(Rd, Rn, imm); 2255 } else { 2256 assert_different_registers(Rd, Rn); 2257 assert(Rd != zr, "overflow in immediate operand"); 2258 mov(Rd, (uint64_t)imm); 2259 (this->*insn2)(Rd, Rn, Rd, LSL, 0); 2260 } 2261 } 2262 2263 2264 void MacroAssembler::add(Register Rd, Register Rn, RegisterOrConstant increment) { 2265 if (increment.is_register()) { 2266 add(Rd, Rn, increment.as_register()); 2267 } else { 2268 add(Rd, Rn, increment.as_constant()); 2269 } 2270 } 2271 2272 void MacroAssembler::addw(Register Rd, Register Rn, RegisterOrConstant increment) { 2273 if (increment.is_register()) { 2274 addw(Rd, Rn, increment.as_register()); 2275 } else { 2276 addw(Rd, Rn, increment.as_constant()); 2277 } 2278 } 2279 2280 void MacroAssembler::sub(Register Rd, Register Rn, RegisterOrConstant decrement) { 2281 if (decrement.is_register()) { 2282 sub(Rd, Rn, decrement.as_register()); 2283 } else { 2284 sub(Rd, Rn, decrement.as_constant()); 2285 } 2286 } 2287 2288 void MacroAssembler::subw(Register Rd, Register Rn, RegisterOrConstant decrement) { 2289 if (decrement.is_register()) { 2290 subw(Rd, Rn, decrement.as_register()); 2291 } else { 2292 subw(Rd, Rn, decrement.as_constant()); 2293 } 2294 } 2295 2296 void MacroAssembler::reinit_heapbase() 2297 { 2298 if (UseCompressedOops) { 2299 if (Universe::is_fully_initialized()) { 2300 mov(rheapbase, Universe::narrow_ptrs_base()); 2301 } else { 2302 lea(rheapbase, ExternalAddress((address)Universe::narrow_ptrs_base_addr())); 2303 ldr(rheapbase, Address(rheapbase)); 2304 } 2305 } 2306 } 2307 2308 // this simulates the behaviour of the x86 cmpxchg instruction using a 2309 // load linked/store conditional pair. we use the acquire/release 2310 // versions of these instructions so that we flush pending writes as 2311 // per Java semantics. 2312 2313 // n.b the x86 version assumes the old value to be compared against is 2314 // in rax and updates rax with the value located in memory if the 2315 // cmpxchg fails. we supply a register for the old value explicitly 2316 2317 // the aarch64 load linked/store conditional instructions do not 2318 // accept an offset. so, unlike x86, we must provide a plain register 2319 // to identify the memory word to be compared/exchanged rather than a 2320 // register+offset Address. 2321 2322 void MacroAssembler::cmpxchgptr(Register oldv, Register newv, Register addr, Register tmp, 2323 Label &succeed, Label *fail) { 2324 // oldv holds comparison value 2325 // newv holds value to write in exchange 2326 // addr identifies memory word to compare against/update 2327 if (UseLSE) { 2328 mov(tmp, oldv); 2329 casal(Assembler::xword, oldv, newv, addr); 2330 cmp(tmp, oldv); 2331 br(Assembler::EQ, succeed); 2332 membar(AnyAny); 2333 } else { 2334 Label retry_load, nope; 2335 if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH)) 2336 prfm(Address(addr), PSTL1STRM); 2337 bind(retry_load); 2338 // flush and load exclusive from the memory location 2339 // and fail if it is not what we expect 2340 ldaxr(tmp, addr); 2341 cmp(tmp, oldv); 2342 br(Assembler::NE, nope); 2343 // if we store+flush with no intervening write tmp wil be zero 2344 stlxr(tmp, newv, addr); 2345 cbzw(tmp, succeed); 2346 // retry so we only ever return after a load fails to compare 2347 // ensures we don't return a stale value after a failed write. 2348 b(retry_load); 2349 // if the memory word differs we return it in oldv and signal a fail 2350 bind(nope); 2351 membar(AnyAny); 2352 mov(oldv, tmp); 2353 } 2354 if (fail) 2355 b(*fail); 2356 } 2357 2358 void MacroAssembler::cmpxchg_obj_header(Register oldv, Register newv, Register obj, Register tmp, 2359 Label &succeed, Label *fail) { 2360 assert(oopDesc::mark_offset_in_bytes() == 0, "assumption"); 2361 cmpxchgptr(oldv, newv, obj, tmp, succeed, fail); 2362 } 2363 2364 void MacroAssembler::cmpxchgw(Register oldv, Register newv, Register addr, Register tmp, 2365 Label &succeed, Label *fail) { 2366 // oldv holds comparison value 2367 // newv holds value to write in exchange 2368 // addr identifies memory word to compare against/update 2369 // tmp returns 0/1 for success/failure 2370 if (UseLSE) { 2371 mov(tmp, oldv); 2372 casal(Assembler::word, oldv, newv, addr); 2373 cmp(tmp, oldv); 2374 br(Assembler::EQ, succeed); 2375 membar(AnyAny); 2376 } else { 2377 Label retry_load, nope; 2378 if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH)) 2379 prfm(Address(addr), PSTL1STRM); 2380 bind(retry_load); 2381 // flush and load exclusive from the memory location 2382 // and fail if it is not what we expect 2383 ldaxrw(tmp, addr); 2384 cmp(tmp, oldv); 2385 br(Assembler::NE, nope); 2386 // if we store+flush with no intervening write tmp wil be zero 2387 stlxrw(tmp, newv, addr); 2388 cbzw(tmp, succeed); 2389 // retry so we only ever return after a load fails to compare 2390 // ensures we don't return a stale value after a failed write. 2391 b(retry_load); 2392 // if the memory word differs we return it in oldv and signal a fail 2393 bind(nope); 2394 membar(AnyAny); 2395 mov(oldv, tmp); 2396 } 2397 if (fail) 2398 b(*fail); 2399 } 2400 2401 // A generic CAS; success or failure is in the EQ flag. A weak CAS 2402 // doesn't retry and may fail spuriously. If the oldval is wanted, 2403 // Pass a register for the result, otherwise pass noreg. 2404 2405 // Clobbers rscratch1 2406 void MacroAssembler::cmpxchg(Register addr, Register expected, 2407 Register new_val, 2408 enum operand_size size, 2409 bool acquire, bool release, 2410 bool weak, 2411 Register result) { 2412 if (result == noreg) result = rscratch1; 2413 BLOCK_COMMENT("cmpxchg {"); 2414 if (UseLSE) { 2415 mov(result, expected); 2416 lse_cas(result, new_val, addr, size, acquire, release, /*not_pair*/ true); 2417 compare_eq(result, expected, size); 2418 } else { 2419 Label retry_load, done; 2420 if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH)) 2421 prfm(Address(addr), PSTL1STRM); 2422 bind(retry_load); 2423 load_exclusive(result, addr, size, acquire); 2424 compare_eq(result, expected, size); 2425 br(Assembler::NE, done); 2426 store_exclusive(rscratch1, new_val, addr, size, release); 2427 if (weak) { 2428 cmpw(rscratch1, 0u); // If the store fails, return NE to our caller. 2429 } else { 2430 cbnzw(rscratch1, retry_load); 2431 } 2432 bind(done); 2433 } 2434 BLOCK_COMMENT("} cmpxchg"); 2435 } 2436 2437 // A generic comparison. Only compares for equality, clobbers rscratch1. 2438 void MacroAssembler::compare_eq(Register rm, Register rn, enum operand_size size) { 2439 if (size == xword) { 2440 cmp(rm, rn); 2441 } else if (size == word) { 2442 cmpw(rm, rn); 2443 } else if (size == halfword) { 2444 eorw(rscratch1, rm, rn); 2445 ands(zr, rscratch1, 0xffff); 2446 } else if (size == byte) { 2447 eorw(rscratch1, rm, rn); 2448 ands(zr, rscratch1, 0xff); 2449 } else { 2450 ShouldNotReachHere(); 2451 } 2452 } 2453 2454 2455 static bool different(Register a, RegisterOrConstant b, Register c) { 2456 if (b.is_constant()) 2457 return a != c; 2458 else 2459 return a != b.as_register() && a != c && b.as_register() != c; 2460 } 2461 2462 #define ATOMIC_OP(NAME, LDXR, OP, IOP, AOP, STXR, sz) \ 2463 void MacroAssembler::atomic_##NAME(Register prev, RegisterOrConstant incr, Register addr) { \ 2464 if (UseLSE) { \ 2465 prev = prev->is_valid() ? prev : zr; \ 2466 if (incr.is_register()) { \ 2467 AOP(sz, incr.as_register(), prev, addr); \ 2468 } else { \ 2469 mov(rscratch2, incr.as_constant()); \ 2470 AOP(sz, rscratch2, prev, addr); \ 2471 } \ 2472 return; \ 2473 } \ 2474 Register result = rscratch2; \ 2475 if (prev->is_valid()) \ 2476 result = different(prev, incr, addr) ? prev : rscratch2; \ 2477 \ 2478 Label retry_load; \ 2479 if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH)) \ 2480 prfm(Address(addr), PSTL1STRM); \ 2481 bind(retry_load); \ 2482 LDXR(result, addr); \ 2483 OP(rscratch1, result, incr); \ 2484 STXR(rscratch2, rscratch1, addr); \ 2485 cbnzw(rscratch2, retry_load); \ 2486 if (prev->is_valid() && prev != result) { \ 2487 IOP(prev, rscratch1, incr); \ 2488 } \ 2489 } 2490 2491 ATOMIC_OP(add, ldxr, add, sub, ldadd, stxr, Assembler::xword) 2492 ATOMIC_OP(addw, ldxrw, addw, subw, ldadd, stxrw, Assembler::word) 2493 ATOMIC_OP(addal, ldaxr, add, sub, ldaddal, stlxr, Assembler::xword) 2494 ATOMIC_OP(addalw, ldaxrw, addw, subw, ldaddal, stlxrw, Assembler::word) 2495 2496 #undef ATOMIC_OP 2497 2498 #define ATOMIC_XCHG(OP, AOP, LDXR, STXR, sz) \ 2499 void MacroAssembler::atomic_##OP(Register prev, Register newv, Register addr) { \ 2500 if (UseLSE) { \ 2501 prev = prev->is_valid() ? prev : zr; \ 2502 AOP(sz, newv, prev, addr); \ 2503 return; \ 2504 } \ 2505 Register result = rscratch2; \ 2506 if (prev->is_valid()) \ 2507 result = different(prev, newv, addr) ? prev : rscratch2; \ 2508 \ 2509 Label retry_load; \ 2510 if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH)) \ 2511 prfm(Address(addr), PSTL1STRM); \ 2512 bind(retry_load); \ 2513 LDXR(result, addr); \ 2514 STXR(rscratch1, newv, addr); \ 2515 cbnzw(rscratch1, retry_load); \ 2516 if (prev->is_valid() && prev != result) \ 2517 mov(prev, result); \ 2518 } 2519 2520 ATOMIC_XCHG(xchg, swp, ldxr, stxr, Assembler::xword) 2521 ATOMIC_XCHG(xchgw, swp, ldxrw, stxrw, Assembler::word) 2522 ATOMIC_XCHG(xchgal, swpal, ldaxr, stlxr, Assembler::xword) 2523 ATOMIC_XCHG(xchgalw, swpal, ldaxrw, stlxrw, Assembler::word) 2524 2525 #undef ATOMIC_XCHG 2526 2527 #ifndef PRODUCT 2528 extern "C" void findpc(intptr_t x); 2529 #endif 2530 2531 void MacroAssembler::debug64(char* msg, int64_t pc, int64_t regs[]) 2532 { 2533 // In order to get locks to work, we need to fake a in_VM state 2534 if (ShowMessageBoxOnError ) { 2535 JavaThread* thread = JavaThread::current(); 2536 JavaThreadState saved_state = thread->thread_state(); 2537 thread->set_thread_state(_thread_in_vm); 2538 #ifndef PRODUCT 2539 if (CountBytecodes || TraceBytecodes || StopInterpreterAt) { 2540 ttyLocker ttyl; 2541 BytecodeCounter::print(); 2542 } 2543 #endif 2544 if (os::message_box(msg, "Execution stopped, print registers?")) { 2545 ttyLocker ttyl; 2546 tty->print_cr(" pc = 0x%016lx", pc); 2547 #ifndef PRODUCT 2548 tty->cr(); 2549 findpc(pc); 2550 tty->cr(); 2551 #endif 2552 tty->print_cr(" r0 = 0x%016lx", regs[0]); 2553 tty->print_cr(" r1 = 0x%016lx", regs[1]); 2554 tty->print_cr(" r2 = 0x%016lx", regs[2]); 2555 tty->print_cr(" r3 = 0x%016lx", regs[3]); 2556 tty->print_cr(" r4 = 0x%016lx", regs[4]); 2557 tty->print_cr(" r5 = 0x%016lx", regs[5]); 2558 tty->print_cr(" r6 = 0x%016lx", regs[6]); 2559 tty->print_cr(" r7 = 0x%016lx", regs[7]); 2560 tty->print_cr(" r8 = 0x%016lx", regs[8]); 2561 tty->print_cr(" r9 = 0x%016lx", regs[9]); 2562 tty->print_cr("r10 = 0x%016lx", regs[10]); 2563 tty->print_cr("r11 = 0x%016lx", regs[11]); 2564 tty->print_cr("r12 = 0x%016lx", regs[12]); 2565 tty->print_cr("r13 = 0x%016lx", regs[13]); 2566 tty->print_cr("r14 = 0x%016lx", regs[14]); 2567 tty->print_cr("r15 = 0x%016lx", regs[15]); 2568 tty->print_cr("r16 = 0x%016lx", regs[16]); 2569 tty->print_cr("r17 = 0x%016lx", regs[17]); 2570 tty->print_cr("r18 = 0x%016lx", regs[18]); 2571 tty->print_cr("r19 = 0x%016lx", regs[19]); 2572 tty->print_cr("r20 = 0x%016lx", regs[20]); 2573 tty->print_cr("r21 = 0x%016lx", regs[21]); 2574 tty->print_cr("r22 = 0x%016lx", regs[22]); 2575 tty->print_cr("r23 = 0x%016lx", regs[23]); 2576 tty->print_cr("r24 = 0x%016lx", regs[24]); 2577 tty->print_cr("r25 = 0x%016lx", regs[25]); 2578 tty->print_cr("r26 = 0x%016lx", regs[26]); 2579 tty->print_cr("r27 = 0x%016lx", regs[27]); 2580 tty->print_cr("r28 = 0x%016lx", regs[28]); 2581 tty->print_cr("r30 = 0x%016lx", regs[30]); 2582 tty->print_cr("r31 = 0x%016lx", regs[31]); 2583 BREAKPOINT; 2584 } 2585 ThreadStateTransition::transition(thread, _thread_in_vm, saved_state); 2586 } else { 2587 ttyLocker ttyl; 2588 ::tty->print_cr("=============== DEBUG MESSAGE: %s ================\n", 2589 msg); 2590 assert(false, "DEBUG MESSAGE: %s", msg); 2591 } 2592 } 2593 2594 #ifdef BUILTIN_SIM 2595 // routine to generate an x86 prolog for a stub function which 2596 // bootstraps into the generated ARM code which directly follows the 2597 // stub 2598 // 2599 // the argument encodes the number of general and fp registers 2600 // passed by the caller and the callng convention (currently just 2601 // the number of general registers and assumes C argument passing) 2602 2603 extern "C" { 2604 int aarch64_stub_prolog_size(); 2605 void aarch64_stub_prolog(); 2606 void aarch64_prolog(); 2607 } 2608 2609 void MacroAssembler::c_stub_prolog(int gp_arg_count, int fp_arg_count, int ret_type, 2610 address *prolog_ptr) 2611 { 2612 int calltype = (((ret_type & 0x3) << 8) | 2613 ((fp_arg_count & 0xf) << 4) | 2614 (gp_arg_count & 0xf)); 2615 2616 // the addresses for the x86 to ARM entry code we need to use 2617 address start = pc(); 2618 // printf("start = %lx\n", start); 2619 int byteCount = aarch64_stub_prolog_size(); 2620 // printf("byteCount = %x\n", byteCount); 2621 int instructionCount = (byteCount + 3)/ 4; 2622 // printf("instructionCount = %x\n", instructionCount); 2623 for (int i = 0; i < instructionCount; i++) { 2624 nop(); 2625 } 2626 2627 memcpy(start, (void*)aarch64_stub_prolog, byteCount); 2628 2629 // write the address of the setup routine and the call format at the 2630 // end of into the copied code 2631 u_int64_t *patch_end = (u_int64_t *)(start + byteCount); 2632 if (prolog_ptr) 2633 patch_end[-2] = (u_int64_t)prolog_ptr; 2634 patch_end[-1] = calltype; 2635 } 2636 #endif 2637 2638 void MacroAssembler::push_call_clobbered_registers() { 2639 int step = 4 * wordSize; 2640 push(RegSet::range(r0, r18) - RegSet::of(rscratch1, rscratch2), sp); 2641 sub(sp, sp, step); 2642 mov(rscratch1, -step); 2643 // Push v0-v7, v16-v31. 2644 for (int i = 31; i>= 4; i -= 4) { 2645 if (i <= v7->encoding() || i >= v16->encoding()) 2646 st1(as_FloatRegister(i-3), as_FloatRegister(i-2), as_FloatRegister(i-1), 2647 as_FloatRegister(i), T1D, Address(post(sp, rscratch1))); 2648 } 2649 st1(as_FloatRegister(0), as_FloatRegister(1), as_FloatRegister(2), 2650 as_FloatRegister(3), T1D, Address(sp)); 2651 } 2652 2653 void MacroAssembler::pop_call_clobbered_registers() { 2654 for (int i = 0; i < 32; i += 4) { 2655 if (i <= v7->encoding() || i >= v16->encoding()) 2656 ld1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2), 2657 as_FloatRegister(i+3), T1D, Address(post(sp, 4 * wordSize))); 2658 } 2659 2660 pop(RegSet::range(r0, r18) - RegSet::of(rscratch1, rscratch2), sp); 2661 } 2662 2663 void MacroAssembler::push_CPU_state(bool save_vectors) { 2664 int step = (save_vectors ? 8 : 4) * wordSize; 2665 push(0x3fffffff, sp); // integer registers except lr & sp 2666 mov(rscratch1, -step); 2667 sub(sp, sp, step); 2668 for (int i = 28; i >= 4; i -= 4) { 2669 st1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2), 2670 as_FloatRegister(i+3), save_vectors ? T2D : T1D, Address(post(sp, rscratch1))); 2671 } 2672 st1(v0, v1, v2, v3, save_vectors ? T2D : T1D, sp); 2673 } 2674 2675 void MacroAssembler::pop_CPU_state(bool restore_vectors) { 2676 int step = (restore_vectors ? 8 : 4) * wordSize; 2677 for (int i = 0; i <= 28; i += 4) 2678 ld1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2), 2679 as_FloatRegister(i+3), restore_vectors ? T2D : T1D, Address(post(sp, step))); 2680 pop(0x3fffffff, sp); // integer registers except lr & sp 2681 } 2682 2683 /** 2684 * Helpers for multiply_to_len(). 2685 */ 2686 void MacroAssembler::add2_with_carry(Register final_dest_hi, Register dest_hi, Register dest_lo, 2687 Register src1, Register src2) { 2688 adds(dest_lo, dest_lo, src1); 2689 adc(dest_hi, dest_hi, zr); 2690 adds(dest_lo, dest_lo, src2); 2691 adc(final_dest_hi, dest_hi, zr); 2692 } 2693 2694 // Generate an address from (r + r1 extend offset). "size" is the 2695 // size of the operand. The result may be in rscratch2. 2696 Address MacroAssembler::offsetted_address(Register r, Register r1, 2697 Address::extend ext, int offset, int size) { 2698 if (offset || (ext.shift() % size != 0)) { 2699 lea(rscratch2, Address(r, r1, ext)); 2700 return Address(rscratch2, offset); 2701 } else { 2702 return Address(r, r1, ext); 2703 } 2704 } 2705 2706 Address MacroAssembler::spill_address(int size, int offset, Register tmp) 2707 { 2708 assert(offset >= 0, "spill to negative address?"); 2709 // Offset reachable ? 2710 // Not aligned - 9 bits signed offset 2711 // Aligned - 12 bits unsigned offset shifted 2712 Register base = sp; 2713 if ((offset & (size-1)) && offset >= (1<<8)) { 2714 add(tmp, base, offset & ((1<<12)-1)); 2715 base = tmp; 2716 offset &= -1<<12; 2717 } 2718 2719 if (offset >= (1<<12) * size) { 2720 add(tmp, base, offset & (((1<<12)-1)<<12)); 2721 base = tmp; 2722 offset &= ~(((1<<12)-1)<<12); 2723 } 2724 2725 return Address(base, offset); 2726 } 2727 2728 // Checks whether offset is aligned. 2729 // Returns true if it is, else false. 2730 bool MacroAssembler::merge_alignment_check(Register base, 2731 size_t size, 2732 long cur_offset, 2733 long prev_offset) const { 2734 if (AvoidUnalignedAccesses) { 2735 if (base == sp) { 2736 // Checks whether low offset if aligned to pair of registers. 2737 long pair_mask = size * 2 - 1; 2738 long offset = prev_offset > cur_offset ? cur_offset : prev_offset; 2739 return (offset & pair_mask) == 0; 2740 } else { // If base is not sp, we can't guarantee the access is aligned. 2741 return false; 2742 } 2743 } else { 2744 long mask = size - 1; 2745 // Load/store pair instruction only supports element size aligned offset. 2746 return (cur_offset & mask) == 0 && (prev_offset & mask) == 0; 2747 } 2748 } 2749 2750 // Checks whether current and previous loads/stores can be merged. 2751 // Returns true if it can be merged, else false. 2752 bool MacroAssembler::ldst_can_merge(Register rt, 2753 const Address &adr, 2754 size_t cur_size_in_bytes, 2755 bool is_store) const { 2756 address prev = pc() - NativeInstruction::instruction_size; 2757 address last = code()->last_insn(); 2758 2759 if (last == NULL || !nativeInstruction_at(last)->is_Imm_LdSt()) { 2760 return false; 2761 } 2762 2763 if (adr.getMode() != Address::base_plus_offset || prev != last) { 2764 return false; 2765 } 2766 2767 NativeLdSt* prev_ldst = NativeLdSt_at(prev); 2768 size_t prev_size_in_bytes = prev_ldst->size_in_bytes(); 2769 2770 assert(prev_size_in_bytes == 4 || prev_size_in_bytes == 8, "only supports 64/32bit merging."); 2771 assert(cur_size_in_bytes == 4 || cur_size_in_bytes == 8, "only supports 64/32bit merging."); 2772 2773 if (cur_size_in_bytes != prev_size_in_bytes || is_store != prev_ldst->is_store()) { 2774 return false; 2775 } 2776 2777 long max_offset = 63 * prev_size_in_bytes; 2778 long min_offset = -64 * prev_size_in_bytes; 2779 2780 assert(prev_ldst->is_not_pre_post_index(), "pre-index or post-index is not supported to be merged."); 2781 2782 // Only same base can be merged. 2783 if (adr.base() != prev_ldst->base()) { 2784 return false; 2785 } 2786 2787 long cur_offset = adr.offset(); 2788 long prev_offset = prev_ldst->offset(); 2789 size_t diff = abs(cur_offset - prev_offset); 2790 if (diff != prev_size_in_bytes) { 2791 return false; 2792 } 2793 2794 // Following cases can not be merged: 2795 // ldr x2, [x2, #8] 2796 // ldr x3, [x2, #16] 2797 // or: 2798 // ldr x2, [x3, #8] 2799 // ldr x2, [x3, #16] 2800 // If t1 and t2 is the same in "ldp t1, t2, [xn, #imm]", we'll get SIGILL. 2801 if (!is_store && (adr.base() == prev_ldst->target() || rt == prev_ldst->target())) { 2802 return false; 2803 } 2804 2805 long low_offset = prev_offset > cur_offset ? cur_offset : prev_offset; 2806 // Offset range must be in ldp/stp instruction's range. 2807 if (low_offset > max_offset || low_offset < min_offset) { 2808 return false; 2809 } 2810 2811 if (merge_alignment_check(adr.base(), prev_size_in_bytes, cur_offset, prev_offset)) { 2812 return true; 2813 } 2814 2815 return false; 2816 } 2817 2818 // Merge current load/store with previous load/store into ldp/stp. 2819 void MacroAssembler::merge_ldst(Register rt, 2820 const Address &adr, 2821 size_t cur_size_in_bytes, 2822 bool is_store) { 2823 2824 assert(ldst_can_merge(rt, adr, cur_size_in_bytes, is_store) == true, "cur and prev must be able to be merged."); 2825 2826 Register rt_low, rt_high; 2827 address prev = pc() - NativeInstruction::instruction_size; 2828 NativeLdSt* prev_ldst = NativeLdSt_at(prev); 2829 2830 long offset; 2831 2832 if (adr.offset() < prev_ldst->offset()) { 2833 offset = adr.offset(); 2834 rt_low = rt; 2835 rt_high = prev_ldst->target(); 2836 } else { 2837 offset = prev_ldst->offset(); 2838 rt_low = prev_ldst->target(); 2839 rt_high = rt; 2840 } 2841 2842 Address adr_p = Address(prev_ldst->base(), offset); 2843 // Overwrite previous generated binary. 2844 code_section()->set_end(prev); 2845 2846 const int sz = prev_ldst->size_in_bytes(); 2847 assert(sz == 8 || sz == 4, "only supports 64/32bit merging."); 2848 if (!is_store) { 2849 BLOCK_COMMENT("merged ldr pair"); 2850 if (sz == 8) { 2851 ldp(rt_low, rt_high, adr_p); 2852 } else { 2853 ldpw(rt_low, rt_high, adr_p); 2854 } 2855 } else { 2856 BLOCK_COMMENT("merged str pair"); 2857 if (sz == 8) { 2858 stp(rt_low, rt_high, adr_p); 2859 } else { 2860 stpw(rt_low, rt_high, adr_p); 2861 } 2862 } 2863 } 2864 2865 /** 2866 * Multiply 64 bit by 64 bit first loop. 2867 */ 2868 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart, 2869 Register y, Register y_idx, Register z, 2870 Register carry, Register product, 2871 Register idx, Register kdx) { 2872 // 2873 // jlong carry, x[], y[], z[]; 2874 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) { 2875 // huge_128 product = y[idx] * x[xstart] + carry; 2876 // z[kdx] = (jlong)product; 2877 // carry = (jlong)(product >>> 64); 2878 // } 2879 // z[xstart] = carry; 2880 // 2881 2882 Label L_first_loop, L_first_loop_exit; 2883 Label L_one_x, L_one_y, L_multiply; 2884 2885 subsw(xstart, xstart, 1); 2886 br(Assembler::MI, L_one_x); 2887 2888 lea(rscratch1, Address(x, xstart, Address::lsl(LogBytesPerInt))); 2889 ldr(x_xstart, Address(rscratch1)); 2890 ror(x_xstart, x_xstart, 32); // convert big-endian to little-endian 2891 2892 bind(L_first_loop); 2893 subsw(idx, idx, 1); 2894 br(Assembler::MI, L_first_loop_exit); 2895 subsw(idx, idx, 1); 2896 br(Assembler::MI, L_one_y); 2897 lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt))); 2898 ldr(y_idx, Address(rscratch1)); 2899 ror(y_idx, y_idx, 32); // convert big-endian to little-endian 2900 bind(L_multiply); 2901 2902 // AArch64 has a multiply-accumulate instruction that we can't use 2903 // here because it has no way to process carries, so we have to use 2904 // separate add and adc instructions. Bah. 2905 umulh(rscratch1, x_xstart, y_idx); // x_xstart * y_idx -> rscratch1:product 2906 mul(product, x_xstart, y_idx); 2907 adds(product, product, carry); 2908 adc(carry, rscratch1, zr); // x_xstart * y_idx + carry -> carry:product 2909 2910 subw(kdx, kdx, 2); 2911 ror(product, product, 32); // back to big-endian 2912 str(product, offsetted_address(z, kdx, Address::uxtw(LogBytesPerInt), 0, BytesPerLong)); 2913 2914 b(L_first_loop); 2915 2916 bind(L_one_y); 2917 ldrw(y_idx, Address(y, 0)); 2918 b(L_multiply); 2919 2920 bind(L_one_x); 2921 ldrw(x_xstart, Address(x, 0)); 2922 b(L_first_loop); 2923 2924 bind(L_first_loop_exit); 2925 } 2926 2927 /** 2928 * Multiply 128 bit by 128. Unrolled inner loop. 2929 * 2930 */ 2931 void MacroAssembler::multiply_128_x_128_loop(Register y, Register z, 2932 Register carry, Register carry2, 2933 Register idx, Register jdx, 2934 Register yz_idx1, Register yz_idx2, 2935 Register tmp, Register tmp3, Register tmp4, 2936 Register tmp6, Register product_hi) { 2937 2938 // jlong carry, x[], y[], z[]; 2939 // int kdx = ystart+1; 2940 // for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop 2941 // huge_128 tmp3 = (y[idx+1] * product_hi) + z[kdx+idx+1] + carry; 2942 // jlong carry2 = (jlong)(tmp3 >>> 64); 2943 // huge_128 tmp4 = (y[idx] * product_hi) + z[kdx+idx] + carry2; 2944 // carry = (jlong)(tmp4 >>> 64); 2945 // z[kdx+idx+1] = (jlong)tmp3; 2946 // z[kdx+idx] = (jlong)tmp4; 2947 // } 2948 // idx += 2; 2949 // if (idx > 0) { 2950 // yz_idx1 = (y[idx] * product_hi) + z[kdx+idx] + carry; 2951 // z[kdx+idx] = (jlong)yz_idx1; 2952 // carry = (jlong)(yz_idx1 >>> 64); 2953 // } 2954 // 2955 2956 Label L_third_loop, L_third_loop_exit, L_post_third_loop_done; 2957 2958 lsrw(jdx, idx, 2); 2959 2960 bind(L_third_loop); 2961 2962 subsw(jdx, jdx, 1); 2963 br(Assembler::MI, L_third_loop_exit); 2964 subw(idx, idx, 4); 2965 2966 lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt))); 2967 2968 ldp(yz_idx2, yz_idx1, Address(rscratch1, 0)); 2969 2970 lea(tmp6, Address(z, idx, Address::uxtw(LogBytesPerInt))); 2971 2972 ror(yz_idx1, yz_idx1, 32); // convert big-endian to little-endian 2973 ror(yz_idx2, yz_idx2, 32); 2974 2975 ldp(rscratch2, rscratch1, Address(tmp6, 0)); 2976 2977 mul(tmp3, product_hi, yz_idx1); // yz_idx1 * product_hi -> tmp4:tmp3 2978 umulh(tmp4, product_hi, yz_idx1); 2979 2980 ror(rscratch1, rscratch1, 32); // convert big-endian to little-endian 2981 ror(rscratch2, rscratch2, 32); 2982 2983 mul(tmp, product_hi, yz_idx2); // yz_idx2 * product_hi -> carry2:tmp 2984 umulh(carry2, product_hi, yz_idx2); 2985 2986 // propagate sum of both multiplications into carry:tmp4:tmp3 2987 adds(tmp3, tmp3, carry); 2988 adc(tmp4, tmp4, zr); 2989 adds(tmp3, tmp3, rscratch1); 2990 adcs(tmp4, tmp4, tmp); 2991 adc(carry, carry2, zr); 2992 adds(tmp4, tmp4, rscratch2); 2993 adc(carry, carry, zr); 2994 2995 ror(tmp3, tmp3, 32); // convert little-endian to big-endian 2996 ror(tmp4, tmp4, 32); 2997 stp(tmp4, tmp3, Address(tmp6, 0)); 2998 2999 b(L_third_loop); 3000 bind (L_third_loop_exit); 3001 3002 andw (idx, idx, 0x3); 3003 cbz(idx, L_post_third_loop_done); 3004 3005 Label L_check_1; 3006 subsw(idx, idx, 2); 3007 br(Assembler::MI, L_check_1); 3008 3009 lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt))); 3010 ldr(yz_idx1, Address(rscratch1, 0)); 3011 ror(yz_idx1, yz_idx1, 32); 3012 mul(tmp3, product_hi, yz_idx1); // yz_idx1 * product_hi -> tmp4:tmp3 3013 umulh(tmp4, product_hi, yz_idx1); 3014 lea(rscratch1, Address(z, idx, Address::uxtw(LogBytesPerInt))); 3015 ldr(yz_idx2, Address(rscratch1, 0)); 3016 ror(yz_idx2, yz_idx2, 32); 3017 3018 add2_with_carry(carry, tmp4, tmp3, carry, yz_idx2); 3019 3020 ror(tmp3, tmp3, 32); 3021 str(tmp3, Address(rscratch1, 0)); 3022 3023 bind (L_check_1); 3024 3025 andw (idx, idx, 0x1); 3026 subsw(idx, idx, 1); 3027 br(Assembler::MI, L_post_third_loop_done); 3028 ldrw(tmp4, Address(y, idx, Address::uxtw(LogBytesPerInt))); 3029 mul(tmp3, tmp4, product_hi); // tmp4 * product_hi -> carry2:tmp3 3030 umulh(carry2, tmp4, product_hi); 3031 ldrw(tmp4, Address(z, idx, Address::uxtw(LogBytesPerInt))); 3032 3033 add2_with_carry(carry2, tmp3, tmp4, carry); 3034 3035 strw(tmp3, Address(z, idx, Address::uxtw(LogBytesPerInt))); 3036 extr(carry, carry2, tmp3, 32); 3037 3038 bind(L_post_third_loop_done); 3039 } 3040 3041 /** 3042 * Code for BigInteger::multiplyToLen() instrinsic. 3043 * 3044 * r0: x 3045 * r1: xlen 3046 * r2: y 3047 * r3: ylen 3048 * r4: z 3049 * r5: zlen 3050 * r10: tmp1 3051 * r11: tmp2 3052 * r12: tmp3 3053 * r13: tmp4 3054 * r14: tmp5 3055 * r15: tmp6 3056 * r16: tmp7 3057 * 3058 */ 3059 void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Register ylen, 3060 Register z, Register zlen, 3061 Register tmp1, Register tmp2, Register tmp3, Register tmp4, 3062 Register tmp5, Register tmp6, Register product_hi) { 3063 3064 assert_different_registers(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6); 3065 3066 const Register idx = tmp1; 3067 const Register kdx = tmp2; 3068 const Register xstart = tmp3; 3069 3070 const Register y_idx = tmp4; 3071 const Register carry = tmp5; 3072 const Register product = xlen; 3073 const Register x_xstart = zlen; // reuse register 3074 3075 // First Loop. 3076 // 3077 // final static long LONG_MASK = 0xffffffffL; 3078 // int xstart = xlen - 1; 3079 // int ystart = ylen - 1; 3080 // long carry = 0; 3081 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) { 3082 // long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry; 3083 // z[kdx] = (int)product; 3084 // carry = product >>> 32; 3085 // } 3086 // z[xstart] = (int)carry; 3087 // 3088 3089 movw(idx, ylen); // idx = ylen; 3090 movw(kdx, zlen); // kdx = xlen+ylen; 3091 mov(carry, zr); // carry = 0; 3092 3093 Label L_done; 3094 3095 movw(xstart, xlen); 3096 subsw(xstart, xstart, 1); 3097 br(Assembler::MI, L_done); 3098 3099 multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, carry, product, idx, kdx); 3100 3101 Label L_second_loop; 3102 cbzw(kdx, L_second_loop); 3103 3104 Label L_carry; 3105 subw(kdx, kdx, 1); 3106 cbzw(kdx, L_carry); 3107 3108 strw(carry, Address(z, kdx, Address::uxtw(LogBytesPerInt))); 3109 lsr(carry, carry, 32); 3110 subw(kdx, kdx, 1); 3111 3112 bind(L_carry); 3113 strw(carry, Address(z, kdx, Address::uxtw(LogBytesPerInt))); 3114 3115 // Second and third (nested) loops. 3116 // 3117 // for (int i = xstart-1; i >= 0; i--) { // Second loop 3118 // carry = 0; 3119 // for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop 3120 // long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) + 3121 // (z[k] & LONG_MASK) + carry; 3122 // z[k] = (int)product; 3123 // carry = product >>> 32; 3124 // } 3125 // z[i] = (int)carry; 3126 // } 3127 // 3128 // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = product_hi 3129 3130 const Register jdx = tmp1; 3131 3132 bind(L_second_loop); 3133 mov(carry, zr); // carry = 0; 3134 movw(jdx, ylen); // j = ystart+1 3135 3136 subsw(xstart, xstart, 1); // i = xstart-1; 3137 br(Assembler::MI, L_done); 3138 3139 str(z, Address(pre(sp, -4 * wordSize))); 3140 3141 Label L_last_x; 3142 lea(z, offsetted_address(z, xstart, Address::uxtw(LogBytesPerInt), 4, BytesPerInt)); // z = z + k - j 3143 subsw(xstart, xstart, 1); // i = xstart-1; 3144 br(Assembler::MI, L_last_x); 3145 3146 lea(rscratch1, Address(x, xstart, Address::uxtw(LogBytesPerInt))); 3147 ldr(product_hi, Address(rscratch1)); 3148 ror(product_hi, product_hi, 32); // convert big-endian to little-endian 3149 3150 Label L_third_loop_prologue; 3151 bind(L_third_loop_prologue); 3152 3153 str(ylen, Address(sp, wordSize)); 3154 stp(x, xstart, Address(sp, 2 * wordSize)); 3155 multiply_128_x_128_loop(y, z, carry, x, jdx, ylen, product, 3156 tmp2, x_xstart, tmp3, tmp4, tmp6, product_hi); 3157 ldp(z, ylen, Address(post(sp, 2 * wordSize))); 3158 ldp(x, xlen, Address(post(sp, 2 * wordSize))); // copy old xstart -> xlen 3159 3160 addw(tmp3, xlen, 1); 3161 strw(carry, Address(z, tmp3, Address::uxtw(LogBytesPerInt))); 3162 subsw(tmp3, tmp3, 1); 3163 br(Assembler::MI, L_done); 3164 3165 lsr(carry, carry, 32); 3166 strw(carry, Address(z, tmp3, Address::uxtw(LogBytesPerInt))); 3167 b(L_second_loop); 3168 3169 // Next infrequent code is moved outside loops. 3170 bind(L_last_x); 3171 ldrw(product_hi, Address(x, 0)); 3172 b(L_third_loop_prologue); 3173 3174 bind(L_done); 3175 } 3176 3177 // Code for BigInteger::mulAdd instrinsic 3178 // out = r0 3179 // in = r1 3180 // offset = r2 (already out.length-offset) 3181 // len = r3 3182 // k = r4 3183 // 3184 // pseudo code from java implementation: 3185 // carry = 0; 3186 // offset = out.length-offset - 1; 3187 // for (int j=len-1; j >= 0; j--) { 3188 // product = (in[j] & LONG_MASK) * kLong + (out[offset] & LONG_MASK) + carry; 3189 // out[offset--] = (int)product; 3190 // carry = product >>> 32; 3191 // } 3192 // return (int)carry; 3193 void MacroAssembler::mul_add(Register out, Register in, Register offset, 3194 Register len, Register k) { 3195 Label LOOP, END; 3196 // pre-loop 3197 cmp(len, zr); // cmp, not cbz/cbnz: to use condition twice => less branches 3198 csel(out, zr, out, Assembler::EQ); 3199 br(Assembler::EQ, END); 3200 add(in, in, len, LSL, 2); // in[j+1] address 3201 add(offset, out, offset, LSL, 2); // out[offset + 1] address 3202 mov(out, zr); // used to keep carry now 3203 BIND(LOOP); 3204 ldrw(rscratch1, Address(pre(in, -4))); 3205 madd(rscratch1, rscratch1, k, out); 3206 ldrw(rscratch2, Address(pre(offset, -4))); 3207 add(rscratch1, rscratch1, rscratch2); 3208 strw(rscratch1, Address(offset)); 3209 lsr(out, rscratch1, 32); 3210 subs(len, len, 1); 3211 br(Assembler::NE, LOOP); 3212 BIND(END); 3213 } 3214 3215 /** 3216 * Emits code to update CRC-32 with a byte value according to constants in table 3217 * 3218 * @param [in,out]crc Register containing the crc. 3219 * @param [in]val Register containing the byte to fold into the CRC. 3220 * @param [in]table Register containing the table of crc constants. 3221 * 3222 * uint32_t crc; 3223 * val = crc_table[(val ^ crc) & 0xFF]; 3224 * crc = val ^ (crc >> 8); 3225 * 3226 */ 3227 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) { 3228 eor(val, val, crc); 3229 andr(val, val, 0xff); 3230 ldrw(val, Address(table, val, Address::lsl(2))); 3231 eor(crc, val, crc, Assembler::LSR, 8); 3232 } 3233 3234 /** 3235 * Emits code to update CRC-32 with a 32-bit value according to tables 0 to 3 3236 * 3237 * @param [in,out]crc Register containing the crc. 3238 * @param [in]v Register containing the 32-bit to fold into the CRC. 3239 * @param [in]table0 Register containing table 0 of crc constants. 3240 * @param [in]table1 Register containing table 1 of crc constants. 3241 * @param [in]table2 Register containing table 2 of crc constants. 3242 * @param [in]table3 Register containing table 3 of crc constants. 3243 * 3244 * uint32_t crc; 3245 * v = crc ^ v 3246 * crc = table3[v&0xff]^table2[(v>>8)&0xff]^table1[(v>>16)&0xff]^table0[v>>24] 3247 * 3248 */ 3249 void MacroAssembler::update_word_crc32(Register crc, Register v, Register tmp, 3250 Register table0, Register table1, Register table2, Register table3, 3251 bool upper) { 3252 eor(v, crc, v, upper ? LSR:LSL, upper ? 32:0); 3253 uxtb(tmp, v); 3254 ldrw(crc, Address(table3, tmp, Address::lsl(2))); 3255 ubfx(tmp, v, 8, 8); 3256 ldrw(tmp, Address(table2, tmp, Address::lsl(2))); 3257 eor(crc, crc, tmp); 3258 ubfx(tmp, v, 16, 8); 3259 ldrw(tmp, Address(table1, tmp, Address::lsl(2))); 3260 eor(crc, crc, tmp); 3261 ubfx(tmp, v, 24, 8); 3262 ldrw(tmp, Address(table0, tmp, Address::lsl(2))); 3263 eor(crc, crc, tmp); 3264 } 3265 3266 void MacroAssembler::kernel_crc32_using_crc32(Register crc, Register buf, 3267 Register len, Register tmp0, Register tmp1, Register tmp2, 3268 Register tmp3) { 3269 Label CRC_by64_loop, CRC_by4_loop, CRC_by1_loop, CRC_less64, CRC_by64_pre, CRC_by32_loop, CRC_less32, L_exit; 3270 assert_different_registers(crc, buf, len, tmp0, tmp1, tmp2, tmp3); 3271 3272 mvnw(crc, crc); 3273 3274 subs(len, len, 128); 3275 br(Assembler::GE, CRC_by64_pre); 3276 BIND(CRC_less64); 3277 adds(len, len, 128-32); 3278 br(Assembler::GE, CRC_by32_loop); 3279 BIND(CRC_less32); 3280 adds(len, len, 32-4); 3281 br(Assembler::GE, CRC_by4_loop); 3282 adds(len, len, 4); 3283 br(Assembler::GT, CRC_by1_loop); 3284 b(L_exit); 3285 3286 BIND(CRC_by32_loop); 3287 ldp(tmp0, tmp1, Address(post(buf, 16))); 3288 subs(len, len, 32); 3289 crc32x(crc, crc, tmp0); 3290 ldr(tmp2, Address(post(buf, 8))); 3291 crc32x(crc, crc, tmp1); 3292 ldr(tmp3, Address(post(buf, 8))); 3293 crc32x(crc, crc, tmp2); 3294 crc32x(crc, crc, tmp3); 3295 br(Assembler::GE, CRC_by32_loop); 3296 cmn(len, 32); 3297 br(Assembler::NE, CRC_less32); 3298 b(L_exit); 3299 3300 BIND(CRC_by4_loop); 3301 ldrw(tmp0, Address(post(buf, 4))); 3302 subs(len, len, 4); 3303 crc32w(crc, crc, tmp0); 3304 br(Assembler::GE, CRC_by4_loop); 3305 adds(len, len, 4); 3306 br(Assembler::LE, L_exit); 3307 BIND(CRC_by1_loop); 3308 ldrb(tmp0, Address(post(buf, 1))); 3309 subs(len, len, 1); 3310 crc32b(crc, crc, tmp0); 3311 br(Assembler::GT, CRC_by1_loop); 3312 b(L_exit); 3313 3314 BIND(CRC_by64_pre); 3315 sub(buf, buf, 8); 3316 ldp(tmp0, tmp1, Address(buf, 8)); 3317 crc32x(crc, crc, tmp0); 3318 ldr(tmp2, Address(buf, 24)); 3319 crc32x(crc, crc, tmp1); 3320 ldr(tmp3, Address(buf, 32)); 3321 crc32x(crc, crc, tmp2); 3322 ldr(tmp0, Address(buf, 40)); 3323 crc32x(crc, crc, tmp3); 3324 ldr(tmp1, Address(buf, 48)); 3325 crc32x(crc, crc, tmp0); 3326 ldr(tmp2, Address(buf, 56)); 3327 crc32x(crc, crc, tmp1); 3328 ldr(tmp3, Address(pre(buf, 64))); 3329 3330 b(CRC_by64_loop); 3331 3332 align(CodeEntryAlignment); 3333 BIND(CRC_by64_loop); 3334 subs(len, len, 64); 3335 crc32x(crc, crc, tmp2); 3336 ldr(tmp0, Address(buf, 8)); 3337 crc32x(crc, crc, tmp3); 3338 ldr(tmp1, Address(buf, 16)); 3339 crc32x(crc, crc, tmp0); 3340 ldr(tmp2, Address(buf, 24)); 3341 crc32x(crc, crc, tmp1); 3342 ldr(tmp3, Address(buf, 32)); 3343 crc32x(crc, crc, tmp2); 3344 ldr(tmp0, Address(buf, 40)); 3345 crc32x(crc, crc, tmp3); 3346 ldr(tmp1, Address(buf, 48)); 3347 crc32x(crc, crc, tmp0); 3348 ldr(tmp2, Address(buf, 56)); 3349 crc32x(crc, crc, tmp1); 3350 ldr(tmp3, Address(pre(buf, 64))); 3351 br(Assembler::GE, CRC_by64_loop); 3352 3353 // post-loop 3354 crc32x(crc, crc, tmp2); 3355 crc32x(crc, crc, tmp3); 3356 3357 sub(len, len, 64); 3358 add(buf, buf, 8); 3359 cmn(len, 128); 3360 br(Assembler::NE, CRC_less64); 3361 BIND(L_exit); 3362 mvnw(crc, crc); 3363 } 3364 3365 /** 3366 * @param crc register containing existing CRC (32-bit) 3367 * @param buf register pointing to input byte buffer (byte*) 3368 * @param len register containing number of bytes 3369 * @param table register that will contain address of CRC table 3370 * @param tmp scratch register 3371 */ 3372 void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len, 3373 Register table0, Register table1, Register table2, Register table3, 3374 Register tmp, Register tmp2, Register tmp3) { 3375 Label L_by16, L_by16_loop, L_by4, L_by4_loop, L_by1, L_by1_loop, L_exit; 3376 unsigned long offset; 3377 3378 if (UseCRC32) { 3379 kernel_crc32_using_crc32(crc, buf, len, table0, table1, table2, table3); 3380 return; 3381 } 3382 3383 mvnw(crc, crc); 3384 3385 adrp(table0, ExternalAddress(StubRoutines::crc_table_addr()), offset); 3386 if (offset) add(table0, table0, offset); 3387 add(table1, table0, 1*256*sizeof(juint)); 3388 add(table2, table0, 2*256*sizeof(juint)); 3389 add(table3, table0, 3*256*sizeof(juint)); 3390 3391 if (UseNeon) { 3392 cmp(len, (u1)64); 3393 br(Assembler::LT, L_by16); 3394 eor(v16, T16B, v16, v16); 3395 3396 Label L_fold; 3397 3398 add(tmp, table0, 4*256*sizeof(juint)); // Point at the Neon constants 3399 3400 ld1(v0, v1, T2D, post(buf, 32)); 3401 ld1r(v4, T2D, post(tmp, 8)); 3402 ld1r(v5, T2D, post(tmp, 8)); 3403 ld1r(v6, T2D, post(tmp, 8)); 3404 ld1r(v7, T2D, post(tmp, 8)); 3405 mov(v16, T4S, 0, crc); 3406 3407 eor(v0, T16B, v0, v16); 3408 sub(len, len, 64); 3409 3410 BIND(L_fold); 3411 pmull(v22, T8H, v0, v5, T8B); 3412 pmull(v20, T8H, v0, v7, T8B); 3413 pmull(v23, T8H, v0, v4, T8B); 3414 pmull(v21, T8H, v0, v6, T8B); 3415 3416 pmull2(v18, T8H, v0, v5, T16B); 3417 pmull2(v16, T8H, v0, v7, T16B); 3418 pmull2(v19, T8H, v0, v4, T16B); 3419 pmull2(v17, T8H, v0, v6, T16B); 3420 3421 uzp1(v24, T8H, v20, v22); 3422 uzp2(v25, T8H, v20, v22); 3423 eor(v20, T16B, v24, v25); 3424 3425 uzp1(v26, T8H, v16, v18); 3426 uzp2(v27, T8H, v16, v18); 3427 eor(v16, T16B, v26, v27); 3428 3429 ushll2(v22, T4S, v20, T8H, 8); 3430 ushll(v20, T4S, v20, T4H, 8); 3431 3432 ushll2(v18, T4S, v16, T8H, 8); 3433 ushll(v16, T4S, v16, T4H, 8); 3434 3435 eor(v22, T16B, v23, v22); 3436 eor(v18, T16B, v19, v18); 3437 eor(v20, T16B, v21, v20); 3438 eor(v16, T16B, v17, v16); 3439 3440 uzp1(v17, T2D, v16, v20); 3441 uzp2(v21, T2D, v16, v20); 3442 eor(v17, T16B, v17, v21); 3443 3444 ushll2(v20, T2D, v17, T4S, 16); 3445 ushll(v16, T2D, v17, T2S, 16); 3446 3447 eor(v20, T16B, v20, v22); 3448 eor(v16, T16B, v16, v18); 3449 3450 uzp1(v17, T2D, v20, v16); 3451 uzp2(v21, T2D, v20, v16); 3452 eor(v28, T16B, v17, v21); 3453 3454 pmull(v22, T8H, v1, v5, T8B); 3455 pmull(v20, T8H, v1, v7, T8B); 3456 pmull(v23, T8H, v1, v4, T8B); 3457 pmull(v21, T8H, v1, v6, T8B); 3458 3459 pmull2(v18, T8H, v1, v5, T16B); 3460 pmull2(v16, T8H, v1, v7, T16B); 3461 pmull2(v19, T8H, v1, v4, T16B); 3462 pmull2(v17, T8H, v1, v6, T16B); 3463 3464 ld1(v0, v1, T2D, post(buf, 32)); 3465 3466 uzp1(v24, T8H, v20, v22); 3467 uzp2(v25, T8H, v20, v22); 3468 eor(v20, T16B, v24, v25); 3469 3470 uzp1(v26, T8H, v16, v18); 3471 uzp2(v27, T8H, v16, v18); 3472 eor(v16, T16B, v26, v27); 3473 3474 ushll2(v22, T4S, v20, T8H, 8); 3475 ushll(v20, T4S, v20, T4H, 8); 3476 3477 ushll2(v18, T4S, v16, T8H, 8); 3478 ushll(v16, T4S, v16, T4H, 8); 3479 3480 eor(v22, T16B, v23, v22); 3481 eor(v18, T16B, v19, v18); 3482 eor(v20, T16B, v21, v20); 3483 eor(v16, T16B, v17, v16); 3484 3485 uzp1(v17, T2D, v16, v20); 3486 uzp2(v21, T2D, v16, v20); 3487 eor(v16, T16B, v17, v21); 3488 3489 ushll2(v20, T2D, v16, T4S, 16); 3490 ushll(v16, T2D, v16, T2S, 16); 3491 3492 eor(v20, T16B, v22, v20); 3493 eor(v16, T16B, v16, v18); 3494 3495 uzp1(v17, T2D, v20, v16); 3496 uzp2(v21, T2D, v20, v16); 3497 eor(v20, T16B, v17, v21); 3498 3499 shl(v16, T2D, v28, 1); 3500 shl(v17, T2D, v20, 1); 3501 3502 eor(v0, T16B, v0, v16); 3503 eor(v1, T16B, v1, v17); 3504 3505 subs(len, len, 32); 3506 br(Assembler::GE, L_fold); 3507 3508 mov(crc, 0); 3509 mov(tmp, v0, T1D, 0); 3510 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false); 3511 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true); 3512 mov(tmp, v0, T1D, 1); 3513 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false); 3514 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true); 3515 mov(tmp, v1, T1D, 0); 3516 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false); 3517 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true); 3518 mov(tmp, v1, T1D, 1); 3519 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false); 3520 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true); 3521 3522 add(len, len, 32); 3523 } 3524 3525 BIND(L_by16); 3526 subs(len, len, 16); 3527 br(Assembler::GE, L_by16_loop); 3528 adds(len, len, 16-4); 3529 br(Assembler::GE, L_by4_loop); 3530 adds(len, len, 4); 3531 br(Assembler::GT, L_by1_loop); 3532 b(L_exit); 3533 3534 BIND(L_by4_loop); 3535 ldrw(tmp, Address(post(buf, 4))); 3536 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3); 3537 subs(len, len, 4); 3538 br(Assembler::GE, L_by4_loop); 3539 adds(len, len, 4); 3540 br(Assembler::LE, L_exit); 3541 BIND(L_by1_loop); 3542 subs(len, len, 1); 3543 ldrb(tmp, Address(post(buf, 1))); 3544 update_byte_crc32(crc, tmp, table0); 3545 br(Assembler::GT, L_by1_loop); 3546 b(L_exit); 3547 3548 align(CodeEntryAlignment); 3549 BIND(L_by16_loop); 3550 subs(len, len, 16); 3551 ldp(tmp, tmp3, Address(post(buf, 16))); 3552 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false); 3553 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true); 3554 update_word_crc32(crc, tmp3, tmp2, table0, table1, table2, table3, false); 3555 update_word_crc32(crc, tmp3, tmp2, table0, table1, table2, table3, true); 3556 br(Assembler::GE, L_by16_loop); 3557 adds(len, len, 16-4); 3558 br(Assembler::GE, L_by4_loop); 3559 adds(len, len, 4); 3560 br(Assembler::GT, L_by1_loop); 3561 BIND(L_exit); 3562 mvnw(crc, crc); 3563 } 3564 3565 void MacroAssembler::kernel_crc32c_using_crc32c(Register crc, Register buf, 3566 Register len, Register tmp0, Register tmp1, Register tmp2, 3567 Register tmp3) { 3568 Label CRC_by64_loop, CRC_by4_loop, CRC_by1_loop, CRC_less64, CRC_by64_pre, CRC_by32_loop, CRC_less32, L_exit; 3569 assert_different_registers(crc, buf, len, tmp0, tmp1, tmp2, tmp3); 3570 3571 subs(len, len, 128); 3572 br(Assembler::GE, CRC_by64_pre); 3573 BIND(CRC_less64); 3574 adds(len, len, 128-32); 3575 br(Assembler::GE, CRC_by32_loop); 3576 BIND(CRC_less32); 3577 adds(len, len, 32-4); 3578 br(Assembler::GE, CRC_by4_loop); 3579 adds(len, len, 4); 3580 br(Assembler::GT, CRC_by1_loop); 3581 b(L_exit); 3582 3583 BIND(CRC_by32_loop); 3584 ldp(tmp0, tmp1, Address(post(buf, 16))); 3585 subs(len, len, 32); 3586 crc32cx(crc, crc, tmp0); 3587 ldr(tmp2, Address(post(buf, 8))); 3588 crc32cx(crc, crc, tmp1); 3589 ldr(tmp3, Address(post(buf, 8))); 3590 crc32cx(crc, crc, tmp2); 3591 crc32cx(crc, crc, tmp3); 3592 br(Assembler::GE, CRC_by32_loop); 3593 cmn(len, 32); 3594 br(Assembler::NE, CRC_less32); 3595 b(L_exit); 3596 3597 BIND(CRC_by4_loop); 3598 ldrw(tmp0, Address(post(buf, 4))); 3599 subs(len, len, 4); 3600 crc32cw(crc, crc, tmp0); 3601 br(Assembler::GE, CRC_by4_loop); 3602 adds(len, len, 4); 3603 br(Assembler::LE, L_exit); 3604 BIND(CRC_by1_loop); 3605 ldrb(tmp0, Address(post(buf, 1))); 3606 subs(len, len, 1); 3607 crc32cb(crc, crc, tmp0); 3608 br(Assembler::GT, CRC_by1_loop); 3609 b(L_exit); 3610 3611 BIND(CRC_by64_pre); 3612 sub(buf, buf, 8); 3613 ldp(tmp0, tmp1, Address(buf, 8)); 3614 crc32cx(crc, crc, tmp0); 3615 ldr(tmp2, Address(buf, 24)); 3616 crc32cx(crc, crc, tmp1); 3617 ldr(tmp3, Address(buf, 32)); 3618 crc32cx(crc, crc, tmp2); 3619 ldr(tmp0, Address(buf, 40)); 3620 crc32cx(crc, crc, tmp3); 3621 ldr(tmp1, Address(buf, 48)); 3622 crc32cx(crc, crc, tmp0); 3623 ldr(tmp2, Address(buf, 56)); 3624 crc32cx(crc, crc, tmp1); 3625 ldr(tmp3, Address(pre(buf, 64))); 3626 3627 b(CRC_by64_loop); 3628 3629 align(CodeEntryAlignment); 3630 BIND(CRC_by64_loop); 3631 subs(len, len, 64); 3632 crc32cx(crc, crc, tmp2); 3633 ldr(tmp0, Address(buf, 8)); 3634 crc32cx(crc, crc, tmp3); 3635 ldr(tmp1, Address(buf, 16)); 3636 crc32cx(crc, crc, tmp0); 3637 ldr(tmp2, Address(buf, 24)); 3638 crc32cx(crc, crc, tmp1); 3639 ldr(tmp3, Address(buf, 32)); 3640 crc32cx(crc, crc, tmp2); 3641 ldr(tmp0, Address(buf, 40)); 3642 crc32cx(crc, crc, tmp3); 3643 ldr(tmp1, Address(buf, 48)); 3644 crc32cx(crc, crc, tmp0); 3645 ldr(tmp2, Address(buf, 56)); 3646 crc32cx(crc, crc, tmp1); 3647 ldr(tmp3, Address(pre(buf, 64))); 3648 br(Assembler::GE, CRC_by64_loop); 3649 3650 // post-loop 3651 crc32cx(crc, crc, tmp2); 3652 crc32cx(crc, crc, tmp3); 3653 3654 sub(len, len, 64); 3655 add(buf, buf, 8); 3656 cmn(len, 128); 3657 br(Assembler::NE, CRC_less64); 3658 BIND(L_exit); 3659 } 3660 3661 /** 3662 * @param crc register containing existing CRC (32-bit) 3663 * @param buf register pointing to input byte buffer (byte*) 3664 * @param len register containing number of bytes 3665 * @param table register that will contain address of CRC table 3666 * @param tmp scratch register 3667 */ 3668 void MacroAssembler::kernel_crc32c(Register crc, Register buf, Register len, 3669 Register table0, Register table1, Register table2, Register table3, 3670 Register tmp, Register tmp2, Register tmp3) { 3671 kernel_crc32c_using_crc32c(crc, buf, len, table0, table1, table2, table3); 3672 } 3673 3674 3675 SkipIfEqual::SkipIfEqual( 3676 MacroAssembler* masm, const bool* flag_addr, bool value) { 3677 _masm = masm; 3678 unsigned long offset; 3679 _masm->adrp(rscratch1, ExternalAddress((address)flag_addr), offset); 3680 _masm->ldrb(rscratch1, Address(rscratch1, offset)); 3681 _masm->cbzw(rscratch1, _label); 3682 } 3683 3684 SkipIfEqual::~SkipIfEqual() { 3685 _masm->bind(_label); 3686 } 3687 3688 void MacroAssembler::addptr(const Address &dst, int32_t src) { 3689 Address adr; 3690 switch(dst.getMode()) { 3691 case Address::base_plus_offset: 3692 // This is the expected mode, although we allow all the other 3693 // forms below. 3694 adr = form_address(rscratch2, dst.base(), dst.offset(), LogBytesPerWord); 3695 break; 3696 default: 3697 lea(rscratch2, dst); 3698 adr = Address(rscratch2); 3699 break; 3700 } 3701 ldr(rscratch1, adr); 3702 add(rscratch1, rscratch1, src); 3703 str(rscratch1, adr); 3704 } 3705 3706 void MacroAssembler::cmpptr(Register src1, Address src2) { 3707 unsigned long offset; 3708 adrp(rscratch1, src2, offset); 3709 ldr(rscratch1, Address(rscratch1, offset)); 3710 cmp(src1, rscratch1); 3711 } 3712 3713 void MacroAssembler::cmpoop(Register obj1, Register obj2) { 3714 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 3715 bs->obj_equals(this, obj1, obj2); 3716 } 3717 3718 void MacroAssembler::load_klass(Register dst, Register src) { 3719 if (UseCompressedClassPointers) { 3720 ldrw(dst, Address(src, oopDesc::klass_offset_in_bytes())); 3721 decode_klass_not_null(dst); 3722 } else { 3723 ldr(dst, Address(src, oopDesc::klass_offset_in_bytes())); 3724 } 3725 } 3726 3727 // ((OopHandle)result).resolve(); 3728 void MacroAssembler::resolve_oop_handle(Register result, Register tmp) { 3729 // OopHandle::resolve is an indirection. 3730 access_load_at(T_OBJECT, IN_NATIVE, result, Address(result, 0), tmp, noreg); 3731 } 3732 3733 void MacroAssembler::load_mirror(Register dst, Register method, Register tmp) { 3734 const int mirror_offset = in_bytes(Klass::java_mirror_offset()); 3735 ldr(dst, Address(rmethod, Method::const_offset())); 3736 ldr(dst, Address(dst, ConstMethod::constants_offset())); 3737 ldr(dst, Address(dst, ConstantPool::pool_holder_offset_in_bytes())); 3738 ldr(dst, Address(dst, mirror_offset)); 3739 resolve_oop_handle(dst, tmp); 3740 } 3741 3742 void MacroAssembler::cmp_klass(Register oop, Register trial_klass, Register tmp) { 3743 if (UseCompressedClassPointers) { 3744 ldrw(tmp, Address(oop, oopDesc::klass_offset_in_bytes())); 3745 if (Universe::narrow_klass_base() == NULL) { 3746 cmp(trial_klass, tmp, LSL, Universe::narrow_klass_shift()); 3747 return; 3748 } else if (((uint64_t)Universe::narrow_klass_base() & 0xffffffff) == 0 3749 && Universe::narrow_klass_shift() == 0) { 3750 // Only the bottom 32 bits matter 3751 cmpw(trial_klass, tmp); 3752 return; 3753 } 3754 decode_klass_not_null(tmp); 3755 } else { 3756 ldr(tmp, Address(oop, oopDesc::klass_offset_in_bytes())); 3757 } 3758 cmp(trial_klass, tmp); 3759 } 3760 3761 void MacroAssembler::load_prototype_header(Register dst, Register src) { 3762 load_klass(dst, src); 3763 ldr(dst, Address(dst, Klass::prototype_header_offset())); 3764 } 3765 3766 void MacroAssembler::store_klass(Register dst, Register src) { 3767 // FIXME: Should this be a store release? concurrent gcs assumes 3768 // klass length is valid if klass field is not null. 3769 if (UseCompressedClassPointers) { 3770 encode_klass_not_null(src); 3771 strw(src, Address(dst, oopDesc::klass_offset_in_bytes())); 3772 } else { 3773 str(src, Address(dst, oopDesc::klass_offset_in_bytes())); 3774 } 3775 } 3776 3777 void MacroAssembler::store_klass_gap(Register dst, Register src) { 3778 if (UseCompressedClassPointers) { 3779 // Store to klass gap in destination 3780 strw(src, Address(dst, oopDesc::klass_gap_offset_in_bytes())); 3781 } 3782 } 3783 3784 // Algorithm must match CompressedOops::encode. 3785 void MacroAssembler::encode_heap_oop(Register d, Register s) { 3786 #ifdef ASSERT 3787 verify_heapbase("MacroAssembler::encode_heap_oop: heap base corrupted?"); 3788 #endif 3789 verify_oop(s, "broken oop in encode_heap_oop"); 3790 if (Universe::narrow_oop_base() == NULL) { 3791 if (Universe::narrow_oop_shift() != 0) { 3792 assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong"); 3793 lsr(d, s, LogMinObjAlignmentInBytes); 3794 } else { 3795 mov(d, s); 3796 } 3797 } else { 3798 subs(d, s, rheapbase); 3799 csel(d, d, zr, Assembler::HS); 3800 lsr(d, d, LogMinObjAlignmentInBytes); 3801 3802 /* Old algorithm: is this any worse? 3803 Label nonnull; 3804 cbnz(r, nonnull); 3805 sub(r, r, rheapbase); 3806 bind(nonnull); 3807 lsr(r, r, LogMinObjAlignmentInBytes); 3808 */ 3809 } 3810 } 3811 3812 void MacroAssembler::encode_heap_oop_not_null(Register r) { 3813 #ifdef ASSERT 3814 verify_heapbase("MacroAssembler::encode_heap_oop_not_null: heap base corrupted?"); 3815 if (CheckCompressedOops) { 3816 Label ok; 3817 cbnz(r, ok); 3818 stop("null oop passed to encode_heap_oop_not_null"); 3819 bind(ok); 3820 } 3821 #endif 3822 verify_oop(r, "broken oop in encode_heap_oop_not_null"); 3823 if (Universe::narrow_oop_base() != NULL) { 3824 sub(r, r, rheapbase); 3825 } 3826 if (Universe::narrow_oop_shift() != 0) { 3827 assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong"); 3828 lsr(r, r, LogMinObjAlignmentInBytes); 3829 } 3830 } 3831 3832 void MacroAssembler::encode_heap_oop_not_null(Register dst, Register src) { 3833 #ifdef ASSERT 3834 verify_heapbase("MacroAssembler::encode_heap_oop_not_null2: heap base corrupted?"); 3835 if (CheckCompressedOops) { 3836 Label ok; 3837 cbnz(src, ok); 3838 stop("null oop passed to encode_heap_oop_not_null2"); 3839 bind(ok); 3840 } 3841 #endif 3842 verify_oop(src, "broken oop in encode_heap_oop_not_null2"); 3843 3844 Register data = src; 3845 if (Universe::narrow_oop_base() != NULL) { 3846 sub(dst, src, rheapbase); 3847 data = dst; 3848 } 3849 if (Universe::narrow_oop_shift() != 0) { 3850 assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong"); 3851 lsr(dst, data, LogMinObjAlignmentInBytes); 3852 data = dst; 3853 } 3854 if (data == src) 3855 mov(dst, src); 3856 } 3857 3858 void MacroAssembler::decode_heap_oop(Register d, Register s) { 3859 #ifdef ASSERT 3860 verify_heapbase("MacroAssembler::decode_heap_oop: heap base corrupted?"); 3861 #endif 3862 if (Universe::narrow_oop_base() == NULL) { 3863 if (Universe::narrow_oop_shift() != 0 || d != s) { 3864 lsl(d, s, Universe::narrow_oop_shift()); 3865 } 3866 } else { 3867 Label done; 3868 if (d != s) 3869 mov(d, s); 3870 cbz(s, done); 3871 add(d, rheapbase, s, Assembler::LSL, LogMinObjAlignmentInBytes); 3872 bind(done); 3873 } 3874 verify_oop(d, "broken oop in decode_heap_oop"); 3875 } 3876 3877 void MacroAssembler::decode_heap_oop_not_null(Register r) { 3878 assert (UseCompressedOops, "should only be used for compressed headers"); 3879 assert (Universe::heap() != NULL, "java heap should be initialized"); 3880 // Cannot assert, unverified entry point counts instructions (see .ad file) 3881 // vtableStubs also counts instructions in pd_code_size_limit. 3882 // Also do not verify_oop as this is called by verify_oop. 3883 if (Universe::narrow_oop_shift() != 0) { 3884 assert(LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong"); 3885 if (Universe::narrow_oop_base() != NULL) { 3886 add(r, rheapbase, r, Assembler::LSL, LogMinObjAlignmentInBytes); 3887 } else { 3888 add(r, zr, r, Assembler::LSL, LogMinObjAlignmentInBytes); 3889 } 3890 } else { 3891 assert (Universe::narrow_oop_base() == NULL, "sanity"); 3892 } 3893 } 3894 3895 void MacroAssembler::decode_heap_oop_not_null(Register dst, Register src) { 3896 assert (UseCompressedOops, "should only be used for compressed headers"); 3897 assert (Universe::heap() != NULL, "java heap should be initialized"); 3898 // Cannot assert, unverified entry point counts instructions (see .ad file) 3899 // vtableStubs also counts instructions in pd_code_size_limit. 3900 // Also do not verify_oop as this is called by verify_oop. 3901 if (Universe::narrow_oop_shift() != 0) { 3902 assert(LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong"); 3903 if (Universe::narrow_oop_base() != NULL) { 3904 add(dst, rheapbase, src, Assembler::LSL, LogMinObjAlignmentInBytes); 3905 } else { 3906 add(dst, zr, src, Assembler::LSL, LogMinObjAlignmentInBytes); 3907 } 3908 } else { 3909 assert (Universe::narrow_oop_base() == NULL, "sanity"); 3910 if (dst != src) { 3911 mov(dst, src); 3912 } 3913 } 3914 } 3915 3916 void MacroAssembler::encode_klass_not_null(Register dst, Register src) { 3917 if (Universe::narrow_klass_base() == NULL) { 3918 if (Universe::narrow_klass_shift() != 0) { 3919 assert (LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong"); 3920 lsr(dst, src, LogKlassAlignmentInBytes); 3921 } else { 3922 if (dst != src) mov(dst, src); 3923 } 3924 return; 3925 } 3926 3927 if (use_XOR_for_compressed_class_base) { 3928 if (Universe::narrow_klass_shift() != 0) { 3929 eor(dst, src, (uint64_t)Universe::narrow_klass_base()); 3930 lsr(dst, dst, LogKlassAlignmentInBytes); 3931 } else { 3932 eor(dst, src, (uint64_t)Universe::narrow_klass_base()); 3933 } 3934 return; 3935 } 3936 3937 if (((uint64_t)Universe::narrow_klass_base() & 0xffffffff) == 0 3938 && Universe::narrow_klass_shift() == 0) { 3939 movw(dst, src); 3940 return; 3941 } 3942 3943 #ifdef ASSERT 3944 verify_heapbase("MacroAssembler::encode_klass_not_null2: heap base corrupted?"); 3945 #endif 3946 3947 Register rbase = dst; 3948 if (dst == src) rbase = rheapbase; 3949 mov(rbase, (uint64_t)Universe::narrow_klass_base()); 3950 sub(dst, src, rbase); 3951 if (Universe::narrow_klass_shift() != 0) { 3952 assert (LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong"); 3953 lsr(dst, dst, LogKlassAlignmentInBytes); 3954 } 3955 if (dst == src) reinit_heapbase(); 3956 } 3957 3958 void MacroAssembler::encode_klass_not_null(Register r) { 3959 encode_klass_not_null(r, r); 3960 } 3961 3962 void MacroAssembler::decode_klass_not_null(Register dst, Register src) { 3963 Register rbase = dst; 3964 assert (UseCompressedClassPointers, "should only be used for compressed headers"); 3965 3966 if (Universe::narrow_klass_base() == NULL) { 3967 if (Universe::narrow_klass_shift() != 0) { 3968 assert(LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong"); 3969 lsl(dst, src, LogKlassAlignmentInBytes); 3970 } else { 3971 if (dst != src) mov(dst, src); 3972 } 3973 return; 3974 } 3975 3976 if (use_XOR_for_compressed_class_base) { 3977 if (Universe::narrow_klass_shift() != 0) { 3978 lsl(dst, src, LogKlassAlignmentInBytes); 3979 eor(dst, dst, (uint64_t)Universe::narrow_klass_base()); 3980 } else { 3981 eor(dst, src, (uint64_t)Universe::narrow_klass_base()); 3982 } 3983 return; 3984 } 3985 3986 if (((uint64_t)Universe::narrow_klass_base() & 0xffffffff) == 0 3987 && Universe::narrow_klass_shift() == 0) { 3988 if (dst != src) 3989 movw(dst, src); 3990 movk(dst, (uint64_t)Universe::narrow_klass_base() >> 32, 32); 3991 return; 3992 } 3993 3994 // Cannot assert, unverified entry point counts instructions (see .ad file) 3995 // vtableStubs also counts instructions in pd_code_size_limit. 3996 // Also do not verify_oop as this is called by verify_oop. 3997 if (dst == src) rbase = rheapbase; 3998 mov(rbase, (uint64_t)Universe::narrow_klass_base()); 3999 if (Universe::narrow_klass_shift() != 0) { 4000 assert(LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong"); 4001 add(dst, rbase, src, Assembler::LSL, LogKlassAlignmentInBytes); 4002 } else { 4003 add(dst, rbase, src); 4004 } 4005 if (dst == src) reinit_heapbase(); 4006 } 4007 4008 void MacroAssembler::decode_klass_not_null(Register r) { 4009 decode_klass_not_null(r, r); 4010 } 4011 4012 void MacroAssembler::set_narrow_oop(Register dst, jobject obj) { 4013 #ifdef ASSERT 4014 { 4015 ThreadInVMfromUnknown tiv; 4016 assert (UseCompressedOops, "should only be used for compressed oops"); 4017 assert (Universe::heap() != NULL, "java heap should be initialized"); 4018 assert (oop_recorder() != NULL, "this assembler needs an OopRecorder"); 4019 assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "should be real oop"); 4020 } 4021 #endif 4022 int oop_index = oop_recorder()->find_index(obj); 4023 InstructionMark im(this); 4024 RelocationHolder rspec = oop_Relocation::spec(oop_index); 4025 code_section()->relocate(inst_mark(), rspec); 4026 movz(dst, 0xDEAD, 16); 4027 movk(dst, 0xBEEF); 4028 } 4029 4030 void MacroAssembler::set_narrow_klass(Register dst, Klass* k) { 4031 assert (UseCompressedClassPointers, "should only be used for compressed headers"); 4032 assert (oop_recorder() != NULL, "this assembler needs an OopRecorder"); 4033 int index = oop_recorder()->find_index(k); 4034 assert(! Universe::heap()->is_in_reserved(k), "should not be an oop"); 4035 4036 InstructionMark im(this); 4037 RelocationHolder rspec = metadata_Relocation::spec(index); 4038 code_section()->relocate(inst_mark(), rspec); 4039 narrowKlass nk = Klass::encode_klass(k); 4040 movz(dst, (nk >> 16), 16); 4041 movk(dst, nk & 0xffff); 4042 } 4043 4044 void MacroAssembler::access_load_at(BasicType type, DecoratorSet decorators, 4045 Register dst, Address src, 4046 Register tmp1, Register thread_tmp) { 4047 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 4048 decorators = AccessInternal::decorator_fixup(decorators); 4049 bool as_raw = (decorators & AS_RAW) != 0; 4050 if (as_raw) { 4051 bs->BarrierSetAssembler::load_at(this, decorators, type, dst, src, tmp1, thread_tmp); 4052 } else { 4053 bs->load_at(this, decorators, type, dst, src, tmp1, thread_tmp); 4054 } 4055 } 4056 4057 void MacroAssembler::access_store_at(BasicType type, DecoratorSet decorators, 4058 Address dst, Register src, 4059 Register tmp1, Register thread_tmp) { 4060 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 4061 decorators = AccessInternal::decorator_fixup(decorators); 4062 bool as_raw = (decorators & AS_RAW) != 0; 4063 if (as_raw) { 4064 bs->BarrierSetAssembler::store_at(this, decorators, type, dst, src, tmp1, thread_tmp); 4065 } else { 4066 bs->store_at(this, decorators, type, dst, src, tmp1, thread_tmp); 4067 } 4068 } 4069 4070 void MacroAssembler::resolve(DecoratorSet decorators, Register obj) { 4071 // Use stronger ACCESS_WRITE|ACCESS_READ by default. 4072 if ((decorators & (ACCESS_READ | ACCESS_WRITE)) == 0) { 4073 decorators |= ACCESS_READ | ACCESS_WRITE; 4074 } 4075 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 4076 return bs->resolve(this, decorators, obj); 4077 } 4078 4079 void MacroAssembler::load_heap_oop(Register dst, Address src, Register tmp1, 4080 Register thread_tmp, DecoratorSet decorators) { 4081 access_load_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, thread_tmp); 4082 } 4083 4084 void MacroAssembler::load_heap_oop_not_null(Register dst, Address src, Register tmp1, 4085 Register thread_tmp, DecoratorSet decorators) { 4086 access_load_at(T_OBJECT, IN_HEAP | IS_NOT_NULL | decorators, dst, src, tmp1, thread_tmp); 4087 } 4088 4089 void MacroAssembler::store_heap_oop(Address dst, Register src, Register tmp1, 4090 Register thread_tmp, DecoratorSet decorators) { 4091 access_store_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, thread_tmp); 4092 } 4093 4094 // Used for storing NULLs. 4095 void MacroAssembler::store_heap_oop_null(Address dst) { 4096 access_store_at(T_OBJECT, IN_HEAP, dst, noreg, noreg, noreg); 4097 } 4098 4099 Address MacroAssembler::allocate_metadata_address(Metadata* obj) { 4100 assert(oop_recorder() != NULL, "this assembler needs a Recorder"); 4101 int index = oop_recorder()->allocate_metadata_index(obj); 4102 RelocationHolder rspec = metadata_Relocation::spec(index); 4103 return Address((address)obj, rspec); 4104 } 4105 4106 // Move an oop into a register. immediate is true if we want 4107 // immediate instrcutions, i.e. we are not going to patch this 4108 // instruction while the code is being executed by another thread. In 4109 // that case we can use move immediates rather than the constant pool. 4110 void MacroAssembler::movoop(Register dst, jobject obj, bool immediate) { 4111 int oop_index; 4112 if (obj == NULL) { 4113 oop_index = oop_recorder()->allocate_oop_index(obj); 4114 } else { 4115 #ifdef ASSERT 4116 { 4117 ThreadInVMfromUnknown tiv; 4118 assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "should be real oop"); 4119 } 4120 #endif 4121 oop_index = oop_recorder()->find_index(obj); 4122 } 4123 RelocationHolder rspec = oop_Relocation::spec(oop_index); 4124 if (! immediate) { 4125 address dummy = address(uintptr_t(pc()) & -wordSize); // A nearby aligned address 4126 ldr_constant(dst, Address(dummy, rspec)); 4127 } else 4128 mov(dst, Address((address)obj, rspec)); 4129 } 4130 4131 // Move a metadata address into a register. 4132 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) { 4133 int oop_index; 4134 if (obj == NULL) { 4135 oop_index = oop_recorder()->allocate_metadata_index(obj); 4136 } else { 4137 oop_index = oop_recorder()->find_index(obj); 4138 } 4139 RelocationHolder rspec = metadata_Relocation::spec(oop_index); 4140 mov(dst, Address((address)obj, rspec)); 4141 } 4142 4143 Address MacroAssembler::constant_oop_address(jobject obj) { 4144 #ifdef ASSERT 4145 { 4146 ThreadInVMfromUnknown tiv; 4147 assert(oop_recorder() != NULL, "this assembler needs an OopRecorder"); 4148 assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "not an oop"); 4149 } 4150 #endif 4151 int oop_index = oop_recorder()->find_index(obj); 4152 return Address((address)obj, oop_Relocation::spec(oop_index)); 4153 } 4154 4155 // Defines obj, preserves var_size_in_bytes, okay for t2 == var_size_in_bytes. 4156 void MacroAssembler::tlab_allocate(Register obj, 4157 Register var_size_in_bytes, 4158 int con_size_in_bytes, 4159 Register t1, 4160 Register t2, 4161 Label& slow_case) { 4162 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 4163 bs->tlab_allocate(this, obj, var_size_in_bytes, con_size_in_bytes, t1, t2, slow_case); 4164 } 4165 4166 // Defines obj, preserves var_size_in_bytes 4167 void MacroAssembler::eden_allocate(Register obj, 4168 Register var_size_in_bytes, 4169 int con_size_in_bytes, 4170 Register t1, 4171 Label& slow_case) { 4172 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 4173 bs->eden_allocate(this, obj, var_size_in_bytes, con_size_in_bytes, t1, slow_case); 4174 } 4175 4176 // Zero words; len is in bytes 4177 // Destroys all registers except addr 4178 // len must be a nonzero multiple of wordSize 4179 void MacroAssembler::zero_memory(Register addr, Register len, Register t1) { 4180 assert_different_registers(addr, len, t1, rscratch1, rscratch2); 4181 4182 #ifdef ASSERT 4183 { Label L; 4184 tst(len, BytesPerWord - 1); 4185 br(Assembler::EQ, L); 4186 stop("len is not a multiple of BytesPerWord"); 4187 bind(L); 4188 } 4189 #endif 4190 4191 #ifndef PRODUCT 4192 block_comment("zero memory"); 4193 #endif 4194 4195 Label loop; 4196 Label entry; 4197 4198 // Algorithm: 4199 // 4200 // scratch1 = cnt & 7; 4201 // cnt -= scratch1; 4202 // p += scratch1; 4203 // switch (scratch1) { 4204 // do { 4205 // cnt -= 8; 4206 // p[-8] = 0; 4207 // case 7: 4208 // p[-7] = 0; 4209 // case 6: 4210 // p[-6] = 0; 4211 // // ... 4212 // case 1: 4213 // p[-1] = 0; 4214 // case 0: 4215 // p += 8; 4216 // } while (cnt); 4217 // } 4218 4219 const int unroll = 8; // Number of str(zr) instructions we'll unroll 4220 4221 lsr(len, len, LogBytesPerWord); 4222 andr(rscratch1, len, unroll - 1); // tmp1 = cnt % unroll 4223 sub(len, len, rscratch1); // cnt -= unroll 4224 // t1 always points to the end of the region we're about to zero 4225 add(t1, addr, rscratch1, Assembler::LSL, LogBytesPerWord); 4226 adr(rscratch2, entry); 4227 sub(rscratch2, rscratch2, rscratch1, Assembler::LSL, 2); 4228 br(rscratch2); 4229 bind(loop); 4230 sub(len, len, unroll); 4231 for (int i = -unroll; i < 0; i++) 4232 Assembler::str(zr, Address(t1, i * wordSize)); 4233 bind(entry); 4234 add(t1, t1, unroll * wordSize); 4235 cbnz(len, loop); 4236 } 4237 4238 void MacroAssembler::verify_tlab() { 4239 #ifdef ASSERT 4240 if (UseTLAB && VerifyOops) { 4241 Label next, ok; 4242 4243 stp(rscratch2, rscratch1, Address(pre(sp, -16))); 4244 4245 ldr(rscratch2, Address(rthread, in_bytes(JavaThread::tlab_top_offset()))); 4246 ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_start_offset()))); 4247 cmp(rscratch2, rscratch1); 4248 br(Assembler::HS, next); 4249 STOP("assert(top >= start)"); 4250 should_not_reach_here(); 4251 4252 bind(next); 4253 ldr(rscratch2, Address(rthread, in_bytes(JavaThread::tlab_end_offset()))); 4254 ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_top_offset()))); 4255 cmp(rscratch2, rscratch1); 4256 br(Assembler::HS, ok); 4257 STOP("assert(top <= end)"); 4258 should_not_reach_here(); 4259 4260 bind(ok); 4261 ldp(rscratch2, rscratch1, Address(post(sp, 16))); 4262 } 4263 #endif 4264 } 4265 4266 // Writes to stack successive pages until offset reached to check for 4267 // stack overflow + shadow pages. This clobbers tmp. 4268 void MacroAssembler::bang_stack_size(Register size, Register tmp) { 4269 assert_different_registers(tmp, size, rscratch1); 4270 mov(tmp, sp); 4271 // Bang stack for total size given plus shadow page size. 4272 // Bang one page at a time because large size can bang beyond yellow and 4273 // red zones. 4274 Label loop; 4275 mov(rscratch1, os::vm_page_size()); 4276 bind(loop); 4277 lea(tmp, Address(tmp, -os::vm_page_size())); 4278 subsw(size, size, rscratch1); 4279 str(size, Address(tmp)); 4280 br(Assembler::GT, loop); 4281 4282 // Bang down shadow pages too. 4283 // At this point, (tmp-0) is the last address touched, so don't 4284 // touch it again. (It was touched as (tmp-pagesize) but then tmp 4285 // was post-decremented.) Skip this address by starting at i=1, and 4286 // touch a few more pages below. N.B. It is important to touch all 4287 // the way down to and including i=StackShadowPages. 4288 for (int i = 0; i < (int)(JavaThread::stack_shadow_zone_size() / os::vm_page_size()) - 1; i++) { 4289 // this could be any sized move but this is can be a debugging crumb 4290 // so the bigger the better. 4291 lea(tmp, Address(tmp, -os::vm_page_size())); 4292 str(size, Address(tmp)); 4293 } 4294 } 4295 4296 4297 // Move the address of the polling page into dest. 4298 void MacroAssembler::get_polling_page(Register dest, address page, relocInfo::relocType rtype) { 4299 if (SafepointMechanism::uses_thread_local_poll()) { 4300 ldr(dest, Address(rthread, Thread::polling_page_offset())); 4301 } else { 4302 unsigned long off; 4303 adrp(dest, Address(page, rtype), off); 4304 assert(off == 0, "polling page must be page aligned"); 4305 } 4306 } 4307 4308 // Move the address of the polling page into r, then read the polling 4309 // page. 4310 address MacroAssembler::read_polling_page(Register r, address page, relocInfo::relocType rtype) { 4311 get_polling_page(r, page, rtype); 4312 return read_polling_page(r, rtype); 4313 } 4314 4315 // Read the polling page. The address of the polling page must 4316 // already be in r. 4317 address MacroAssembler::read_polling_page(Register r, relocInfo::relocType rtype) { 4318 InstructionMark im(this); 4319 code_section()->relocate(inst_mark(), rtype); 4320 ldrw(zr, Address(r, 0)); 4321 return inst_mark(); 4322 } 4323 4324 void MacroAssembler::adrp(Register reg1, const Address &dest, unsigned long &byte_offset) { 4325 relocInfo::relocType rtype = dest.rspec().reloc()->type(); 4326 unsigned long low_page = (unsigned long)CodeCache::low_bound() >> 12; 4327 unsigned long high_page = (unsigned long)(CodeCache::high_bound()-1) >> 12; 4328 unsigned long dest_page = (unsigned long)dest.target() >> 12; 4329 long offset_low = dest_page - low_page; 4330 long offset_high = dest_page - high_page; 4331 4332 assert(is_valid_AArch64_address(dest.target()), "bad address"); 4333 assert(dest.getMode() == Address::literal, "ADRP must be applied to a literal address"); 4334 4335 InstructionMark im(this); 4336 code_section()->relocate(inst_mark(), dest.rspec()); 4337 // 8143067: Ensure that the adrp can reach the dest from anywhere within 4338 // the code cache so that if it is relocated we know it will still reach 4339 if (offset_high >= -(1<<20) && offset_low < (1<<20)) { 4340 _adrp(reg1, dest.target()); 4341 } else { 4342 unsigned long target = (unsigned long)dest.target(); 4343 unsigned long adrp_target 4344 = (target & 0xffffffffUL) | ((unsigned long)pc() & 0xffff00000000UL); 4345 4346 _adrp(reg1, (address)adrp_target); 4347 movk(reg1, target >> 32, 32); 4348 } 4349 byte_offset = (unsigned long)dest.target() & 0xfff; 4350 } 4351 4352 void MacroAssembler::load_byte_map_base(Register reg) { 4353 CardTable::CardValue* byte_map_base = 4354 ((CardTableBarrierSet*)(BarrierSet::barrier_set()))->card_table()->byte_map_base(); 4355 4356 if (is_valid_AArch64_address((address)byte_map_base)) { 4357 // Strictly speaking the byte_map_base isn't an address at all, 4358 // and it might even be negative. 4359 unsigned long offset; 4360 adrp(reg, ExternalAddress((address)byte_map_base), offset); 4361 // We expect offset to be zero with most collectors. 4362 if (offset != 0) { 4363 add(reg, reg, offset); 4364 } 4365 } else { 4366 mov(reg, (uint64_t)byte_map_base); 4367 } 4368 } 4369 4370 void MacroAssembler::build_frame(int framesize) { 4371 assert(framesize > 0, "framesize must be > 0"); 4372 if (framesize < ((1 << 9) + 2 * wordSize)) { 4373 sub(sp, sp, framesize); 4374 stp(rfp, lr, Address(sp, framesize - 2 * wordSize)); 4375 if (PreserveFramePointer) add(rfp, sp, framesize - 2 * wordSize); 4376 } else { 4377 stp(rfp, lr, Address(pre(sp, -2 * wordSize))); 4378 if (PreserveFramePointer) mov(rfp, sp); 4379 if (framesize < ((1 << 12) + 2 * wordSize)) 4380 sub(sp, sp, framesize - 2 * wordSize); 4381 else { 4382 mov(rscratch1, framesize - 2 * wordSize); 4383 sub(sp, sp, rscratch1); 4384 } 4385 } 4386 } 4387 4388 void MacroAssembler::remove_frame(int framesize) { 4389 assert(framesize > 0, "framesize must be > 0"); 4390 if (framesize < ((1 << 9) + 2 * wordSize)) { 4391 ldp(rfp, lr, Address(sp, framesize - 2 * wordSize)); 4392 add(sp, sp, framesize); 4393 } else { 4394 if (framesize < ((1 << 12) + 2 * wordSize)) 4395 add(sp, sp, framesize - 2 * wordSize); 4396 else { 4397 mov(rscratch1, framesize - 2 * wordSize); 4398 add(sp, sp, rscratch1); 4399 } 4400 ldp(rfp, lr, Address(post(sp, 2 * wordSize))); 4401 } 4402 } 4403 4404 #ifdef COMPILER2 4405 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr); 4406 4407 // Search for str1 in str2 and return index or -1 4408 void MacroAssembler::string_indexof(Register str2, Register str1, 4409 Register cnt2, Register cnt1, 4410 Register tmp1, Register tmp2, 4411 Register tmp3, Register tmp4, 4412 Register tmp5, Register tmp6, 4413 int icnt1, Register result, int ae) { 4414 // NOTE: tmp5, tmp6 can be zr depending on specific method version 4415 Label LINEARSEARCH, LINEARSTUB, LINEAR_MEDIUM, DONE, NOMATCH, MATCH; 4416 4417 Register ch1 = rscratch1; 4418 Register ch2 = rscratch2; 4419 Register cnt1tmp = tmp1; 4420 Register cnt2tmp = tmp2; 4421 Register cnt1_neg = cnt1; 4422 Register cnt2_neg = cnt2; 4423 Register result_tmp = tmp4; 4424 4425 bool isL = ae == StrIntrinsicNode::LL; 4426 4427 bool str1_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL; 4428 bool str2_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU; 4429 int str1_chr_shift = str1_isL ? 0:1; 4430 int str2_chr_shift = str2_isL ? 0:1; 4431 int str1_chr_size = str1_isL ? 1:2; 4432 int str2_chr_size = str2_isL ? 1:2; 4433 chr_insn str1_load_1chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb : 4434 (chr_insn)&MacroAssembler::ldrh; 4435 chr_insn str2_load_1chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb : 4436 (chr_insn)&MacroAssembler::ldrh; 4437 chr_insn load_2chr = isL ? (chr_insn)&MacroAssembler::ldrh : (chr_insn)&MacroAssembler::ldrw; 4438 chr_insn load_4chr = isL ? (chr_insn)&MacroAssembler::ldrw : (chr_insn)&MacroAssembler::ldr; 4439 4440 // Note, inline_string_indexOf() generates checks: 4441 // if (substr.count > string.count) return -1; 4442 // if (substr.count == 0) return 0; 4443 4444 // We have two strings, a source string in str2, cnt2 and a pattern string 4445 // in str1, cnt1. Find the 1st occurence of pattern in source or return -1. 4446 4447 // For larger pattern and source we use a simplified Boyer Moore algorithm. 4448 // With a small pattern and source we use linear scan. 4449 4450 if (icnt1 == -1) { 4451 sub(result_tmp, cnt2, cnt1); 4452 cmp(cnt1, (u1)8); // Use Linear Scan if cnt1 < 8 || cnt1 >= 256 4453 br(LT, LINEARSEARCH); 4454 dup(v0, T16B, cnt1); // done in separate FPU pipeline. Almost no penalty 4455 subs(zr, cnt1, 256); 4456 lsr(tmp1, cnt2, 2); 4457 ccmp(cnt1, tmp1, 0b0000, LT); // Source must be 4 * pattern for BM 4458 br(GE, LINEARSTUB); 4459 } 4460 4461 // The Boyer Moore alogorithm is based on the description here:- 4462 // 4463 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm 4464 // 4465 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule 4466 // and the 'Good Suffix' rule. 4467 // 4468 // These rules are essentially heuristics for how far we can shift the 4469 // pattern along the search string. 4470 // 4471 // The implementation here uses the 'Bad Character' rule only because of the 4472 // complexity of initialisation for the 'Good Suffix' rule. 4473 // 4474 // This is also known as the Boyer-Moore-Horspool algorithm:- 4475 // 4476 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm 4477 // 4478 // This particular implementation has few java-specific optimizations. 4479 // 4480 // #define ASIZE 256 4481 // 4482 // int bm(unsigned char *x, int m, unsigned char *y, int n) { 4483 // int i, j; 4484 // unsigned c; 4485 // unsigned char bc[ASIZE]; 4486 // 4487 // /* Preprocessing */ 4488 // for (i = 0; i < ASIZE; ++i) 4489 // bc[i] = m; 4490 // for (i = 0; i < m - 1; ) { 4491 // c = x[i]; 4492 // ++i; 4493 // // c < 256 for Latin1 string, so, no need for branch 4494 // #ifdef PATTERN_STRING_IS_LATIN1 4495 // bc[c] = m - i; 4496 // #else 4497 // if (c < ASIZE) bc[c] = m - i; 4498 // #endif 4499 // } 4500 // 4501 // /* Searching */ 4502 // j = 0; 4503 // while (j <= n - m) { 4504 // c = y[i+j]; 4505 // if (x[m-1] == c) 4506 // for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i); 4507 // if (i < 0) return j; 4508 // // c < 256 for Latin1 string, so, no need for branch 4509 // #ifdef SOURCE_STRING_IS_LATIN1 4510 // // LL case: (c< 256) always true. Remove branch 4511 // j += bc[y[j+m-1]]; 4512 // #endif 4513 // #ifndef PATTERN_STRING_IS_UTF 4514 // // UU case: need if (c<ASIZE) check. Skip 1 character if not. 4515 // if (c < ASIZE) 4516 // j += bc[y[j+m-1]]; 4517 // else 4518 // j += 1 4519 // #endif 4520 // #ifdef PATTERN_IS_LATIN1_AND_SOURCE_IS_UTF 4521 // // UL case: need if (c<ASIZE) check. Skip <pattern length> if not. 4522 // if (c < ASIZE) 4523 // j += bc[y[j+m-1]]; 4524 // else 4525 // j += m 4526 // #endif 4527 // } 4528 // } 4529 4530 if (icnt1 == -1) { 4531 Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH, 4532 BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP; 4533 Register cnt1end = tmp2; 4534 Register str2end = cnt2; 4535 Register skipch = tmp2; 4536 4537 // str1 length is >=8, so, we can read at least 1 register for cases when 4538 // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for 4539 // UL case. We'll re-read last character in inner pre-loop code to have 4540 // single outer pre-loop load 4541 const int firstStep = isL ? 7 : 3; 4542 4543 const int ASIZE = 256; 4544 const int STORED_BYTES = 32; // amount of bytes stored per instruction 4545 sub(sp, sp, ASIZE); 4546 mov(tmp5, ASIZE/STORED_BYTES); // loop iterations 4547 mov(ch1, sp); 4548 BIND(BM_INIT_LOOP); 4549 stpq(v0, v0, Address(post(ch1, STORED_BYTES))); 4550 subs(tmp5, tmp5, 1); 4551 br(GT, BM_INIT_LOOP); 4552 4553 sub(cnt1tmp, cnt1, 1); 4554 mov(tmp5, str2); 4555 add(str2end, str2, result_tmp, LSL, str2_chr_shift); 4556 sub(ch2, cnt1, 1); 4557 mov(tmp3, str1); 4558 BIND(BCLOOP); 4559 (this->*str1_load_1chr)(ch1, Address(post(tmp3, str1_chr_size))); 4560 if (!str1_isL) { 4561 subs(zr, ch1, ASIZE); 4562 br(HS, BCSKIP); 4563 } 4564 strb(ch2, Address(sp, ch1)); 4565 BIND(BCSKIP); 4566 subs(ch2, ch2, 1); 4567 br(GT, BCLOOP); 4568 4569 add(tmp6, str1, cnt1, LSL, str1_chr_shift); // address after str1 4570 if (str1_isL == str2_isL) { 4571 // load last 8 bytes (8LL/4UU symbols) 4572 ldr(tmp6, Address(tmp6, -wordSize)); 4573 } else { 4574 ldrw(tmp6, Address(tmp6, -wordSize/2)); // load last 4 bytes(4 symbols) 4575 // convert Latin1 to UTF. We'll have to wait until load completed, but 4576 // it's still faster than per-character loads+checks 4577 lsr(tmp3, tmp6, BitsPerByte * (wordSize/2 - str1_chr_size)); // str1[N-1] 4578 ubfx(ch1, tmp6, 8, 8); // str1[N-2] 4579 ubfx(ch2, tmp6, 16, 8); // str1[N-3] 4580 andr(tmp6, tmp6, 0xFF); // str1[N-4] 4581 orr(ch2, ch1, ch2, LSL, 16); 4582 orr(tmp6, tmp6, tmp3, LSL, 48); 4583 orr(tmp6, tmp6, ch2, LSL, 16); 4584 } 4585 BIND(BMLOOPSTR2); 4586 (this->*str2_load_1chr)(skipch, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift))); 4587 sub(cnt1tmp, cnt1tmp, firstStep); // cnt1tmp is positive here, because cnt1 >= 8 4588 if (str1_isL == str2_isL) { 4589 // re-init tmp3. It's for free because it's executed in parallel with 4590 // load above. Alternative is to initialize it before loop, but it'll 4591 // affect performance on in-order systems with 2 or more ld/st pipelines 4592 lsr(tmp3, tmp6, BitsPerByte * (wordSize - str1_chr_size)); 4593 } 4594 if (!isL) { // UU/UL case 4595 lsl(ch2, cnt1tmp, 1); // offset in bytes 4596 } 4597 cmp(tmp3, skipch); 4598 br(NE, BMSKIP); 4599 ldr(ch2, Address(str2, isL ? cnt1tmp : ch2)); 4600 mov(ch1, tmp6); 4601 if (isL) { 4602 b(BMLOOPSTR1_AFTER_LOAD); 4603 } else { 4604 sub(cnt1tmp, cnt1tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8 4605 b(BMLOOPSTR1_CMP); 4606 } 4607 BIND(BMLOOPSTR1); 4608 (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift))); 4609 (this->*str2_load_1chr)(ch2, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift))); 4610 BIND(BMLOOPSTR1_AFTER_LOAD); 4611 subs(cnt1tmp, cnt1tmp, 1); 4612 br(LT, BMLOOPSTR1_LASTCMP); 4613 BIND(BMLOOPSTR1_CMP); 4614 cmp(ch1, ch2); 4615 br(EQ, BMLOOPSTR1); 4616 BIND(BMSKIP); 4617 if (!isL) { 4618 // if we've met UTF symbol while searching Latin1 pattern, then we can 4619 // skip cnt1 symbols 4620 if (str1_isL != str2_isL) { 4621 mov(result_tmp, cnt1); 4622 } else { 4623 mov(result_tmp, 1); 4624 } 4625 subs(zr, skipch, ASIZE); 4626 br(HS, BMADV); 4627 } 4628 ldrb(result_tmp, Address(sp, skipch)); // load skip distance 4629 BIND(BMADV); 4630 sub(cnt1tmp, cnt1, 1); 4631 add(str2, str2, result_tmp, LSL, str2_chr_shift); 4632 cmp(str2, str2end); 4633 br(LE, BMLOOPSTR2); 4634 add(sp, sp, ASIZE); 4635 b(NOMATCH); 4636 BIND(BMLOOPSTR1_LASTCMP); 4637 cmp(ch1, ch2); 4638 br(NE, BMSKIP); 4639 BIND(BMMATCH); 4640 sub(result, str2, tmp5); 4641 if (!str2_isL) lsr(result, result, 1); 4642 add(sp, sp, ASIZE); 4643 b(DONE); 4644 4645 BIND(LINEARSTUB); 4646 cmp(cnt1, (u1)16); // small patterns still should be handled by simple algorithm 4647 br(LT, LINEAR_MEDIUM); 4648 mov(result, zr); 4649 RuntimeAddress stub = NULL; 4650 if (isL) { 4651 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ll()); 4652 assert(stub.target() != NULL, "string_indexof_linear_ll stub has not been generated"); 4653 } else if (str1_isL) { 4654 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ul()); 4655 assert(stub.target() != NULL, "string_indexof_linear_ul stub has not been generated"); 4656 } else { 4657 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_uu()); 4658 assert(stub.target() != NULL, "string_indexof_linear_uu stub has not been generated"); 4659 } 4660 trampoline_call(stub); 4661 b(DONE); 4662 } 4663 4664 BIND(LINEARSEARCH); 4665 { 4666 Label DO1, DO2, DO3; 4667 4668 Register str2tmp = tmp2; 4669 Register first = tmp3; 4670 4671 if (icnt1 == -1) 4672 { 4673 Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT; 4674 4675 cmp(cnt1, u1(str1_isL == str2_isL ? 4 : 2)); 4676 br(LT, DOSHORT); 4677 BIND(LINEAR_MEDIUM); 4678 (this->*str1_load_1chr)(first, Address(str1)); 4679 lea(str1, Address(str1, cnt1, Address::lsl(str1_chr_shift))); 4680 sub(cnt1_neg, zr, cnt1, LSL, str1_chr_shift); 4681 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 4682 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 4683 4684 BIND(FIRST_LOOP); 4685 (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg)); 4686 cmp(first, ch2); 4687 br(EQ, STR1_LOOP); 4688 BIND(STR2_NEXT); 4689 adds(cnt2_neg, cnt2_neg, str2_chr_size); 4690 br(LE, FIRST_LOOP); 4691 b(NOMATCH); 4692 4693 BIND(STR1_LOOP); 4694 adds(cnt1tmp, cnt1_neg, str1_chr_size); 4695 add(cnt2tmp, cnt2_neg, str2_chr_size); 4696 br(GE, MATCH); 4697 4698 BIND(STR1_NEXT); 4699 (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp)); 4700 (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp)); 4701 cmp(ch1, ch2); 4702 br(NE, STR2_NEXT); 4703 adds(cnt1tmp, cnt1tmp, str1_chr_size); 4704 add(cnt2tmp, cnt2tmp, str2_chr_size); 4705 br(LT, STR1_NEXT); 4706 b(MATCH); 4707 4708 BIND(DOSHORT); 4709 if (str1_isL == str2_isL) { 4710 cmp(cnt1, (u1)2); 4711 br(LT, DO1); 4712 br(GT, DO3); 4713 } 4714 } 4715 4716 if (icnt1 == 4) { 4717 Label CH1_LOOP; 4718 4719 (this->*load_4chr)(ch1, str1); 4720 sub(result_tmp, cnt2, 4); 4721 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 4722 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 4723 4724 BIND(CH1_LOOP); 4725 (this->*load_4chr)(ch2, Address(str2, cnt2_neg)); 4726 cmp(ch1, ch2); 4727 br(EQ, MATCH); 4728 adds(cnt2_neg, cnt2_neg, str2_chr_size); 4729 br(LE, CH1_LOOP); 4730 b(NOMATCH); 4731 } 4732 4733 if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 2) { 4734 Label CH1_LOOP; 4735 4736 BIND(DO2); 4737 (this->*load_2chr)(ch1, str1); 4738 if (icnt1 == 2) { 4739 sub(result_tmp, cnt2, 2); 4740 } 4741 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 4742 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 4743 BIND(CH1_LOOP); 4744 (this->*load_2chr)(ch2, Address(str2, cnt2_neg)); 4745 cmp(ch1, ch2); 4746 br(EQ, MATCH); 4747 adds(cnt2_neg, cnt2_neg, str2_chr_size); 4748 br(LE, CH1_LOOP); 4749 b(NOMATCH); 4750 } 4751 4752 if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 3) { 4753 Label FIRST_LOOP, STR2_NEXT, STR1_LOOP; 4754 4755 BIND(DO3); 4756 (this->*load_2chr)(first, str1); 4757 (this->*str1_load_1chr)(ch1, Address(str1, 2*str1_chr_size)); 4758 if (icnt1 == 3) { 4759 sub(result_tmp, cnt2, 3); 4760 } 4761 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 4762 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 4763 BIND(FIRST_LOOP); 4764 (this->*load_2chr)(ch2, Address(str2, cnt2_neg)); 4765 cmpw(first, ch2); 4766 br(EQ, STR1_LOOP); 4767 BIND(STR2_NEXT); 4768 adds(cnt2_neg, cnt2_neg, str2_chr_size); 4769 br(LE, FIRST_LOOP); 4770 b(NOMATCH); 4771 4772 BIND(STR1_LOOP); 4773 add(cnt2tmp, cnt2_neg, 2*str2_chr_size); 4774 (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp)); 4775 cmp(ch1, ch2); 4776 br(NE, STR2_NEXT); 4777 b(MATCH); 4778 } 4779 4780 if (icnt1 == -1 || icnt1 == 1) { 4781 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP; 4782 4783 BIND(DO1); 4784 (this->*str1_load_1chr)(ch1, str1); 4785 cmp(cnt2, (u1)8); 4786 br(LT, DO1_SHORT); 4787 4788 sub(result_tmp, cnt2, 8/str2_chr_size); 4789 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 4790 mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001); 4791 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 4792 4793 if (str2_isL) { 4794 orr(ch1, ch1, ch1, LSL, 8); 4795 } 4796 orr(ch1, ch1, ch1, LSL, 16); 4797 orr(ch1, ch1, ch1, LSL, 32); 4798 BIND(CH1_LOOP); 4799 ldr(ch2, Address(str2, cnt2_neg)); 4800 eor(ch2, ch1, ch2); 4801 sub(tmp1, ch2, tmp3); 4802 orr(tmp2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 4803 bics(tmp1, tmp1, tmp2); 4804 br(NE, HAS_ZERO); 4805 adds(cnt2_neg, cnt2_neg, 8); 4806 br(LT, CH1_LOOP); 4807 4808 cmp(cnt2_neg, (u1)8); 4809 mov(cnt2_neg, 0); 4810 br(LT, CH1_LOOP); 4811 b(NOMATCH); 4812 4813 BIND(HAS_ZERO); 4814 rev(tmp1, tmp1); 4815 clz(tmp1, tmp1); 4816 add(cnt2_neg, cnt2_neg, tmp1, LSR, 3); 4817 b(MATCH); 4818 4819 BIND(DO1_SHORT); 4820 mov(result_tmp, cnt2); 4821 lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift))); 4822 sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift); 4823 BIND(DO1_LOOP); 4824 (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg)); 4825 cmpw(ch1, ch2); 4826 br(EQ, MATCH); 4827 adds(cnt2_neg, cnt2_neg, str2_chr_size); 4828 br(LT, DO1_LOOP); 4829 } 4830 } 4831 BIND(NOMATCH); 4832 mov(result, -1); 4833 b(DONE); 4834 BIND(MATCH); 4835 add(result, result_tmp, cnt2_neg, ASR, str2_chr_shift); 4836 BIND(DONE); 4837 } 4838 4839 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr); 4840 typedef void (MacroAssembler::* uxt_insn)(Register Rd, Register Rn); 4841 4842 void MacroAssembler::string_indexof_char(Register str1, Register cnt1, 4843 Register ch, Register result, 4844 Register tmp1, Register tmp2, Register tmp3) 4845 { 4846 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE; 4847 Register cnt1_neg = cnt1; 4848 Register ch1 = rscratch1; 4849 Register result_tmp = rscratch2; 4850 4851 cmp(cnt1, (u1)4); 4852 br(LT, DO1_SHORT); 4853 4854 orr(ch, ch, ch, LSL, 16); 4855 orr(ch, ch, ch, LSL, 32); 4856 4857 sub(cnt1, cnt1, 4); 4858 mov(result_tmp, cnt1); 4859 lea(str1, Address(str1, cnt1, Address::uxtw(1))); 4860 sub(cnt1_neg, zr, cnt1, LSL, 1); 4861 4862 mov(tmp3, 0x0001000100010001); 4863 4864 BIND(CH1_LOOP); 4865 ldr(ch1, Address(str1, cnt1_neg)); 4866 eor(ch1, ch, ch1); 4867 sub(tmp1, ch1, tmp3); 4868 orr(tmp2, ch1, 0x7fff7fff7fff7fff); 4869 bics(tmp1, tmp1, tmp2); 4870 br(NE, HAS_ZERO); 4871 adds(cnt1_neg, cnt1_neg, 8); 4872 br(LT, CH1_LOOP); 4873 4874 cmp(cnt1_neg, (u1)8); 4875 mov(cnt1_neg, 0); 4876 br(LT, CH1_LOOP); 4877 b(NOMATCH); 4878 4879 BIND(HAS_ZERO); 4880 rev(tmp1, tmp1); 4881 clz(tmp1, tmp1); 4882 add(cnt1_neg, cnt1_neg, tmp1, LSR, 3); 4883 b(MATCH); 4884 4885 BIND(DO1_SHORT); 4886 mov(result_tmp, cnt1); 4887 lea(str1, Address(str1, cnt1, Address::uxtw(1))); 4888 sub(cnt1_neg, zr, cnt1, LSL, 1); 4889 BIND(DO1_LOOP); 4890 ldrh(ch1, Address(str1, cnt1_neg)); 4891 cmpw(ch, ch1); 4892 br(EQ, MATCH); 4893 adds(cnt1_neg, cnt1_neg, 2); 4894 br(LT, DO1_LOOP); 4895 BIND(NOMATCH); 4896 mov(result, -1); 4897 b(DONE); 4898 BIND(MATCH); 4899 add(result, result_tmp, cnt1_neg, ASR, 1); 4900 BIND(DONE); 4901 } 4902 4903 // Compare strings. 4904 void MacroAssembler::string_compare(Register str1, Register str2, 4905 Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2, 4906 FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3, int ae) { 4907 Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB, 4908 DIFFERENCE, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, 4909 SHORT_LOOP_START, TAIL_CHECK; 4910 4911 const u1 STUB_THRESHOLD = 64 + 8; 4912 bool isLL = ae == StrIntrinsicNode::LL; 4913 bool isLU = ae == StrIntrinsicNode::LU; 4914 bool isUL = ae == StrIntrinsicNode::UL; 4915 4916 bool str1_isL = isLL || isLU; 4917 bool str2_isL = isLL || isUL; 4918 4919 int str1_chr_shift = str1_isL ? 0 : 1; 4920 int str2_chr_shift = str2_isL ? 0 : 1; 4921 int str1_chr_size = str1_isL ? 1 : 2; 4922 int str2_chr_size = str2_isL ? 1 : 2; 4923 int minCharsInWord = isLL ? wordSize : wordSize/2; 4924 4925 FloatRegister vtmpZ = vtmp1, vtmp = vtmp2; 4926 chr_insn str1_load_chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb : 4927 (chr_insn)&MacroAssembler::ldrh; 4928 chr_insn str2_load_chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb : 4929 (chr_insn)&MacroAssembler::ldrh; 4930 uxt_insn ext_chr = isLL ? (uxt_insn)&MacroAssembler::uxtbw : 4931 (uxt_insn)&MacroAssembler::uxthw; 4932 4933 BLOCK_COMMENT("string_compare {"); 4934 4935 // Bizzarely, the counts are passed in bytes, regardless of whether they 4936 // are L or U strings, however the result is always in characters. 4937 if (!str1_isL) asrw(cnt1, cnt1, 1); 4938 if (!str2_isL) asrw(cnt2, cnt2, 1); 4939 4940 // Compute the minimum of the string lengths and save the difference. 4941 subsw(result, cnt1, cnt2); 4942 cselw(cnt2, cnt1, cnt2, Assembler::LE); // min 4943 4944 // A very short string 4945 cmpw(cnt2, minCharsInWord); 4946 br(Assembler::LE, SHORT_STRING); 4947 4948 // Compare longwords 4949 // load first parts of strings and finish initialization while loading 4950 { 4951 if (str1_isL == str2_isL) { // LL or UU 4952 ldr(tmp1, Address(str1)); 4953 cmp(str1, str2); 4954 br(Assembler::EQ, DONE); 4955 ldr(tmp2, Address(str2)); 4956 cmp(cnt2, STUB_THRESHOLD); 4957 br(GE, STUB); 4958 subsw(cnt2, cnt2, minCharsInWord); 4959 br(EQ, TAIL_CHECK); 4960 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift))); 4961 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift))); 4962 sub(cnt2, zr, cnt2, LSL, str2_chr_shift); 4963 } else if (isLU) { 4964 ldrs(vtmp, Address(str1)); 4965 cmp(str1, str2); 4966 br(Assembler::EQ, DONE); 4967 ldr(tmp2, Address(str2)); 4968 cmp(cnt2, STUB_THRESHOLD); 4969 br(GE, STUB); 4970 subw(cnt2, cnt2, 4); 4971 eor(vtmpZ, T16B, vtmpZ, vtmpZ); 4972 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift))); 4973 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift))); 4974 zip1(vtmp, T8B, vtmp, vtmpZ); 4975 sub(cnt1, zr, cnt2, LSL, str1_chr_shift); 4976 sub(cnt2, zr, cnt2, LSL, str2_chr_shift); 4977 add(cnt1, cnt1, 4); 4978 fmovd(tmp1, vtmp); 4979 } else { // UL case 4980 ldr(tmp1, Address(str1)); 4981 cmp(str1, str2); 4982 br(Assembler::EQ, DONE); 4983 ldrs(vtmp, Address(str2)); 4984 cmp(cnt2, STUB_THRESHOLD); 4985 br(GE, STUB); 4986 subw(cnt2, cnt2, 4); 4987 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift))); 4988 eor(vtmpZ, T16B, vtmpZ, vtmpZ); 4989 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift))); 4990 sub(cnt1, zr, cnt2, LSL, str1_chr_shift); 4991 zip1(vtmp, T8B, vtmp, vtmpZ); 4992 sub(cnt2, zr, cnt2, LSL, str2_chr_shift); 4993 add(cnt1, cnt1, 8); 4994 fmovd(tmp2, vtmp); 4995 } 4996 adds(cnt2, cnt2, isUL ? 4 : 8); 4997 br(GE, TAIL); 4998 eor(rscratch2, tmp1, tmp2); 4999 cbnz(rscratch2, DIFFERENCE); 5000 // main loop 5001 bind(NEXT_WORD); 5002 if (str1_isL == str2_isL) { 5003 ldr(tmp1, Address(str1, cnt2)); 5004 ldr(tmp2, Address(str2, cnt2)); 5005 adds(cnt2, cnt2, 8); 5006 } else if (isLU) { 5007 ldrs(vtmp, Address(str1, cnt1)); 5008 ldr(tmp2, Address(str2, cnt2)); 5009 add(cnt1, cnt1, 4); 5010 zip1(vtmp, T8B, vtmp, vtmpZ); 5011 fmovd(tmp1, vtmp); 5012 adds(cnt2, cnt2, 8); 5013 } else { // UL 5014 ldrs(vtmp, Address(str2, cnt2)); 5015 ldr(tmp1, Address(str1, cnt1)); 5016 zip1(vtmp, T8B, vtmp, vtmpZ); 5017 add(cnt1, cnt1, 8); 5018 fmovd(tmp2, vtmp); 5019 adds(cnt2, cnt2, 4); 5020 } 5021 br(GE, TAIL); 5022 5023 eor(rscratch2, tmp1, tmp2); 5024 cbz(rscratch2, NEXT_WORD); 5025 b(DIFFERENCE); 5026 bind(TAIL); 5027 eor(rscratch2, tmp1, tmp2); 5028 cbnz(rscratch2, DIFFERENCE); 5029 // Last longword. In the case where length == 4 we compare the 5030 // same longword twice, but that's still faster than another 5031 // conditional branch. 5032 if (str1_isL == str2_isL) { 5033 ldr(tmp1, Address(str1)); 5034 ldr(tmp2, Address(str2)); 5035 } else if (isLU) { 5036 ldrs(vtmp, Address(str1)); 5037 ldr(tmp2, Address(str2)); 5038 zip1(vtmp, T8B, vtmp, vtmpZ); 5039 fmovd(tmp1, vtmp); 5040 } else { // UL 5041 ldrs(vtmp, Address(str2)); 5042 ldr(tmp1, Address(str1)); 5043 zip1(vtmp, T8B, vtmp, vtmpZ); 5044 fmovd(tmp2, vtmp); 5045 } 5046 bind(TAIL_CHECK); 5047 eor(rscratch2, tmp1, tmp2); 5048 cbz(rscratch2, DONE); 5049 5050 // Find the first different characters in the longwords and 5051 // compute their difference. 5052 bind(DIFFERENCE); 5053 rev(rscratch2, rscratch2); 5054 clz(rscratch2, rscratch2); 5055 andr(rscratch2, rscratch2, isLL ? -8 : -16); 5056 lsrv(tmp1, tmp1, rscratch2); 5057 (this->*ext_chr)(tmp1, tmp1); 5058 lsrv(tmp2, tmp2, rscratch2); 5059 (this->*ext_chr)(tmp2, tmp2); 5060 subw(result, tmp1, tmp2); 5061 b(DONE); 5062 } 5063 5064 bind(STUB); 5065 RuntimeAddress stub = NULL; 5066 switch(ae) { 5067 case StrIntrinsicNode::LL: 5068 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LL()); 5069 break; 5070 case StrIntrinsicNode::UU: 5071 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UU()); 5072 break; 5073 case StrIntrinsicNode::LU: 5074 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LU()); 5075 break; 5076 case StrIntrinsicNode::UL: 5077 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UL()); 5078 break; 5079 default: 5080 ShouldNotReachHere(); 5081 } 5082 assert(stub.target() != NULL, "compare_long_string stub has not been generated"); 5083 trampoline_call(stub); 5084 b(DONE); 5085 5086 bind(SHORT_STRING); 5087 // Is the minimum length zero? 5088 cbz(cnt2, DONE); 5089 // arrange code to do most branches while loading and loading next characters 5090 // while comparing previous 5091 (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size))); 5092 subs(cnt2, cnt2, 1); 5093 br(EQ, SHORT_LAST_INIT); 5094 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size))); 5095 b(SHORT_LOOP_START); 5096 bind(SHORT_LOOP); 5097 subs(cnt2, cnt2, 1); 5098 br(EQ, SHORT_LAST); 5099 bind(SHORT_LOOP_START); 5100 (this->*str1_load_chr)(tmp2, Address(post(str1, str1_chr_size))); 5101 (this->*str2_load_chr)(rscratch1, Address(post(str2, str2_chr_size))); 5102 cmp(tmp1, cnt1); 5103 br(NE, SHORT_LOOP_TAIL); 5104 subs(cnt2, cnt2, 1); 5105 br(EQ, SHORT_LAST2); 5106 (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size))); 5107 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size))); 5108 cmp(tmp2, rscratch1); 5109 br(EQ, SHORT_LOOP); 5110 sub(result, tmp2, rscratch1); 5111 b(DONE); 5112 bind(SHORT_LOOP_TAIL); 5113 sub(result, tmp1, cnt1); 5114 b(DONE); 5115 bind(SHORT_LAST2); 5116 cmp(tmp2, rscratch1); 5117 br(EQ, DONE); 5118 sub(result, tmp2, rscratch1); 5119 5120 b(DONE); 5121 bind(SHORT_LAST_INIT); 5122 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size))); 5123 bind(SHORT_LAST); 5124 cmp(tmp1, cnt1); 5125 br(EQ, DONE); 5126 sub(result, tmp1, cnt1); 5127 5128 bind(DONE); 5129 5130 BLOCK_COMMENT("} string_compare"); 5131 } 5132 #endif // COMPILER2 5133 5134 // This method checks if provided byte array contains byte with highest bit set. 5135 void MacroAssembler::has_negatives(Register ary1, Register len, Register result) { 5136 // Simple and most common case of aligned small array which is not at the 5137 // end of memory page is placed here. All other cases are in stub. 5138 Label LOOP, END, STUB, STUB_LONG, SET_RESULT, DONE; 5139 const uint64_t UPPER_BIT_MASK=0x8080808080808080; 5140 assert_different_registers(ary1, len, result); 5141 5142 cmpw(len, 0); 5143 br(LE, SET_RESULT); 5144 cmpw(len, 4 * wordSize); 5145 br(GE, STUB_LONG); // size > 32 then go to stub 5146 5147 int shift = 64 - exact_log2(os::vm_page_size()); 5148 lsl(rscratch1, ary1, shift); 5149 mov(rscratch2, (size_t)(4 * wordSize) << shift); 5150 adds(rscratch2, rscratch1, rscratch2); // At end of page? 5151 br(CS, STUB); // at the end of page then go to stub 5152 subs(len, len, wordSize); 5153 br(LT, END); 5154 5155 BIND(LOOP); 5156 ldr(rscratch1, Address(post(ary1, wordSize))); 5157 tst(rscratch1, UPPER_BIT_MASK); 5158 br(NE, SET_RESULT); 5159 subs(len, len, wordSize); 5160 br(GE, LOOP); 5161 cmpw(len, -wordSize); 5162 br(EQ, SET_RESULT); 5163 5164 BIND(END); 5165 ldr(result, Address(ary1)); 5166 sub(len, zr, len, LSL, 3); // LSL 3 is to get bits from bytes 5167 lslv(result, result, len); 5168 tst(result, UPPER_BIT_MASK); 5169 b(SET_RESULT); 5170 5171 BIND(STUB); 5172 RuntimeAddress has_neg = RuntimeAddress(StubRoutines::aarch64::has_negatives()); 5173 assert(has_neg.target() != NULL, "has_negatives stub has not been generated"); 5174 trampoline_call(has_neg); 5175 b(DONE); 5176 5177 BIND(STUB_LONG); 5178 RuntimeAddress has_neg_long = RuntimeAddress( 5179 StubRoutines::aarch64::has_negatives_long()); 5180 assert(has_neg_long.target() != NULL, "has_negatives stub has not been generated"); 5181 trampoline_call(has_neg_long); 5182 b(DONE); 5183 5184 BIND(SET_RESULT); 5185 cset(result, NE); // set true or false 5186 5187 BIND(DONE); 5188 } 5189 5190 void MacroAssembler::arrays_equals(Register a1, Register a2, Register tmp3, 5191 Register tmp4, Register tmp5, Register result, 5192 Register cnt1, int elem_size) { 5193 Label DONE, SAME; 5194 Register tmp1 = rscratch1; 5195 Register tmp2 = rscratch2; 5196 Register cnt2 = tmp2; // cnt2 only used in array length compare 5197 int elem_per_word = wordSize/elem_size; 5198 int log_elem_size = exact_log2(elem_size); 5199 int length_offset = arrayOopDesc::length_offset_in_bytes(); 5200 int base_offset 5201 = arrayOopDesc::base_offset_in_bytes(elem_size == 2 ? T_CHAR : T_BYTE); 5202 int stubBytesThreshold = 3 * 64 + (UseSIMDForArrayEquals ? 0 : 16); 5203 5204 assert(elem_size == 1 || elem_size == 2, "must be char or byte"); 5205 assert_different_registers(a1, a2, result, cnt1, rscratch1, rscratch2); 5206 5207 #ifndef PRODUCT 5208 { 5209 const char kind = (elem_size == 2) ? 'U' : 'L'; 5210 char comment[64]; 5211 snprintf(comment, sizeof comment, "array_equals%c{", kind); 5212 BLOCK_COMMENT(comment); 5213 } 5214 #endif 5215 5216 // if (a1 == a2) 5217 // return true; 5218 cmpoop(a1, a2); // May have read barriers for a1 and a2. 5219 br(EQ, SAME); 5220 5221 if (UseSimpleArrayEquals) { 5222 Label NEXT_WORD, SHORT, TAIL03, TAIL01, A_MIGHT_BE_NULL, A_IS_NOT_NULL; 5223 // if (a1 == null || a2 == null) 5224 // return false; 5225 // a1 & a2 == 0 means (some-pointer is null) or 5226 // (very-rare-or-even-probably-impossible-pointer-values) 5227 // so, we can save one branch in most cases 5228 tst(a1, a2); 5229 mov(result, false); 5230 br(EQ, A_MIGHT_BE_NULL); 5231 // if (a1.length != a2.length) 5232 // return false; 5233 bind(A_IS_NOT_NULL); 5234 ldrw(cnt1, Address(a1, length_offset)); 5235 ldrw(cnt2, Address(a2, length_offset)); 5236 eorw(tmp5, cnt1, cnt2); 5237 cbnzw(tmp5, DONE); 5238 lea(a1, Address(a1, base_offset)); 5239 lea(a2, Address(a2, base_offset)); 5240 // Check for short strings, i.e. smaller than wordSize. 5241 subs(cnt1, cnt1, elem_per_word); 5242 br(Assembler::LT, SHORT); 5243 // Main 8 byte comparison loop. 5244 bind(NEXT_WORD); { 5245 ldr(tmp1, Address(post(a1, wordSize))); 5246 ldr(tmp2, Address(post(a2, wordSize))); 5247 subs(cnt1, cnt1, elem_per_word); 5248 eor(tmp5, tmp1, tmp2); 5249 cbnz(tmp5, DONE); 5250 } br(GT, NEXT_WORD); 5251 // Last longword. In the case where length == 4 we compare the 5252 // same longword twice, but that's still faster than another 5253 // conditional branch. 5254 // cnt1 could be 0, -1, -2, -3, -4 for chars; -4 only happens when 5255 // length == 4. 5256 if (log_elem_size > 0) 5257 lsl(cnt1, cnt1, log_elem_size); 5258 ldr(tmp3, Address(a1, cnt1)); 5259 ldr(tmp4, Address(a2, cnt1)); 5260 eor(tmp5, tmp3, tmp4); 5261 cbnz(tmp5, DONE); 5262 b(SAME); 5263 bind(A_MIGHT_BE_NULL); 5264 // in case both a1 and a2 are not-null, proceed with loads 5265 cbz(a1, DONE); 5266 cbz(a2, DONE); 5267 b(A_IS_NOT_NULL); 5268 bind(SHORT); 5269 5270 tbz(cnt1, 2 - log_elem_size, TAIL03); // 0-7 bytes left. 5271 { 5272 ldrw(tmp1, Address(post(a1, 4))); 5273 ldrw(tmp2, Address(post(a2, 4))); 5274 eorw(tmp5, tmp1, tmp2); 5275 cbnzw(tmp5, DONE); 5276 } 5277 bind(TAIL03); 5278 tbz(cnt1, 1 - log_elem_size, TAIL01); // 0-3 bytes left. 5279 { 5280 ldrh(tmp3, Address(post(a1, 2))); 5281 ldrh(tmp4, Address(post(a2, 2))); 5282 eorw(tmp5, tmp3, tmp4); 5283 cbnzw(tmp5, DONE); 5284 } 5285 bind(TAIL01); 5286 if (elem_size == 1) { // Only needed when comparing byte arrays. 5287 tbz(cnt1, 0, SAME); // 0-1 bytes left. 5288 { 5289 ldrb(tmp1, a1); 5290 ldrb(tmp2, a2); 5291 eorw(tmp5, tmp1, tmp2); 5292 cbnzw(tmp5, DONE); 5293 } 5294 } 5295 } else { 5296 Label NEXT_DWORD, SHORT, TAIL, TAIL2, STUB, EARLY_OUT, 5297 CSET_EQ, LAST_CHECK; 5298 mov(result, false); 5299 cbz(a1, DONE); 5300 ldrw(cnt1, Address(a1, length_offset)); 5301 cbz(a2, DONE); 5302 ldrw(cnt2, Address(a2, length_offset)); 5303 // on most CPUs a2 is still "locked"(surprisingly) in ldrw and it's 5304 // faster to perform another branch before comparing a1 and a2 5305 cmp(cnt1, (u1)elem_per_word); 5306 br(LE, SHORT); // short or same 5307 ldr(tmp3, Address(pre(a1, base_offset))); 5308 subs(zr, cnt1, stubBytesThreshold); 5309 br(GE, STUB); 5310 ldr(tmp4, Address(pre(a2, base_offset))); 5311 sub(tmp5, zr, cnt1, LSL, 3 + log_elem_size); 5312 cmp(cnt2, cnt1); 5313 br(NE, DONE); 5314 5315 // Main 16 byte comparison loop with 2 exits 5316 bind(NEXT_DWORD); { 5317 ldr(tmp1, Address(pre(a1, wordSize))); 5318 ldr(tmp2, Address(pre(a2, wordSize))); 5319 subs(cnt1, cnt1, 2 * elem_per_word); 5320 br(LE, TAIL); 5321 eor(tmp4, tmp3, tmp4); 5322 cbnz(tmp4, DONE); 5323 ldr(tmp3, Address(pre(a1, wordSize))); 5324 ldr(tmp4, Address(pre(a2, wordSize))); 5325 cmp(cnt1, (u1)elem_per_word); 5326 br(LE, TAIL2); 5327 cmp(tmp1, tmp2); 5328 } br(EQ, NEXT_DWORD); 5329 b(DONE); 5330 5331 bind(TAIL); 5332 eor(tmp4, tmp3, tmp4); 5333 eor(tmp2, tmp1, tmp2); 5334 lslv(tmp2, tmp2, tmp5); 5335 orr(tmp5, tmp4, tmp2); 5336 cmp(tmp5, zr); 5337 b(CSET_EQ); 5338 5339 bind(TAIL2); 5340 eor(tmp2, tmp1, tmp2); 5341 cbnz(tmp2, DONE); 5342 b(LAST_CHECK); 5343 5344 bind(STUB); 5345 ldr(tmp4, Address(pre(a2, base_offset))); 5346 cmp(cnt2, cnt1); 5347 br(NE, DONE); 5348 if (elem_size == 2) { // convert to byte counter 5349 lsl(cnt1, cnt1, 1); 5350 } 5351 eor(tmp5, tmp3, tmp4); 5352 cbnz(tmp5, DONE); 5353 RuntimeAddress stub = RuntimeAddress(StubRoutines::aarch64::large_array_equals()); 5354 assert(stub.target() != NULL, "array_equals_long stub has not been generated"); 5355 trampoline_call(stub); 5356 b(DONE); 5357 5358 bind(EARLY_OUT); 5359 // (a1 != null && a2 == null) || (a1 != null && a2 != null && a1 == a2) 5360 // so, if a2 == null => return false(0), else return true, so we can return a2 5361 mov(result, a2); 5362 b(DONE); 5363 bind(SHORT); 5364 cmp(cnt2, cnt1); 5365 br(NE, DONE); 5366 cbz(cnt1, SAME); 5367 sub(tmp5, zr, cnt1, LSL, 3 + log_elem_size); 5368 ldr(tmp3, Address(a1, base_offset)); 5369 ldr(tmp4, Address(a2, base_offset)); 5370 bind(LAST_CHECK); 5371 eor(tmp4, tmp3, tmp4); 5372 lslv(tmp5, tmp4, tmp5); 5373 cmp(tmp5, zr); 5374 bind(CSET_EQ); 5375 cset(result, EQ); 5376 b(DONE); 5377 } 5378 5379 bind(SAME); 5380 mov(result, true); 5381 // That's it. 5382 bind(DONE); 5383 5384 BLOCK_COMMENT("} array_equals"); 5385 } 5386 5387 // Compare Strings 5388 5389 // For Strings we're passed the address of the first characters in a1 5390 // and a2 and the length in cnt1. 5391 // elem_size is the element size in bytes: either 1 or 2. 5392 // There are two implementations. For arrays >= 8 bytes, all 5393 // comparisons (including the final one, which may overlap) are 5394 // performed 8 bytes at a time. For strings < 8 bytes, we compare a 5395 // halfword, then a short, and then a byte. 5396 5397 void MacroAssembler::string_equals(Register a1, Register a2, 5398 Register result, Register cnt1, int elem_size) 5399 { 5400 Label SAME, DONE, SHORT, NEXT_WORD; 5401 Register tmp1 = rscratch1; 5402 Register tmp2 = rscratch2; 5403 Register cnt2 = tmp2; // cnt2 only used in array length compare 5404 5405 assert(elem_size == 1 || elem_size == 2, "must be 2 or 1 byte"); 5406 assert_different_registers(a1, a2, result, cnt1, rscratch1, rscratch2); 5407 5408 #ifndef PRODUCT 5409 { 5410 const char kind = (elem_size == 2) ? 'U' : 'L'; 5411 char comment[64]; 5412 snprintf(comment, sizeof comment, "{string_equals%c", kind); 5413 BLOCK_COMMENT(comment); 5414 } 5415 #endif 5416 5417 mov(result, false); 5418 5419 // Check for short strings, i.e. smaller than wordSize. 5420 subs(cnt1, cnt1, wordSize); 5421 br(Assembler::LT, SHORT); 5422 // Main 8 byte comparison loop. 5423 bind(NEXT_WORD); { 5424 ldr(tmp1, Address(post(a1, wordSize))); 5425 ldr(tmp2, Address(post(a2, wordSize))); 5426 subs(cnt1, cnt1, wordSize); 5427 eor(tmp1, tmp1, tmp2); 5428 cbnz(tmp1, DONE); 5429 } br(GT, NEXT_WORD); 5430 // Last longword. In the case where length == 4 we compare the 5431 // same longword twice, but that's still faster than another 5432 // conditional branch. 5433 // cnt1 could be 0, -1, -2, -3, -4 for chars; -4 only happens when 5434 // length == 4. 5435 ldr(tmp1, Address(a1, cnt1)); 5436 ldr(tmp2, Address(a2, cnt1)); 5437 eor(tmp2, tmp1, tmp2); 5438 cbnz(tmp2, DONE); 5439 b(SAME); 5440 5441 bind(SHORT); 5442 Label TAIL03, TAIL01; 5443 5444 tbz(cnt1, 2, TAIL03); // 0-7 bytes left. 5445 { 5446 ldrw(tmp1, Address(post(a1, 4))); 5447 ldrw(tmp2, Address(post(a2, 4))); 5448 eorw(tmp1, tmp1, tmp2); 5449 cbnzw(tmp1, DONE); 5450 } 5451 bind(TAIL03); 5452 tbz(cnt1, 1, TAIL01); // 0-3 bytes left. 5453 { 5454 ldrh(tmp1, Address(post(a1, 2))); 5455 ldrh(tmp2, Address(post(a2, 2))); 5456 eorw(tmp1, tmp1, tmp2); 5457 cbnzw(tmp1, DONE); 5458 } 5459 bind(TAIL01); 5460 if (elem_size == 1) { // Only needed when comparing 1-byte elements 5461 tbz(cnt1, 0, SAME); // 0-1 bytes left. 5462 { 5463 ldrb(tmp1, a1); 5464 ldrb(tmp2, a2); 5465 eorw(tmp1, tmp1, tmp2); 5466 cbnzw(tmp1, DONE); 5467 } 5468 } 5469 // Arrays are equal. 5470 bind(SAME); 5471 mov(result, true); 5472 5473 // That's it. 5474 bind(DONE); 5475 BLOCK_COMMENT("} string_equals"); 5476 } 5477 5478 5479 // The size of the blocks erased by the zero_blocks stub. We must 5480 // handle anything smaller than this ourselves in zero_words(). 5481 const int MacroAssembler::zero_words_block_size = 8; 5482 5483 // zero_words() is used by C2 ClearArray patterns. It is as small as 5484 // possible, handling small word counts locally and delegating 5485 // anything larger to the zero_blocks stub. It is expanded many times 5486 // in compiled code, so it is important to keep it short. 5487 5488 // ptr: Address of a buffer to be zeroed. 5489 // cnt: Count in HeapWords. 5490 // 5491 // ptr, cnt, rscratch1, and rscratch2 are clobbered. 5492 void MacroAssembler::zero_words(Register ptr, Register cnt) 5493 { 5494 assert(is_power_of_2(zero_words_block_size), "adjust this"); 5495 assert(ptr == r10 && cnt == r11, "mismatch in register usage"); 5496 5497 BLOCK_COMMENT("zero_words {"); 5498 cmp(cnt, (u1)zero_words_block_size); 5499 Label around; 5500 br(LO, around); 5501 { 5502 RuntimeAddress zero_blocks = RuntimeAddress(StubRoutines::aarch64::zero_blocks()); 5503 assert(zero_blocks.target() != NULL, "zero_blocks stub has not been generated"); 5504 if (StubRoutines::aarch64::complete()) { 5505 trampoline_call(zero_blocks); 5506 } else { 5507 bl(zero_blocks); 5508 } 5509 } 5510 bind(around); 5511 for (int i = zero_words_block_size >> 1; i > 1; i >>= 1) { 5512 Label l; 5513 tbz(cnt, exact_log2(i), l); 5514 for (int j = 0; j < i; j += 2) { 5515 stp(zr, zr, post(ptr, 16)); 5516 } 5517 bind(l); 5518 } 5519 { 5520 Label l; 5521 tbz(cnt, 0, l); 5522 str(zr, Address(ptr)); 5523 bind(l); 5524 } 5525 BLOCK_COMMENT("} zero_words"); 5526 } 5527 5528 // base: Address of a buffer to be zeroed, 8 bytes aligned. 5529 // cnt: Immediate count in HeapWords. 5530 #define SmallArraySize (18 * BytesPerLong) 5531 void MacroAssembler::zero_words(Register base, u_int64_t cnt) 5532 { 5533 BLOCK_COMMENT("zero_words {"); 5534 int i = cnt & 1; // store any odd word to start 5535 if (i) str(zr, Address(base)); 5536 5537 if (cnt <= SmallArraySize / BytesPerLong) { 5538 for (; i < (int)cnt; i += 2) 5539 stp(zr, zr, Address(base, i * wordSize)); 5540 } else { 5541 const int unroll = 4; // Number of stp(zr, zr) instructions we'll unroll 5542 int remainder = cnt % (2 * unroll); 5543 for (; i < remainder; i += 2) 5544 stp(zr, zr, Address(base, i * wordSize)); 5545 5546 Label loop; 5547 Register cnt_reg = rscratch1; 5548 Register loop_base = rscratch2; 5549 cnt = cnt - remainder; 5550 mov(cnt_reg, cnt); 5551 // adjust base and prebias by -2 * wordSize so we can pre-increment 5552 add(loop_base, base, (remainder - 2) * wordSize); 5553 bind(loop); 5554 sub(cnt_reg, cnt_reg, 2 * unroll); 5555 for (i = 1; i < unroll; i++) 5556 stp(zr, zr, Address(loop_base, 2 * i * wordSize)); 5557 stp(zr, zr, Address(pre(loop_base, 2 * unroll * wordSize))); 5558 cbnz(cnt_reg, loop); 5559 } 5560 BLOCK_COMMENT("} zero_words"); 5561 } 5562 5563 // Zero blocks of memory by using DC ZVA. 5564 // 5565 // Aligns the base address first sufficently for DC ZVA, then uses 5566 // DC ZVA repeatedly for every full block. cnt is the size to be 5567 // zeroed in HeapWords. Returns the count of words left to be zeroed 5568 // in cnt. 5569 // 5570 // NOTE: This is intended to be used in the zero_blocks() stub. If 5571 // you want to use it elsewhere, note that cnt must be >= 2*zva_length. 5572 void MacroAssembler::zero_dcache_blocks(Register base, Register cnt) { 5573 Register tmp = rscratch1; 5574 Register tmp2 = rscratch2; 5575 int zva_length = VM_Version::zva_length(); 5576 Label initial_table_end, loop_zva; 5577 Label fini; 5578 5579 // Base must be 16 byte aligned. If not just return and let caller handle it 5580 tst(base, 0x0f); 5581 br(Assembler::NE, fini); 5582 // Align base with ZVA length. 5583 neg(tmp, base); 5584 andr(tmp, tmp, zva_length - 1); 5585 5586 // tmp: the number of bytes to be filled to align the base with ZVA length. 5587 add(base, base, tmp); 5588 sub(cnt, cnt, tmp, Assembler::ASR, 3); 5589 adr(tmp2, initial_table_end); 5590 sub(tmp2, tmp2, tmp, Assembler::LSR, 2); 5591 br(tmp2); 5592 5593 for (int i = -zva_length + 16; i < 0; i += 16) 5594 stp(zr, zr, Address(base, i)); 5595 bind(initial_table_end); 5596 5597 sub(cnt, cnt, zva_length >> 3); 5598 bind(loop_zva); 5599 dc(Assembler::ZVA, base); 5600 subs(cnt, cnt, zva_length >> 3); 5601 add(base, base, zva_length); 5602 br(Assembler::GE, loop_zva); 5603 add(cnt, cnt, zva_length >> 3); // count not zeroed by DC ZVA 5604 bind(fini); 5605 } 5606 5607 // base: Address of a buffer to be filled, 8 bytes aligned. 5608 // cnt: Count in 8-byte unit. 5609 // value: Value to be filled with. 5610 // base will point to the end of the buffer after filling. 5611 void MacroAssembler::fill_words(Register base, Register cnt, Register value) 5612 { 5613 // Algorithm: 5614 // 5615 // scratch1 = cnt & 7; 5616 // cnt -= scratch1; 5617 // p += scratch1; 5618 // switch (scratch1) { 5619 // do { 5620 // cnt -= 8; 5621 // p[-8] = v; 5622 // case 7: 5623 // p[-7] = v; 5624 // case 6: 5625 // p[-6] = v; 5626 // // ... 5627 // case 1: 5628 // p[-1] = v; 5629 // case 0: 5630 // p += 8; 5631 // } while (cnt); 5632 // } 5633 5634 assert_different_registers(base, cnt, value, rscratch1, rscratch2); 5635 5636 Label fini, skip, entry, loop; 5637 const int unroll = 8; // Number of stp instructions we'll unroll 5638 5639 cbz(cnt, fini); 5640 tbz(base, 3, skip); 5641 str(value, Address(post(base, 8))); 5642 sub(cnt, cnt, 1); 5643 bind(skip); 5644 5645 andr(rscratch1, cnt, (unroll-1) * 2); 5646 sub(cnt, cnt, rscratch1); 5647 add(base, base, rscratch1, Assembler::LSL, 3); 5648 adr(rscratch2, entry); 5649 sub(rscratch2, rscratch2, rscratch1, Assembler::LSL, 1); 5650 br(rscratch2); 5651 5652 bind(loop); 5653 add(base, base, unroll * 16); 5654 for (int i = -unroll; i < 0; i++) 5655 stp(value, value, Address(base, i * 16)); 5656 bind(entry); 5657 subs(cnt, cnt, unroll * 2); 5658 br(Assembler::GE, loop); 5659 5660 tbz(cnt, 0, fini); 5661 str(value, Address(post(base, 8))); 5662 bind(fini); 5663 } 5664 5665 // Intrinsic for sun/nio/cs/ISO_8859_1$Encoder.implEncodeISOArray and 5666 // java/lang/StringUTF16.compress. 5667 void MacroAssembler::encode_iso_array(Register src, Register dst, 5668 Register len, Register result, 5669 FloatRegister Vtmp1, FloatRegister Vtmp2, 5670 FloatRegister Vtmp3, FloatRegister Vtmp4) 5671 { 5672 Label DONE, SET_RESULT, NEXT_32, NEXT_32_PRFM, LOOP_8, NEXT_8, LOOP_1, NEXT_1, 5673 NEXT_32_START, NEXT_32_PRFM_START; 5674 Register tmp1 = rscratch1, tmp2 = rscratch2; 5675 5676 mov(result, len); // Save initial len 5677 5678 #ifndef BUILTIN_SIM 5679 cmp(len, (u1)8); // handle shortest strings first 5680 br(LT, LOOP_1); 5681 cmp(len, (u1)32); 5682 br(LT, NEXT_8); 5683 // The following code uses the SIMD 'uzp1' and 'uzp2' instructions 5684 // to convert chars to bytes 5685 if (SoftwarePrefetchHintDistance >= 0) { 5686 ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src); 5687 subs(tmp2, len, SoftwarePrefetchHintDistance/2 + 16); 5688 br(LE, NEXT_32_START); 5689 b(NEXT_32_PRFM_START); 5690 BIND(NEXT_32_PRFM); 5691 ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src); 5692 BIND(NEXT_32_PRFM_START); 5693 prfm(Address(src, SoftwarePrefetchHintDistance)); 5694 orr(v4, T16B, Vtmp1, Vtmp2); 5695 orr(v5, T16B, Vtmp3, Vtmp4); 5696 uzp1(Vtmp1, T16B, Vtmp1, Vtmp2); 5697 uzp1(Vtmp3, T16B, Vtmp3, Vtmp4); 5698 uzp2(v5, T16B, v4, v5); // high bytes 5699 umov(tmp2, v5, D, 1); 5700 fmovd(tmp1, v5); 5701 orr(tmp1, tmp1, tmp2); 5702 cbnz(tmp1, LOOP_8); 5703 stpq(Vtmp1, Vtmp3, dst); 5704 sub(len, len, 32); 5705 add(dst, dst, 32); 5706 add(src, src, 64); 5707 subs(tmp2, len, SoftwarePrefetchHintDistance/2 + 16); 5708 br(GE, NEXT_32_PRFM); 5709 cmp(len, (u1)32); 5710 br(LT, LOOP_8); 5711 BIND(NEXT_32); 5712 ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src); 5713 BIND(NEXT_32_START); 5714 } else { 5715 BIND(NEXT_32); 5716 ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src); 5717 } 5718 prfm(Address(src, SoftwarePrefetchHintDistance)); 5719 uzp1(v4, T16B, Vtmp1, Vtmp2); 5720 uzp1(v5, T16B, Vtmp3, Vtmp4); 5721 orr(Vtmp1, T16B, Vtmp1, Vtmp2); 5722 orr(Vtmp3, T16B, Vtmp3, Vtmp4); 5723 uzp2(Vtmp1, T16B, Vtmp1, Vtmp3); // high bytes 5724 umov(tmp2, Vtmp1, D, 1); 5725 fmovd(tmp1, Vtmp1); 5726 orr(tmp1, tmp1, tmp2); 5727 cbnz(tmp1, LOOP_8); 5728 stpq(v4, v5, dst); 5729 sub(len, len, 32); 5730 add(dst, dst, 32); 5731 add(src, src, 64); 5732 cmp(len, (u1)32); 5733 br(GE, NEXT_32); 5734 cbz(len, DONE); 5735 5736 BIND(LOOP_8); 5737 cmp(len, (u1)8); 5738 br(LT, LOOP_1); 5739 BIND(NEXT_8); 5740 ld1(Vtmp1, T8H, src); 5741 uzp1(Vtmp2, T16B, Vtmp1, Vtmp1); // low bytes 5742 uzp2(Vtmp3, T16B, Vtmp1, Vtmp1); // high bytes 5743 fmovd(tmp1, Vtmp3); 5744 cbnz(tmp1, NEXT_1); 5745 strd(Vtmp2, dst); 5746 5747 sub(len, len, 8); 5748 add(dst, dst, 8); 5749 add(src, src, 16); 5750 cmp(len, (u1)8); 5751 br(GE, NEXT_8); 5752 5753 BIND(LOOP_1); 5754 #endif 5755 cbz(len, DONE); 5756 BIND(NEXT_1); 5757 ldrh(tmp1, Address(post(src, 2))); 5758 tst(tmp1, 0xff00); 5759 br(NE, SET_RESULT); 5760 strb(tmp1, Address(post(dst, 1))); 5761 subs(len, len, 1); 5762 br(GT, NEXT_1); 5763 5764 BIND(SET_RESULT); 5765 sub(result, result, len); // Return index where we stopped 5766 // Return len == 0 if we processed all 5767 // characters 5768 BIND(DONE); 5769 } 5770 5771 5772 // Inflate byte[] array to char[]. 5773 void MacroAssembler::byte_array_inflate(Register src, Register dst, Register len, 5774 FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3, 5775 Register tmp4) { 5776 Label big, done, after_init, to_stub; 5777 5778 assert_different_registers(src, dst, len, tmp4, rscratch1); 5779 5780 fmovd(vtmp1, zr); 5781 lsrw(tmp4, len, 3); 5782 bind(after_init); 5783 cbnzw(tmp4, big); 5784 // Short string: less than 8 bytes. 5785 { 5786 Label loop, tiny; 5787 5788 cmpw(len, 4); 5789 br(LT, tiny); 5790 // Use SIMD to do 4 bytes. 5791 ldrs(vtmp2, post(src, 4)); 5792 zip1(vtmp3, T8B, vtmp2, vtmp1); 5793 subw(len, len, 4); 5794 strd(vtmp3, post(dst, 8)); 5795 5796 cbzw(len, done); 5797 5798 // Do the remaining bytes by steam. 5799 bind(loop); 5800 ldrb(tmp4, post(src, 1)); 5801 strh(tmp4, post(dst, 2)); 5802 subw(len, len, 1); 5803 5804 bind(tiny); 5805 cbnz(len, loop); 5806 5807 b(done); 5808 } 5809 5810 if (SoftwarePrefetchHintDistance >= 0) { 5811 bind(to_stub); 5812 RuntimeAddress stub = RuntimeAddress(StubRoutines::aarch64::large_byte_array_inflate()); 5813 assert(stub.target() != NULL, "large_byte_array_inflate stub has not been generated"); 5814 trampoline_call(stub); 5815 b(after_init); 5816 } 5817 5818 // Unpack the bytes 8 at a time. 5819 bind(big); 5820 { 5821 Label loop, around, loop_last, loop_start; 5822 5823 if (SoftwarePrefetchHintDistance >= 0) { 5824 const int large_loop_threshold = (64 + 16)/8; 5825 ldrd(vtmp2, post(src, 8)); 5826 andw(len, len, 7); 5827 cmp(tmp4, (u1)large_loop_threshold); 5828 br(GE, to_stub); 5829 b(loop_start); 5830 5831 bind(loop); 5832 ldrd(vtmp2, post(src, 8)); 5833 bind(loop_start); 5834 subs(tmp4, tmp4, 1); 5835 br(EQ, loop_last); 5836 zip1(vtmp2, T16B, vtmp2, vtmp1); 5837 ldrd(vtmp3, post(src, 8)); 5838 st1(vtmp2, T8H, post(dst, 16)); 5839 subs(tmp4, tmp4, 1); 5840 zip1(vtmp3, T16B, vtmp3, vtmp1); 5841 st1(vtmp3, T8H, post(dst, 16)); 5842 br(NE, loop); 5843 b(around); 5844 bind(loop_last); 5845 zip1(vtmp2, T16B, vtmp2, vtmp1); 5846 st1(vtmp2, T8H, post(dst, 16)); 5847 bind(around); 5848 cbz(len, done); 5849 } else { 5850 andw(len, len, 7); 5851 bind(loop); 5852 ldrd(vtmp2, post(src, 8)); 5853 sub(tmp4, tmp4, 1); 5854 zip1(vtmp3, T16B, vtmp2, vtmp1); 5855 st1(vtmp3, T8H, post(dst, 16)); 5856 cbnz(tmp4, loop); 5857 } 5858 } 5859 5860 // Do the tail of up to 8 bytes. 5861 add(src, src, len); 5862 ldrd(vtmp3, Address(src, -8)); 5863 add(dst, dst, len, ext::uxtw, 1); 5864 zip1(vtmp3, T16B, vtmp3, vtmp1); 5865 strq(vtmp3, Address(dst, -16)); 5866 5867 bind(done); 5868 } 5869 5870 // Compress char[] array to byte[]. 5871 void MacroAssembler::char_array_compress(Register src, Register dst, Register len, 5872 FloatRegister tmp1Reg, FloatRegister tmp2Reg, 5873 FloatRegister tmp3Reg, FloatRegister tmp4Reg, 5874 Register result) { 5875 encode_iso_array(src, dst, len, result, 5876 tmp1Reg, tmp2Reg, tmp3Reg, tmp4Reg); 5877 cmp(len, zr); 5878 csel(result, result, zr, EQ); 5879 } 5880 5881 // get_thread() can be called anywhere inside generated code so we 5882 // need to save whatever non-callee save context might get clobbered 5883 // by the call to JavaThread::aarch64_get_thread_helper() or, indeed, 5884 // the call setup code. 5885 // 5886 // aarch64_get_thread_helper() clobbers only r0, r1, and flags. 5887 // 5888 void MacroAssembler::get_thread(Register dst) { 5889 RegSet saved_regs = RegSet::range(r0, r1) + lr - dst; 5890 push(saved_regs, sp); 5891 5892 mov(lr, CAST_FROM_FN_PTR(address, JavaThread::aarch64_get_thread_helper)); 5893 blrt(lr, 1, 0, 1); 5894 if (dst != c_rarg0) { 5895 mov(dst, c_rarg0); 5896 } 5897 5898 pop(saved_regs, sp); 5899 } 5900 5901 // C2 compiled method's prolog code 5902 // Moved here from aarch64.ad to support Valhalla code belows 5903 void MacroAssembler::verified_entry(Compile* C, int sp_inc) { 5904 5905 // n.b. frame size includes space for return pc and rfp 5906 const long framesize = C->frame_size_in_bytes(); 5907 assert(framesize % (2 * wordSize) == 0, "must preserve 2 * wordSize alignment"); 5908 5909 // insert a nop at the start of the prolog so we can patch in a 5910 // branch if we need to invalidate the method later 5911 nop(); 5912 5913 int bangsize = C->bang_size_in_bytes(); 5914 if (C->need_stack_bang(bangsize) && UseStackBanging) 5915 generate_stack_overflow_check(bangsize); 5916 5917 build_frame(framesize); 5918 5919 if (NotifySimulator) { 5920 notify(Assembler::method_entry); 5921 } 5922 5923 if (VerifyStackAtCalls) { 5924 Unimplemented(); 5925 } 5926 } 5927 5928 5929 // DMS TODO: Need extra eyes to bring code below to good shape. 5930 // 5931 void MacroAssembler::unpack_value_args(Compile* C, bool receiver_only) { 5932 5933 assert(C->has_scalarized_args(), "value type argument scalarization is disabled"); 5934 Method* method = C->method()->get_Method(); 5935 const GrowableArray<SigEntry>* sig_cc = method->adapter()->get_sig_cc(); 5936 assert(sig_cc != NULL, "must have scalarized signature"); 5937 5938 // Get unscalarized calling convention 5939 BasicType* sig_bt = NEW_RESOURCE_ARRAY(BasicType, sig_cc->length()); 5940 int args_passed = 0; 5941 if (!method->is_static()) { 5942 sig_bt[args_passed++] = T_OBJECT; 5943 } 5944 if (!receiver_only) { 5945 for (SignatureStream ss(method->signature()); !ss.at_return_type(); ss.next()) { 5946 BasicType bt = ss.type(); 5947 sig_bt[args_passed++] = bt; 5948 if (type2size[bt] == 2) { 5949 sig_bt[args_passed++] = T_VOID; 5950 } 5951 } 5952 } else { 5953 // Only unpack the receiver, all other arguments are already scalarized 5954 InstanceKlass* holder = method->method_holder(); 5955 int rec_len = holder->is_value() ? ValueKlass::cast(holder)->extended_sig()->length() : 1; 5956 // Copy scalarized signature but skip receiver, value type delimiters and reserved entries 5957 for (int i = 0; i < sig_cc->length(); i++) { 5958 if (!SigEntry::is_reserved_entry(sig_cc, i)) { 5959 if (SigEntry::skip_value_delimiters(sig_cc, i) && rec_len <= 0) { 5960 sig_bt[args_passed++] = sig_cc->at(i)._bt; 5961 } 5962 rec_len--; 5963 } 5964 } 5965 } 5966 5967 VMRegPair* regs = NEW_RESOURCE_ARRAY(VMRegPair, args_passed); 5968 int args_on_stack = SharedRuntime::java_calling_convention(sig_bt, regs, args_passed, false); 5969 5970 // Get scalarized calling convention 5971 int args_passed_cc = SigEntry::fill_sig_bt(sig_cc, sig_bt); 5972 VMRegPair* regs_cc = NEW_RESOURCE_ARRAY(VMRegPair, sig_cc->length()); 5973 int args_on_stack_cc = SharedRuntime::java_calling_convention(sig_bt, regs_cc, args_passed_cc, false); 5974 5975 // Check if we need to extend the stack for unpacking 5976 int sp_inc = (args_on_stack_cc - args_on_stack) * VMRegImpl::stack_slot_size; 5977 if (sp_inc > 0) { 5978 // Save the return address, adjust the stack (make sure it is properly 5979 // 16-byte aligned) and copy the return address to the new top of the stack. 5980 // pop(r13); 5981 sp_inc = align_up(sp_inc, StackAlignmentInBytes); 5982 // DMS CHECK: subptr(rsp, sp_inc); 5983 sub(sp, sp, sp_inc); 5984 // push(r13); 5985 } else { 5986 // The scalarized calling convention needs less stack space than the unscalarized one. 5987 // No need to extend the stack, the caller will take care of these adjustments. 5988 sp_inc = 0; 5989 } 5990 5991 // Initialize register/stack slot states (make all writable) 5992 int max_stack = MAX2(args_on_stack + sp_inc/VMRegImpl::stack_slot_size, args_on_stack_cc); 5993 int max_reg = VMRegImpl::stack2reg(max_stack)->value(); 5994 RegState* reg_state = NEW_RESOURCE_ARRAY(RegState, max_reg); 5995 for (int i = 0; i < max_reg; ++i) { 5996 reg_state[i] = reg_writable; 5997 } 5998 // Set all source registers/stack slots to readonly to prevent accidental overwriting 5999 for (int i = 0; i < args_passed; ++i) { 6000 VMReg reg = regs[i].first(); 6001 if (!reg->is_valid()) continue; 6002 if (reg->is_stack()) { 6003 // Update source stack location by adding stack increment 6004 reg = VMRegImpl::stack2reg(reg->reg2stack() + sp_inc/VMRegImpl::stack_slot_size); 6005 regs[i] = reg; 6006 } 6007 assert(reg->value() >= 0 && reg->value() < max_reg, "reg value out of bounds"); 6008 reg_state[reg->value()] = reg_readonly; 6009 } 6010 6011 6012 // Emit code for unpacking value type arguments 6013 // We try multiple times and eventually start spilling to resolve (circular) dependencies 6014 bool done = false; 6015 for (int i = 0; i < 2 * args_passed_cc && !done; ++i) { 6016 done = true; 6017 bool spill = (i > args_passed_cc); // Start spilling? 6018 // Iterate over all arguments (in reverse) 6019 for (int from_index = args_passed - 1, to_index = args_passed_cc - 1, sig_index = sig_cc->length() - 1; sig_index >= 0; sig_index--) { 6020 if (SigEntry::is_reserved_entry(sig_cc, sig_index)) { 6021 to_index--; // Skip reserved entry 6022 } else { 6023 assert(from_index >= 0, "index out of bounds"); 6024 VMReg reg = regs[from_index].first(); 6025 if (spill && reg->is_valid() && reg_state[reg->value()] == reg_readonly) { 6026 // Spill argument to be able to write the source and resolve circular dependencies 6027 VMReg spill_reg = r14->as_VMReg(); 6028 bool res = move_helper(reg, spill_reg, T_DOUBLE, reg_state, sp_inc); 6029 assert(res, "Spilling should not fail"); 6030 // Set spill_reg as new source and update state 6031 reg = spill_reg; 6032 regs[from_index].set1(reg); 6033 reg_state[reg->value()] = reg_readonly; 6034 spill = false; // Do not spill again in this round 6035 } 6036 BasicType bt = sig_cc->at(sig_index)._bt; 6037 if (SigEntry::skip_value_delimiters(sig_cc, sig_index)) { 6038 assert(to_index >= 0, "index out of bounds"); 6039 done &= move_helper(reg, regs_cc[to_index].first(), bt, reg_state, sp_inc); 6040 to_index--; 6041 } else if (!receiver_only || (from_index == 0 && bt == T_VOID)) { 6042 done &= unpack_value_helper(sig_cc, sig_index, reg, regs_cc, to_index, reg_state, sp_inc); 6043 } else { 6044 continue; 6045 } 6046 from_index--; 6047 } 6048 } 6049 } 6050 guarantee(done, "Could not resolve circular dependency when unpacking value type arguments"); 6051 6052 // Emit code for verified entry and save increment for stack repair on return 6053 verified_entry(C, sp_inc); 6054 } 6055 6056 bool MacroAssembler::move_helper(VMReg from, VMReg to, BasicType bt, RegState reg_state[], int ret_off) { 6057 if (reg_state[to->value()] == reg_written) { 6058 return true; // Already written 6059 } 6060 if (from != to && bt != T_VOID) { 6061 if (reg_state[to->value()] == reg_readonly) { 6062 return false; // Not yet writable 6063 } 6064 if (from->is_reg()) { 6065 if (to->is_reg()) { 6066 mov(to->as_Register(), from->as_Register()); 6067 } else { 6068 int st_off = to->reg2stack() * VMRegImpl::stack_slot_size + wordSize; 6069 assert(st_off != ret_off, "overwriting return address at %d", st_off); 6070 Address to_addr = Address(sp, st_off); 6071 str(from->as_Register(), to_addr); 6072 } 6073 } else { 6074 Address from_addr = Address(sp, from->reg2stack() * VMRegImpl::stack_slot_size + wordSize); 6075 if (to->is_reg()) { 6076 ldr(to->as_Register(), from_addr); 6077 } else { 6078 int st_off = to->reg2stack() * VMRegImpl::stack_slot_size + wordSize; 6079 assert(st_off != ret_off, "overwriting return address at %d", st_off); 6080 ldr(rscratch1, from_addr); 6081 str(rscratch1, Address(sp, st_off)); 6082 } 6083 } 6084 } 6085 // Update register states 6086 reg_state[from->value()] = reg_writable; 6087 reg_state[to->value()] = reg_written; 6088 return true; 6089 } 6090 6091 bool MacroAssembler::unpack_value_helper(const GrowableArray<SigEntry>* sig, int& sig_index, VMReg from, VMRegPair* regs_to, int& to_index, RegState reg_state[], int ret_off) { 6092 Register fromReg = from->is_reg() ? from->as_Register() : noreg; 6093 assert(sig->at(sig_index)._bt == T_VOID, "should be at end delimiter"); 6094 6095 int vt = 1; 6096 bool done = true; 6097 bool mark_done = true; 6098 do { 6099 sig_index--; 6100 BasicType bt = sig->at(sig_index)._bt; 6101 if (bt == T_VALUETYPE) { 6102 vt--; 6103 } else if (bt == T_VOID && sig->at(sig_index-1)._bt != T_LONG && sig->at(sig_index-1)._bt != T_DOUBLE) { 6104 vt++; 6105 } else if (SigEntry::is_reserved_entry(sig, sig_index)) { 6106 to_index--; // Ignore this 6107 } else { 6108 6109 assert(to_index >= 0, "invalid to_index"); 6110 VMRegPair pair_to = regs_to[to_index--]; 6111 VMReg r_1 = pair_to.first(); 6112 VMReg r_2 = pair_to.second(); 6113 6114 if (bt == T_VOID) continue; 6115 6116 int idx = (int) r_1->value(); 6117 if (reg_state[idx] == reg_readonly) { 6118 if (idx != from->value()) { 6119 mark_done = false; 6120 } 6121 done = false; 6122 continue; 6123 } else if (reg_state[idx] == reg_written) { 6124 continue; 6125 } else { 6126 assert(reg_state[idx] == reg_writable, "must be writable"); 6127 reg_state[idx] = reg_written; 6128 } 6129 6130 if (fromReg == noreg) { 6131 int st_off = from->reg2stack() * VMRegImpl::stack_slot_size + wordSize; 6132 ldr(r10, Address(sp, st_off)); 6133 fromReg = r10; 6134 } 6135 6136 int off = sig->at(sig_index)._offset; 6137 assert(off > 0, "offset in object should be positive"); 6138 6139 Address fromAddr = Address(fromReg, off); 6140 6141 if (r_1->is_stack()) { 6142 // Convert stack slot to an SP offset (+ wordSize to account for return address ) 6143 int st_off = r_1->reg2stack() * VMRegImpl::stack_slot_size; 6144 if (!r_2->is_valid()) { 6145 // sign extend??? 6146 ldrsw(rscratch2, fromAddr); 6147 str(rscratch2, Address(sp, st_off)); 6148 } else { 6149 ldr(rscratch2, fromAddr); 6150 str(rscratch2, Address(sp, st_off)); 6151 } 6152 } else if (r_1->is_Register()) { // Register argument 6153 Register r = r_1->as_Register(); 6154 if (r_2->is_valid()) { 6155 ldr(r, fromAddr); 6156 } else { 6157 ldrw(r, fromAddr); 6158 } 6159 } else { 6160 if (!r_2->is_valid()) { 6161 ldrs(r_1->as_FloatRegister(), fromAddr); 6162 } else { 6163 ldrd(r_1->as_FloatRegister(), fromAddr); 6164 } 6165 } 6166 6167 } 6168 } while (vt != 0); 6169 6170 if (mark_done && reg_state[from->value()] != reg_written) { 6171 // This is okay because no one else will write to that slot 6172 reg_state[from->value()] = reg_writable; 6173 } 6174 return done; 6175 } 6176