1 /* 2 * Copyright (c) 1997, 2019, Oracle and/or its affiliates. All rights reserved. 3 * Copyright (c) 2014, 2019, Red Hat Inc. All rights reserved. 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. 9 * 10 * This code is distributed in the hope that it will be useful, but WITHOUT 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 * version 2 for more details (a copy is included in the LICENSE file that 14 * accompanied this code). 15 * 16 * You should have received a copy of the GNU General Public License version 17 * 2 along with this work; if not, write to the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 19 * 20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 21 * or visit www.oracle.com if you need additional information or have any 22 * questions. 23 * 24 */ 25 26 #include <sys/types.h> 27 28 #include "precompiled.hpp" 29 #include "jvm.h" 30 #include "asm/assembler.hpp" 31 #include "asm/assembler.inline.hpp" 32 #include "gc/shared/barrierSet.hpp" 33 #include "gc/shared/cardTable.hpp" 34 #include "gc/shared/barrierSetAssembler.hpp" 35 #include "gc/shared/cardTableBarrierSet.hpp" 36 #include "interpreter/interpreter.hpp" 37 #include "compiler/disassembler.hpp" 38 #include "memory/resourceArea.hpp" 39 #include "nativeInst_aarch64.hpp" 40 #include "oops/accessDecorators.hpp" 41 #include "oops/compressedOops.inline.hpp" 42 #include "oops/klass.inline.hpp" 43 #include "oops/oop.hpp" 44 #include "opto/compile.hpp" 45 #include "opto/intrinsicnode.hpp" 46 #include "opto/node.hpp" 47 #include "runtime/biasedLocking.hpp" 48 #include "runtime/icache.hpp" 49 #include "runtime/interfaceSupport.inline.hpp" 50 #include "runtime/jniHandles.inline.hpp" 51 #include "runtime/sharedRuntime.hpp" 52 #include "runtime/thread.hpp" 53 54 #ifdef PRODUCT 55 #define BLOCK_COMMENT(str) /* nothing */ 56 #define STOP(error) stop(error) 57 #else 58 #define BLOCK_COMMENT(str) block_comment(str) 59 #define STOP(error) block_comment(error); stop(error) 60 #endif 61 62 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 63 64 // Patch any kind of instruction; there may be several instructions. 65 // Return the total length (in bytes) of the instructions. 66 int MacroAssembler::pd_patch_instruction_size(address branch, address target) { 67 int instructions = 1; 68 assert((uint64_t)target < (1ul << 48), "48-bit overflow in address constant"); 69 long offset = (target - branch) >> 2; 70 unsigned insn = *(unsigned*)branch; 71 if ((Instruction_aarch64::extract(insn, 29, 24) & 0b111011) == 0b011000) { 72 // Load register (literal) 73 Instruction_aarch64::spatch(branch, 23, 5, offset); 74 } else if (Instruction_aarch64::extract(insn, 30, 26) == 0b00101) { 75 // Unconditional branch (immediate) 76 Instruction_aarch64::spatch(branch, 25, 0, offset); 77 } else if (Instruction_aarch64::extract(insn, 31, 25) == 0b0101010) { 78 // Conditional branch (immediate) 79 Instruction_aarch64::spatch(branch, 23, 5, offset); 80 } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011010) { 81 // Compare & branch (immediate) 82 Instruction_aarch64::spatch(branch, 23, 5, offset); 83 } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011011) { 84 // Test & branch (immediate) 85 Instruction_aarch64::spatch(branch, 18, 5, offset); 86 } else if (Instruction_aarch64::extract(insn, 28, 24) == 0b10000) { 87 // PC-rel. addressing 88 offset = target-branch; 89 int shift = Instruction_aarch64::extract(insn, 31, 31); 90 if (shift) { 91 u_int64_t dest = (u_int64_t)target; 92 uint64_t pc_page = (uint64_t)branch >> 12; 93 uint64_t adr_page = (uint64_t)target >> 12; 94 unsigned offset_lo = dest & 0xfff; 95 offset = adr_page - pc_page; 96 97 // We handle 4 types of PC relative addressing 98 // 1 - adrp Rx, target_page 99 // ldr/str Ry, [Rx, #offset_in_page] 100 // 2 - adrp Rx, target_page 101 // add Ry, Rx, #offset_in_page 102 // 3 - adrp Rx, target_page (page aligned reloc, offset == 0) 103 // movk Rx, #imm16<<32 104 // 4 - adrp Rx, target_page (page aligned reloc, offset == 0) 105 // In the first 3 cases we must check that Rx is the same in the adrp and the 106 // subsequent ldr/str, add or movk instruction. Otherwise we could accidentally end 107 // up treating a type 4 relocation as a type 1, 2 or 3 just because it happened 108 // to be followed by a random unrelated ldr/str, add or movk instruction. 109 // 110 unsigned insn2 = ((unsigned*)branch)[1]; 111 if (Instruction_aarch64::extract(insn2, 29, 24) == 0b111001 && 112 Instruction_aarch64::extract(insn, 4, 0) == 113 Instruction_aarch64::extract(insn2, 9, 5)) { 114 // Load/store register (unsigned immediate) 115 unsigned size = Instruction_aarch64::extract(insn2, 31, 30); 116 Instruction_aarch64::patch(branch + sizeof (unsigned), 117 21, 10, offset_lo >> size); 118 guarantee(((dest >> size) << size) == dest, "misaligned target"); 119 instructions = 2; 120 } else if (Instruction_aarch64::extract(insn2, 31, 22) == 0b1001000100 && 121 Instruction_aarch64::extract(insn, 4, 0) == 122 Instruction_aarch64::extract(insn2, 4, 0)) { 123 // add (immediate) 124 Instruction_aarch64::patch(branch + sizeof (unsigned), 125 21, 10, offset_lo); 126 instructions = 2; 127 } else if (Instruction_aarch64::extract(insn2, 31, 21) == 0b11110010110 && 128 Instruction_aarch64::extract(insn, 4, 0) == 129 Instruction_aarch64::extract(insn2, 4, 0)) { 130 // movk #imm16<<32 131 Instruction_aarch64::patch(branch + 4, 20, 5, (uint64_t)target >> 32); 132 long dest = ((long)target & 0xffffffffL) | ((long)branch & 0xffff00000000L); 133 long pc_page = (long)branch >> 12; 134 long adr_page = (long)dest >> 12; 135 offset = adr_page - pc_page; 136 instructions = 2; 137 } 138 } 139 int offset_lo = offset & 3; 140 offset >>= 2; 141 Instruction_aarch64::spatch(branch, 23, 5, offset); 142 Instruction_aarch64::patch(branch, 30, 29, offset_lo); 143 } else if (Instruction_aarch64::extract(insn, 31, 21) == 0b11010010100) { 144 u_int64_t dest = (u_int64_t)target; 145 // Move wide constant 146 assert(nativeInstruction_at(branch+4)->is_movk(), "wrong insns in patch"); 147 assert(nativeInstruction_at(branch+8)->is_movk(), "wrong insns in patch"); 148 Instruction_aarch64::patch(branch, 20, 5, dest & 0xffff); 149 Instruction_aarch64::patch(branch+4, 20, 5, (dest >>= 16) & 0xffff); 150 Instruction_aarch64::patch(branch+8, 20, 5, (dest >>= 16) & 0xffff); 151 assert(target_addr_for_insn(branch) == target, "should be"); 152 instructions = 3; 153 } else if (Instruction_aarch64::extract(insn, 31, 22) == 0b1011100101 && 154 Instruction_aarch64::extract(insn, 4, 0) == 0b11111) { 155 // nothing to do 156 assert(target == 0, "did not expect to relocate target for polling page load"); 157 } else { 158 ShouldNotReachHere(); 159 } 160 return instructions * NativeInstruction::instruction_size; 161 } 162 163 int MacroAssembler::patch_oop(address insn_addr, address o) { 164 int instructions; 165 unsigned insn = *(unsigned*)insn_addr; 166 assert(nativeInstruction_at(insn_addr+4)->is_movk(), "wrong insns in patch"); 167 168 // OOPs are either narrow (32 bits) or wide (48 bits). We encode 169 // narrow OOPs by setting the upper 16 bits in the first 170 // instruction. 171 if (Instruction_aarch64::extract(insn, 31, 21) == 0b11010010101) { 172 // Move narrow OOP 173 narrowOop n = CompressedOops::encode((oop)o); 174 Instruction_aarch64::patch(insn_addr, 20, 5, n >> 16); 175 Instruction_aarch64::patch(insn_addr+4, 20, 5, n & 0xffff); 176 instructions = 2; 177 } else { 178 // Move wide OOP 179 assert(nativeInstruction_at(insn_addr+8)->is_movk(), "wrong insns in patch"); 180 uintptr_t dest = (uintptr_t)o; 181 Instruction_aarch64::patch(insn_addr, 20, 5, dest & 0xffff); 182 Instruction_aarch64::patch(insn_addr+4, 20, 5, (dest >>= 16) & 0xffff); 183 Instruction_aarch64::patch(insn_addr+8, 20, 5, (dest >>= 16) & 0xffff); 184 instructions = 3; 185 } 186 return instructions * NativeInstruction::instruction_size; 187 } 188 189 int MacroAssembler::patch_narrow_klass(address insn_addr, narrowKlass n) { 190 // Metatdata pointers are either narrow (32 bits) or wide (48 bits). 191 // We encode narrow ones by setting the upper 16 bits in the first 192 // instruction. 193 NativeInstruction *insn = nativeInstruction_at(insn_addr); 194 assert(Instruction_aarch64::extract(insn->encoding(), 31, 21) == 0b11010010101 && 195 nativeInstruction_at(insn_addr+4)->is_movk(), "wrong insns in patch"); 196 197 Instruction_aarch64::patch(insn_addr, 20, 5, n >> 16); 198 Instruction_aarch64::patch(insn_addr+4, 20, 5, n & 0xffff); 199 return 2 * NativeInstruction::instruction_size; 200 } 201 202 address MacroAssembler::target_addr_for_insn(address insn_addr, unsigned insn) { 203 long offset = 0; 204 if ((Instruction_aarch64::extract(insn, 29, 24) & 0b011011) == 0b00011000) { 205 // Load register (literal) 206 offset = Instruction_aarch64::sextract(insn, 23, 5); 207 return address(((uint64_t)insn_addr + (offset << 2))); 208 } else if (Instruction_aarch64::extract(insn, 30, 26) == 0b00101) { 209 // Unconditional branch (immediate) 210 offset = Instruction_aarch64::sextract(insn, 25, 0); 211 } else if (Instruction_aarch64::extract(insn, 31, 25) == 0b0101010) { 212 // Conditional branch (immediate) 213 offset = Instruction_aarch64::sextract(insn, 23, 5); 214 } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011010) { 215 // Compare & branch (immediate) 216 offset = Instruction_aarch64::sextract(insn, 23, 5); 217 } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011011) { 218 // Test & branch (immediate) 219 offset = Instruction_aarch64::sextract(insn, 18, 5); 220 } else if (Instruction_aarch64::extract(insn, 28, 24) == 0b10000) { 221 // PC-rel. addressing 222 offset = Instruction_aarch64::extract(insn, 30, 29); 223 offset |= Instruction_aarch64::sextract(insn, 23, 5) << 2; 224 int shift = Instruction_aarch64::extract(insn, 31, 31) ? 12 : 0; 225 if (shift) { 226 offset <<= shift; 227 uint64_t target_page = ((uint64_t)insn_addr) + offset; 228 target_page &= ((uint64_t)-1) << shift; 229 // Return the target address for the following sequences 230 // 1 - adrp Rx, target_page 231 // ldr/str Ry, [Rx, #offset_in_page] 232 // 2 - adrp Rx, target_page 233 // add Ry, Rx, #offset_in_page 234 // 3 - adrp Rx, target_page (page aligned reloc, offset == 0) 235 // movk Rx, #imm12<<32 236 // 4 - adrp Rx, target_page (page aligned reloc, offset == 0) 237 // 238 // In the first two cases we check that the register is the same and 239 // return the target_page + the offset within the page. 240 // Otherwise we assume it is a page aligned relocation and return 241 // the target page only. 242 // 243 unsigned insn2 = ((unsigned*)insn_addr)[1]; 244 if (Instruction_aarch64::extract(insn2, 29, 24) == 0b111001 && 245 Instruction_aarch64::extract(insn, 4, 0) == 246 Instruction_aarch64::extract(insn2, 9, 5)) { 247 // Load/store register (unsigned immediate) 248 unsigned int byte_offset = Instruction_aarch64::extract(insn2, 21, 10); 249 unsigned int size = Instruction_aarch64::extract(insn2, 31, 30); 250 return address(target_page + (byte_offset << size)); 251 } else if (Instruction_aarch64::extract(insn2, 31, 22) == 0b1001000100 && 252 Instruction_aarch64::extract(insn, 4, 0) == 253 Instruction_aarch64::extract(insn2, 4, 0)) { 254 // add (immediate) 255 unsigned int byte_offset = Instruction_aarch64::extract(insn2, 21, 10); 256 return address(target_page + byte_offset); 257 } else { 258 if (Instruction_aarch64::extract(insn2, 31, 21) == 0b11110010110 && 259 Instruction_aarch64::extract(insn, 4, 0) == 260 Instruction_aarch64::extract(insn2, 4, 0)) { 261 target_page = (target_page & 0xffffffff) | 262 ((uint64_t)Instruction_aarch64::extract(insn2, 20, 5) << 32); 263 } 264 return (address)target_page; 265 } 266 } else { 267 ShouldNotReachHere(); 268 } 269 } else if (Instruction_aarch64::extract(insn, 31, 23) == 0b110100101) { 270 u_int32_t *insns = (u_int32_t *)insn_addr; 271 // Move wide constant: movz, movk, movk. See movptr(). 272 assert(nativeInstruction_at(insns+1)->is_movk(), "wrong insns in patch"); 273 assert(nativeInstruction_at(insns+2)->is_movk(), "wrong insns in patch"); 274 return address(u_int64_t(Instruction_aarch64::extract(insns[0], 20, 5)) 275 + (u_int64_t(Instruction_aarch64::extract(insns[1], 20, 5)) << 16) 276 + (u_int64_t(Instruction_aarch64::extract(insns[2], 20, 5)) << 32)); 277 } else if (Instruction_aarch64::extract(insn, 31, 22) == 0b1011100101 && 278 Instruction_aarch64::extract(insn, 4, 0) == 0b11111) { 279 return 0; 280 } else { 281 ShouldNotReachHere(); 282 } 283 return address(((uint64_t)insn_addr + (offset << 2))); 284 } 285 286 void MacroAssembler::serialize_memory(Register thread, Register tmp) { 287 dsb(Assembler::SY); 288 } 289 290 void MacroAssembler::safepoint_poll(Label& slow_path) { 291 if (SafepointMechanism::uses_thread_local_poll()) { 292 ldr(rscratch1, Address(rthread, Thread::polling_page_offset())); 293 tbnz(rscratch1, exact_log2(SafepointMechanism::poll_bit()), slow_path); 294 } else { 295 unsigned long offset; 296 adrp(rscratch1, ExternalAddress(SafepointSynchronize::address_of_state()), offset); 297 ldrw(rscratch1, Address(rscratch1, offset)); 298 assert(SafepointSynchronize::_not_synchronized == 0, "rewrite this code"); 299 cbnz(rscratch1, slow_path); 300 } 301 } 302 303 // Just like safepoint_poll, but use an acquiring load for thread- 304 // local polling. 305 // 306 // We need an acquire here to ensure that any subsequent load of the 307 // global SafepointSynchronize::_state flag is ordered after this load 308 // of the local Thread::_polling page. We don't want this poll to 309 // return false (i.e. not safepointing) and a later poll of the global 310 // SafepointSynchronize::_state spuriously to return true. 311 // 312 // This is to avoid a race when we're in a native->Java transition 313 // racing the code which wakes up from a safepoint. 314 // 315 void MacroAssembler::safepoint_poll_acquire(Label& slow_path) { 316 if (SafepointMechanism::uses_thread_local_poll()) { 317 lea(rscratch1, Address(rthread, Thread::polling_page_offset())); 318 ldar(rscratch1, rscratch1); 319 tbnz(rscratch1, exact_log2(SafepointMechanism::poll_bit()), slow_path); 320 } else { 321 safepoint_poll(slow_path); 322 } 323 } 324 325 void MacroAssembler::reset_last_Java_frame(bool clear_fp) { 326 // we must set sp to zero to clear frame 327 str(zr, Address(rthread, JavaThread::last_Java_sp_offset())); 328 329 // must clear fp, so that compiled frames are not confused; it is 330 // possible that we need it only for debugging 331 if (clear_fp) { 332 str(zr, Address(rthread, JavaThread::last_Java_fp_offset())); 333 } 334 335 // Always clear the pc because it could have been set by make_walkable() 336 str(zr, Address(rthread, JavaThread::last_Java_pc_offset())); 337 } 338 339 // Calls to C land 340 // 341 // When entering C land, the rfp, & resp of the last Java frame have to be recorded 342 // in the (thread-local) JavaThread object. When leaving C land, the last Java fp 343 // has to be reset to 0. This is required to allow proper stack traversal. 344 void MacroAssembler::set_last_Java_frame(Register last_java_sp, 345 Register last_java_fp, 346 Register last_java_pc, 347 Register scratch) { 348 349 if (last_java_pc->is_valid()) { 350 str(last_java_pc, Address(rthread, 351 JavaThread::frame_anchor_offset() 352 + JavaFrameAnchor::last_Java_pc_offset())); 353 } 354 355 // determine last_java_sp register 356 if (last_java_sp == sp) { 357 mov(scratch, sp); 358 last_java_sp = scratch; 359 } else if (!last_java_sp->is_valid()) { 360 last_java_sp = esp; 361 } 362 363 str(last_java_sp, Address(rthread, JavaThread::last_Java_sp_offset())); 364 365 // last_java_fp is optional 366 if (last_java_fp->is_valid()) { 367 str(last_java_fp, Address(rthread, JavaThread::last_Java_fp_offset())); 368 } 369 } 370 371 void MacroAssembler::set_last_Java_frame(Register last_java_sp, 372 Register last_java_fp, 373 address last_java_pc, 374 Register scratch) { 375 assert(last_java_pc != NULL, "must provide a valid PC"); 376 377 adr(scratch, last_java_pc); 378 str(scratch, Address(rthread, 379 JavaThread::frame_anchor_offset() 380 + JavaFrameAnchor::last_Java_pc_offset())); 381 382 set_last_Java_frame(last_java_sp, last_java_fp, noreg, scratch); 383 } 384 385 void MacroAssembler::set_last_Java_frame(Register last_java_sp, 386 Register last_java_fp, 387 Label &L, 388 Register scratch) { 389 if (L.is_bound()) { 390 set_last_Java_frame(last_java_sp, last_java_fp, target(L), scratch); 391 } else { 392 InstructionMark im(this); 393 L.add_patch_at(code(), locator()); 394 set_last_Java_frame(last_java_sp, last_java_fp, pc() /* Patched later */, scratch); 395 } 396 } 397 398 void MacroAssembler::far_call(Address entry, CodeBuffer *cbuf, Register tmp) { 399 assert(ReservedCodeCacheSize < 4*G, "branch out of range"); 400 assert(CodeCache::find_blob(entry.target()) != NULL, 401 "destination of far call not found in code cache"); 402 if (far_branches()) { 403 unsigned long offset; 404 // We can use ADRP here because we know that the total size of 405 // the code cache cannot exceed 2Gb. 406 adrp(tmp, entry, offset); 407 add(tmp, tmp, offset); 408 if (cbuf) cbuf->set_insts_mark(); 409 blr(tmp); 410 } else { 411 if (cbuf) cbuf->set_insts_mark(); 412 bl(entry); 413 } 414 } 415 416 void MacroAssembler::far_jump(Address entry, CodeBuffer *cbuf, Register tmp) { 417 assert(ReservedCodeCacheSize < 4*G, "branch out of range"); 418 assert(CodeCache::find_blob(entry.target()) != NULL, 419 "destination of far call not found in code cache"); 420 if (far_branches()) { 421 unsigned long offset; 422 // We can use ADRP here because we know that the total size of 423 // the code cache cannot exceed 2Gb. 424 adrp(tmp, entry, offset); 425 add(tmp, tmp, offset); 426 if (cbuf) cbuf->set_insts_mark(); 427 br(tmp); 428 } else { 429 if (cbuf) cbuf->set_insts_mark(); 430 b(entry); 431 } 432 } 433 434 void MacroAssembler::reserved_stack_check() { 435 // testing if reserved zone needs to be enabled 436 Label no_reserved_zone_enabling; 437 438 ldr(rscratch1, Address(rthread, JavaThread::reserved_stack_activation_offset())); 439 cmp(sp, rscratch1); 440 br(Assembler::LO, no_reserved_zone_enabling); 441 442 enter(); // LR and FP are live. 443 lea(rscratch1, CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone)); 444 mov(c_rarg0, rthread); 445 blr(rscratch1); 446 leave(); 447 448 // We have already removed our own frame. 449 // throw_delayed_StackOverflowError will think that it's been 450 // called by our caller. 451 lea(rscratch1, RuntimeAddress(StubRoutines::throw_delayed_StackOverflowError_entry())); 452 br(rscratch1); 453 should_not_reach_here(); 454 455 bind(no_reserved_zone_enabling); 456 } 457 458 int MacroAssembler::biased_locking_enter(Register lock_reg, 459 Register obj_reg, 460 Register swap_reg, 461 Register tmp_reg, 462 bool swap_reg_contains_mark, 463 Label& done, 464 Label* slow_case, 465 BiasedLockingCounters* counters) { 466 assert(UseBiasedLocking, "why call this otherwise?"); 467 assert_different_registers(lock_reg, obj_reg, swap_reg); 468 469 if (PrintBiasedLockingStatistics && counters == NULL) 470 counters = BiasedLocking::counters(); 471 472 assert_different_registers(lock_reg, obj_reg, swap_reg, tmp_reg, rscratch1, rscratch2, noreg); 473 assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits, "biased locking makes assumptions about bit layout"); 474 Address mark_addr (obj_reg, oopDesc::mark_offset_in_bytes()); 475 Address klass_addr (obj_reg, oopDesc::klass_offset_in_bytes()); 476 Address saved_mark_addr(lock_reg, 0); 477 478 // Biased locking 479 // See whether the lock is currently biased toward our thread and 480 // whether the epoch is still valid 481 // Note that the runtime guarantees sufficient alignment of JavaThread 482 // pointers to allow age to be placed into low bits 483 // First check to see whether biasing is even enabled for this object 484 Label cas_label; 485 int null_check_offset = -1; 486 if (!swap_reg_contains_mark) { 487 null_check_offset = offset(); 488 ldr(swap_reg, mark_addr); 489 } 490 andr(tmp_reg, swap_reg, markOopDesc::biased_lock_mask_in_place); 491 cmp(tmp_reg, markOopDesc::biased_lock_pattern); 492 br(Assembler::NE, cas_label); 493 // The bias pattern is present in the object's header. Need to check 494 // whether the bias owner and the epoch are both still current. 495 load_prototype_header(tmp_reg, obj_reg); 496 orr(tmp_reg, tmp_reg, rthread); 497 eor(tmp_reg, swap_reg, tmp_reg); 498 andr(tmp_reg, tmp_reg, ~((int) markOopDesc::age_mask_in_place)); 499 if (counters != NULL) { 500 Label around; 501 cbnz(tmp_reg, around); 502 atomic_incw(Address((address)counters->biased_lock_entry_count_addr()), tmp_reg, rscratch1, rscratch2); 503 b(done); 504 bind(around); 505 } else { 506 cbz(tmp_reg, done); 507 } 508 509 Label try_revoke_bias; 510 Label try_rebias; 511 512 // At this point we know that the header has the bias pattern and 513 // that we are not the bias owner in the current epoch. We need to 514 // figure out more details about the state of the header in order to 515 // know what operations can be legally performed on the object's 516 // header. 517 518 // If the low three bits in the xor result aren't clear, that means 519 // the prototype header is no longer biased and we have to revoke 520 // the bias on this object. 521 andr(rscratch1, tmp_reg, markOopDesc::biased_lock_mask_in_place); 522 cbnz(rscratch1, try_revoke_bias); 523 524 // Biasing is still enabled for this data type. See whether the 525 // epoch of the current bias is still valid, meaning that the epoch 526 // bits of the mark word are equal to the epoch bits of the 527 // prototype header. (Note that the prototype header's epoch bits 528 // only change at a safepoint.) If not, attempt to rebias the object 529 // toward the current thread. Note that we must be absolutely sure 530 // that the current epoch is invalid in order to do this because 531 // otherwise the manipulations it performs on the mark word are 532 // illegal. 533 andr(rscratch1, tmp_reg, markOopDesc::epoch_mask_in_place); 534 cbnz(rscratch1, try_rebias); 535 536 // The epoch of the current bias is still valid but we know nothing 537 // about the owner; it might be set or it might be clear. Try to 538 // acquire the bias of the object using an atomic operation. If this 539 // fails we will go in to the runtime to revoke the object's bias. 540 // Note that we first construct the presumed unbiased header so we 541 // don't accidentally blow away another thread's valid bias. 542 { 543 Label here; 544 mov(rscratch1, markOopDesc::biased_lock_mask_in_place | markOopDesc::age_mask_in_place | markOopDesc::epoch_mask_in_place); 545 andr(swap_reg, swap_reg, rscratch1); 546 orr(tmp_reg, swap_reg, rthread); 547 cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, rscratch1, here, slow_case); 548 // If the biasing toward our thread failed, this means that 549 // another thread succeeded in biasing it toward itself and we 550 // need to revoke that bias. The revocation will occur in the 551 // interpreter runtime in the slow case. 552 bind(here); 553 if (counters != NULL) { 554 atomic_incw(Address((address)counters->anonymously_biased_lock_entry_count_addr()), 555 tmp_reg, rscratch1, rscratch2); 556 } 557 } 558 b(done); 559 560 bind(try_rebias); 561 // At this point we know the epoch has expired, meaning that the 562 // current "bias owner", if any, is actually invalid. Under these 563 // circumstances _only_, we are allowed to use the current header's 564 // value as the comparison value when doing the cas to acquire the 565 // bias in the current epoch. In other words, we allow transfer of 566 // the bias from one thread to another directly in this situation. 567 // 568 // FIXME: due to a lack of registers we currently blow away the age 569 // bits in this situation. Should attempt to preserve them. 570 { 571 Label here; 572 load_prototype_header(tmp_reg, obj_reg); 573 orr(tmp_reg, rthread, tmp_reg); 574 cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, rscratch1, here, slow_case); 575 // If the biasing toward our thread failed, then another thread 576 // succeeded in biasing it toward itself and we need to revoke that 577 // bias. The revocation will occur in the runtime in the slow case. 578 bind(here); 579 if (counters != NULL) { 580 atomic_incw(Address((address)counters->rebiased_lock_entry_count_addr()), 581 tmp_reg, rscratch1, rscratch2); 582 } 583 } 584 b(done); 585 586 bind(try_revoke_bias); 587 // The prototype mark in the klass doesn't have the bias bit set any 588 // more, indicating that objects of this data type are not supposed 589 // to be biased any more. We are going to try to reset the mark of 590 // this object to the prototype value and fall through to the 591 // CAS-based locking scheme. Note that if our CAS fails, it means 592 // that another thread raced us for the privilege of revoking the 593 // bias of this particular object, so it's okay to continue in the 594 // normal locking code. 595 // 596 // FIXME: due to a lack of registers we currently blow away the age 597 // bits in this situation. Should attempt to preserve them. 598 { 599 Label here, nope; 600 load_prototype_header(tmp_reg, obj_reg); 601 cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, rscratch1, here, &nope); 602 bind(here); 603 604 // Fall through to the normal CAS-based lock, because no matter what 605 // the result of the above CAS, some thread must have succeeded in 606 // removing the bias bit from the object's header. 607 if (counters != NULL) { 608 atomic_incw(Address((address)counters->revoked_lock_entry_count_addr()), tmp_reg, 609 rscratch1, rscratch2); 610 } 611 bind(nope); 612 } 613 614 bind(cas_label); 615 616 return null_check_offset; 617 } 618 619 void MacroAssembler::biased_locking_exit(Register obj_reg, Register temp_reg, Label& done) { 620 assert(UseBiasedLocking, "why call this otherwise?"); 621 622 // Check for biased locking unlock case, which is a no-op 623 // Note: we do not have to check the thread ID for two reasons. 624 // First, the interpreter checks for IllegalMonitorStateException at 625 // a higher level. Second, if the bias was revoked while we held the 626 // lock, the object could not be rebiased toward another thread, so 627 // the bias bit would be clear. 628 ldr(temp_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes())); 629 andr(temp_reg, temp_reg, markOopDesc::biased_lock_mask_in_place); 630 cmp(temp_reg, markOopDesc::biased_lock_pattern); 631 br(Assembler::EQ, done); 632 } 633 634 static void pass_arg0(MacroAssembler* masm, Register arg) { 635 if (c_rarg0 != arg ) { 636 masm->mov(c_rarg0, arg); 637 } 638 } 639 640 static void pass_arg1(MacroAssembler* masm, Register arg) { 641 if (c_rarg1 != arg ) { 642 masm->mov(c_rarg1, arg); 643 } 644 } 645 646 static void pass_arg2(MacroAssembler* masm, Register arg) { 647 if (c_rarg2 != arg ) { 648 masm->mov(c_rarg2, arg); 649 } 650 } 651 652 static void pass_arg3(MacroAssembler* masm, Register arg) { 653 if (c_rarg3 != arg ) { 654 masm->mov(c_rarg3, arg); 655 } 656 } 657 658 void MacroAssembler::call_VM_base(Register oop_result, 659 Register java_thread, 660 Register last_java_sp, 661 address entry_point, 662 int number_of_arguments, 663 bool check_exceptions) { 664 // determine java_thread register 665 if (!java_thread->is_valid()) { 666 java_thread = rthread; 667 } 668 669 // determine last_java_sp register 670 if (!last_java_sp->is_valid()) { 671 last_java_sp = esp; 672 } 673 674 // debugging support 675 assert(number_of_arguments >= 0 , "cannot have negative number of arguments"); 676 assert(java_thread == rthread, "unexpected register"); 677 #ifdef ASSERT 678 // TraceBytecodes does not use r12 but saves it over the call, so don't verify 679 // if ((UseCompressedOops || UseCompressedClassPointers) && !TraceBytecodes) verify_heapbase("call_VM_base: heap base corrupted?"); 680 #endif // ASSERT 681 682 assert(java_thread != oop_result , "cannot use the same register for java_thread & oop_result"); 683 assert(java_thread != last_java_sp, "cannot use the same register for java_thread & last_java_sp"); 684 685 // push java thread (becomes first argument of C function) 686 687 mov(c_rarg0, java_thread); 688 689 // set last Java frame before call 690 assert(last_java_sp != rfp, "can't use rfp"); 691 692 Label l; 693 set_last_Java_frame(last_java_sp, rfp, l, rscratch1); 694 695 // do the call, remove parameters 696 MacroAssembler::call_VM_leaf_base(entry_point, number_of_arguments, &l); 697 698 // reset last Java frame 699 // Only interpreter should have to clear fp 700 reset_last_Java_frame(true); 701 702 // C++ interp handles this in the interpreter 703 check_and_handle_popframe(java_thread); 704 check_and_handle_earlyret(java_thread); 705 706 if (check_exceptions) { 707 // check for pending exceptions (java_thread is set upon return) 708 ldr(rscratch1, Address(java_thread, in_bytes(Thread::pending_exception_offset()))); 709 Label ok; 710 cbz(rscratch1, ok); 711 lea(rscratch1, RuntimeAddress(StubRoutines::forward_exception_entry())); 712 br(rscratch1); 713 bind(ok); 714 } 715 716 // get oop result if there is one and reset the value in the thread 717 if (oop_result->is_valid()) { 718 get_vm_result(oop_result, java_thread); 719 } 720 } 721 722 void MacroAssembler::call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions) { 723 call_VM_base(oop_result, noreg, noreg, entry_point, number_of_arguments, check_exceptions); 724 } 725 726 // Maybe emit a call via a trampoline. If the code cache is small 727 // trampolines won't be emitted. 728 729 address MacroAssembler::trampoline_call(Address entry, CodeBuffer *cbuf) { 730 assert(JavaThread::current()->is_Compiler_thread(), "just checking"); 731 assert(entry.rspec().type() == relocInfo::runtime_call_type 732 || entry.rspec().type() == relocInfo::opt_virtual_call_type 733 || entry.rspec().type() == relocInfo::static_call_type 734 || entry.rspec().type() == relocInfo::virtual_call_type, "wrong reloc type"); 735 736 // We need a trampoline if branches are far. 737 if (far_branches()) { 738 // We don't want to emit a trampoline if C2 is generating dummy 739 // code during its branch shortening phase. 740 CompileTask* task = ciEnv::current()->task(); 741 bool in_scratch_emit_size = 742 (task != NULL && is_c2_compile(task->comp_level()) && 743 Compile::current()->in_scratch_emit_size()); 744 if (!in_scratch_emit_size) { 745 address stub = emit_trampoline_stub(offset(), entry.target()); 746 if (stub == NULL) { 747 return NULL; // CodeCache is full 748 } 749 } 750 } 751 752 if (cbuf) cbuf->set_insts_mark(); 753 relocate(entry.rspec()); 754 if (!far_branches()) { 755 bl(entry.target()); 756 } else { 757 bl(pc()); 758 } 759 // just need to return a non-null address 760 return pc(); 761 } 762 763 764 // Emit a trampoline stub for a call to a target which is too far away. 765 // 766 // code sequences: 767 // 768 // call-site: 769 // branch-and-link to <destination> or <trampoline stub> 770 // 771 // Related trampoline stub for this call site in the stub section: 772 // load the call target from the constant pool 773 // branch (LR still points to the call site above) 774 775 address MacroAssembler::emit_trampoline_stub(int insts_call_instruction_offset, 776 address dest) { 777 address stub = start_a_stub(Compile::MAX_stubs_size/2); 778 if (stub == NULL) { 779 return NULL; // CodeBuffer::expand failed 780 } 781 782 // Create a trampoline stub relocation which relates this trampoline stub 783 // with the call instruction at insts_call_instruction_offset in the 784 // instructions code-section. 785 align(wordSize); 786 relocate(trampoline_stub_Relocation::spec(code()->insts()->start() 787 + insts_call_instruction_offset)); 788 const int stub_start_offset = offset(); 789 790 // Now, create the trampoline stub's code: 791 // - load the call 792 // - call 793 Label target; 794 ldr(rscratch1, target); 795 br(rscratch1); 796 bind(target); 797 assert(offset() - stub_start_offset == NativeCallTrampolineStub::data_offset, 798 "should be"); 799 emit_int64((int64_t)dest); 800 801 const address stub_start_addr = addr_at(stub_start_offset); 802 803 assert(is_NativeCallTrampolineStub_at(stub_start_addr), "doesn't look like a trampoline"); 804 805 end_a_stub(); 806 return stub_start_addr; 807 } 808 809 void MacroAssembler::emit_static_call_stub() { 810 // CompiledDirectStaticCall::set_to_interpreted knows the 811 // exact layout of this stub. 812 813 isb(); 814 mov_metadata(rmethod, (Metadata*)NULL); 815 816 // Jump to the entry point of the i2c stub. 817 movptr(rscratch1, 0); 818 br(rscratch1); 819 } 820 821 void MacroAssembler::c2bool(Register x) { 822 // implements x == 0 ? 0 : 1 823 // note: must only look at least-significant byte of x 824 // since C-style booleans are stored in one byte 825 // only! (was bug) 826 tst(x, 0xff); 827 cset(x, Assembler::NE); 828 } 829 830 address MacroAssembler::ic_call(address entry, jint method_index) { 831 RelocationHolder rh = virtual_call_Relocation::spec(pc(), method_index); 832 // address const_ptr = long_constant((jlong)Universe::non_oop_word()); 833 // unsigned long offset; 834 // ldr_constant(rscratch2, const_ptr); 835 movptr(rscratch2, (uintptr_t)Universe::non_oop_word()); 836 return trampoline_call(Address(entry, rh)); 837 } 838 839 // Implementation of call_VM versions 840 841 void MacroAssembler::call_VM(Register oop_result, 842 address entry_point, 843 bool check_exceptions) { 844 call_VM_helper(oop_result, entry_point, 0, check_exceptions); 845 } 846 847 void MacroAssembler::call_VM(Register oop_result, 848 address entry_point, 849 Register arg_1, 850 bool check_exceptions) { 851 pass_arg1(this, arg_1); 852 call_VM_helper(oop_result, entry_point, 1, check_exceptions); 853 } 854 855 void MacroAssembler::call_VM(Register oop_result, 856 address entry_point, 857 Register arg_1, 858 Register arg_2, 859 bool check_exceptions) { 860 assert(arg_1 != c_rarg2, "smashed arg"); 861 pass_arg2(this, arg_2); 862 pass_arg1(this, arg_1); 863 call_VM_helper(oop_result, entry_point, 2, check_exceptions); 864 } 865 866 void MacroAssembler::call_VM(Register oop_result, 867 address entry_point, 868 Register arg_1, 869 Register arg_2, 870 Register arg_3, 871 bool check_exceptions) { 872 assert(arg_1 != c_rarg3, "smashed arg"); 873 assert(arg_2 != c_rarg3, "smashed arg"); 874 pass_arg3(this, arg_3); 875 876 assert(arg_1 != c_rarg2, "smashed arg"); 877 pass_arg2(this, arg_2); 878 879 pass_arg1(this, arg_1); 880 call_VM_helper(oop_result, entry_point, 3, check_exceptions); 881 } 882 883 void MacroAssembler::call_VM(Register oop_result, 884 Register last_java_sp, 885 address entry_point, 886 int number_of_arguments, 887 bool check_exceptions) { 888 call_VM_base(oop_result, rthread, last_java_sp, entry_point, number_of_arguments, check_exceptions); 889 } 890 891 void MacroAssembler::call_VM(Register oop_result, 892 Register last_java_sp, 893 address entry_point, 894 Register arg_1, 895 bool check_exceptions) { 896 pass_arg1(this, arg_1); 897 call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions); 898 } 899 900 void MacroAssembler::call_VM(Register oop_result, 901 Register last_java_sp, 902 address entry_point, 903 Register arg_1, 904 Register arg_2, 905 bool check_exceptions) { 906 907 assert(arg_1 != c_rarg2, "smashed arg"); 908 pass_arg2(this, arg_2); 909 pass_arg1(this, arg_1); 910 call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions); 911 } 912 913 void MacroAssembler::call_VM(Register oop_result, 914 Register last_java_sp, 915 address entry_point, 916 Register arg_1, 917 Register arg_2, 918 Register arg_3, 919 bool check_exceptions) { 920 assert(arg_1 != c_rarg3, "smashed arg"); 921 assert(arg_2 != c_rarg3, "smashed arg"); 922 pass_arg3(this, arg_3); 923 assert(arg_1 != c_rarg2, "smashed arg"); 924 pass_arg2(this, arg_2); 925 pass_arg1(this, arg_1); 926 call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions); 927 } 928 929 930 void MacroAssembler::get_vm_result(Register oop_result, Register java_thread) { 931 ldr(oop_result, Address(java_thread, JavaThread::vm_result_offset())); 932 str(zr, Address(java_thread, JavaThread::vm_result_offset())); 933 verify_oop(oop_result, "broken oop in call_VM_base"); 934 } 935 936 void MacroAssembler::get_vm_result_2(Register metadata_result, Register java_thread) { 937 ldr(metadata_result, Address(java_thread, JavaThread::vm_result_2_offset())); 938 str(zr, Address(java_thread, JavaThread::vm_result_2_offset())); 939 } 940 941 void MacroAssembler::align(int modulus) { 942 while (offset() % modulus != 0) nop(); 943 } 944 945 // these are no-ops overridden by InterpreterMacroAssembler 946 947 void MacroAssembler::check_and_handle_earlyret(Register java_thread) { } 948 949 void MacroAssembler::check_and_handle_popframe(Register java_thread) { } 950 951 952 RegisterOrConstant MacroAssembler::delayed_value_impl(intptr_t* delayed_value_addr, 953 Register tmp, 954 int offset) { 955 intptr_t value = *delayed_value_addr; 956 if (value != 0) 957 return RegisterOrConstant(value + offset); 958 959 // load indirectly to solve generation ordering problem 960 ldr(tmp, ExternalAddress((address) delayed_value_addr)); 961 962 if (offset != 0) 963 add(tmp, tmp, offset); 964 965 return RegisterOrConstant(tmp); 966 } 967 968 // Look up the method for a megamorphic invokeinterface call. 969 // The target method is determined by <intf_klass, itable_index>. 970 // The receiver klass is in recv_klass. 971 // On success, the result will be in method_result, and execution falls through. 972 // On failure, execution transfers to the given label. 973 void MacroAssembler::lookup_interface_method(Register recv_klass, 974 Register intf_klass, 975 RegisterOrConstant itable_index, 976 Register method_result, 977 Register scan_temp, 978 Label& L_no_such_interface, 979 bool return_method) { 980 assert_different_registers(recv_klass, intf_klass, scan_temp); 981 assert_different_registers(method_result, intf_klass, scan_temp); 982 assert(recv_klass != method_result || !return_method, 983 "recv_klass can be destroyed when method isn't needed"); 984 assert(itable_index.is_constant() || itable_index.as_register() == method_result, 985 "caller must use same register for non-constant itable index as for method"); 986 987 // Compute start of first itableOffsetEntry (which is at the end of the vtable) 988 int vtable_base = in_bytes(Klass::vtable_start_offset()); 989 int itentry_off = itableMethodEntry::method_offset_in_bytes(); 990 int scan_step = itableOffsetEntry::size() * wordSize; 991 int vte_size = vtableEntry::size_in_bytes(); 992 assert(vte_size == wordSize, "else adjust times_vte_scale"); 993 994 ldrw(scan_temp, Address(recv_klass, Klass::vtable_length_offset())); 995 996 // %%% Could store the aligned, prescaled offset in the klassoop. 997 // lea(scan_temp, Address(recv_klass, scan_temp, times_vte_scale, vtable_base)); 998 lea(scan_temp, Address(recv_klass, scan_temp, Address::lsl(3))); 999 add(scan_temp, scan_temp, vtable_base); 1000 1001 if (return_method) { 1002 // Adjust recv_klass by scaled itable_index, so we can free itable_index. 1003 assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below"); 1004 // lea(recv_klass, Address(recv_klass, itable_index, Address::times_ptr, itentry_off)); 1005 lea(recv_klass, Address(recv_klass, itable_index, Address::lsl(3))); 1006 if (itentry_off) 1007 add(recv_klass, recv_klass, itentry_off); 1008 } 1009 1010 // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) { 1011 // if (scan->interface() == intf) { 1012 // result = (klass + scan->offset() + itable_index); 1013 // } 1014 // } 1015 Label search, found_method; 1016 1017 for (int peel = 1; peel >= 0; peel--) { 1018 ldr(method_result, Address(scan_temp, itableOffsetEntry::interface_offset_in_bytes())); 1019 cmp(intf_klass, method_result); 1020 1021 if (peel) { 1022 br(Assembler::EQ, found_method); 1023 } else { 1024 br(Assembler::NE, search); 1025 // (invert the test to fall through to found_method...) 1026 } 1027 1028 if (!peel) break; 1029 1030 bind(search); 1031 1032 // Check that the previous entry is non-null. A null entry means that 1033 // the receiver class doesn't implement the interface, and wasn't the 1034 // same as when the caller was compiled. 1035 cbz(method_result, L_no_such_interface); 1036 add(scan_temp, scan_temp, scan_step); 1037 } 1038 1039 bind(found_method); 1040 1041 // Got a hit. 1042 if (return_method) { 1043 ldrw(scan_temp, Address(scan_temp, itableOffsetEntry::offset_offset_in_bytes())); 1044 ldr(method_result, Address(recv_klass, scan_temp, Address::uxtw(0))); 1045 } 1046 } 1047 1048 // virtual method calling 1049 void MacroAssembler::lookup_virtual_method(Register recv_klass, 1050 RegisterOrConstant vtable_index, 1051 Register method_result) { 1052 const int base = in_bytes(Klass::vtable_start_offset()); 1053 assert(vtableEntry::size() * wordSize == 8, 1054 "adjust the scaling in the code below"); 1055 int vtable_offset_in_bytes = base + vtableEntry::method_offset_in_bytes(); 1056 1057 if (vtable_index.is_register()) { 1058 lea(method_result, Address(recv_klass, 1059 vtable_index.as_register(), 1060 Address::lsl(LogBytesPerWord))); 1061 ldr(method_result, Address(method_result, vtable_offset_in_bytes)); 1062 } else { 1063 vtable_offset_in_bytes += vtable_index.as_constant() * wordSize; 1064 ldr(method_result, 1065 form_address(rscratch1, recv_klass, vtable_offset_in_bytes, 0)); 1066 } 1067 } 1068 1069 void MacroAssembler::check_klass_subtype(Register sub_klass, 1070 Register super_klass, 1071 Register temp_reg, 1072 Label& L_success) { 1073 Label L_failure; 1074 check_klass_subtype_fast_path(sub_klass, super_klass, temp_reg, &L_success, &L_failure, NULL); 1075 check_klass_subtype_slow_path(sub_klass, super_klass, temp_reg, noreg, &L_success, NULL); 1076 bind(L_failure); 1077 } 1078 1079 1080 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass, 1081 Register super_klass, 1082 Register temp_reg, 1083 Label* L_success, 1084 Label* L_failure, 1085 Label* L_slow_path, 1086 RegisterOrConstant super_check_offset) { 1087 assert_different_registers(sub_klass, super_klass, temp_reg); 1088 bool must_load_sco = (super_check_offset.constant_or_zero() == -1); 1089 if (super_check_offset.is_register()) { 1090 assert_different_registers(sub_klass, super_klass, 1091 super_check_offset.as_register()); 1092 } else if (must_load_sco) { 1093 assert(temp_reg != noreg, "supply either a temp or a register offset"); 1094 } 1095 1096 Label L_fallthrough; 1097 int label_nulls = 0; 1098 if (L_success == NULL) { L_success = &L_fallthrough; label_nulls++; } 1099 if (L_failure == NULL) { L_failure = &L_fallthrough; label_nulls++; } 1100 if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; } 1101 assert(label_nulls <= 1, "at most one NULL in the batch"); 1102 1103 int sc_offset = in_bytes(Klass::secondary_super_cache_offset()); 1104 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 1105 Address super_check_offset_addr(super_klass, sco_offset); 1106 1107 // Hacked jmp, which may only be used just before L_fallthrough. 1108 #define final_jmp(label) \ 1109 if (&(label) == &L_fallthrough) { /*do nothing*/ } \ 1110 else b(label) /*omit semi*/ 1111 1112 // If the pointers are equal, we are done (e.g., String[] elements). 1113 // This self-check enables sharing of secondary supertype arrays among 1114 // non-primary types such as array-of-interface. Otherwise, each such 1115 // type would need its own customized SSA. 1116 // We move this check to the front of the fast path because many 1117 // type checks are in fact trivially successful in this manner, 1118 // so we get a nicely predicted branch right at the start of the check. 1119 cmp(sub_klass, super_klass); 1120 br(Assembler::EQ, *L_success); 1121 1122 // Check the supertype display: 1123 if (must_load_sco) { 1124 ldrw(temp_reg, super_check_offset_addr); 1125 super_check_offset = RegisterOrConstant(temp_reg); 1126 } 1127 Address super_check_addr(sub_klass, super_check_offset); 1128 ldr(rscratch1, super_check_addr); 1129 cmp(super_klass, rscratch1); // load displayed supertype 1130 1131 // This check has worked decisively for primary supers. 1132 // Secondary supers are sought in the super_cache ('super_cache_addr'). 1133 // (Secondary supers are interfaces and very deeply nested subtypes.) 1134 // This works in the same check above because of a tricky aliasing 1135 // between the super_cache and the primary super display elements. 1136 // (The 'super_check_addr' can address either, as the case requires.) 1137 // Note that the cache is updated below if it does not help us find 1138 // what we need immediately. 1139 // So if it was a primary super, we can just fail immediately. 1140 // Otherwise, it's the slow path for us (no success at this point). 1141 1142 if (super_check_offset.is_register()) { 1143 br(Assembler::EQ, *L_success); 1144 cmp(super_check_offset.as_register(), sc_offset); 1145 if (L_failure == &L_fallthrough) { 1146 br(Assembler::EQ, *L_slow_path); 1147 } else { 1148 br(Assembler::NE, *L_failure); 1149 final_jmp(*L_slow_path); 1150 } 1151 } else if (super_check_offset.as_constant() == sc_offset) { 1152 // Need a slow path; fast failure is impossible. 1153 if (L_slow_path == &L_fallthrough) { 1154 br(Assembler::EQ, *L_success); 1155 } else { 1156 br(Assembler::NE, *L_slow_path); 1157 final_jmp(*L_success); 1158 } 1159 } else { 1160 // No slow path; it's a fast decision. 1161 if (L_failure == &L_fallthrough) { 1162 br(Assembler::EQ, *L_success); 1163 } else { 1164 br(Assembler::NE, *L_failure); 1165 final_jmp(*L_success); 1166 } 1167 } 1168 1169 bind(L_fallthrough); 1170 1171 #undef final_jmp 1172 } 1173 1174 // These two are taken from x86, but they look generally useful 1175 1176 // scans count pointer sized words at [addr] for occurence of value, 1177 // generic 1178 void MacroAssembler::repne_scan(Register addr, Register value, Register count, 1179 Register scratch) { 1180 Label Lloop, Lexit; 1181 cbz(count, Lexit); 1182 bind(Lloop); 1183 ldr(scratch, post(addr, wordSize)); 1184 cmp(value, scratch); 1185 br(EQ, Lexit); 1186 sub(count, count, 1); 1187 cbnz(count, Lloop); 1188 bind(Lexit); 1189 } 1190 1191 // scans count 4 byte words at [addr] for occurence of value, 1192 // generic 1193 void MacroAssembler::repne_scanw(Register addr, Register value, Register count, 1194 Register scratch) { 1195 Label Lloop, Lexit; 1196 cbz(count, Lexit); 1197 bind(Lloop); 1198 ldrw(scratch, post(addr, wordSize)); 1199 cmpw(value, scratch); 1200 br(EQ, Lexit); 1201 sub(count, count, 1); 1202 cbnz(count, Lloop); 1203 bind(Lexit); 1204 } 1205 1206 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass, 1207 Register super_klass, 1208 Register temp_reg, 1209 Register temp2_reg, 1210 Label* L_success, 1211 Label* L_failure, 1212 bool set_cond_codes) { 1213 assert_different_registers(sub_klass, super_klass, temp_reg); 1214 if (temp2_reg != noreg) 1215 assert_different_registers(sub_klass, super_klass, temp_reg, temp2_reg, rscratch1); 1216 #define IS_A_TEMP(reg) ((reg) == temp_reg || (reg) == temp2_reg) 1217 1218 Label L_fallthrough; 1219 int label_nulls = 0; 1220 if (L_success == NULL) { L_success = &L_fallthrough; label_nulls++; } 1221 if (L_failure == NULL) { L_failure = &L_fallthrough; label_nulls++; } 1222 assert(label_nulls <= 1, "at most one NULL in the batch"); 1223 1224 // a couple of useful fields in sub_klass: 1225 int ss_offset = in_bytes(Klass::secondary_supers_offset()); 1226 int sc_offset = in_bytes(Klass::secondary_super_cache_offset()); 1227 Address secondary_supers_addr(sub_klass, ss_offset); 1228 Address super_cache_addr( sub_klass, sc_offset); 1229 1230 BLOCK_COMMENT("check_klass_subtype_slow_path"); 1231 1232 // Do a linear scan of the secondary super-klass chain. 1233 // This code is rarely used, so simplicity is a virtue here. 1234 // The repne_scan instruction uses fixed registers, which we must spill. 1235 // Don't worry too much about pre-existing connections with the input regs. 1236 1237 assert(sub_klass != r0, "killed reg"); // killed by mov(r0, super) 1238 assert(sub_klass != r2, "killed reg"); // killed by lea(r2, &pst_counter) 1239 1240 RegSet pushed_registers; 1241 if (!IS_A_TEMP(r2)) pushed_registers += r2; 1242 if (!IS_A_TEMP(r5)) pushed_registers += r5; 1243 1244 if (super_klass != r0 || UseCompressedOops) { 1245 if (!IS_A_TEMP(r0)) pushed_registers += r0; 1246 } 1247 1248 push(pushed_registers, sp); 1249 1250 // Get super_klass value into r0 (even if it was in r5 or r2). 1251 if (super_klass != r0) { 1252 mov(r0, super_klass); 1253 } 1254 1255 #ifndef PRODUCT 1256 mov(rscratch2, (address)&SharedRuntime::_partial_subtype_ctr); 1257 Address pst_counter_addr(rscratch2); 1258 ldr(rscratch1, pst_counter_addr); 1259 add(rscratch1, rscratch1, 1); 1260 str(rscratch1, pst_counter_addr); 1261 #endif //PRODUCT 1262 1263 // We will consult the secondary-super array. 1264 ldr(r5, secondary_supers_addr); 1265 // Load the array length. 1266 ldrw(r2, Address(r5, Array<Klass*>::length_offset_in_bytes())); 1267 // Skip to start of data. 1268 add(r5, r5, Array<Klass*>::base_offset_in_bytes()); 1269 1270 cmp(sp, zr); // Clear Z flag; SP is never zero 1271 // Scan R2 words at [R5] for an occurrence of R0. 1272 // Set NZ/Z based on last compare. 1273 repne_scan(r5, r0, r2, rscratch1); 1274 1275 // Unspill the temp. registers: 1276 pop(pushed_registers, sp); 1277 1278 br(Assembler::NE, *L_failure); 1279 1280 // Success. Cache the super we found and proceed in triumph. 1281 str(super_klass, super_cache_addr); 1282 1283 if (L_success != &L_fallthrough) { 1284 b(*L_success); 1285 } 1286 1287 #undef IS_A_TEMP 1288 1289 bind(L_fallthrough); 1290 } 1291 1292 1293 void MacroAssembler::verify_oop(Register reg, const char* s) { 1294 if (!VerifyOops) return; 1295 1296 // Pass register number to verify_oop_subroutine 1297 const char* b = NULL; 1298 { 1299 ResourceMark rm; 1300 stringStream ss; 1301 ss.print("verify_oop: %s: %s", reg->name(), s); 1302 b = code_string(ss.as_string()); 1303 } 1304 BLOCK_COMMENT("verify_oop {"); 1305 1306 stp(r0, rscratch1, Address(pre(sp, -2 * wordSize))); 1307 stp(rscratch2, lr, Address(pre(sp, -2 * wordSize))); 1308 1309 mov(r0, reg); 1310 mov(rscratch1, (address)b); 1311 1312 // call indirectly to solve generation ordering problem 1313 lea(rscratch2, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address())); 1314 ldr(rscratch2, Address(rscratch2)); 1315 blr(rscratch2); 1316 1317 ldp(rscratch2, lr, Address(post(sp, 2 * wordSize))); 1318 ldp(r0, rscratch1, Address(post(sp, 2 * wordSize))); 1319 1320 BLOCK_COMMENT("} verify_oop"); 1321 } 1322 1323 void MacroAssembler::verify_oop_addr(Address addr, const char* s) { 1324 if (!VerifyOops) return; 1325 1326 const char* b = NULL; 1327 { 1328 ResourceMark rm; 1329 stringStream ss; 1330 ss.print("verify_oop_addr: %s", s); 1331 b = code_string(ss.as_string()); 1332 } 1333 BLOCK_COMMENT("verify_oop_addr {"); 1334 1335 stp(r0, rscratch1, Address(pre(sp, -2 * wordSize))); 1336 stp(rscratch2, lr, Address(pre(sp, -2 * wordSize))); 1337 1338 // addr may contain sp so we will have to adjust it based on the 1339 // pushes that we just did. 1340 if (addr.uses(sp)) { 1341 lea(r0, addr); 1342 ldr(r0, Address(r0, 4 * wordSize)); 1343 } else { 1344 ldr(r0, addr); 1345 } 1346 mov(rscratch1, (address)b); 1347 1348 // call indirectly to solve generation ordering problem 1349 lea(rscratch2, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address())); 1350 ldr(rscratch2, Address(rscratch2)); 1351 blr(rscratch2); 1352 1353 ldp(rscratch2, lr, Address(post(sp, 2 * wordSize))); 1354 ldp(r0, rscratch1, Address(post(sp, 2 * wordSize))); 1355 1356 BLOCK_COMMENT("} verify_oop_addr"); 1357 } 1358 1359 Address MacroAssembler::argument_address(RegisterOrConstant arg_slot, 1360 int extra_slot_offset) { 1361 // cf. TemplateTable::prepare_invoke(), if (load_receiver). 1362 int stackElementSize = Interpreter::stackElementSize; 1363 int offset = Interpreter::expr_offset_in_bytes(extra_slot_offset+0); 1364 #ifdef ASSERT 1365 int offset1 = Interpreter::expr_offset_in_bytes(extra_slot_offset+1); 1366 assert(offset1 - offset == stackElementSize, "correct arithmetic"); 1367 #endif 1368 if (arg_slot.is_constant()) { 1369 return Address(esp, arg_slot.as_constant() * stackElementSize 1370 + offset); 1371 } else { 1372 add(rscratch1, esp, arg_slot.as_register(), 1373 ext::uxtx, exact_log2(stackElementSize)); 1374 return Address(rscratch1, offset); 1375 } 1376 } 1377 1378 void MacroAssembler::call_VM_leaf_base(address entry_point, 1379 int number_of_arguments, 1380 Label *retaddr) { 1381 Label E, L; 1382 1383 stp(rscratch1, rmethod, Address(pre(sp, -2 * wordSize))); 1384 1385 mov(rscratch1, entry_point); 1386 blr(rscratch1); 1387 if (retaddr) 1388 bind(*retaddr); 1389 1390 ldp(rscratch1, rmethod, Address(post(sp, 2 * wordSize))); 1391 maybe_isb(); 1392 } 1393 1394 void MacroAssembler::call_VM_leaf(address entry_point, int number_of_arguments) { 1395 call_VM_leaf_base(entry_point, number_of_arguments); 1396 } 1397 1398 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0) { 1399 pass_arg0(this, arg_0); 1400 call_VM_leaf_base(entry_point, 1); 1401 } 1402 1403 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1) { 1404 pass_arg0(this, arg_0); 1405 pass_arg1(this, arg_1); 1406 call_VM_leaf_base(entry_point, 2); 1407 } 1408 1409 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, 1410 Register arg_1, Register arg_2) { 1411 pass_arg0(this, arg_0); 1412 pass_arg1(this, arg_1); 1413 pass_arg2(this, arg_2); 1414 call_VM_leaf_base(entry_point, 3); 1415 } 1416 1417 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0) { 1418 pass_arg0(this, arg_0); 1419 MacroAssembler::call_VM_leaf_base(entry_point, 1); 1420 } 1421 1422 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1) { 1423 1424 assert(arg_0 != c_rarg1, "smashed arg"); 1425 pass_arg1(this, arg_1); 1426 pass_arg0(this, arg_0); 1427 MacroAssembler::call_VM_leaf_base(entry_point, 2); 1428 } 1429 1430 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) { 1431 assert(arg_0 != c_rarg2, "smashed arg"); 1432 assert(arg_1 != c_rarg2, "smashed arg"); 1433 pass_arg2(this, arg_2); 1434 assert(arg_0 != c_rarg1, "smashed arg"); 1435 pass_arg1(this, arg_1); 1436 pass_arg0(this, arg_0); 1437 MacroAssembler::call_VM_leaf_base(entry_point, 3); 1438 } 1439 1440 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) { 1441 assert(arg_0 != c_rarg3, "smashed arg"); 1442 assert(arg_1 != c_rarg3, "smashed arg"); 1443 assert(arg_2 != c_rarg3, "smashed arg"); 1444 pass_arg3(this, arg_3); 1445 assert(arg_0 != c_rarg2, "smashed arg"); 1446 assert(arg_1 != c_rarg2, "smashed arg"); 1447 pass_arg2(this, arg_2); 1448 assert(arg_0 != c_rarg1, "smashed arg"); 1449 pass_arg1(this, arg_1); 1450 pass_arg0(this, arg_0); 1451 MacroAssembler::call_VM_leaf_base(entry_point, 4); 1452 } 1453 1454 void MacroAssembler::null_check(Register reg, int offset) { 1455 if (needs_explicit_null_check(offset)) { 1456 // provoke OS NULL exception if reg = NULL by 1457 // accessing M[reg] w/o changing any registers 1458 // NOTE: this is plenty to provoke a segv 1459 ldr(zr, Address(reg)); 1460 } else { 1461 // nothing to do, (later) access of M[reg + offset] 1462 // will provoke OS NULL exception if reg = NULL 1463 } 1464 } 1465 1466 // MacroAssembler protected routines needed to implement 1467 // public methods 1468 1469 void MacroAssembler::mov(Register r, Address dest) { 1470 code_section()->relocate(pc(), dest.rspec()); 1471 u_int64_t imm64 = (u_int64_t)dest.target(); 1472 movptr(r, imm64); 1473 } 1474 1475 // Move a constant pointer into r. In AArch64 mode the virtual 1476 // address space is 48 bits in size, so we only need three 1477 // instructions to create a patchable instruction sequence that can 1478 // reach anywhere. 1479 void MacroAssembler::movptr(Register r, uintptr_t imm64) { 1480 #ifndef PRODUCT 1481 { 1482 char buffer[64]; 1483 snprintf(buffer, sizeof(buffer), "0x%"PRIX64, imm64); 1484 block_comment(buffer); 1485 } 1486 #endif 1487 assert(imm64 < (1ul << 48), "48-bit overflow in address constant"); 1488 movz(r, imm64 & 0xffff); 1489 imm64 >>= 16; 1490 movk(r, imm64 & 0xffff, 16); 1491 imm64 >>= 16; 1492 movk(r, imm64 & 0xffff, 32); 1493 } 1494 1495 // Macro to mov replicated immediate to vector register. 1496 // Vd will get the following values for different arrangements in T 1497 // imm32 == hex 000000gh T8B: Vd = ghghghghghghghgh 1498 // imm32 == hex 000000gh T16B: Vd = ghghghghghghghghghghghghghghghgh 1499 // imm32 == hex 0000efgh T4H: Vd = efghefghefghefgh 1500 // imm32 == hex 0000efgh T8H: Vd = efghefghefghefghefghefghefghefgh 1501 // imm32 == hex abcdefgh T2S: Vd = abcdefghabcdefgh 1502 // imm32 == hex abcdefgh T4S: Vd = abcdefghabcdefghabcdefghabcdefgh 1503 // T1D/T2D: invalid 1504 void MacroAssembler::mov(FloatRegister Vd, SIMD_Arrangement T, u_int32_t imm32) { 1505 assert(T != T1D && T != T2D, "invalid arrangement"); 1506 if (T == T8B || T == T16B) { 1507 assert((imm32 & ~0xff) == 0, "extraneous bits in unsigned imm32 (T8B/T16B)"); 1508 movi(Vd, T, imm32 & 0xff, 0); 1509 return; 1510 } 1511 u_int32_t nimm32 = ~imm32; 1512 if (T == T4H || T == T8H) { 1513 assert((imm32 & ~0xffff) == 0, "extraneous bits in unsigned imm32 (T4H/T8H)"); 1514 imm32 &= 0xffff; 1515 nimm32 &= 0xffff; 1516 } 1517 u_int32_t x = imm32; 1518 int movi_cnt = 0; 1519 int movn_cnt = 0; 1520 while (x) { if (x & 0xff) movi_cnt++; x >>= 8; } 1521 x = nimm32; 1522 while (x) { if (x & 0xff) movn_cnt++; x >>= 8; } 1523 if (movn_cnt < movi_cnt) imm32 = nimm32; 1524 unsigned lsl = 0; 1525 while (imm32 && (imm32 & 0xff) == 0) { lsl += 8; imm32 >>= 8; } 1526 if (movn_cnt < movi_cnt) 1527 mvni(Vd, T, imm32 & 0xff, lsl); 1528 else 1529 movi(Vd, T, imm32 & 0xff, lsl); 1530 imm32 >>= 8; lsl += 8; 1531 while (imm32) { 1532 while ((imm32 & 0xff) == 0) { lsl += 8; imm32 >>= 8; } 1533 if (movn_cnt < movi_cnt) 1534 bici(Vd, T, imm32 & 0xff, lsl); 1535 else 1536 orri(Vd, T, imm32 & 0xff, lsl); 1537 lsl += 8; imm32 >>= 8; 1538 } 1539 } 1540 1541 void MacroAssembler::mov_immediate64(Register dst, u_int64_t imm64) 1542 { 1543 #ifndef PRODUCT 1544 { 1545 char buffer[64]; 1546 snprintf(buffer, sizeof(buffer), "0x%"PRIX64, imm64); 1547 block_comment(buffer); 1548 } 1549 #endif 1550 if (operand_valid_for_logical_immediate(false, imm64)) { 1551 orr(dst, zr, imm64); 1552 } else { 1553 // we can use a combination of MOVZ or MOVN with 1554 // MOVK to build up the constant 1555 u_int64_t imm_h[4]; 1556 int zero_count = 0; 1557 int neg_count = 0; 1558 int i; 1559 for (i = 0; i < 4; i++) { 1560 imm_h[i] = ((imm64 >> (i * 16)) & 0xffffL); 1561 if (imm_h[i] == 0) { 1562 zero_count++; 1563 } else if (imm_h[i] == 0xffffL) { 1564 neg_count++; 1565 } 1566 } 1567 if (zero_count == 4) { 1568 // one MOVZ will do 1569 movz(dst, 0); 1570 } else if (neg_count == 4) { 1571 // one MOVN will do 1572 movn(dst, 0); 1573 } else if (zero_count == 3) { 1574 for (i = 0; i < 4; i++) { 1575 if (imm_h[i] != 0L) { 1576 movz(dst, (u_int32_t)imm_h[i], (i << 4)); 1577 break; 1578 } 1579 } 1580 } else if (neg_count == 3) { 1581 // one MOVN will do 1582 for (int i = 0; i < 4; i++) { 1583 if (imm_h[i] != 0xffffL) { 1584 movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4)); 1585 break; 1586 } 1587 } 1588 } else if (zero_count == 2) { 1589 // one MOVZ and one MOVK will do 1590 for (i = 0; i < 3; i++) { 1591 if (imm_h[i] != 0L) { 1592 movz(dst, (u_int32_t)imm_h[i], (i << 4)); 1593 i++; 1594 break; 1595 } 1596 } 1597 for (;i < 4; i++) { 1598 if (imm_h[i] != 0L) { 1599 movk(dst, (u_int32_t)imm_h[i], (i << 4)); 1600 } 1601 } 1602 } else if (neg_count == 2) { 1603 // one MOVN and one MOVK will do 1604 for (i = 0; i < 4; i++) { 1605 if (imm_h[i] != 0xffffL) { 1606 movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4)); 1607 i++; 1608 break; 1609 } 1610 } 1611 for (;i < 4; i++) { 1612 if (imm_h[i] != 0xffffL) { 1613 movk(dst, (u_int32_t)imm_h[i], (i << 4)); 1614 } 1615 } 1616 } else if (zero_count == 1) { 1617 // one MOVZ and two MOVKs will do 1618 for (i = 0; i < 4; i++) { 1619 if (imm_h[i] != 0L) { 1620 movz(dst, (u_int32_t)imm_h[i], (i << 4)); 1621 i++; 1622 break; 1623 } 1624 } 1625 for (;i < 4; i++) { 1626 if (imm_h[i] != 0x0L) { 1627 movk(dst, (u_int32_t)imm_h[i], (i << 4)); 1628 } 1629 } 1630 } else if (neg_count == 1) { 1631 // one MOVN and two MOVKs will do 1632 for (i = 0; i < 4; i++) { 1633 if (imm_h[i] != 0xffffL) { 1634 movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4)); 1635 i++; 1636 break; 1637 } 1638 } 1639 for (;i < 4; i++) { 1640 if (imm_h[i] != 0xffffL) { 1641 movk(dst, (u_int32_t)imm_h[i], (i << 4)); 1642 } 1643 } 1644 } else { 1645 // use a MOVZ and 3 MOVKs (makes it easier to debug) 1646 movz(dst, (u_int32_t)imm_h[0], 0); 1647 for (i = 1; i < 4; i++) { 1648 movk(dst, (u_int32_t)imm_h[i], (i << 4)); 1649 } 1650 } 1651 } 1652 } 1653 1654 void MacroAssembler::mov_immediate32(Register dst, u_int32_t imm32) 1655 { 1656 #ifndef PRODUCT 1657 { 1658 char buffer[64]; 1659 snprintf(buffer, sizeof(buffer), "0x%"PRIX32, imm32); 1660 block_comment(buffer); 1661 } 1662 #endif 1663 if (operand_valid_for_logical_immediate(true, imm32)) { 1664 orrw(dst, zr, imm32); 1665 } else { 1666 // we can use MOVZ, MOVN or two calls to MOVK to build up the 1667 // constant 1668 u_int32_t imm_h[2]; 1669 imm_h[0] = imm32 & 0xffff; 1670 imm_h[1] = ((imm32 >> 16) & 0xffff); 1671 if (imm_h[0] == 0) { 1672 movzw(dst, imm_h[1], 16); 1673 } else if (imm_h[0] == 0xffff) { 1674 movnw(dst, imm_h[1] ^ 0xffff, 16); 1675 } else if (imm_h[1] == 0) { 1676 movzw(dst, imm_h[0], 0); 1677 } else if (imm_h[1] == 0xffff) { 1678 movnw(dst, imm_h[0] ^ 0xffff, 0); 1679 } else { 1680 // use a MOVZ and MOVK (makes it easier to debug) 1681 movzw(dst, imm_h[0], 0); 1682 movkw(dst, imm_h[1], 16); 1683 } 1684 } 1685 } 1686 1687 // Form an address from base + offset in Rd. Rd may or may 1688 // not actually be used: you must use the Address that is returned. 1689 // It is up to you to ensure that the shift provided matches the size 1690 // of your data. 1691 Address MacroAssembler::form_address(Register Rd, Register base, long byte_offset, int shift) { 1692 if (Address::offset_ok_for_immed(byte_offset, shift)) 1693 // It fits; no need for any heroics 1694 return Address(base, byte_offset); 1695 1696 // Don't do anything clever with negative or misaligned offsets 1697 unsigned mask = (1 << shift) - 1; 1698 if (byte_offset < 0 || byte_offset & mask) { 1699 mov(Rd, byte_offset); 1700 add(Rd, base, Rd); 1701 return Address(Rd); 1702 } 1703 1704 // See if we can do this with two 12-bit offsets 1705 { 1706 unsigned long word_offset = byte_offset >> shift; 1707 unsigned long masked_offset = word_offset & 0xfff000; 1708 if (Address::offset_ok_for_immed(word_offset - masked_offset) 1709 && Assembler::operand_valid_for_add_sub_immediate(masked_offset << shift)) { 1710 add(Rd, base, masked_offset << shift); 1711 word_offset -= masked_offset; 1712 return Address(Rd, word_offset << shift); 1713 } 1714 } 1715 1716 // Do it the hard way 1717 mov(Rd, byte_offset); 1718 add(Rd, base, Rd); 1719 return Address(Rd); 1720 } 1721 1722 void MacroAssembler::atomic_incw(Register counter_addr, Register tmp, Register tmp2) { 1723 if (UseLSE) { 1724 mov(tmp, 1); 1725 ldadd(Assembler::word, tmp, zr, counter_addr); 1726 return; 1727 } 1728 Label retry_load; 1729 if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH)) 1730 prfm(Address(counter_addr), PSTL1STRM); 1731 bind(retry_load); 1732 // flush and load exclusive from the memory location 1733 ldxrw(tmp, counter_addr); 1734 addw(tmp, tmp, 1); 1735 // if we store+flush with no intervening write tmp wil be zero 1736 stxrw(tmp2, tmp, counter_addr); 1737 cbnzw(tmp2, retry_load); 1738 } 1739 1740 1741 int MacroAssembler::corrected_idivl(Register result, Register ra, Register rb, 1742 bool want_remainder, Register scratch) 1743 { 1744 // Full implementation of Java idiv and irem. The function 1745 // returns the (pc) offset of the div instruction - may be needed 1746 // for implicit exceptions. 1747 // 1748 // constraint : ra/rb =/= scratch 1749 // normal case 1750 // 1751 // input : ra: dividend 1752 // rb: divisor 1753 // 1754 // result: either 1755 // quotient (= ra idiv rb) 1756 // remainder (= ra irem rb) 1757 1758 assert(ra != scratch && rb != scratch, "reg cannot be scratch"); 1759 1760 int idivl_offset = offset(); 1761 if (! want_remainder) { 1762 sdivw(result, ra, rb); 1763 } else { 1764 sdivw(scratch, ra, rb); 1765 Assembler::msubw(result, scratch, rb, ra); 1766 } 1767 1768 return idivl_offset; 1769 } 1770 1771 int MacroAssembler::corrected_idivq(Register result, Register ra, Register rb, 1772 bool want_remainder, Register scratch) 1773 { 1774 // Full implementation of Java ldiv and lrem. The function 1775 // returns the (pc) offset of the div instruction - may be needed 1776 // for implicit exceptions. 1777 // 1778 // constraint : ra/rb =/= scratch 1779 // normal case 1780 // 1781 // input : ra: dividend 1782 // rb: divisor 1783 // 1784 // result: either 1785 // quotient (= ra idiv rb) 1786 // remainder (= ra irem rb) 1787 1788 assert(ra != scratch && rb != scratch, "reg cannot be scratch"); 1789 1790 int idivq_offset = offset(); 1791 if (! want_remainder) { 1792 sdiv(result, ra, rb); 1793 } else { 1794 sdiv(scratch, ra, rb); 1795 Assembler::msub(result, scratch, rb, ra); 1796 } 1797 1798 return idivq_offset; 1799 } 1800 1801 void MacroAssembler::membar(Membar_mask_bits order_constraint) { 1802 address prev = pc() - NativeMembar::instruction_size; 1803 address last = code()->last_insn(); 1804 if (last != NULL && nativeInstruction_at(last)->is_Membar() && prev == last) { 1805 NativeMembar *bar = NativeMembar_at(prev); 1806 // We are merging two memory barrier instructions. On AArch64 we 1807 // can do this simply by ORing them together. 1808 bar->set_kind(bar->get_kind() | order_constraint); 1809 BLOCK_COMMENT("merged membar"); 1810 } else { 1811 code()->set_last_insn(pc()); 1812 dmb(Assembler::barrier(order_constraint)); 1813 } 1814 } 1815 1816 bool MacroAssembler::try_merge_ldst(Register rt, const Address &adr, size_t size_in_bytes, bool is_store) { 1817 if (ldst_can_merge(rt, adr, size_in_bytes, is_store)) { 1818 merge_ldst(rt, adr, size_in_bytes, is_store); 1819 code()->clear_last_insn(); 1820 return true; 1821 } else { 1822 assert(size_in_bytes == 8 || size_in_bytes == 4, "only 8 bytes or 4 bytes load/store is supported."); 1823 const unsigned mask = size_in_bytes - 1; 1824 if (adr.getMode() == Address::base_plus_offset && 1825 (adr.offset() & mask) == 0) { // only supports base_plus_offset. 1826 code()->set_last_insn(pc()); 1827 } 1828 return false; 1829 } 1830 } 1831 1832 void MacroAssembler::ldr(Register Rx, const Address &adr) { 1833 // We always try to merge two adjacent loads into one ldp. 1834 if (!try_merge_ldst(Rx, adr, 8, false)) { 1835 Assembler::ldr(Rx, adr); 1836 } 1837 } 1838 1839 void MacroAssembler::ldrw(Register Rw, const Address &adr) { 1840 // We always try to merge two adjacent loads into one ldp. 1841 if (!try_merge_ldst(Rw, adr, 4, false)) { 1842 Assembler::ldrw(Rw, adr); 1843 } 1844 } 1845 1846 void MacroAssembler::str(Register Rx, const Address &adr) { 1847 // We always try to merge two adjacent stores into one stp. 1848 if (!try_merge_ldst(Rx, adr, 8, true)) { 1849 Assembler::str(Rx, adr); 1850 } 1851 } 1852 1853 void MacroAssembler::strw(Register Rw, const Address &adr) { 1854 // We always try to merge two adjacent stores into one stp. 1855 if (!try_merge_ldst(Rw, adr, 4, true)) { 1856 Assembler::strw(Rw, adr); 1857 } 1858 } 1859 1860 // MacroAssembler routines found actually to be needed 1861 1862 void MacroAssembler::push(Register src) 1863 { 1864 str(src, Address(pre(esp, -1 * wordSize))); 1865 } 1866 1867 void MacroAssembler::pop(Register dst) 1868 { 1869 ldr(dst, Address(post(esp, 1 * wordSize))); 1870 } 1871 1872 // Note: load_unsigned_short used to be called load_unsigned_word. 1873 int MacroAssembler::load_unsigned_short(Register dst, Address src) { 1874 int off = offset(); 1875 ldrh(dst, src); 1876 return off; 1877 } 1878 1879 int MacroAssembler::load_unsigned_byte(Register dst, Address src) { 1880 int off = offset(); 1881 ldrb(dst, src); 1882 return off; 1883 } 1884 1885 int MacroAssembler::load_signed_short(Register dst, Address src) { 1886 int off = offset(); 1887 ldrsh(dst, src); 1888 return off; 1889 } 1890 1891 int MacroAssembler::load_signed_byte(Register dst, Address src) { 1892 int off = offset(); 1893 ldrsb(dst, src); 1894 return off; 1895 } 1896 1897 int MacroAssembler::load_signed_short32(Register dst, Address src) { 1898 int off = offset(); 1899 ldrshw(dst, src); 1900 return off; 1901 } 1902 1903 int MacroAssembler::load_signed_byte32(Register dst, Address src) { 1904 int off = offset(); 1905 ldrsbw(dst, src); 1906 return off; 1907 } 1908 1909 void MacroAssembler::load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed, Register dst2) { 1910 switch (size_in_bytes) { 1911 case 8: ldr(dst, src); break; 1912 case 4: ldrw(dst, src); break; 1913 case 2: is_signed ? load_signed_short(dst, src) : load_unsigned_short(dst, src); break; 1914 case 1: is_signed ? load_signed_byte( dst, src) : load_unsigned_byte( dst, src); break; 1915 default: ShouldNotReachHere(); 1916 } 1917 } 1918 1919 void MacroAssembler::store_sized_value(Address dst, Register src, size_t size_in_bytes, Register src2) { 1920 switch (size_in_bytes) { 1921 case 8: str(src, dst); break; 1922 case 4: strw(src, dst); break; 1923 case 2: strh(src, dst); break; 1924 case 1: strb(src, dst); break; 1925 default: ShouldNotReachHere(); 1926 } 1927 } 1928 1929 void MacroAssembler::decrementw(Register reg, int value) 1930 { 1931 if (value < 0) { incrementw(reg, -value); return; } 1932 if (value == 0) { return; } 1933 if (value < (1 << 12)) { subw(reg, reg, value); return; } 1934 /* else */ { 1935 guarantee(reg != rscratch2, "invalid dst for register decrement"); 1936 movw(rscratch2, (unsigned)value); 1937 subw(reg, reg, rscratch2); 1938 } 1939 } 1940 1941 void MacroAssembler::decrement(Register reg, int value) 1942 { 1943 if (value < 0) { increment(reg, -value); return; } 1944 if (value == 0) { return; } 1945 if (value < (1 << 12)) { sub(reg, reg, value); return; } 1946 /* else */ { 1947 assert(reg != rscratch2, "invalid dst for register decrement"); 1948 mov(rscratch2, (unsigned long)value); 1949 sub(reg, reg, rscratch2); 1950 } 1951 } 1952 1953 void MacroAssembler::decrementw(Address dst, int value) 1954 { 1955 assert(!dst.uses(rscratch1), "invalid dst for address decrement"); 1956 if (dst.getMode() == Address::literal) { 1957 assert(abs(value) < (1 << 12), "invalid value and address mode combination"); 1958 lea(rscratch2, dst); 1959 dst = Address(rscratch2); 1960 } 1961 ldrw(rscratch1, dst); 1962 decrementw(rscratch1, value); 1963 strw(rscratch1, dst); 1964 } 1965 1966 void MacroAssembler::decrement(Address dst, int value) 1967 { 1968 assert(!dst.uses(rscratch1), "invalid address for decrement"); 1969 if (dst.getMode() == Address::literal) { 1970 assert(abs(value) < (1 << 12), "invalid value and address mode combination"); 1971 lea(rscratch2, dst); 1972 dst = Address(rscratch2); 1973 } 1974 ldr(rscratch1, dst); 1975 decrement(rscratch1, value); 1976 str(rscratch1, dst); 1977 } 1978 1979 void MacroAssembler::incrementw(Register reg, int value) 1980 { 1981 if (value < 0) { decrementw(reg, -value); return; } 1982 if (value == 0) { return; } 1983 if (value < (1 << 12)) { addw(reg, reg, value); return; } 1984 /* else */ { 1985 assert(reg != rscratch2, "invalid dst for register increment"); 1986 movw(rscratch2, (unsigned)value); 1987 addw(reg, reg, rscratch2); 1988 } 1989 } 1990 1991 void MacroAssembler::increment(Register reg, int value) 1992 { 1993 if (value < 0) { decrement(reg, -value); return; } 1994 if (value == 0) { return; } 1995 if (value < (1 << 12)) { add(reg, reg, value); return; } 1996 /* else */ { 1997 assert(reg != rscratch2, "invalid dst for register increment"); 1998 movw(rscratch2, (unsigned)value); 1999 add(reg, reg, rscratch2); 2000 } 2001 } 2002 2003 void MacroAssembler::incrementw(Address dst, int value) 2004 { 2005 assert(!dst.uses(rscratch1), "invalid dst for address increment"); 2006 if (dst.getMode() == Address::literal) { 2007 assert(abs(value) < (1 << 12), "invalid value and address mode combination"); 2008 lea(rscratch2, dst); 2009 dst = Address(rscratch2); 2010 } 2011 ldrw(rscratch1, dst); 2012 incrementw(rscratch1, value); 2013 strw(rscratch1, dst); 2014 } 2015 2016 void MacroAssembler::increment(Address dst, int value) 2017 { 2018 assert(!dst.uses(rscratch1), "invalid dst for address increment"); 2019 if (dst.getMode() == Address::literal) { 2020 assert(abs(value) < (1 << 12), "invalid value and address mode combination"); 2021 lea(rscratch2, dst); 2022 dst = Address(rscratch2); 2023 } 2024 ldr(rscratch1, dst); 2025 increment(rscratch1, value); 2026 str(rscratch1, dst); 2027 } 2028 2029 2030 void MacroAssembler::pusha() { 2031 push(0x7fffffff, sp); 2032 } 2033 2034 void MacroAssembler::popa() { 2035 pop(0x7fffffff, sp); 2036 } 2037 2038 // Push lots of registers in the bit set supplied. Don't push sp. 2039 // Return the number of words pushed 2040 int MacroAssembler::push(unsigned int bitset, Register stack) { 2041 int words_pushed = 0; 2042 2043 // Scan bitset to accumulate register pairs 2044 unsigned char regs[32]; 2045 int count = 0; 2046 for (int reg = 0; reg <= 30; reg++) { 2047 if (1 & bitset) 2048 regs[count++] = reg; 2049 bitset >>= 1; 2050 } 2051 regs[count++] = zr->encoding_nocheck(); 2052 count &= ~1; // Only push an even nuber of regs 2053 2054 if (count) { 2055 stp(as_Register(regs[0]), as_Register(regs[1]), 2056 Address(pre(stack, -count * wordSize))); 2057 words_pushed += 2; 2058 } 2059 for (int i = 2; i < count; i += 2) { 2060 stp(as_Register(regs[i]), as_Register(regs[i+1]), 2061 Address(stack, i * wordSize)); 2062 words_pushed += 2; 2063 } 2064 2065 assert(words_pushed == count, "oops, pushed != count"); 2066 2067 return count; 2068 } 2069 2070 int MacroAssembler::pop(unsigned int bitset, Register stack) { 2071 int words_pushed = 0; 2072 2073 // Scan bitset to accumulate register pairs 2074 unsigned char regs[32]; 2075 int count = 0; 2076 for (int reg = 0; reg <= 30; reg++) { 2077 if (1 & bitset) 2078 regs[count++] = reg; 2079 bitset >>= 1; 2080 } 2081 regs[count++] = zr->encoding_nocheck(); 2082 count &= ~1; 2083 2084 for (int i = 2; i < count; i += 2) { 2085 ldp(as_Register(regs[i]), as_Register(regs[i+1]), 2086 Address(stack, i * wordSize)); 2087 words_pushed += 2; 2088 } 2089 if (count) { 2090 ldp(as_Register(regs[0]), as_Register(regs[1]), 2091 Address(post(stack, count * wordSize))); 2092 words_pushed += 2; 2093 } 2094 2095 assert(words_pushed == count, "oops, pushed != count"); 2096 2097 return count; 2098 } 2099 #ifdef ASSERT 2100 void MacroAssembler::verify_heapbase(const char* msg) { 2101 #if 0 2102 assert (UseCompressedOops || UseCompressedClassPointers, "should be compressed"); 2103 assert (Universe::heap() != NULL, "java heap should be initialized"); 2104 if (CheckCompressedOops) { 2105 Label ok; 2106 push(1 << rscratch1->encoding(), sp); // cmpptr trashes rscratch1 2107 cmpptr(rheapbase, ExternalAddress((address)Universe::narrow_ptrs_base_addr())); 2108 br(Assembler::EQ, ok); 2109 stop(msg); 2110 bind(ok); 2111 pop(1 << rscratch1->encoding(), sp); 2112 } 2113 #endif 2114 } 2115 #endif 2116 2117 void MacroAssembler::resolve_jobject(Register value, Register thread, Register tmp) { 2118 Label done, not_weak; 2119 cbz(value, done); // Use NULL as-is. 2120 2121 STATIC_ASSERT(JNIHandles::weak_tag_mask == 1u); 2122 tbz(r0, 0, not_weak); // Test for jweak tag. 2123 2124 // Resolve jweak. 2125 access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF, value, 2126 Address(value, -JNIHandles::weak_tag_value), tmp, thread); 2127 verify_oop(value); 2128 b(done); 2129 2130 bind(not_weak); 2131 // Resolve (untagged) jobject. 2132 access_load_at(T_OBJECT, IN_NATIVE, value, Address(value, 0), tmp, thread); 2133 verify_oop(value); 2134 bind(done); 2135 } 2136 2137 void MacroAssembler::stop(const char* msg) { 2138 address ip = pc(); 2139 pusha(); 2140 mov(c_rarg0, (address)msg); 2141 mov(c_rarg1, (address)ip); 2142 mov(c_rarg2, sp); 2143 mov(c_rarg3, CAST_FROM_FN_PTR(address, MacroAssembler::debug64)); 2144 blr(c_rarg3); 2145 hlt(0); 2146 } 2147 2148 void MacroAssembler::warn(const char* msg) { 2149 pusha(); 2150 mov(c_rarg0, (address)msg); 2151 mov(lr, CAST_FROM_FN_PTR(address, warning)); 2152 blr(lr); 2153 popa(); 2154 } 2155 2156 void MacroAssembler::unimplemented(const char* what) { 2157 const char* buf = NULL; 2158 { 2159 ResourceMark rm; 2160 stringStream ss; 2161 ss.print("unimplemented: %s", what); 2162 buf = code_string(ss.as_string()); 2163 } 2164 stop(buf); 2165 } 2166 2167 // If a constant does not fit in an immediate field, generate some 2168 // number of MOV instructions and then perform the operation. 2169 void MacroAssembler::wrap_add_sub_imm_insn(Register Rd, Register Rn, unsigned imm, 2170 add_sub_imm_insn insn1, 2171 add_sub_reg_insn insn2) { 2172 assert(Rd != zr, "Rd = zr and not setting flags?"); 2173 if (operand_valid_for_add_sub_immediate((int)imm)) { 2174 (this->*insn1)(Rd, Rn, imm); 2175 } else { 2176 if (uabs(imm) < (1 << 24)) { 2177 (this->*insn1)(Rd, Rn, imm & -(1 << 12)); 2178 (this->*insn1)(Rd, Rd, imm & ((1 << 12)-1)); 2179 } else { 2180 assert_different_registers(Rd, Rn); 2181 mov(Rd, (uint64_t)imm); 2182 (this->*insn2)(Rd, Rn, Rd, LSL, 0); 2183 } 2184 } 2185 } 2186 2187 // Seperate vsn which sets the flags. Optimisations are more restricted 2188 // because we must set the flags correctly. 2189 void MacroAssembler::wrap_adds_subs_imm_insn(Register Rd, Register Rn, unsigned imm, 2190 add_sub_imm_insn insn1, 2191 add_sub_reg_insn insn2) { 2192 if (operand_valid_for_add_sub_immediate((int)imm)) { 2193 (this->*insn1)(Rd, Rn, imm); 2194 } else { 2195 assert_different_registers(Rd, Rn); 2196 assert(Rd != zr, "overflow in immediate operand"); 2197 mov(Rd, (uint64_t)imm); 2198 (this->*insn2)(Rd, Rn, Rd, LSL, 0); 2199 } 2200 } 2201 2202 2203 void MacroAssembler::add(Register Rd, Register Rn, RegisterOrConstant increment) { 2204 if (increment.is_register()) { 2205 add(Rd, Rn, increment.as_register()); 2206 } else { 2207 add(Rd, Rn, increment.as_constant()); 2208 } 2209 } 2210 2211 void MacroAssembler::addw(Register Rd, Register Rn, RegisterOrConstant increment) { 2212 if (increment.is_register()) { 2213 addw(Rd, Rn, increment.as_register()); 2214 } else { 2215 addw(Rd, Rn, increment.as_constant()); 2216 } 2217 } 2218 2219 void MacroAssembler::sub(Register Rd, Register Rn, RegisterOrConstant decrement) { 2220 if (decrement.is_register()) { 2221 sub(Rd, Rn, decrement.as_register()); 2222 } else { 2223 sub(Rd, Rn, decrement.as_constant()); 2224 } 2225 } 2226 2227 void MacroAssembler::subw(Register Rd, Register Rn, RegisterOrConstant decrement) { 2228 if (decrement.is_register()) { 2229 subw(Rd, Rn, decrement.as_register()); 2230 } else { 2231 subw(Rd, Rn, decrement.as_constant()); 2232 } 2233 } 2234 2235 void MacroAssembler::reinit_heapbase() 2236 { 2237 if (UseCompressedOops) { 2238 if (Universe::is_fully_initialized()) { 2239 mov(rheapbase, Universe::narrow_ptrs_base()); 2240 } else { 2241 lea(rheapbase, ExternalAddress((address)Universe::narrow_ptrs_base_addr())); 2242 ldr(rheapbase, Address(rheapbase)); 2243 } 2244 } 2245 } 2246 2247 // this simulates the behaviour of the x86 cmpxchg instruction using a 2248 // load linked/store conditional pair. we use the acquire/release 2249 // versions of these instructions so that we flush pending writes as 2250 // per Java semantics. 2251 2252 // n.b the x86 version assumes the old value to be compared against is 2253 // in rax and updates rax with the value located in memory if the 2254 // cmpxchg fails. we supply a register for the old value explicitly 2255 2256 // the aarch64 load linked/store conditional instructions do not 2257 // accept an offset. so, unlike x86, we must provide a plain register 2258 // to identify the memory word to be compared/exchanged rather than a 2259 // register+offset Address. 2260 2261 void MacroAssembler::cmpxchgptr(Register oldv, Register newv, Register addr, Register tmp, 2262 Label &succeed, Label *fail) { 2263 // oldv holds comparison value 2264 // newv holds value to write in exchange 2265 // addr identifies memory word to compare against/update 2266 if (UseLSE) { 2267 mov(tmp, oldv); 2268 casal(Assembler::xword, oldv, newv, addr); 2269 cmp(tmp, oldv); 2270 br(Assembler::EQ, succeed); 2271 membar(AnyAny); 2272 } else { 2273 Label retry_load, nope; 2274 if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH)) 2275 prfm(Address(addr), PSTL1STRM); 2276 bind(retry_load); 2277 // flush and load exclusive from the memory location 2278 // and fail if it is not what we expect 2279 ldaxr(tmp, addr); 2280 cmp(tmp, oldv); 2281 br(Assembler::NE, nope); 2282 // if we store+flush with no intervening write tmp wil be zero 2283 stlxr(tmp, newv, addr); 2284 cbzw(tmp, succeed); 2285 // retry so we only ever return after a load fails to compare 2286 // ensures we don't return a stale value after a failed write. 2287 b(retry_load); 2288 // if the memory word differs we return it in oldv and signal a fail 2289 bind(nope); 2290 membar(AnyAny); 2291 mov(oldv, tmp); 2292 } 2293 if (fail) 2294 b(*fail); 2295 } 2296 2297 void MacroAssembler::cmpxchg_obj_header(Register oldv, Register newv, Register obj, Register tmp, 2298 Label &succeed, Label *fail) { 2299 assert(oopDesc::mark_offset_in_bytes() == 0, "assumption"); 2300 cmpxchgptr(oldv, newv, obj, tmp, succeed, fail); 2301 } 2302 2303 void MacroAssembler::cmpxchgw(Register oldv, Register newv, Register addr, Register tmp, 2304 Label &succeed, Label *fail) { 2305 // oldv holds comparison value 2306 // newv holds value to write in exchange 2307 // addr identifies memory word to compare against/update 2308 // tmp returns 0/1 for success/failure 2309 if (UseLSE) { 2310 mov(tmp, oldv); 2311 casal(Assembler::word, oldv, newv, addr); 2312 cmp(tmp, oldv); 2313 br(Assembler::EQ, succeed); 2314 membar(AnyAny); 2315 } else { 2316 Label retry_load, nope; 2317 if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH)) 2318 prfm(Address(addr), PSTL1STRM); 2319 bind(retry_load); 2320 // flush and load exclusive from the memory location 2321 // and fail if it is not what we expect 2322 ldaxrw(tmp, addr); 2323 cmp(tmp, oldv); 2324 br(Assembler::NE, nope); 2325 // if we store+flush with no intervening write tmp wil be zero 2326 stlxrw(tmp, newv, addr); 2327 cbzw(tmp, succeed); 2328 // retry so we only ever return after a load fails to compare 2329 // ensures we don't return a stale value after a failed write. 2330 b(retry_load); 2331 // if the memory word differs we return it in oldv and signal a fail 2332 bind(nope); 2333 membar(AnyAny); 2334 mov(oldv, tmp); 2335 } 2336 if (fail) 2337 b(*fail); 2338 } 2339 2340 // A generic CAS; success or failure is in the EQ flag. A weak CAS 2341 // doesn't retry and may fail spuriously. If the oldval is wanted, 2342 // Pass a register for the result, otherwise pass noreg. 2343 2344 // Clobbers rscratch1 2345 void MacroAssembler::cmpxchg(Register addr, Register expected, 2346 Register new_val, 2347 enum operand_size size, 2348 bool acquire, bool release, 2349 bool weak, 2350 Register result) { 2351 if (result == noreg) result = rscratch1; 2352 BLOCK_COMMENT("cmpxchg {"); 2353 if (UseLSE) { 2354 mov(result, expected); 2355 lse_cas(result, new_val, addr, size, acquire, release, /*not_pair*/ true); 2356 compare_eq(result, expected, size); 2357 } else { 2358 Label retry_load, done; 2359 if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH)) 2360 prfm(Address(addr), PSTL1STRM); 2361 bind(retry_load); 2362 load_exclusive(result, addr, size, acquire); 2363 compare_eq(result, expected, size); 2364 br(Assembler::NE, done); 2365 store_exclusive(rscratch1, new_val, addr, size, release); 2366 if (weak) { 2367 cmpw(rscratch1, 0u); // If the store fails, return NE to our caller. 2368 } else { 2369 cbnzw(rscratch1, retry_load); 2370 } 2371 bind(done); 2372 } 2373 BLOCK_COMMENT("} cmpxchg"); 2374 } 2375 2376 // A generic comparison. Only compares for equality, clobbers rscratch1. 2377 void MacroAssembler::compare_eq(Register rm, Register rn, enum operand_size size) { 2378 if (size == xword) { 2379 cmp(rm, rn); 2380 } else if (size == word) { 2381 cmpw(rm, rn); 2382 } else if (size == halfword) { 2383 eorw(rscratch1, rm, rn); 2384 ands(zr, rscratch1, 0xffff); 2385 } else if (size == byte) { 2386 eorw(rscratch1, rm, rn); 2387 ands(zr, rscratch1, 0xff); 2388 } else { 2389 ShouldNotReachHere(); 2390 } 2391 } 2392 2393 2394 static bool different(Register a, RegisterOrConstant b, Register c) { 2395 if (b.is_constant()) 2396 return a != c; 2397 else 2398 return a != b.as_register() && a != c && b.as_register() != c; 2399 } 2400 2401 #define ATOMIC_OP(NAME, LDXR, OP, IOP, AOP, STXR, sz) \ 2402 void MacroAssembler::atomic_##NAME(Register prev, RegisterOrConstant incr, Register addr) { \ 2403 if (UseLSE) { \ 2404 prev = prev->is_valid() ? prev : zr; \ 2405 if (incr.is_register()) { \ 2406 AOP(sz, incr.as_register(), prev, addr); \ 2407 } else { \ 2408 mov(rscratch2, incr.as_constant()); \ 2409 AOP(sz, rscratch2, prev, addr); \ 2410 } \ 2411 return; \ 2412 } \ 2413 Register result = rscratch2; \ 2414 if (prev->is_valid()) \ 2415 result = different(prev, incr, addr) ? prev : rscratch2; \ 2416 \ 2417 Label retry_load; \ 2418 if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH)) \ 2419 prfm(Address(addr), PSTL1STRM); \ 2420 bind(retry_load); \ 2421 LDXR(result, addr); \ 2422 OP(rscratch1, result, incr); \ 2423 STXR(rscratch2, rscratch1, addr); \ 2424 cbnzw(rscratch2, retry_load); \ 2425 if (prev->is_valid() && prev != result) { \ 2426 IOP(prev, rscratch1, incr); \ 2427 } \ 2428 } 2429 2430 ATOMIC_OP(add, ldxr, add, sub, ldadd, stxr, Assembler::xword) 2431 ATOMIC_OP(addw, ldxrw, addw, subw, ldadd, stxrw, Assembler::word) 2432 ATOMIC_OP(addal, ldaxr, add, sub, ldaddal, stlxr, Assembler::xword) 2433 ATOMIC_OP(addalw, ldaxrw, addw, subw, ldaddal, stlxrw, Assembler::word) 2434 2435 #undef ATOMIC_OP 2436 2437 #define ATOMIC_XCHG(OP, AOP, LDXR, STXR, sz) \ 2438 void MacroAssembler::atomic_##OP(Register prev, Register newv, Register addr) { \ 2439 if (UseLSE) { \ 2440 prev = prev->is_valid() ? prev : zr; \ 2441 AOP(sz, newv, prev, addr); \ 2442 return; \ 2443 } \ 2444 Register result = rscratch2; \ 2445 if (prev->is_valid()) \ 2446 result = different(prev, newv, addr) ? prev : rscratch2; \ 2447 \ 2448 Label retry_load; \ 2449 if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH)) \ 2450 prfm(Address(addr), PSTL1STRM); \ 2451 bind(retry_load); \ 2452 LDXR(result, addr); \ 2453 STXR(rscratch1, newv, addr); \ 2454 cbnzw(rscratch1, retry_load); \ 2455 if (prev->is_valid() && prev != result) \ 2456 mov(prev, result); \ 2457 } 2458 2459 ATOMIC_XCHG(xchg, swp, ldxr, stxr, Assembler::xword) 2460 ATOMIC_XCHG(xchgw, swp, ldxrw, stxrw, Assembler::word) 2461 ATOMIC_XCHG(xchgal, swpal, ldaxr, stlxr, Assembler::xword) 2462 ATOMIC_XCHG(xchgalw, swpal, ldaxrw, stlxrw, Assembler::word) 2463 2464 #undef ATOMIC_XCHG 2465 2466 #ifndef PRODUCT 2467 extern "C" void findpc(intptr_t x); 2468 #endif 2469 2470 void MacroAssembler::debug64(char* msg, int64_t pc, int64_t regs[]) 2471 { 2472 // In order to get locks to work, we need to fake a in_VM state 2473 if (ShowMessageBoxOnError ) { 2474 JavaThread* thread = JavaThread::current(); 2475 JavaThreadState saved_state = thread->thread_state(); 2476 thread->set_thread_state(_thread_in_vm); 2477 #ifndef PRODUCT 2478 if (CountBytecodes || TraceBytecodes || StopInterpreterAt) { 2479 ttyLocker ttyl; 2480 BytecodeCounter::print(); 2481 } 2482 #endif 2483 if (os::message_box(msg, "Execution stopped, print registers?")) { 2484 ttyLocker ttyl; 2485 tty->print_cr(" pc = 0x%016lx", pc); 2486 #ifndef PRODUCT 2487 tty->cr(); 2488 findpc(pc); 2489 tty->cr(); 2490 #endif 2491 tty->print_cr(" r0 = 0x%016lx", regs[0]); 2492 tty->print_cr(" r1 = 0x%016lx", regs[1]); 2493 tty->print_cr(" r2 = 0x%016lx", regs[2]); 2494 tty->print_cr(" r3 = 0x%016lx", regs[3]); 2495 tty->print_cr(" r4 = 0x%016lx", regs[4]); 2496 tty->print_cr(" r5 = 0x%016lx", regs[5]); 2497 tty->print_cr(" r6 = 0x%016lx", regs[6]); 2498 tty->print_cr(" r7 = 0x%016lx", regs[7]); 2499 tty->print_cr(" r8 = 0x%016lx", regs[8]); 2500 tty->print_cr(" r9 = 0x%016lx", regs[9]); 2501 tty->print_cr("r10 = 0x%016lx", regs[10]); 2502 tty->print_cr("r11 = 0x%016lx", regs[11]); 2503 tty->print_cr("r12 = 0x%016lx", regs[12]); 2504 tty->print_cr("r13 = 0x%016lx", regs[13]); 2505 tty->print_cr("r14 = 0x%016lx", regs[14]); 2506 tty->print_cr("r15 = 0x%016lx", regs[15]); 2507 tty->print_cr("r16 = 0x%016lx", regs[16]); 2508 tty->print_cr("r17 = 0x%016lx", regs[17]); 2509 tty->print_cr("r18 = 0x%016lx", regs[18]); 2510 tty->print_cr("r19 = 0x%016lx", regs[19]); 2511 tty->print_cr("r20 = 0x%016lx", regs[20]); 2512 tty->print_cr("r21 = 0x%016lx", regs[21]); 2513 tty->print_cr("r22 = 0x%016lx", regs[22]); 2514 tty->print_cr("r23 = 0x%016lx", regs[23]); 2515 tty->print_cr("r24 = 0x%016lx", regs[24]); 2516 tty->print_cr("r25 = 0x%016lx", regs[25]); 2517 tty->print_cr("r26 = 0x%016lx", regs[26]); 2518 tty->print_cr("r27 = 0x%016lx", regs[27]); 2519 tty->print_cr("r28 = 0x%016lx", regs[28]); 2520 tty->print_cr("r30 = 0x%016lx", regs[30]); 2521 tty->print_cr("r31 = 0x%016lx", regs[31]); 2522 BREAKPOINT; 2523 } 2524 ThreadStateTransition::transition(thread, _thread_in_vm, saved_state); 2525 } else { 2526 ttyLocker ttyl; 2527 ::tty->print_cr("=============== DEBUG MESSAGE: %s ================\n", 2528 msg); 2529 assert(false, "DEBUG MESSAGE: %s", msg); 2530 } 2531 } 2532 2533 void MacroAssembler::push_call_clobbered_registers() { 2534 int step = 4 * wordSize; 2535 push(RegSet::range(r0, r18) - RegSet::of(rscratch1, rscratch2), sp); 2536 sub(sp, sp, step); 2537 mov(rscratch1, -step); 2538 // Push v0-v7, v16-v31. 2539 for (int i = 31; i>= 4; i -= 4) { 2540 if (i <= v7->encoding() || i >= v16->encoding()) 2541 st1(as_FloatRegister(i-3), as_FloatRegister(i-2), as_FloatRegister(i-1), 2542 as_FloatRegister(i), T1D, Address(post(sp, rscratch1))); 2543 } 2544 st1(as_FloatRegister(0), as_FloatRegister(1), as_FloatRegister(2), 2545 as_FloatRegister(3), T1D, Address(sp)); 2546 } 2547 2548 void MacroAssembler::pop_call_clobbered_registers() { 2549 for (int i = 0; i < 32; i += 4) { 2550 if (i <= v7->encoding() || i >= v16->encoding()) 2551 ld1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2), 2552 as_FloatRegister(i+3), T1D, Address(post(sp, 4 * wordSize))); 2553 } 2554 2555 pop(RegSet::range(r0, r18) - RegSet::of(rscratch1, rscratch2), sp); 2556 } 2557 2558 void MacroAssembler::push_CPU_state(bool save_vectors) { 2559 int step = (save_vectors ? 8 : 4) * wordSize; 2560 push(0x3fffffff, sp); // integer registers except lr & sp 2561 mov(rscratch1, -step); 2562 sub(sp, sp, step); 2563 for (int i = 28; i >= 4; i -= 4) { 2564 st1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2), 2565 as_FloatRegister(i+3), save_vectors ? T2D : T1D, Address(post(sp, rscratch1))); 2566 } 2567 st1(v0, v1, v2, v3, save_vectors ? T2D : T1D, sp); 2568 } 2569 2570 void MacroAssembler::pop_CPU_state(bool restore_vectors) { 2571 int step = (restore_vectors ? 8 : 4) * wordSize; 2572 for (int i = 0; i <= 28; i += 4) 2573 ld1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2), 2574 as_FloatRegister(i+3), restore_vectors ? T2D : T1D, Address(post(sp, step))); 2575 pop(0x3fffffff, sp); // integer registers except lr & sp 2576 } 2577 2578 /** 2579 * Helpers for multiply_to_len(). 2580 */ 2581 void MacroAssembler::add2_with_carry(Register final_dest_hi, Register dest_hi, Register dest_lo, 2582 Register src1, Register src2) { 2583 adds(dest_lo, dest_lo, src1); 2584 adc(dest_hi, dest_hi, zr); 2585 adds(dest_lo, dest_lo, src2); 2586 adc(final_dest_hi, dest_hi, zr); 2587 } 2588 2589 // Generate an address from (r + r1 extend offset). "size" is the 2590 // size of the operand. The result may be in rscratch2. 2591 Address MacroAssembler::offsetted_address(Register r, Register r1, 2592 Address::extend ext, int offset, int size) { 2593 if (offset || (ext.shift() % size != 0)) { 2594 lea(rscratch2, Address(r, r1, ext)); 2595 return Address(rscratch2, offset); 2596 } else { 2597 return Address(r, r1, ext); 2598 } 2599 } 2600 2601 Address MacroAssembler::spill_address(int size, int offset, Register tmp) 2602 { 2603 assert(offset >= 0, "spill to negative address?"); 2604 // Offset reachable ? 2605 // Not aligned - 9 bits signed offset 2606 // Aligned - 12 bits unsigned offset shifted 2607 Register base = sp; 2608 if ((offset & (size-1)) && offset >= (1<<8)) { 2609 add(tmp, base, offset & ((1<<12)-1)); 2610 base = tmp; 2611 offset &= -1u<<12; 2612 } 2613 2614 if (offset >= (1<<12) * size) { 2615 add(tmp, base, offset & (((1<<12)-1)<<12)); 2616 base = tmp; 2617 offset &= ~(((1<<12)-1)<<12); 2618 } 2619 2620 return Address(base, offset); 2621 } 2622 2623 // Checks whether offset is aligned. 2624 // Returns true if it is, else false. 2625 bool MacroAssembler::merge_alignment_check(Register base, 2626 size_t size, 2627 long cur_offset, 2628 long prev_offset) const { 2629 if (AvoidUnalignedAccesses) { 2630 if (base == sp) { 2631 // Checks whether low offset if aligned to pair of registers. 2632 long pair_mask = size * 2 - 1; 2633 long offset = prev_offset > cur_offset ? cur_offset : prev_offset; 2634 return (offset & pair_mask) == 0; 2635 } else { // If base is not sp, we can't guarantee the access is aligned. 2636 return false; 2637 } 2638 } else { 2639 long mask = size - 1; 2640 // Load/store pair instruction only supports element size aligned offset. 2641 return (cur_offset & mask) == 0 && (prev_offset & mask) == 0; 2642 } 2643 } 2644 2645 // Checks whether current and previous loads/stores can be merged. 2646 // Returns true if it can be merged, else false. 2647 bool MacroAssembler::ldst_can_merge(Register rt, 2648 const Address &adr, 2649 size_t cur_size_in_bytes, 2650 bool is_store) const { 2651 address prev = pc() - NativeInstruction::instruction_size; 2652 address last = code()->last_insn(); 2653 2654 if (last == NULL || !nativeInstruction_at(last)->is_Imm_LdSt()) { 2655 return false; 2656 } 2657 2658 if (adr.getMode() != Address::base_plus_offset || prev != last) { 2659 return false; 2660 } 2661 2662 NativeLdSt* prev_ldst = NativeLdSt_at(prev); 2663 size_t prev_size_in_bytes = prev_ldst->size_in_bytes(); 2664 2665 assert(prev_size_in_bytes == 4 || prev_size_in_bytes == 8, "only supports 64/32bit merging."); 2666 assert(cur_size_in_bytes == 4 || cur_size_in_bytes == 8, "only supports 64/32bit merging."); 2667 2668 if (cur_size_in_bytes != prev_size_in_bytes || is_store != prev_ldst->is_store()) { 2669 return false; 2670 } 2671 2672 long max_offset = 63 * prev_size_in_bytes; 2673 long min_offset = -64 * prev_size_in_bytes; 2674 2675 assert(prev_ldst->is_not_pre_post_index(), "pre-index or post-index is not supported to be merged."); 2676 2677 // Only same base can be merged. 2678 if (adr.base() != prev_ldst->base()) { 2679 return false; 2680 } 2681 2682 long cur_offset = adr.offset(); 2683 long prev_offset = prev_ldst->offset(); 2684 size_t diff = abs(cur_offset - prev_offset); 2685 if (diff != prev_size_in_bytes) { 2686 return false; 2687 } 2688 2689 // Following cases can not be merged: 2690 // ldr x2, [x2, #8] 2691 // ldr x3, [x2, #16] 2692 // or: 2693 // ldr x2, [x3, #8] 2694 // ldr x2, [x3, #16] 2695 // If t1 and t2 is the same in "ldp t1, t2, [xn, #imm]", we'll get SIGILL. 2696 if (!is_store && (adr.base() == prev_ldst->target() || rt == prev_ldst->target())) { 2697 return false; 2698 } 2699 2700 long low_offset = prev_offset > cur_offset ? cur_offset : prev_offset; 2701 // Offset range must be in ldp/stp instruction's range. 2702 if (low_offset > max_offset || low_offset < min_offset) { 2703 return false; 2704 } 2705 2706 if (merge_alignment_check(adr.base(), prev_size_in_bytes, cur_offset, prev_offset)) { 2707 return true; 2708 } 2709 2710 return false; 2711 } 2712 2713 // Merge current load/store with previous load/store into ldp/stp. 2714 void MacroAssembler::merge_ldst(Register rt, 2715 const Address &adr, 2716 size_t cur_size_in_bytes, 2717 bool is_store) { 2718 2719 assert(ldst_can_merge(rt, adr, cur_size_in_bytes, is_store) == true, "cur and prev must be able to be merged."); 2720 2721 Register rt_low, rt_high; 2722 address prev = pc() - NativeInstruction::instruction_size; 2723 NativeLdSt* prev_ldst = NativeLdSt_at(prev); 2724 2725 long offset; 2726 2727 if (adr.offset() < prev_ldst->offset()) { 2728 offset = adr.offset(); 2729 rt_low = rt; 2730 rt_high = prev_ldst->target(); 2731 } else { 2732 offset = prev_ldst->offset(); 2733 rt_low = prev_ldst->target(); 2734 rt_high = rt; 2735 } 2736 2737 Address adr_p = Address(prev_ldst->base(), offset); 2738 // Overwrite previous generated binary. 2739 code_section()->set_end(prev); 2740 2741 const int sz = prev_ldst->size_in_bytes(); 2742 assert(sz == 8 || sz == 4, "only supports 64/32bit merging."); 2743 if (!is_store) { 2744 BLOCK_COMMENT("merged ldr pair"); 2745 if (sz == 8) { 2746 ldp(rt_low, rt_high, adr_p); 2747 } else { 2748 ldpw(rt_low, rt_high, adr_p); 2749 } 2750 } else { 2751 BLOCK_COMMENT("merged str pair"); 2752 if (sz == 8) { 2753 stp(rt_low, rt_high, adr_p); 2754 } else { 2755 stpw(rt_low, rt_high, adr_p); 2756 } 2757 } 2758 } 2759 2760 /** 2761 * Multiply 64 bit by 64 bit first loop. 2762 */ 2763 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart, 2764 Register y, Register y_idx, Register z, 2765 Register carry, Register product, 2766 Register idx, Register kdx) { 2767 // 2768 // jlong carry, x[], y[], z[]; 2769 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) { 2770 // huge_128 product = y[idx] * x[xstart] + carry; 2771 // z[kdx] = (jlong)product; 2772 // carry = (jlong)(product >>> 64); 2773 // } 2774 // z[xstart] = carry; 2775 // 2776 2777 Label L_first_loop, L_first_loop_exit; 2778 Label L_one_x, L_one_y, L_multiply; 2779 2780 subsw(xstart, xstart, 1); 2781 br(Assembler::MI, L_one_x); 2782 2783 lea(rscratch1, Address(x, xstart, Address::lsl(LogBytesPerInt))); 2784 ldr(x_xstart, Address(rscratch1)); 2785 ror(x_xstart, x_xstart, 32); // convert big-endian to little-endian 2786 2787 bind(L_first_loop); 2788 subsw(idx, idx, 1); 2789 br(Assembler::MI, L_first_loop_exit); 2790 subsw(idx, idx, 1); 2791 br(Assembler::MI, L_one_y); 2792 lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt))); 2793 ldr(y_idx, Address(rscratch1)); 2794 ror(y_idx, y_idx, 32); // convert big-endian to little-endian 2795 bind(L_multiply); 2796 2797 // AArch64 has a multiply-accumulate instruction that we can't use 2798 // here because it has no way to process carries, so we have to use 2799 // separate add and adc instructions. Bah. 2800 umulh(rscratch1, x_xstart, y_idx); // x_xstart * y_idx -> rscratch1:product 2801 mul(product, x_xstart, y_idx); 2802 adds(product, product, carry); 2803 adc(carry, rscratch1, zr); // x_xstart * y_idx + carry -> carry:product 2804 2805 subw(kdx, kdx, 2); 2806 ror(product, product, 32); // back to big-endian 2807 str(product, offsetted_address(z, kdx, Address::uxtw(LogBytesPerInt), 0, BytesPerLong)); 2808 2809 b(L_first_loop); 2810 2811 bind(L_one_y); 2812 ldrw(y_idx, Address(y, 0)); 2813 b(L_multiply); 2814 2815 bind(L_one_x); 2816 ldrw(x_xstart, Address(x, 0)); 2817 b(L_first_loop); 2818 2819 bind(L_first_loop_exit); 2820 } 2821 2822 /** 2823 * Multiply 128 bit by 128. Unrolled inner loop. 2824 * 2825 */ 2826 void MacroAssembler::multiply_128_x_128_loop(Register y, Register z, 2827 Register carry, Register carry2, 2828 Register idx, Register jdx, 2829 Register yz_idx1, Register yz_idx2, 2830 Register tmp, Register tmp3, Register tmp4, 2831 Register tmp6, Register product_hi) { 2832 2833 // jlong carry, x[], y[], z[]; 2834 // int kdx = ystart+1; 2835 // for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop 2836 // huge_128 tmp3 = (y[idx+1] * product_hi) + z[kdx+idx+1] + carry; 2837 // jlong carry2 = (jlong)(tmp3 >>> 64); 2838 // huge_128 tmp4 = (y[idx] * product_hi) + z[kdx+idx] + carry2; 2839 // carry = (jlong)(tmp4 >>> 64); 2840 // z[kdx+idx+1] = (jlong)tmp3; 2841 // z[kdx+idx] = (jlong)tmp4; 2842 // } 2843 // idx += 2; 2844 // if (idx > 0) { 2845 // yz_idx1 = (y[idx] * product_hi) + z[kdx+idx] + carry; 2846 // z[kdx+idx] = (jlong)yz_idx1; 2847 // carry = (jlong)(yz_idx1 >>> 64); 2848 // } 2849 // 2850 2851 Label L_third_loop, L_third_loop_exit, L_post_third_loop_done; 2852 2853 lsrw(jdx, idx, 2); 2854 2855 bind(L_third_loop); 2856 2857 subsw(jdx, jdx, 1); 2858 br(Assembler::MI, L_third_loop_exit); 2859 subw(idx, idx, 4); 2860 2861 lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt))); 2862 2863 ldp(yz_idx2, yz_idx1, Address(rscratch1, 0)); 2864 2865 lea(tmp6, Address(z, idx, Address::uxtw(LogBytesPerInt))); 2866 2867 ror(yz_idx1, yz_idx1, 32); // convert big-endian to little-endian 2868 ror(yz_idx2, yz_idx2, 32); 2869 2870 ldp(rscratch2, rscratch1, Address(tmp6, 0)); 2871 2872 mul(tmp3, product_hi, yz_idx1); // yz_idx1 * product_hi -> tmp4:tmp3 2873 umulh(tmp4, product_hi, yz_idx1); 2874 2875 ror(rscratch1, rscratch1, 32); // convert big-endian to little-endian 2876 ror(rscratch2, rscratch2, 32); 2877 2878 mul(tmp, product_hi, yz_idx2); // yz_idx2 * product_hi -> carry2:tmp 2879 umulh(carry2, product_hi, yz_idx2); 2880 2881 // propagate sum of both multiplications into carry:tmp4:tmp3 2882 adds(tmp3, tmp3, carry); 2883 adc(tmp4, tmp4, zr); 2884 adds(tmp3, tmp3, rscratch1); 2885 adcs(tmp4, tmp4, tmp); 2886 adc(carry, carry2, zr); 2887 adds(tmp4, tmp4, rscratch2); 2888 adc(carry, carry, zr); 2889 2890 ror(tmp3, tmp3, 32); // convert little-endian to big-endian 2891 ror(tmp4, tmp4, 32); 2892 stp(tmp4, tmp3, Address(tmp6, 0)); 2893 2894 b(L_third_loop); 2895 bind (L_third_loop_exit); 2896 2897 andw (idx, idx, 0x3); 2898 cbz(idx, L_post_third_loop_done); 2899 2900 Label L_check_1; 2901 subsw(idx, idx, 2); 2902 br(Assembler::MI, L_check_1); 2903 2904 lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt))); 2905 ldr(yz_idx1, Address(rscratch1, 0)); 2906 ror(yz_idx1, yz_idx1, 32); 2907 mul(tmp3, product_hi, yz_idx1); // yz_idx1 * product_hi -> tmp4:tmp3 2908 umulh(tmp4, product_hi, yz_idx1); 2909 lea(rscratch1, Address(z, idx, Address::uxtw(LogBytesPerInt))); 2910 ldr(yz_idx2, Address(rscratch1, 0)); 2911 ror(yz_idx2, yz_idx2, 32); 2912 2913 add2_with_carry(carry, tmp4, tmp3, carry, yz_idx2); 2914 2915 ror(tmp3, tmp3, 32); 2916 str(tmp3, Address(rscratch1, 0)); 2917 2918 bind (L_check_1); 2919 2920 andw (idx, idx, 0x1); 2921 subsw(idx, idx, 1); 2922 br(Assembler::MI, L_post_third_loop_done); 2923 ldrw(tmp4, Address(y, idx, Address::uxtw(LogBytesPerInt))); 2924 mul(tmp3, tmp4, product_hi); // tmp4 * product_hi -> carry2:tmp3 2925 umulh(carry2, tmp4, product_hi); 2926 ldrw(tmp4, Address(z, idx, Address::uxtw(LogBytesPerInt))); 2927 2928 add2_with_carry(carry2, tmp3, tmp4, carry); 2929 2930 strw(tmp3, Address(z, idx, Address::uxtw(LogBytesPerInt))); 2931 extr(carry, carry2, tmp3, 32); 2932 2933 bind(L_post_third_loop_done); 2934 } 2935 2936 /** 2937 * Code for BigInteger::multiplyToLen() instrinsic. 2938 * 2939 * r0: x 2940 * r1: xlen 2941 * r2: y 2942 * r3: ylen 2943 * r4: z 2944 * r5: zlen 2945 * r10: tmp1 2946 * r11: tmp2 2947 * r12: tmp3 2948 * r13: tmp4 2949 * r14: tmp5 2950 * r15: tmp6 2951 * r16: tmp7 2952 * 2953 */ 2954 void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Register ylen, 2955 Register z, Register zlen, 2956 Register tmp1, Register tmp2, Register tmp3, Register tmp4, 2957 Register tmp5, Register tmp6, Register product_hi) { 2958 2959 assert_different_registers(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6); 2960 2961 const Register idx = tmp1; 2962 const Register kdx = tmp2; 2963 const Register xstart = tmp3; 2964 2965 const Register y_idx = tmp4; 2966 const Register carry = tmp5; 2967 const Register product = xlen; 2968 const Register x_xstart = zlen; // reuse register 2969 2970 // First Loop. 2971 // 2972 // final static long LONG_MASK = 0xffffffffL; 2973 // int xstart = xlen - 1; 2974 // int ystart = ylen - 1; 2975 // long carry = 0; 2976 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) { 2977 // long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry; 2978 // z[kdx] = (int)product; 2979 // carry = product >>> 32; 2980 // } 2981 // z[xstart] = (int)carry; 2982 // 2983 2984 movw(idx, ylen); // idx = ylen; 2985 movw(kdx, zlen); // kdx = xlen+ylen; 2986 mov(carry, zr); // carry = 0; 2987 2988 Label L_done; 2989 2990 movw(xstart, xlen); 2991 subsw(xstart, xstart, 1); 2992 br(Assembler::MI, L_done); 2993 2994 multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, carry, product, idx, kdx); 2995 2996 Label L_second_loop; 2997 cbzw(kdx, L_second_loop); 2998 2999 Label L_carry; 3000 subw(kdx, kdx, 1); 3001 cbzw(kdx, L_carry); 3002 3003 strw(carry, Address(z, kdx, Address::uxtw(LogBytesPerInt))); 3004 lsr(carry, carry, 32); 3005 subw(kdx, kdx, 1); 3006 3007 bind(L_carry); 3008 strw(carry, Address(z, kdx, Address::uxtw(LogBytesPerInt))); 3009 3010 // Second and third (nested) loops. 3011 // 3012 // for (int i = xstart-1; i >= 0; i--) { // Second loop 3013 // carry = 0; 3014 // for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop 3015 // long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) + 3016 // (z[k] & LONG_MASK) + carry; 3017 // z[k] = (int)product; 3018 // carry = product >>> 32; 3019 // } 3020 // z[i] = (int)carry; 3021 // } 3022 // 3023 // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = product_hi 3024 3025 const Register jdx = tmp1; 3026 3027 bind(L_second_loop); 3028 mov(carry, zr); // carry = 0; 3029 movw(jdx, ylen); // j = ystart+1 3030 3031 subsw(xstart, xstart, 1); // i = xstart-1; 3032 br(Assembler::MI, L_done); 3033 3034 str(z, Address(pre(sp, -4 * wordSize))); 3035 3036 Label L_last_x; 3037 lea(z, offsetted_address(z, xstart, Address::uxtw(LogBytesPerInt), 4, BytesPerInt)); // z = z + k - j 3038 subsw(xstart, xstart, 1); // i = xstart-1; 3039 br(Assembler::MI, L_last_x); 3040 3041 lea(rscratch1, Address(x, xstart, Address::uxtw(LogBytesPerInt))); 3042 ldr(product_hi, Address(rscratch1)); 3043 ror(product_hi, product_hi, 32); // convert big-endian to little-endian 3044 3045 Label L_third_loop_prologue; 3046 bind(L_third_loop_prologue); 3047 3048 str(ylen, Address(sp, wordSize)); 3049 stp(x, xstart, Address(sp, 2 * wordSize)); 3050 multiply_128_x_128_loop(y, z, carry, x, jdx, ylen, product, 3051 tmp2, x_xstart, tmp3, tmp4, tmp6, product_hi); 3052 ldp(z, ylen, Address(post(sp, 2 * wordSize))); 3053 ldp(x, xlen, Address(post(sp, 2 * wordSize))); // copy old xstart -> xlen 3054 3055 addw(tmp3, xlen, 1); 3056 strw(carry, Address(z, tmp3, Address::uxtw(LogBytesPerInt))); 3057 subsw(tmp3, tmp3, 1); 3058 br(Assembler::MI, L_done); 3059 3060 lsr(carry, carry, 32); 3061 strw(carry, Address(z, tmp3, Address::uxtw(LogBytesPerInt))); 3062 b(L_second_loop); 3063 3064 // Next infrequent code is moved outside loops. 3065 bind(L_last_x); 3066 ldrw(product_hi, Address(x, 0)); 3067 b(L_third_loop_prologue); 3068 3069 bind(L_done); 3070 } 3071 3072 // Code for BigInteger::mulAdd instrinsic 3073 // out = r0 3074 // in = r1 3075 // offset = r2 (already out.length-offset) 3076 // len = r3 3077 // k = r4 3078 // 3079 // pseudo code from java implementation: 3080 // carry = 0; 3081 // offset = out.length-offset - 1; 3082 // for (int j=len-1; j >= 0; j--) { 3083 // product = (in[j] & LONG_MASK) * kLong + (out[offset] & LONG_MASK) + carry; 3084 // out[offset--] = (int)product; 3085 // carry = product >>> 32; 3086 // } 3087 // return (int)carry; 3088 void MacroAssembler::mul_add(Register out, Register in, Register offset, 3089 Register len, Register k) { 3090 Label LOOP, END; 3091 // pre-loop 3092 cmp(len, zr); // cmp, not cbz/cbnz: to use condition twice => less branches 3093 csel(out, zr, out, Assembler::EQ); 3094 br(Assembler::EQ, END); 3095 add(in, in, len, LSL, 2); // in[j+1] address 3096 add(offset, out, offset, LSL, 2); // out[offset + 1] address 3097 mov(out, zr); // used to keep carry now 3098 BIND(LOOP); 3099 ldrw(rscratch1, Address(pre(in, -4))); 3100 madd(rscratch1, rscratch1, k, out); 3101 ldrw(rscratch2, Address(pre(offset, -4))); 3102 add(rscratch1, rscratch1, rscratch2); 3103 strw(rscratch1, Address(offset)); 3104 lsr(out, rscratch1, 32); 3105 subs(len, len, 1); 3106 br(Assembler::NE, LOOP); 3107 BIND(END); 3108 } 3109 3110 /** 3111 * Emits code to update CRC-32 with a byte value according to constants in table 3112 * 3113 * @param [in,out]crc Register containing the crc. 3114 * @param [in]val Register containing the byte to fold into the CRC. 3115 * @param [in]table Register containing the table of crc constants. 3116 * 3117 * uint32_t crc; 3118 * val = crc_table[(val ^ crc) & 0xFF]; 3119 * crc = val ^ (crc >> 8); 3120 * 3121 */ 3122 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) { 3123 eor(val, val, crc); 3124 andr(val, val, 0xff); 3125 ldrw(val, Address(table, val, Address::lsl(2))); 3126 eor(crc, val, crc, Assembler::LSR, 8); 3127 } 3128 3129 /** 3130 * Emits code to update CRC-32 with a 32-bit value according to tables 0 to 3 3131 * 3132 * @param [in,out]crc Register containing the crc. 3133 * @param [in]v Register containing the 32-bit to fold into the CRC. 3134 * @param [in]table0 Register containing table 0 of crc constants. 3135 * @param [in]table1 Register containing table 1 of crc constants. 3136 * @param [in]table2 Register containing table 2 of crc constants. 3137 * @param [in]table3 Register containing table 3 of crc constants. 3138 * 3139 * uint32_t crc; 3140 * v = crc ^ v 3141 * crc = table3[v&0xff]^table2[(v>>8)&0xff]^table1[(v>>16)&0xff]^table0[v>>24] 3142 * 3143 */ 3144 void MacroAssembler::update_word_crc32(Register crc, Register v, Register tmp, 3145 Register table0, Register table1, Register table2, Register table3, 3146 bool upper) { 3147 eor(v, crc, v, upper ? LSR:LSL, upper ? 32:0); 3148 uxtb(tmp, v); 3149 ldrw(crc, Address(table3, tmp, Address::lsl(2))); 3150 ubfx(tmp, v, 8, 8); 3151 ldrw(tmp, Address(table2, tmp, Address::lsl(2))); 3152 eor(crc, crc, tmp); 3153 ubfx(tmp, v, 16, 8); 3154 ldrw(tmp, Address(table1, tmp, Address::lsl(2))); 3155 eor(crc, crc, tmp); 3156 ubfx(tmp, v, 24, 8); 3157 ldrw(tmp, Address(table0, tmp, Address::lsl(2))); 3158 eor(crc, crc, tmp); 3159 } 3160 3161 void MacroAssembler::kernel_crc32_using_crc32(Register crc, Register buf, 3162 Register len, Register tmp0, Register tmp1, Register tmp2, 3163 Register tmp3) { 3164 Label CRC_by64_loop, CRC_by4_loop, CRC_by1_loop, CRC_less64, CRC_by64_pre, CRC_by32_loop, CRC_less32, L_exit; 3165 assert_different_registers(crc, buf, len, tmp0, tmp1, tmp2, tmp3); 3166 3167 mvnw(crc, crc); 3168 3169 subs(len, len, 128); 3170 br(Assembler::GE, CRC_by64_pre); 3171 BIND(CRC_less64); 3172 adds(len, len, 128-32); 3173 br(Assembler::GE, CRC_by32_loop); 3174 BIND(CRC_less32); 3175 adds(len, len, 32-4); 3176 br(Assembler::GE, CRC_by4_loop); 3177 adds(len, len, 4); 3178 br(Assembler::GT, CRC_by1_loop); 3179 b(L_exit); 3180 3181 BIND(CRC_by32_loop); 3182 ldp(tmp0, tmp1, Address(post(buf, 16))); 3183 subs(len, len, 32); 3184 crc32x(crc, crc, tmp0); 3185 ldr(tmp2, Address(post(buf, 8))); 3186 crc32x(crc, crc, tmp1); 3187 ldr(tmp3, Address(post(buf, 8))); 3188 crc32x(crc, crc, tmp2); 3189 crc32x(crc, crc, tmp3); 3190 br(Assembler::GE, CRC_by32_loop); 3191 cmn(len, 32); 3192 br(Assembler::NE, CRC_less32); 3193 b(L_exit); 3194 3195 BIND(CRC_by4_loop); 3196 ldrw(tmp0, Address(post(buf, 4))); 3197 subs(len, len, 4); 3198 crc32w(crc, crc, tmp0); 3199 br(Assembler::GE, CRC_by4_loop); 3200 adds(len, len, 4); 3201 br(Assembler::LE, L_exit); 3202 BIND(CRC_by1_loop); 3203 ldrb(tmp0, Address(post(buf, 1))); 3204 subs(len, len, 1); 3205 crc32b(crc, crc, tmp0); 3206 br(Assembler::GT, CRC_by1_loop); 3207 b(L_exit); 3208 3209 BIND(CRC_by64_pre); 3210 sub(buf, buf, 8); 3211 ldp(tmp0, tmp1, Address(buf, 8)); 3212 crc32x(crc, crc, tmp0); 3213 ldr(tmp2, Address(buf, 24)); 3214 crc32x(crc, crc, tmp1); 3215 ldr(tmp3, Address(buf, 32)); 3216 crc32x(crc, crc, tmp2); 3217 ldr(tmp0, Address(buf, 40)); 3218 crc32x(crc, crc, tmp3); 3219 ldr(tmp1, Address(buf, 48)); 3220 crc32x(crc, crc, tmp0); 3221 ldr(tmp2, Address(buf, 56)); 3222 crc32x(crc, crc, tmp1); 3223 ldr(tmp3, Address(pre(buf, 64))); 3224 3225 b(CRC_by64_loop); 3226 3227 align(CodeEntryAlignment); 3228 BIND(CRC_by64_loop); 3229 subs(len, len, 64); 3230 crc32x(crc, crc, tmp2); 3231 ldr(tmp0, Address(buf, 8)); 3232 crc32x(crc, crc, tmp3); 3233 ldr(tmp1, Address(buf, 16)); 3234 crc32x(crc, crc, tmp0); 3235 ldr(tmp2, Address(buf, 24)); 3236 crc32x(crc, crc, tmp1); 3237 ldr(tmp3, Address(buf, 32)); 3238 crc32x(crc, crc, tmp2); 3239 ldr(tmp0, Address(buf, 40)); 3240 crc32x(crc, crc, tmp3); 3241 ldr(tmp1, Address(buf, 48)); 3242 crc32x(crc, crc, tmp0); 3243 ldr(tmp2, Address(buf, 56)); 3244 crc32x(crc, crc, tmp1); 3245 ldr(tmp3, Address(pre(buf, 64))); 3246 br(Assembler::GE, CRC_by64_loop); 3247 3248 // post-loop 3249 crc32x(crc, crc, tmp2); 3250 crc32x(crc, crc, tmp3); 3251 3252 sub(len, len, 64); 3253 add(buf, buf, 8); 3254 cmn(len, 128); 3255 br(Assembler::NE, CRC_less64); 3256 BIND(L_exit); 3257 mvnw(crc, crc); 3258 } 3259 3260 /** 3261 * @param crc register containing existing CRC (32-bit) 3262 * @param buf register pointing to input byte buffer (byte*) 3263 * @param len register containing number of bytes 3264 * @param table register that will contain address of CRC table 3265 * @param tmp scratch register 3266 */ 3267 void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len, 3268 Register table0, Register table1, Register table2, Register table3, 3269 Register tmp, Register tmp2, Register tmp3) { 3270 Label L_by16, L_by16_loop, L_by4, L_by4_loop, L_by1, L_by1_loop, L_exit; 3271 unsigned long offset; 3272 3273 if (UseCRC32) { 3274 kernel_crc32_using_crc32(crc, buf, len, table0, table1, table2, table3); 3275 return; 3276 } 3277 3278 mvnw(crc, crc); 3279 3280 adrp(table0, ExternalAddress(StubRoutines::crc_table_addr()), offset); 3281 if (offset) add(table0, table0, offset); 3282 add(table1, table0, 1*256*sizeof(juint)); 3283 add(table2, table0, 2*256*sizeof(juint)); 3284 add(table3, table0, 3*256*sizeof(juint)); 3285 3286 if (UseNeon) { 3287 cmp(len, 64); 3288 br(Assembler::LT, L_by16); 3289 eor(v16, T16B, v16, v16); 3290 3291 Label L_fold; 3292 3293 add(tmp, table0, 4*256*sizeof(juint)); // Point at the Neon constants 3294 3295 ld1(v0, v1, T2D, post(buf, 32)); 3296 ld1r(v4, T2D, post(tmp, 8)); 3297 ld1r(v5, T2D, post(tmp, 8)); 3298 ld1r(v6, T2D, post(tmp, 8)); 3299 ld1r(v7, T2D, post(tmp, 8)); 3300 mov(v16, T4S, 0, crc); 3301 3302 eor(v0, T16B, v0, v16); 3303 sub(len, len, 64); 3304 3305 BIND(L_fold); 3306 pmull(v22, T8H, v0, v5, T8B); 3307 pmull(v20, T8H, v0, v7, T8B); 3308 pmull(v23, T8H, v0, v4, T8B); 3309 pmull(v21, T8H, v0, v6, T8B); 3310 3311 pmull2(v18, T8H, v0, v5, T16B); 3312 pmull2(v16, T8H, v0, v7, T16B); 3313 pmull2(v19, T8H, v0, v4, T16B); 3314 pmull2(v17, T8H, v0, v6, T16B); 3315 3316 uzp1(v24, T8H, v20, v22); 3317 uzp2(v25, T8H, v20, v22); 3318 eor(v20, T16B, v24, v25); 3319 3320 uzp1(v26, T8H, v16, v18); 3321 uzp2(v27, T8H, v16, v18); 3322 eor(v16, T16B, v26, v27); 3323 3324 ushll2(v22, T4S, v20, T8H, 8); 3325 ushll(v20, T4S, v20, T4H, 8); 3326 3327 ushll2(v18, T4S, v16, T8H, 8); 3328 ushll(v16, T4S, v16, T4H, 8); 3329 3330 eor(v22, T16B, v23, v22); 3331 eor(v18, T16B, v19, v18); 3332 eor(v20, T16B, v21, v20); 3333 eor(v16, T16B, v17, v16); 3334 3335 uzp1(v17, T2D, v16, v20); 3336 uzp2(v21, T2D, v16, v20); 3337 eor(v17, T16B, v17, v21); 3338 3339 ushll2(v20, T2D, v17, T4S, 16); 3340 ushll(v16, T2D, v17, T2S, 16); 3341 3342 eor(v20, T16B, v20, v22); 3343 eor(v16, T16B, v16, v18); 3344 3345 uzp1(v17, T2D, v20, v16); 3346 uzp2(v21, T2D, v20, v16); 3347 eor(v28, T16B, v17, v21); 3348 3349 pmull(v22, T8H, v1, v5, T8B); 3350 pmull(v20, T8H, v1, v7, T8B); 3351 pmull(v23, T8H, v1, v4, T8B); 3352 pmull(v21, T8H, v1, v6, T8B); 3353 3354 pmull2(v18, T8H, v1, v5, T16B); 3355 pmull2(v16, T8H, v1, v7, T16B); 3356 pmull2(v19, T8H, v1, v4, T16B); 3357 pmull2(v17, T8H, v1, v6, T16B); 3358 3359 ld1(v0, v1, T2D, post(buf, 32)); 3360 3361 uzp1(v24, T8H, v20, v22); 3362 uzp2(v25, T8H, v20, v22); 3363 eor(v20, T16B, v24, v25); 3364 3365 uzp1(v26, T8H, v16, v18); 3366 uzp2(v27, T8H, v16, v18); 3367 eor(v16, T16B, v26, v27); 3368 3369 ushll2(v22, T4S, v20, T8H, 8); 3370 ushll(v20, T4S, v20, T4H, 8); 3371 3372 ushll2(v18, T4S, v16, T8H, 8); 3373 ushll(v16, T4S, v16, T4H, 8); 3374 3375 eor(v22, T16B, v23, v22); 3376 eor(v18, T16B, v19, v18); 3377 eor(v20, T16B, v21, v20); 3378 eor(v16, T16B, v17, v16); 3379 3380 uzp1(v17, T2D, v16, v20); 3381 uzp2(v21, T2D, v16, v20); 3382 eor(v16, T16B, v17, v21); 3383 3384 ushll2(v20, T2D, v16, T4S, 16); 3385 ushll(v16, T2D, v16, T2S, 16); 3386 3387 eor(v20, T16B, v22, v20); 3388 eor(v16, T16B, v16, v18); 3389 3390 uzp1(v17, T2D, v20, v16); 3391 uzp2(v21, T2D, v20, v16); 3392 eor(v20, T16B, v17, v21); 3393 3394 shl(v16, T2D, v28, 1); 3395 shl(v17, T2D, v20, 1); 3396 3397 eor(v0, T16B, v0, v16); 3398 eor(v1, T16B, v1, v17); 3399 3400 subs(len, len, 32); 3401 br(Assembler::GE, L_fold); 3402 3403 mov(crc, 0); 3404 mov(tmp, v0, T1D, 0); 3405 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false); 3406 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true); 3407 mov(tmp, v0, T1D, 1); 3408 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false); 3409 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true); 3410 mov(tmp, v1, T1D, 0); 3411 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false); 3412 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true); 3413 mov(tmp, v1, T1D, 1); 3414 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false); 3415 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true); 3416 3417 add(len, len, 32); 3418 } 3419 3420 BIND(L_by16); 3421 subs(len, len, 16); 3422 br(Assembler::GE, L_by16_loop); 3423 adds(len, len, 16-4); 3424 br(Assembler::GE, L_by4_loop); 3425 adds(len, len, 4); 3426 br(Assembler::GT, L_by1_loop); 3427 b(L_exit); 3428 3429 BIND(L_by4_loop); 3430 ldrw(tmp, Address(post(buf, 4))); 3431 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3); 3432 subs(len, len, 4); 3433 br(Assembler::GE, L_by4_loop); 3434 adds(len, len, 4); 3435 br(Assembler::LE, L_exit); 3436 BIND(L_by1_loop); 3437 subs(len, len, 1); 3438 ldrb(tmp, Address(post(buf, 1))); 3439 update_byte_crc32(crc, tmp, table0); 3440 br(Assembler::GT, L_by1_loop); 3441 b(L_exit); 3442 3443 align(CodeEntryAlignment); 3444 BIND(L_by16_loop); 3445 subs(len, len, 16); 3446 ldp(tmp, tmp3, Address(post(buf, 16))); 3447 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false); 3448 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true); 3449 update_word_crc32(crc, tmp3, tmp2, table0, table1, table2, table3, false); 3450 update_word_crc32(crc, tmp3, tmp2, table0, table1, table2, table3, true); 3451 br(Assembler::GE, L_by16_loop); 3452 adds(len, len, 16-4); 3453 br(Assembler::GE, L_by4_loop); 3454 adds(len, len, 4); 3455 br(Assembler::GT, L_by1_loop); 3456 BIND(L_exit); 3457 mvnw(crc, crc); 3458 } 3459 3460 void MacroAssembler::kernel_crc32c_using_crc32c(Register crc, Register buf, 3461 Register len, Register tmp0, Register tmp1, Register tmp2, 3462 Register tmp3) { 3463 Label CRC_by64_loop, CRC_by4_loop, CRC_by1_loop, CRC_less64, CRC_by64_pre, CRC_by32_loop, CRC_less32, L_exit; 3464 assert_different_registers(crc, buf, len, tmp0, tmp1, tmp2, tmp3); 3465 3466 subs(len, len, 128); 3467 br(Assembler::GE, CRC_by64_pre); 3468 BIND(CRC_less64); 3469 adds(len, len, 128-32); 3470 br(Assembler::GE, CRC_by32_loop); 3471 BIND(CRC_less32); 3472 adds(len, len, 32-4); 3473 br(Assembler::GE, CRC_by4_loop); 3474 adds(len, len, 4); 3475 br(Assembler::GT, CRC_by1_loop); 3476 b(L_exit); 3477 3478 BIND(CRC_by32_loop); 3479 ldp(tmp0, tmp1, Address(post(buf, 16))); 3480 subs(len, len, 32); 3481 crc32cx(crc, crc, tmp0); 3482 ldr(tmp2, Address(post(buf, 8))); 3483 crc32cx(crc, crc, tmp1); 3484 ldr(tmp3, Address(post(buf, 8))); 3485 crc32cx(crc, crc, tmp2); 3486 crc32cx(crc, crc, tmp3); 3487 br(Assembler::GE, CRC_by32_loop); 3488 cmn(len, 32); 3489 br(Assembler::NE, CRC_less32); 3490 b(L_exit); 3491 3492 BIND(CRC_by4_loop); 3493 ldrw(tmp0, Address(post(buf, 4))); 3494 subs(len, len, 4); 3495 crc32cw(crc, crc, tmp0); 3496 br(Assembler::GE, CRC_by4_loop); 3497 adds(len, len, 4); 3498 br(Assembler::LE, L_exit); 3499 BIND(CRC_by1_loop); 3500 ldrb(tmp0, Address(post(buf, 1))); 3501 subs(len, len, 1); 3502 crc32cb(crc, crc, tmp0); 3503 br(Assembler::GT, CRC_by1_loop); 3504 b(L_exit); 3505 3506 BIND(CRC_by64_pre); 3507 sub(buf, buf, 8); 3508 ldp(tmp0, tmp1, Address(buf, 8)); 3509 crc32cx(crc, crc, tmp0); 3510 ldr(tmp2, Address(buf, 24)); 3511 crc32cx(crc, crc, tmp1); 3512 ldr(tmp3, Address(buf, 32)); 3513 crc32cx(crc, crc, tmp2); 3514 ldr(tmp0, Address(buf, 40)); 3515 crc32cx(crc, crc, tmp3); 3516 ldr(tmp1, Address(buf, 48)); 3517 crc32cx(crc, crc, tmp0); 3518 ldr(tmp2, Address(buf, 56)); 3519 crc32cx(crc, crc, tmp1); 3520 ldr(tmp3, Address(pre(buf, 64))); 3521 3522 b(CRC_by64_loop); 3523 3524 align(CodeEntryAlignment); 3525 BIND(CRC_by64_loop); 3526 subs(len, len, 64); 3527 crc32cx(crc, crc, tmp2); 3528 ldr(tmp0, Address(buf, 8)); 3529 crc32cx(crc, crc, tmp3); 3530 ldr(tmp1, Address(buf, 16)); 3531 crc32cx(crc, crc, tmp0); 3532 ldr(tmp2, Address(buf, 24)); 3533 crc32cx(crc, crc, tmp1); 3534 ldr(tmp3, Address(buf, 32)); 3535 crc32cx(crc, crc, tmp2); 3536 ldr(tmp0, Address(buf, 40)); 3537 crc32cx(crc, crc, tmp3); 3538 ldr(tmp1, Address(buf, 48)); 3539 crc32cx(crc, crc, tmp0); 3540 ldr(tmp2, Address(buf, 56)); 3541 crc32cx(crc, crc, tmp1); 3542 ldr(tmp3, Address(pre(buf, 64))); 3543 br(Assembler::GE, CRC_by64_loop); 3544 3545 // post-loop 3546 crc32cx(crc, crc, tmp2); 3547 crc32cx(crc, crc, tmp3); 3548 3549 sub(len, len, 64); 3550 add(buf, buf, 8); 3551 cmn(len, 128); 3552 br(Assembler::NE, CRC_less64); 3553 BIND(L_exit); 3554 } 3555 3556 /** 3557 * @param crc register containing existing CRC (32-bit) 3558 * @param buf register pointing to input byte buffer (byte*) 3559 * @param len register containing number of bytes 3560 * @param table register that will contain address of CRC table 3561 * @param tmp scratch register 3562 */ 3563 void MacroAssembler::kernel_crc32c(Register crc, Register buf, Register len, 3564 Register table0, Register table1, Register table2, Register table3, 3565 Register tmp, Register tmp2, Register tmp3) { 3566 kernel_crc32c_using_crc32c(crc, buf, len, table0, table1, table2, table3); 3567 } 3568 3569 3570 SkipIfEqual::SkipIfEqual( 3571 MacroAssembler* masm, const bool* flag_addr, bool value) { 3572 _masm = masm; 3573 unsigned long offset; 3574 _masm->adrp(rscratch1, ExternalAddress((address)flag_addr), offset); 3575 _masm->ldrb(rscratch1, Address(rscratch1, offset)); 3576 _masm->cbzw(rscratch1, _label); 3577 } 3578 3579 SkipIfEqual::~SkipIfEqual() { 3580 _masm->bind(_label); 3581 } 3582 3583 void MacroAssembler::addptr(const Address &dst, int32_t src) { 3584 Address adr; 3585 switch(dst.getMode()) { 3586 case Address::base_plus_offset: 3587 // This is the expected mode, although we allow all the other 3588 // forms below. 3589 adr = form_address(rscratch2, dst.base(), dst.offset(), LogBytesPerWord); 3590 break; 3591 default: 3592 lea(rscratch2, dst); 3593 adr = Address(rscratch2); 3594 break; 3595 } 3596 ldr(rscratch1, adr); 3597 add(rscratch1, rscratch1, src); 3598 str(rscratch1, adr); 3599 } 3600 3601 void MacroAssembler::cmpptr(Register src1, Address src2) { 3602 unsigned long offset; 3603 adrp(rscratch1, src2, offset); 3604 ldr(rscratch1, Address(rscratch1, offset)); 3605 cmp(src1, rscratch1); 3606 } 3607 3608 void MacroAssembler::cmpoop(Register obj1, Register obj2) { 3609 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 3610 bs->obj_equals(this, obj1, obj2); 3611 } 3612 3613 void MacroAssembler::load_klass(Register dst, Register src) { 3614 if (UseCompressedClassPointers) { 3615 ldrw(dst, Address(src, oopDesc::klass_offset_in_bytes())); 3616 decode_klass_not_null(dst); 3617 } else { 3618 ldr(dst, Address(src, oopDesc::klass_offset_in_bytes())); 3619 } 3620 } 3621 3622 // ((OopHandle)result).resolve(); 3623 void MacroAssembler::resolve_oop_handle(Register result, Register tmp) { 3624 // OopHandle::resolve is an indirection. 3625 access_load_at(T_OBJECT, IN_NATIVE, result, Address(result, 0), tmp, noreg); 3626 } 3627 3628 void MacroAssembler::load_mirror(Register dst, Register method, Register tmp) { 3629 const int mirror_offset = in_bytes(Klass::java_mirror_offset()); 3630 ldr(dst, Address(rmethod, Method::const_offset())); 3631 ldr(dst, Address(dst, ConstMethod::constants_offset())); 3632 ldr(dst, Address(dst, ConstantPool::pool_holder_offset_in_bytes())); 3633 ldr(dst, Address(dst, mirror_offset)); 3634 resolve_oop_handle(dst, tmp); 3635 } 3636 3637 void MacroAssembler::cmp_klass(Register oop, Register trial_klass, Register tmp) { 3638 if (UseCompressedClassPointers) { 3639 ldrw(tmp, Address(oop, oopDesc::klass_offset_in_bytes())); 3640 if (Universe::narrow_klass_base() == NULL) { 3641 cmp(trial_klass, tmp, LSL, Universe::narrow_klass_shift()); 3642 return; 3643 } else if (((uint64_t)Universe::narrow_klass_base() & 0xffffffff) == 0 3644 && Universe::narrow_klass_shift() == 0) { 3645 // Only the bottom 32 bits matter 3646 cmpw(trial_klass, tmp); 3647 return; 3648 } 3649 decode_klass_not_null(tmp); 3650 } else { 3651 ldr(tmp, Address(oop, oopDesc::klass_offset_in_bytes())); 3652 } 3653 cmp(trial_klass, tmp); 3654 } 3655 3656 void MacroAssembler::load_prototype_header(Register dst, Register src) { 3657 load_klass(dst, src); 3658 ldr(dst, Address(dst, Klass::prototype_header_offset())); 3659 } 3660 3661 void MacroAssembler::store_klass(Register dst, Register src) { 3662 // FIXME: Should this be a store release? concurrent gcs assumes 3663 // klass length is valid if klass field is not null. 3664 if (UseCompressedClassPointers) { 3665 encode_klass_not_null(src); 3666 strw(src, Address(dst, oopDesc::klass_offset_in_bytes())); 3667 } else { 3668 str(src, Address(dst, oopDesc::klass_offset_in_bytes())); 3669 } 3670 } 3671 3672 void MacroAssembler::store_klass_gap(Register dst, Register src) { 3673 if (UseCompressedClassPointers) { 3674 // Store to klass gap in destination 3675 strw(src, Address(dst, oopDesc::klass_gap_offset_in_bytes())); 3676 } 3677 } 3678 3679 // Algorithm must match CompressedOops::encode. 3680 void MacroAssembler::encode_heap_oop(Register d, Register s) { 3681 #ifdef ASSERT 3682 verify_heapbase("MacroAssembler::encode_heap_oop: heap base corrupted?"); 3683 #endif 3684 verify_oop(s, "broken oop in encode_heap_oop"); 3685 if (Universe::narrow_oop_base() == NULL) { 3686 if (Universe::narrow_oop_shift() != 0) { 3687 assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong"); 3688 lsr(d, s, LogMinObjAlignmentInBytes); 3689 } else { 3690 mov(d, s); 3691 } 3692 } else { 3693 subs(d, s, rheapbase); 3694 csel(d, d, zr, Assembler::HS); 3695 lsr(d, d, LogMinObjAlignmentInBytes); 3696 3697 /* Old algorithm: is this any worse? 3698 Label nonnull; 3699 cbnz(r, nonnull); 3700 sub(r, r, rheapbase); 3701 bind(nonnull); 3702 lsr(r, r, LogMinObjAlignmentInBytes); 3703 */ 3704 } 3705 } 3706 3707 void MacroAssembler::encode_heap_oop_not_null(Register r) { 3708 #ifdef ASSERT 3709 verify_heapbase("MacroAssembler::encode_heap_oop_not_null: heap base corrupted?"); 3710 if (CheckCompressedOops) { 3711 Label ok; 3712 cbnz(r, ok); 3713 stop("null oop passed to encode_heap_oop_not_null"); 3714 bind(ok); 3715 } 3716 #endif 3717 verify_oop(r, "broken oop in encode_heap_oop_not_null"); 3718 if (Universe::narrow_oop_base() != NULL) { 3719 sub(r, r, rheapbase); 3720 } 3721 if (Universe::narrow_oop_shift() != 0) { 3722 assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong"); 3723 lsr(r, r, LogMinObjAlignmentInBytes); 3724 } 3725 } 3726 3727 void MacroAssembler::encode_heap_oop_not_null(Register dst, Register src) { 3728 #ifdef ASSERT 3729 verify_heapbase("MacroAssembler::encode_heap_oop_not_null2: heap base corrupted?"); 3730 if (CheckCompressedOops) { 3731 Label ok; 3732 cbnz(src, ok); 3733 stop("null oop passed to encode_heap_oop_not_null2"); 3734 bind(ok); 3735 } 3736 #endif 3737 verify_oop(src, "broken oop in encode_heap_oop_not_null2"); 3738 3739 Register data = src; 3740 if (Universe::narrow_oop_base() != NULL) { 3741 sub(dst, src, rheapbase); 3742 data = dst; 3743 } 3744 if (Universe::narrow_oop_shift() != 0) { 3745 assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong"); 3746 lsr(dst, data, LogMinObjAlignmentInBytes); 3747 data = dst; 3748 } 3749 if (data == src) 3750 mov(dst, src); 3751 } 3752 3753 void MacroAssembler::decode_heap_oop(Register d, Register s) { 3754 #ifdef ASSERT 3755 verify_heapbase("MacroAssembler::decode_heap_oop: heap base corrupted?"); 3756 #endif 3757 if (Universe::narrow_oop_base() == NULL) { 3758 if (Universe::narrow_oop_shift() != 0 || d != s) { 3759 lsl(d, s, Universe::narrow_oop_shift()); 3760 } 3761 } else { 3762 Label done; 3763 if (d != s) 3764 mov(d, s); 3765 cbz(s, done); 3766 add(d, rheapbase, s, Assembler::LSL, LogMinObjAlignmentInBytes); 3767 bind(done); 3768 } 3769 verify_oop(d, "broken oop in decode_heap_oop"); 3770 } 3771 3772 void MacroAssembler::decode_heap_oop_not_null(Register r) { 3773 assert (UseCompressedOops, "should only be used for compressed headers"); 3774 assert (Universe::heap() != NULL, "java heap should be initialized"); 3775 // Cannot assert, unverified entry point counts instructions (see .ad file) 3776 // vtableStubs also counts instructions in pd_code_size_limit. 3777 // Also do not verify_oop as this is called by verify_oop. 3778 if (Universe::narrow_oop_shift() != 0) { 3779 assert(LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong"); 3780 if (Universe::narrow_oop_base() != NULL) { 3781 add(r, rheapbase, r, Assembler::LSL, LogMinObjAlignmentInBytes); 3782 } else { 3783 add(r, zr, r, Assembler::LSL, LogMinObjAlignmentInBytes); 3784 } 3785 } else { 3786 assert (Universe::narrow_oop_base() == NULL, "sanity"); 3787 } 3788 } 3789 3790 void MacroAssembler::decode_heap_oop_not_null(Register dst, Register src) { 3791 assert (UseCompressedOops, "should only be used for compressed headers"); 3792 assert (Universe::heap() != NULL, "java heap should be initialized"); 3793 // Cannot assert, unverified entry point counts instructions (see .ad file) 3794 // vtableStubs also counts instructions in pd_code_size_limit. 3795 // Also do not verify_oop as this is called by verify_oop. 3796 if (Universe::narrow_oop_shift() != 0) { 3797 assert(LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong"); 3798 if (Universe::narrow_oop_base() != NULL) { 3799 add(dst, rheapbase, src, Assembler::LSL, LogMinObjAlignmentInBytes); 3800 } else { 3801 add(dst, zr, src, Assembler::LSL, LogMinObjAlignmentInBytes); 3802 } 3803 } else { 3804 assert (Universe::narrow_oop_base() == NULL, "sanity"); 3805 if (dst != src) { 3806 mov(dst, src); 3807 } 3808 } 3809 } 3810 3811 void MacroAssembler::encode_klass_not_null(Register dst, Register src) { 3812 if (Universe::narrow_klass_base() == NULL) { 3813 if (Universe::narrow_klass_shift() != 0) { 3814 assert (LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong"); 3815 lsr(dst, src, LogKlassAlignmentInBytes); 3816 } else { 3817 if (dst != src) mov(dst, src); 3818 } 3819 return; 3820 } 3821 3822 if (use_XOR_for_compressed_class_base) { 3823 if (Universe::narrow_klass_shift() != 0) { 3824 eor(dst, src, (uint64_t)Universe::narrow_klass_base()); 3825 lsr(dst, dst, LogKlassAlignmentInBytes); 3826 } else { 3827 eor(dst, src, (uint64_t)Universe::narrow_klass_base()); 3828 } 3829 return; 3830 } 3831 3832 if (((uint64_t)Universe::narrow_klass_base() & 0xffffffff) == 0 3833 && Universe::narrow_klass_shift() == 0) { 3834 movw(dst, src); 3835 return; 3836 } 3837 3838 #ifdef ASSERT 3839 verify_heapbase("MacroAssembler::encode_klass_not_null2: heap base corrupted?"); 3840 #endif 3841 3842 Register rbase = dst; 3843 if (dst == src) rbase = rheapbase; 3844 mov(rbase, (uint64_t)Universe::narrow_klass_base()); 3845 sub(dst, src, rbase); 3846 if (Universe::narrow_klass_shift() != 0) { 3847 assert (LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong"); 3848 lsr(dst, dst, LogKlassAlignmentInBytes); 3849 } 3850 if (dst == src) reinit_heapbase(); 3851 } 3852 3853 void MacroAssembler::encode_klass_not_null(Register r) { 3854 encode_klass_not_null(r, r); 3855 } 3856 3857 void MacroAssembler::decode_klass_not_null(Register dst, Register src) { 3858 Register rbase = dst; 3859 assert (UseCompressedClassPointers, "should only be used for compressed headers"); 3860 3861 if (Universe::narrow_klass_base() == NULL) { 3862 if (Universe::narrow_klass_shift() != 0) { 3863 assert(LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong"); 3864 lsl(dst, src, LogKlassAlignmentInBytes); 3865 } else { 3866 if (dst != src) mov(dst, src); 3867 } 3868 return; 3869 } 3870 3871 if (use_XOR_for_compressed_class_base) { 3872 if (Universe::narrow_klass_shift() != 0) { 3873 lsl(dst, src, LogKlassAlignmentInBytes); 3874 eor(dst, dst, (uint64_t)Universe::narrow_klass_base()); 3875 } else { 3876 eor(dst, src, (uint64_t)Universe::narrow_klass_base()); 3877 } 3878 return; 3879 } 3880 3881 if (((uint64_t)Universe::narrow_klass_base() & 0xffffffff) == 0 3882 && Universe::narrow_klass_shift() == 0) { 3883 if (dst != src) 3884 movw(dst, src); 3885 movk(dst, (uint64_t)Universe::narrow_klass_base() >> 32, 32); 3886 return; 3887 } 3888 3889 // Cannot assert, unverified entry point counts instructions (see .ad file) 3890 // vtableStubs also counts instructions in pd_code_size_limit. 3891 // Also do not verify_oop as this is called by verify_oop. 3892 if (dst == src) rbase = rheapbase; 3893 mov(rbase, (uint64_t)Universe::narrow_klass_base()); 3894 if (Universe::narrow_klass_shift() != 0) { 3895 assert(LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong"); 3896 add(dst, rbase, src, Assembler::LSL, LogKlassAlignmentInBytes); 3897 } else { 3898 add(dst, rbase, src); 3899 } 3900 if (dst == src) reinit_heapbase(); 3901 } 3902 3903 void MacroAssembler::decode_klass_not_null(Register r) { 3904 decode_klass_not_null(r, r); 3905 } 3906 3907 void MacroAssembler::set_narrow_oop(Register dst, jobject obj) { 3908 #ifdef ASSERT 3909 { 3910 ThreadInVMfromUnknown tiv; 3911 assert (UseCompressedOops, "should only be used for compressed oops"); 3912 assert (Universe::heap() != NULL, "java heap should be initialized"); 3913 assert (oop_recorder() != NULL, "this assembler needs an OopRecorder"); 3914 assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "should be real oop"); 3915 } 3916 #endif 3917 int oop_index = oop_recorder()->find_index(obj); 3918 InstructionMark im(this); 3919 RelocationHolder rspec = oop_Relocation::spec(oop_index); 3920 code_section()->relocate(inst_mark(), rspec); 3921 movz(dst, 0xDEAD, 16); 3922 movk(dst, 0xBEEF); 3923 } 3924 3925 void MacroAssembler::set_narrow_klass(Register dst, Klass* k) { 3926 assert (UseCompressedClassPointers, "should only be used for compressed headers"); 3927 assert (oop_recorder() != NULL, "this assembler needs an OopRecorder"); 3928 int index = oop_recorder()->find_index(k); 3929 assert(! Universe::heap()->is_in_reserved(k), "should not be an oop"); 3930 3931 InstructionMark im(this); 3932 RelocationHolder rspec = metadata_Relocation::spec(index); 3933 code_section()->relocate(inst_mark(), rspec); 3934 narrowKlass nk = Klass::encode_klass(k); 3935 movz(dst, (nk >> 16), 16); 3936 movk(dst, nk & 0xffff); 3937 } 3938 3939 void MacroAssembler::access_load_at(BasicType type, DecoratorSet decorators, 3940 Register dst, Address src, 3941 Register tmp1, Register thread_tmp) { 3942 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 3943 decorators = AccessInternal::decorator_fixup(decorators); 3944 bool as_raw = (decorators & AS_RAW) != 0; 3945 if (as_raw) { 3946 bs->BarrierSetAssembler::load_at(this, decorators, type, dst, src, tmp1, thread_tmp); 3947 } else { 3948 bs->load_at(this, decorators, type, dst, src, tmp1, thread_tmp); 3949 } 3950 } 3951 3952 void MacroAssembler::access_store_at(BasicType type, DecoratorSet decorators, 3953 Address dst, Register src, 3954 Register tmp1, Register thread_tmp) { 3955 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 3956 decorators = AccessInternal::decorator_fixup(decorators); 3957 bool as_raw = (decorators & AS_RAW) != 0; 3958 if (as_raw) { 3959 bs->BarrierSetAssembler::store_at(this, decorators, type, dst, src, tmp1, thread_tmp); 3960 } else { 3961 bs->store_at(this, decorators, type, dst, src, tmp1, thread_tmp); 3962 } 3963 } 3964 3965 void MacroAssembler::load_heap_oop(Register dst, Address src, Register tmp1, 3966 Register thread_tmp, DecoratorSet decorators) { 3967 access_load_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, thread_tmp); 3968 } 3969 3970 void MacroAssembler::load_heap_oop_not_null(Register dst, Address src, Register tmp1, 3971 Register thread_tmp, DecoratorSet decorators) { 3972 access_load_at(T_OBJECT, IN_HEAP | IS_NOT_NULL | decorators, dst, src, tmp1, thread_tmp); 3973 } 3974 3975 void MacroAssembler::store_heap_oop(Address dst, Register src, Register tmp1, 3976 Register thread_tmp, DecoratorSet decorators) { 3977 access_store_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, thread_tmp); 3978 } 3979 3980 // Used for storing NULLs. 3981 void MacroAssembler::store_heap_oop_null(Address dst) { 3982 access_store_at(T_OBJECT, IN_HEAP, dst, noreg, noreg, noreg); 3983 } 3984 3985 Address MacroAssembler::allocate_metadata_address(Metadata* obj) { 3986 assert(oop_recorder() != NULL, "this assembler needs a Recorder"); 3987 int index = oop_recorder()->allocate_metadata_index(obj); 3988 RelocationHolder rspec = metadata_Relocation::spec(index); 3989 return Address((address)obj, rspec); 3990 } 3991 3992 // Move an oop into a register. immediate is true if we want 3993 // immediate instrcutions, i.e. we are not going to patch this 3994 // instruction while the code is being executed by another thread. In 3995 // that case we can use move immediates rather than the constant pool. 3996 void MacroAssembler::movoop(Register dst, jobject obj, bool immediate) { 3997 int oop_index; 3998 if (obj == NULL) { 3999 oop_index = oop_recorder()->allocate_oop_index(obj); 4000 } else { 4001 #ifdef ASSERT 4002 { 4003 ThreadInVMfromUnknown tiv; 4004 assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "should be real oop"); 4005 } 4006 #endif 4007 oop_index = oop_recorder()->find_index(obj); 4008 } 4009 RelocationHolder rspec = oop_Relocation::spec(oop_index); 4010 if (! immediate) { 4011 address dummy = address(uintptr_t(pc()) & -wordSize); // A nearby aligned address 4012 ldr_constant(dst, Address(dummy, rspec)); 4013 } else 4014 mov(dst, Address((address)obj, rspec)); 4015 } 4016 4017 // Move a metadata address into a register. 4018 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) { 4019 int oop_index; 4020 if (obj == NULL) { 4021 oop_index = oop_recorder()->allocate_metadata_index(obj); 4022 } else { 4023 oop_index = oop_recorder()->find_index(obj); 4024 } 4025 RelocationHolder rspec = metadata_Relocation::spec(oop_index); 4026 mov(dst, Address((address)obj, rspec)); 4027 } 4028 4029 Address MacroAssembler::constant_oop_address(jobject obj) { 4030 #ifdef ASSERT 4031 { 4032 ThreadInVMfromUnknown tiv; 4033 assert(oop_recorder() != NULL, "this assembler needs an OopRecorder"); 4034 assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "not an oop"); 4035 } 4036 #endif 4037 int oop_index = oop_recorder()->find_index(obj); 4038 return Address((address)obj, oop_Relocation::spec(oop_index)); 4039 } 4040 4041 // Defines obj, preserves var_size_in_bytes, okay for t2 == var_size_in_bytes. 4042 void MacroAssembler::tlab_allocate(Register obj, 4043 Register var_size_in_bytes, 4044 int con_size_in_bytes, 4045 Register t1, 4046 Register t2, 4047 Label& slow_case) { 4048 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 4049 bs->tlab_allocate(this, obj, var_size_in_bytes, con_size_in_bytes, t1, t2, slow_case); 4050 } 4051 4052 // Defines obj, preserves var_size_in_bytes 4053 void MacroAssembler::eden_allocate(Register obj, 4054 Register var_size_in_bytes, 4055 int con_size_in_bytes, 4056 Register t1, 4057 Label& slow_case) { 4058 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 4059 bs->eden_allocate(this, obj, var_size_in_bytes, con_size_in_bytes, t1, slow_case); 4060 } 4061 4062 // Zero words; len is in bytes 4063 // Destroys all registers except addr 4064 // len must be a nonzero multiple of wordSize 4065 void MacroAssembler::zero_memory(Register addr, Register len, Register t1) { 4066 assert_different_registers(addr, len, t1, rscratch1, rscratch2); 4067 4068 #ifdef ASSERT 4069 { Label L; 4070 tst(len, BytesPerWord - 1); 4071 br(Assembler::EQ, L); 4072 stop("len is not a multiple of BytesPerWord"); 4073 bind(L); 4074 } 4075 #endif 4076 4077 #ifndef PRODUCT 4078 block_comment("zero memory"); 4079 #endif 4080 4081 Label loop; 4082 Label entry; 4083 4084 // Algorithm: 4085 // 4086 // scratch1 = cnt & 7; 4087 // cnt -= scratch1; 4088 // p += scratch1; 4089 // switch (scratch1) { 4090 // do { 4091 // cnt -= 8; 4092 // p[-8] = 0; 4093 // case 7: 4094 // p[-7] = 0; 4095 // case 6: 4096 // p[-6] = 0; 4097 // // ... 4098 // case 1: 4099 // p[-1] = 0; 4100 // case 0: 4101 // p += 8; 4102 // } while (cnt); 4103 // } 4104 4105 const int unroll = 8; // Number of str(zr) instructions we'll unroll 4106 4107 lsr(len, len, LogBytesPerWord); 4108 andr(rscratch1, len, unroll - 1); // tmp1 = cnt % unroll 4109 sub(len, len, rscratch1); // cnt -= unroll 4110 // t1 always points to the end of the region we're about to zero 4111 add(t1, addr, rscratch1, Assembler::LSL, LogBytesPerWord); 4112 adr(rscratch2, entry); 4113 sub(rscratch2, rscratch2, rscratch1, Assembler::LSL, 2); 4114 br(rscratch2); 4115 bind(loop); 4116 sub(len, len, unroll); 4117 for (int i = -unroll; i < 0; i++) 4118 Assembler::str(zr, Address(t1, i * wordSize)); 4119 bind(entry); 4120 add(t1, t1, unroll * wordSize); 4121 cbnz(len, loop); 4122 } 4123 4124 void MacroAssembler::verify_tlab() { 4125 #ifdef ASSERT 4126 if (UseTLAB && VerifyOops) { 4127 Label next, ok; 4128 4129 stp(rscratch2, rscratch1, Address(pre(sp, -16))); 4130 4131 ldr(rscratch2, Address(rthread, in_bytes(JavaThread::tlab_top_offset()))); 4132 ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_start_offset()))); 4133 cmp(rscratch2, rscratch1); 4134 br(Assembler::HS, next); 4135 STOP("assert(top >= start)"); 4136 should_not_reach_here(); 4137 4138 bind(next); 4139 ldr(rscratch2, Address(rthread, in_bytes(JavaThread::tlab_end_offset()))); 4140 ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_top_offset()))); 4141 cmp(rscratch2, rscratch1); 4142 br(Assembler::HS, ok); 4143 STOP("assert(top <= end)"); 4144 should_not_reach_here(); 4145 4146 bind(ok); 4147 ldp(rscratch2, rscratch1, Address(post(sp, 16))); 4148 } 4149 #endif 4150 } 4151 4152 // Writes to stack successive pages until offset reached to check for 4153 // stack overflow + shadow pages. This clobbers tmp. 4154 void MacroAssembler::bang_stack_size(Register size, Register tmp) { 4155 assert_different_registers(tmp, size, rscratch1); 4156 mov(tmp, sp); 4157 // Bang stack for total size given plus shadow page size. 4158 // Bang one page at a time because large size can bang beyond yellow and 4159 // red zones. 4160 Label loop; 4161 mov(rscratch1, os::vm_page_size()); 4162 bind(loop); 4163 lea(tmp, Address(tmp, -os::vm_page_size())); 4164 subsw(size, size, rscratch1); 4165 str(size, Address(tmp)); 4166 br(Assembler::GT, loop); 4167 4168 // Bang down shadow pages too. 4169 // At this point, (tmp-0) is the last address touched, so don't 4170 // touch it again. (It was touched as (tmp-pagesize) but then tmp 4171 // was post-decremented.) Skip this address by starting at i=1, and 4172 // touch a few more pages below. N.B. It is important to touch all 4173 // the way down to and including i=StackShadowPages. 4174 for (int i = 0; i < (int)(JavaThread::stack_shadow_zone_size() / os::vm_page_size()) - 1; i++) { 4175 // this could be any sized move but this is can be a debugging crumb 4176 // so the bigger the better. 4177 lea(tmp, Address(tmp, -os::vm_page_size())); 4178 str(size, Address(tmp)); 4179 } 4180 } 4181 4182 4183 // Move the address of the polling page into dest. 4184 void MacroAssembler::get_polling_page(Register dest, address page, relocInfo::relocType rtype) { 4185 if (SafepointMechanism::uses_thread_local_poll()) { 4186 ldr(dest, Address(rthread, Thread::polling_page_offset())); 4187 } else { 4188 unsigned long off; 4189 adrp(dest, Address(page, rtype), off); 4190 assert(off == 0, "polling page must be page aligned"); 4191 } 4192 } 4193 4194 // Move the address of the polling page into r, then read the polling 4195 // page. 4196 address MacroAssembler::read_polling_page(Register r, address page, relocInfo::relocType rtype) { 4197 get_polling_page(r, page, rtype); 4198 return read_polling_page(r, rtype); 4199 } 4200 4201 // Read the polling page. The address of the polling page must 4202 // already be in r. 4203 address MacroAssembler::read_polling_page(Register r, relocInfo::relocType rtype) { 4204 InstructionMark im(this); 4205 code_section()->relocate(inst_mark(), rtype); 4206 ldrw(zr, Address(r, 0)); 4207 return inst_mark(); 4208 } 4209 4210 void MacroAssembler::adrp(Register reg1, const Address &dest, unsigned long &byte_offset) { 4211 relocInfo::relocType rtype = dest.rspec().reloc()->type(); 4212 unsigned long low_page = (unsigned long)CodeCache::low_bound() >> 12; 4213 unsigned long high_page = (unsigned long)(CodeCache::high_bound()-1) >> 12; 4214 unsigned long dest_page = (unsigned long)dest.target() >> 12; 4215 long offset_low = dest_page - low_page; 4216 long offset_high = dest_page - high_page; 4217 4218 assert(is_valid_AArch64_address(dest.target()), "bad address"); 4219 assert(dest.getMode() == Address::literal, "ADRP must be applied to a literal address"); 4220 4221 InstructionMark im(this); 4222 code_section()->relocate(inst_mark(), dest.rspec()); 4223 // 8143067: Ensure that the adrp can reach the dest from anywhere within 4224 // the code cache so that if it is relocated we know it will still reach 4225 if (offset_high >= -(1<<20) && offset_low < (1<<20)) { 4226 _adrp(reg1, dest.target()); 4227 } else { 4228 unsigned long target = (unsigned long)dest.target(); 4229 unsigned long adrp_target 4230 = (target & 0xffffffffUL) | ((unsigned long)pc() & 0xffff00000000UL); 4231 4232 _adrp(reg1, (address)adrp_target); 4233 movk(reg1, target >> 32, 32); 4234 } 4235 byte_offset = (unsigned long)dest.target() & 0xfff; 4236 } 4237 4238 void MacroAssembler::load_byte_map_base(Register reg) { 4239 jbyte *byte_map_base = 4240 ((CardTableBarrierSet*)(BarrierSet::barrier_set()))->card_table()->byte_map_base(); 4241 4242 if (is_valid_AArch64_address((address)byte_map_base)) { 4243 // Strictly speaking the byte_map_base isn't an address at all, 4244 // and it might even be negative. 4245 unsigned long offset; 4246 adrp(reg, ExternalAddress((address)byte_map_base), offset); 4247 // We expect offset to be zero with most collectors. 4248 if (offset != 0) { 4249 add(reg, reg, offset); 4250 } 4251 } else { 4252 mov(reg, (uint64_t)byte_map_base); 4253 } 4254 } 4255 4256 void MacroAssembler::build_frame(int framesize) { 4257 assert(framesize > 0, "framesize must be > 0"); 4258 if (framesize < ((1 << 9) + 2 * wordSize)) { 4259 sub(sp, sp, framesize); 4260 stp(rfp, lr, Address(sp, framesize - 2 * wordSize)); 4261 if (PreserveFramePointer) add(rfp, sp, framesize - 2 * wordSize); 4262 } else { 4263 stp(rfp, lr, Address(pre(sp, -2 * wordSize))); 4264 if (PreserveFramePointer) mov(rfp, sp); 4265 if (framesize < ((1 << 12) + 2 * wordSize)) 4266 sub(sp, sp, framesize - 2 * wordSize); 4267 else { 4268 mov(rscratch1, framesize - 2 * wordSize); 4269 sub(sp, sp, rscratch1); 4270 } 4271 } 4272 } 4273 4274 void MacroAssembler::remove_frame(int framesize) { 4275 assert(framesize > 0, "framesize must be > 0"); 4276 if (framesize < ((1 << 9) + 2 * wordSize)) { 4277 ldp(rfp, lr, Address(sp, framesize - 2 * wordSize)); 4278 add(sp, sp, framesize); 4279 } else { 4280 if (framesize < ((1 << 12) + 2 * wordSize)) 4281 add(sp, sp, framesize - 2 * wordSize); 4282 else { 4283 mov(rscratch1, framesize - 2 * wordSize); 4284 add(sp, sp, rscratch1); 4285 } 4286 ldp(rfp, lr, Address(post(sp, 2 * wordSize))); 4287 } 4288 } 4289 4290 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr); 4291 4292 // Search for str1 in str2 and return index or -1 4293 void MacroAssembler::string_indexof(Register str2, Register str1, 4294 Register cnt2, Register cnt1, 4295 Register tmp1, Register tmp2, 4296 Register tmp3, Register tmp4, 4297 Register tmp5, Register tmp6, 4298 int icnt1, Register result, int ae) { 4299 // NOTE: tmp5, tmp6 can be zr depending on specific method version 4300 Label LINEARSEARCH, LINEARSTUB, LINEAR_MEDIUM, DONE, NOMATCH, MATCH; 4301 4302 Register ch1 = rscratch1; 4303 Register ch2 = rscratch2; 4304 Register cnt1tmp = tmp1; 4305 Register cnt2tmp = tmp2; 4306 Register cnt1_neg = cnt1; 4307 Register cnt2_neg = cnt2; 4308 Register result_tmp = tmp4; 4309 4310 bool isL = ae == StrIntrinsicNode::LL; 4311 4312 bool str1_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL; 4313 bool str2_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU; 4314 int str1_chr_shift = str1_isL ? 0:1; 4315 int str2_chr_shift = str2_isL ? 0:1; 4316 int str1_chr_size = str1_isL ? 1:2; 4317 int str2_chr_size = str2_isL ? 1:2; 4318 chr_insn str1_load_1chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb : 4319 (chr_insn)&MacroAssembler::ldrh; 4320 chr_insn str2_load_1chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb : 4321 (chr_insn)&MacroAssembler::ldrh; 4322 chr_insn load_2chr = isL ? (chr_insn)&MacroAssembler::ldrh : (chr_insn)&MacroAssembler::ldrw; 4323 chr_insn load_4chr = isL ? (chr_insn)&MacroAssembler::ldrw : (chr_insn)&MacroAssembler::ldr; 4324 4325 // Note, inline_string_indexOf() generates checks: 4326 // if (substr.count > string.count) return -1; 4327 // if (substr.count == 0) return 0; 4328 4329 // We have two strings, a source string in str2, cnt2 and a pattern string 4330 // in str1, cnt1. Find the 1st occurence of pattern in source or return -1. 4331 4332 // For larger pattern and source we use a simplified Boyer Moore algorithm. 4333 // With a small pattern and source we use linear scan. 4334 4335 if (icnt1 == -1) { 4336 sub(result_tmp, cnt2, cnt1); 4337 cmp(cnt1, 8); // Use Linear Scan if cnt1 < 8 || cnt1 >= 256 4338 br(LT, LINEARSEARCH); 4339 dup(v0, T16B, cnt1); // done in separate FPU pipeline. Almost no penalty 4340 cmp(cnt1, 256); 4341 lsr(tmp1, cnt2, 2); 4342 ccmp(cnt1, tmp1, 0b0000, LT); // Source must be 4 * pattern for BM 4343 br(GE, LINEARSTUB); 4344 } 4345 4346 // The Boyer Moore alogorithm is based on the description here:- 4347 // 4348 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm 4349 // 4350 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule 4351 // and the 'Good Suffix' rule. 4352 // 4353 // These rules are essentially heuristics for how far we can shift the 4354 // pattern along the search string. 4355 // 4356 // The implementation here uses the 'Bad Character' rule only because of the 4357 // complexity of initialisation for the 'Good Suffix' rule. 4358 // 4359 // This is also known as the Boyer-Moore-Horspool algorithm:- 4360 // 4361 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm 4362 // 4363 // This particular implementation has few java-specific optimizations. 4364 // 4365 // #define ASIZE 256 4366 // 4367 // int bm(unsigned char *x, int m, unsigned char *y, int n) { 4368 // int i, j; 4369 // unsigned c; 4370 // unsigned char bc[ASIZE]; 4371 // 4372 // /* Preprocessing */ 4373 // for (i = 0; i < ASIZE; ++i) 4374 // bc[i] = m; 4375 // for (i = 0; i < m - 1; ) { 4376 // c = x[i]; 4377 // ++i; 4378 // // c < 256 for Latin1 string, so, no need for branch 4379 // #ifdef PATTERN_STRING_IS_LATIN1 4380 // bc[c] = m - i; 4381 // #else 4382 // if (c < ASIZE) bc[c] = m - i; 4383 // #endif 4384 // } 4385 // 4386 // /* Searching */ 4387 // j = 0; 4388 // while (j <= n - m) { 4389 // c = y[i+j]; 4390 // if (x[m-1] == c) 4391 // for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i); 4392 // if (i < 0) return j; 4393 // // c < 256 for Latin1 string, so, no need for branch 4394 // #ifdef SOURCE_STRING_IS_LATIN1 4395 // // LL case: (c< 256) always true. Remove branch 4396 // j += bc[y[j+m-1]]; 4397 // #endif 4398 // #ifndef PATTERN_STRING_IS_UTF 4399 // // UU case: need if (c<ASIZE) check. Skip 1 character if not. 4400 // if (c < ASIZE) 4401 // j += bc[y[j+m-1]]; 4402 // else 4403 // j += 1 4404 // #endif 4405 // #ifdef PATTERN_IS_LATIN1_AND_SOURCE_IS_UTF 4406 // // UL case: need if (c<ASIZE) check. Skip <pattern length> if not. 4407 // if (c < ASIZE) 4408 // j += bc[y[j+m-1]]; 4409 // else 4410 // j += m 4411 // #endif 4412 // } 4413 // } 4414 4415 if (icnt1 == -1) { 4416 Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH, 4417 BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP; 4418 Register cnt1end = tmp2; 4419 Register str2end = cnt2; 4420 Register skipch = tmp2; 4421 4422 // str1 length is >=8, so, we can read at least 1 register for cases when 4423 // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for 4424 // UL case. We'll re-read last character in inner pre-loop code to have 4425 // single outer pre-loop load 4426 const int firstStep = isL ? 7 : 3; 4427 4428 const int ASIZE = 256; 4429 const int STORED_BYTES = 32; // amount of bytes stored per instruction 4430 sub(sp, sp, ASIZE); 4431 mov(tmp5, ASIZE/STORED_BYTES); // loop iterations 4432 mov(ch1, sp); 4433 BIND(BM_INIT_LOOP); 4434 stpq(v0, v0, Address(post(ch1, STORED_BYTES))); 4435 subs(tmp5, tmp5, 1); 4436 br(GT, BM_INIT_LOOP); 4437 4438 sub(cnt1tmp, cnt1, 1); 4439 mov(tmp5, str2); 4440 add(str2end, str2, result_tmp, LSL, str2_chr_shift); 4441 sub(ch2, cnt1, 1); 4442 mov(tmp3, str1); 4443 BIND(BCLOOP); 4444 (this->*str1_load_1chr)(ch1, Address(post(tmp3, str1_chr_size))); 4445 if (!str1_isL) { 4446 cmp(ch1, ASIZE); 4447 br(HS, BCSKIP); 4448 } 4449 strb(ch2, Address(sp, ch1)); 4450 BIND(BCSKIP); 4451 subs(ch2, ch2, 1); 4452 br(GT, BCLOOP); 4453 4454 add(tmp6, str1, cnt1, LSL, str1_chr_shift); // address after str1 4455 if (str1_isL == str2_isL) { 4456 // load last 8 bytes (8LL/4UU symbols) 4457 ldr(tmp6, Address(tmp6, -wordSize)); 4458 } else { 4459 ldrw(tmp6, Address(tmp6, -wordSize/2)); // load last 4 bytes(4 symbols) 4460 // convert Latin1 to UTF. We'll have to wait until load completed, but 4461 // it's still faster than per-character loads+checks 4462 lsr(tmp3, tmp6, BitsPerByte * (wordSize/2 - str1_chr_size)); // str1[N-1] 4463 ubfx(ch1, tmp6, 8, 8); // str1[N-2] 4464 ubfx(ch2, tmp6, 16, 8); // str1[N-3] 4465 andr(tmp6, tmp6, 0xFF); // str1[N-4] 4466 orr(ch2, ch1, ch2, LSL, 16); 4467 orr(tmp6, tmp6, tmp3, LSL, 48); 4468 orr(tmp6, tmp6, ch2, LSL, 16); 4469 } 4470 BIND(BMLOOPSTR2); 4471 (this->*str2_load_1chr)(skipch, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift))); 4472 sub(cnt1tmp, cnt1tmp, firstStep); // cnt1tmp is positive here, because cnt1 >= 8 4473 if (str1_isL == str2_isL) { 4474 // re-init tmp3. It's for free because it's executed in parallel with 4475 // load above. Alternative is to initialize it before loop, but it'll 4476 // affect performance on in-order systems with 2 or more ld/st pipelines 4477 lsr(tmp3, tmp6, BitsPerByte * (wordSize - str1_chr_size)); 4478 } 4479 if (!isL) { // UU/UL case 4480 lsl(ch2, cnt1tmp, 1); // offset in bytes 4481 } 4482 cmp(tmp3, skipch); 4483 br(NE, BMSKIP); 4484 ldr(ch2, Address(str2, isL ? cnt1tmp : ch2)); 4485 mov(ch1, tmp6); 4486 if (isL) { 4487 b(BMLOOPSTR1_AFTER_LOAD); 4488 } else { 4489 sub(cnt1tmp, cnt1tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8 4490 b(BMLOOPSTR1_CMP); 4491 } 4492 BIND(BMLOOPSTR1); 4493 (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift))); 4494 (this->*str2_load_1chr)(ch2, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift))); 4495 BIND(BMLOOPSTR1_AFTER_LOAD); 4496 subs(cnt1tmp, cnt1tmp, 1); 4497 br(LT, BMLOOPSTR1_LASTCMP); 4498 BIND(BMLOOPSTR1_CMP); 4499 cmp(ch1, ch2); 4500 br(EQ, BMLOOPSTR1); 4501 BIND(BMSKIP); 4502 if (!isL) { 4503 // if we've met UTF symbol while searching Latin1 pattern, then we can 4504 // skip cnt1 symbols 4505 if (str1_isL != str2_isL) { 4506 mov(result_tmp, cnt1); 4507 } else { 4508 mov(result_tmp, 1); 4509 } 4510 cmp(skipch, ASIZE); 4511 br(HS, BMADV); 4512 } 4513 ldrb(result_tmp, Address(sp, skipch)); // load skip distance 4514 BIND(BMADV); 4515 sub(cnt1tmp, cnt1, 1); 4516 add(str2, str2, result_tmp, LSL, str2_chr_shift); 4517 cmp(str2, str2end); 4518 br(LE, BMLOOPSTR2); 4519 add(sp, sp, ASIZE); 4520 b(NOMATCH); 4521 BIND(BMLOOPSTR1_LASTCMP); 4522 cmp(ch1, ch2); 4523 br(NE, BMSKIP); 4524 BIND(BMMATCH); 4525 sub(result, str2, tmp5); 4526 if (!str2_isL) lsr(result, result, 1); 4527 add(sp, sp, ASIZE); 4528 b(DONE); 4529 4530 BIND(LINEARSTUB); 4531 cmp(cnt1, 16); // small patterns still should be handled by simple algorithm 4532 br(LT, LINEAR_MEDIUM); 4533 mov(result, zr); 4534 RuntimeAddress stub = NULL; 4535 if (isL) { 4536 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ll()); 4537 assert(stub.target() != NULL, "string_indexof_linear_ll stub has not been generated"); 4538 } else if (str1_isL) { 4539 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ul()); 4540 assert(stub.target() != NULL, "string_indexof_linear_ul stub has not been generated"); 4541 } else { 4542 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_uu()); 4543 assert(stub.target() != NULL, "string_indexof_linear_uu stub has not been generated"); 4544 } 4545 trampoline_call(stub); 4546 b(DONE); 4547 } 4548 4549 BIND(LINEARSEARCH); 4550 { 4551 Label DO1, DO2, DO3; 4552 4553 Register str2tmp = tmp2; 4554 Register first = tmp3; 4555 4556 if (icnt1 == -1) 4557 { 4558 Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT; 4559 4560 cmp(cnt1, str1_isL == str2_isL ? 4 : 2); 4561 br(LT, DOSHORT); 4562 BIND(LINEAR_MEDIUM); 4563 (this->*str1_load_1chr)(first, Address(str1)); 4564 lea(str1, Address(str1, cnt1, Address::lsl(str1_chr_shift))); 4565 sub(cnt1_neg, zr, cnt1, LSL, str1_chr_shift); 4566 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 4567 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 4568 4569 BIND(FIRST_LOOP); 4570 (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg)); 4571 cmp(first, ch2); 4572 br(EQ, STR1_LOOP); 4573 BIND(STR2_NEXT); 4574 adds(cnt2_neg, cnt2_neg, str2_chr_size); 4575 br(LE, FIRST_LOOP); 4576 b(NOMATCH); 4577 4578 BIND(STR1_LOOP); 4579 adds(cnt1tmp, cnt1_neg, str1_chr_size); 4580 add(cnt2tmp, cnt2_neg, str2_chr_size); 4581 br(GE, MATCH); 4582 4583 BIND(STR1_NEXT); 4584 (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp)); 4585 (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp)); 4586 cmp(ch1, ch2); 4587 br(NE, STR2_NEXT); 4588 adds(cnt1tmp, cnt1tmp, str1_chr_size); 4589 add(cnt2tmp, cnt2tmp, str2_chr_size); 4590 br(LT, STR1_NEXT); 4591 b(MATCH); 4592 4593 BIND(DOSHORT); 4594 if (str1_isL == str2_isL) { 4595 cmp(cnt1, 2); 4596 br(LT, DO1); 4597 br(GT, DO3); 4598 } 4599 } 4600 4601 if (icnt1 == 4) { 4602 Label CH1_LOOP; 4603 4604 (this->*load_4chr)(ch1, str1); 4605 sub(result_tmp, cnt2, 4); 4606 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 4607 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 4608 4609 BIND(CH1_LOOP); 4610 (this->*load_4chr)(ch2, Address(str2, cnt2_neg)); 4611 cmp(ch1, ch2); 4612 br(EQ, MATCH); 4613 adds(cnt2_neg, cnt2_neg, str2_chr_size); 4614 br(LE, CH1_LOOP); 4615 b(NOMATCH); 4616 } 4617 4618 if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 2) { 4619 Label CH1_LOOP; 4620 4621 BIND(DO2); 4622 (this->*load_2chr)(ch1, str1); 4623 if (icnt1 == 2) { 4624 sub(result_tmp, cnt2, 2); 4625 } 4626 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 4627 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 4628 BIND(CH1_LOOP); 4629 (this->*load_2chr)(ch2, Address(str2, cnt2_neg)); 4630 cmp(ch1, ch2); 4631 br(EQ, MATCH); 4632 adds(cnt2_neg, cnt2_neg, str2_chr_size); 4633 br(LE, CH1_LOOP); 4634 b(NOMATCH); 4635 } 4636 4637 if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 3) { 4638 Label FIRST_LOOP, STR2_NEXT, STR1_LOOP; 4639 4640 BIND(DO3); 4641 (this->*load_2chr)(first, str1); 4642 (this->*str1_load_1chr)(ch1, Address(str1, 2*str1_chr_size)); 4643 if (icnt1 == 3) { 4644 sub(result_tmp, cnt2, 3); 4645 } 4646 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 4647 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 4648 BIND(FIRST_LOOP); 4649 (this->*load_2chr)(ch2, Address(str2, cnt2_neg)); 4650 cmpw(first, ch2); 4651 br(EQ, STR1_LOOP); 4652 BIND(STR2_NEXT); 4653 adds(cnt2_neg, cnt2_neg, str2_chr_size); 4654 br(LE, FIRST_LOOP); 4655 b(NOMATCH); 4656 4657 BIND(STR1_LOOP); 4658 add(cnt2tmp, cnt2_neg, 2*str2_chr_size); 4659 (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp)); 4660 cmp(ch1, ch2); 4661 br(NE, STR2_NEXT); 4662 b(MATCH); 4663 } 4664 4665 if (icnt1 == -1 || icnt1 == 1) { 4666 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP; 4667 4668 BIND(DO1); 4669 (this->*str1_load_1chr)(ch1, str1); 4670 cmp(cnt2, 8); 4671 br(LT, DO1_SHORT); 4672 4673 sub(result_tmp, cnt2, 8/str2_chr_size); 4674 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 4675 mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001); 4676 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 4677 4678 if (str2_isL) { 4679 orr(ch1, ch1, ch1, LSL, 8); 4680 } 4681 orr(ch1, ch1, ch1, LSL, 16); 4682 orr(ch1, ch1, ch1, LSL, 32); 4683 BIND(CH1_LOOP); 4684 ldr(ch2, Address(str2, cnt2_neg)); 4685 eor(ch2, ch1, ch2); 4686 sub(tmp1, ch2, tmp3); 4687 orr(tmp2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 4688 bics(tmp1, tmp1, tmp2); 4689 br(NE, HAS_ZERO); 4690 adds(cnt2_neg, cnt2_neg, 8); 4691 br(LT, CH1_LOOP); 4692 4693 cmp(cnt2_neg, 8); 4694 mov(cnt2_neg, 0); 4695 br(LT, CH1_LOOP); 4696 b(NOMATCH); 4697 4698 BIND(HAS_ZERO); 4699 rev(tmp1, tmp1); 4700 clz(tmp1, tmp1); 4701 add(cnt2_neg, cnt2_neg, tmp1, LSR, 3); 4702 b(MATCH); 4703 4704 BIND(DO1_SHORT); 4705 mov(result_tmp, cnt2); 4706 lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift))); 4707 sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift); 4708 BIND(DO1_LOOP); 4709 (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg)); 4710 cmpw(ch1, ch2); 4711 br(EQ, MATCH); 4712 adds(cnt2_neg, cnt2_neg, str2_chr_size); 4713 br(LT, DO1_LOOP); 4714 } 4715 } 4716 BIND(NOMATCH); 4717 mov(result, -1); 4718 b(DONE); 4719 BIND(MATCH); 4720 add(result, result_tmp, cnt2_neg, ASR, str2_chr_shift); 4721 BIND(DONE); 4722 } 4723 4724 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr); 4725 typedef void (MacroAssembler::* uxt_insn)(Register Rd, Register Rn); 4726 4727 void MacroAssembler::string_indexof_char(Register str1, Register cnt1, 4728 Register ch, Register result, 4729 Register tmp1, Register tmp2, Register tmp3) 4730 { 4731 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE; 4732 Register cnt1_neg = cnt1; 4733 Register ch1 = rscratch1; 4734 Register result_tmp = rscratch2; 4735 4736 cbz(cnt1, NOMATCH); 4737 4738 cmp(cnt1, 4); 4739 br(LT, DO1_SHORT); 4740 4741 orr(ch, ch, ch, LSL, 16); 4742 orr(ch, ch, ch, LSL, 32); 4743 4744 sub(cnt1, cnt1, 4); 4745 mov(result_tmp, cnt1); 4746 lea(str1, Address(str1, cnt1, Address::uxtw(1))); 4747 sub(cnt1_neg, zr, cnt1, LSL, 1); 4748 4749 mov(tmp3, 0x0001000100010001); 4750 4751 BIND(CH1_LOOP); 4752 ldr(ch1, Address(str1, cnt1_neg)); 4753 eor(ch1, ch, ch1); 4754 sub(tmp1, ch1, tmp3); 4755 orr(tmp2, ch1, 0x7fff7fff7fff7fff); 4756 bics(tmp1, tmp1, tmp2); 4757 br(NE, HAS_ZERO); 4758 adds(cnt1_neg, cnt1_neg, 8); 4759 br(LT, CH1_LOOP); 4760 4761 cmp(cnt1_neg, 8); 4762 mov(cnt1_neg, 0); 4763 br(LT, CH1_LOOP); 4764 b(NOMATCH); 4765 4766 BIND(HAS_ZERO); 4767 rev(tmp1, tmp1); 4768 clz(tmp1, tmp1); 4769 add(cnt1_neg, cnt1_neg, tmp1, LSR, 3); 4770 b(MATCH); 4771 4772 BIND(DO1_SHORT); 4773 mov(result_tmp, cnt1); 4774 lea(str1, Address(str1, cnt1, Address::uxtw(1))); 4775 sub(cnt1_neg, zr, cnt1, LSL, 1); 4776 BIND(DO1_LOOP); 4777 ldrh(ch1, Address(str1, cnt1_neg)); 4778 cmpw(ch, ch1); 4779 br(EQ, MATCH); 4780 adds(cnt1_neg, cnt1_neg, 2); 4781 br(LT, DO1_LOOP); 4782 BIND(NOMATCH); 4783 mov(result, -1); 4784 b(DONE); 4785 BIND(MATCH); 4786 add(result, result_tmp, cnt1_neg, ASR, 1); 4787 BIND(DONE); 4788 } 4789 4790 // Compare strings. 4791 void MacroAssembler::string_compare(Register str1, Register str2, 4792 Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2, 4793 FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3, int ae) { 4794 Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB, 4795 DIFFERENCE, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, 4796 SHORT_LOOP_START, TAIL_CHECK; 4797 4798 const int STUB_THRESHOLD = 64 + 8; 4799 bool isLL = ae == StrIntrinsicNode::LL; 4800 bool isLU = ae == StrIntrinsicNode::LU; 4801 bool isUL = ae == StrIntrinsicNode::UL; 4802 4803 bool str1_isL = isLL || isLU; 4804 bool str2_isL = isLL || isUL; 4805 4806 int str1_chr_shift = str1_isL ? 0 : 1; 4807 int str2_chr_shift = str2_isL ? 0 : 1; 4808 int str1_chr_size = str1_isL ? 1 : 2; 4809 int str2_chr_size = str2_isL ? 1 : 2; 4810 int minCharsInWord = isLL ? wordSize : wordSize/2; 4811 4812 FloatRegister vtmpZ = vtmp1, vtmp = vtmp2; 4813 chr_insn str1_load_chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb : 4814 (chr_insn)&MacroAssembler::ldrh; 4815 chr_insn str2_load_chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb : 4816 (chr_insn)&MacroAssembler::ldrh; 4817 uxt_insn ext_chr = isLL ? (uxt_insn)&MacroAssembler::uxtbw : 4818 (uxt_insn)&MacroAssembler::uxthw; 4819 4820 BLOCK_COMMENT("string_compare {"); 4821 4822 // Bizzarely, the counts are passed in bytes, regardless of whether they 4823 // are L or U strings, however the result is always in characters. 4824 if (!str1_isL) asrw(cnt1, cnt1, 1); 4825 if (!str2_isL) asrw(cnt2, cnt2, 1); 4826 4827 // Compute the minimum of the string lengths and save the difference. 4828 subsw(result, cnt1, cnt2); 4829 cselw(cnt2, cnt1, cnt2, Assembler::LE); // min 4830 4831 // A very short string 4832 cmpw(cnt2, minCharsInWord); 4833 br(Assembler::LE, SHORT_STRING); 4834 4835 // Compare longwords 4836 // load first parts of strings and finish initialization while loading 4837 { 4838 if (str1_isL == str2_isL) { // LL or UU 4839 ldr(tmp1, Address(str1)); 4840 cmp(str1, str2); 4841 br(Assembler::EQ, DONE); 4842 ldr(tmp2, Address(str2)); 4843 cmp(cnt2, STUB_THRESHOLD); 4844 br(GE, STUB); 4845 subsw(cnt2, cnt2, minCharsInWord); 4846 br(EQ, TAIL_CHECK); 4847 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift))); 4848 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift))); 4849 sub(cnt2, zr, cnt2, LSL, str2_chr_shift); 4850 } else if (isLU) { 4851 ldrs(vtmp, Address(str1)); 4852 cmp(str1, str2); 4853 br(Assembler::EQ, DONE); 4854 ldr(tmp2, Address(str2)); 4855 cmp(cnt2, STUB_THRESHOLD); 4856 br(GE, STUB); 4857 subw(cnt2, cnt2, 4); 4858 eor(vtmpZ, T16B, vtmpZ, vtmpZ); 4859 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift))); 4860 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift))); 4861 zip1(vtmp, T8B, vtmp, vtmpZ); 4862 sub(cnt1, zr, cnt2, LSL, str1_chr_shift); 4863 sub(cnt2, zr, cnt2, LSL, str2_chr_shift); 4864 add(cnt1, cnt1, 4); 4865 fmovd(tmp1, vtmp); 4866 } else { // UL case 4867 ldr(tmp1, Address(str1)); 4868 cmp(str1, str2); 4869 br(Assembler::EQ, DONE); 4870 ldrs(vtmp, Address(str2)); 4871 cmp(cnt2, STUB_THRESHOLD); 4872 br(GE, STUB); 4873 subw(cnt2, cnt2, 4); 4874 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift))); 4875 eor(vtmpZ, T16B, vtmpZ, vtmpZ); 4876 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift))); 4877 sub(cnt1, zr, cnt2, LSL, str1_chr_shift); 4878 zip1(vtmp, T8B, vtmp, vtmpZ); 4879 sub(cnt2, zr, cnt2, LSL, str2_chr_shift); 4880 add(cnt1, cnt1, 8); 4881 fmovd(tmp2, vtmp); 4882 } 4883 adds(cnt2, cnt2, isUL ? 4 : 8); 4884 br(GE, TAIL); 4885 eor(rscratch2, tmp1, tmp2); 4886 cbnz(rscratch2, DIFFERENCE); 4887 // main loop 4888 bind(NEXT_WORD); 4889 if (str1_isL == str2_isL) { 4890 ldr(tmp1, Address(str1, cnt2)); 4891 ldr(tmp2, Address(str2, cnt2)); 4892 adds(cnt2, cnt2, 8); 4893 } else if (isLU) { 4894 ldrs(vtmp, Address(str1, cnt1)); 4895 ldr(tmp2, Address(str2, cnt2)); 4896 add(cnt1, cnt1, 4); 4897 zip1(vtmp, T8B, vtmp, vtmpZ); 4898 fmovd(tmp1, vtmp); 4899 adds(cnt2, cnt2, 8); 4900 } else { // UL 4901 ldrs(vtmp, Address(str2, cnt2)); 4902 ldr(tmp1, Address(str1, cnt1)); 4903 zip1(vtmp, T8B, vtmp, vtmpZ); 4904 add(cnt1, cnt1, 8); 4905 fmovd(tmp2, vtmp); 4906 adds(cnt2, cnt2, 4); 4907 } 4908 br(GE, TAIL); 4909 4910 eor(rscratch2, tmp1, tmp2); 4911 cbz(rscratch2, NEXT_WORD); 4912 b(DIFFERENCE); 4913 bind(TAIL); 4914 eor(rscratch2, tmp1, tmp2); 4915 cbnz(rscratch2, DIFFERENCE); 4916 // Last longword. In the case where length == 4 we compare the 4917 // same longword twice, but that's still faster than another 4918 // conditional branch. 4919 if (str1_isL == str2_isL) { 4920 ldr(tmp1, Address(str1)); 4921 ldr(tmp2, Address(str2)); 4922 } else if (isLU) { 4923 ldrs(vtmp, Address(str1)); 4924 ldr(tmp2, Address(str2)); 4925 zip1(vtmp, T8B, vtmp, vtmpZ); 4926 fmovd(tmp1, vtmp); 4927 } else { // UL 4928 ldrs(vtmp, Address(str2)); 4929 ldr(tmp1, Address(str1)); 4930 zip1(vtmp, T8B, vtmp, vtmpZ); 4931 fmovd(tmp2, vtmp); 4932 } 4933 bind(TAIL_CHECK); 4934 eor(rscratch2, tmp1, tmp2); 4935 cbz(rscratch2, DONE); 4936 4937 // Find the first different characters in the longwords and 4938 // compute their difference. 4939 bind(DIFFERENCE); 4940 rev(rscratch2, rscratch2); 4941 clz(rscratch2, rscratch2); 4942 andr(rscratch2, rscratch2, isLL ? -8 : -16); 4943 lsrv(tmp1, tmp1, rscratch2); 4944 (this->*ext_chr)(tmp1, tmp1); 4945 lsrv(tmp2, tmp2, rscratch2); 4946 (this->*ext_chr)(tmp2, tmp2); 4947 subw(result, tmp1, tmp2); 4948 b(DONE); 4949 } 4950 4951 bind(STUB); 4952 RuntimeAddress stub = NULL; 4953 switch(ae) { 4954 case StrIntrinsicNode::LL: 4955 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LL()); 4956 break; 4957 case StrIntrinsicNode::UU: 4958 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UU()); 4959 break; 4960 case StrIntrinsicNode::LU: 4961 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LU()); 4962 break; 4963 case StrIntrinsicNode::UL: 4964 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UL()); 4965 break; 4966 default: 4967 ShouldNotReachHere(); 4968 } 4969 assert(stub.target() != NULL, "compare_long_string stub has not been generated"); 4970 trampoline_call(stub); 4971 b(DONE); 4972 4973 bind(SHORT_STRING); 4974 // Is the minimum length zero? 4975 cbz(cnt2, DONE); 4976 // arrange code to do most branches while loading and loading next characters 4977 // while comparing previous 4978 (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size))); 4979 subs(cnt2, cnt2, 1); 4980 br(EQ, SHORT_LAST_INIT); 4981 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size))); 4982 b(SHORT_LOOP_START); 4983 bind(SHORT_LOOP); 4984 subs(cnt2, cnt2, 1); 4985 br(EQ, SHORT_LAST); 4986 bind(SHORT_LOOP_START); 4987 (this->*str1_load_chr)(tmp2, Address(post(str1, str1_chr_size))); 4988 (this->*str2_load_chr)(rscratch1, Address(post(str2, str2_chr_size))); 4989 cmp(tmp1, cnt1); 4990 br(NE, SHORT_LOOP_TAIL); 4991 subs(cnt2, cnt2, 1); 4992 br(EQ, SHORT_LAST2); 4993 (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size))); 4994 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size))); 4995 cmp(tmp2, rscratch1); 4996 br(EQ, SHORT_LOOP); 4997 sub(result, tmp2, rscratch1); 4998 b(DONE); 4999 bind(SHORT_LOOP_TAIL); 5000 sub(result, tmp1, cnt1); 5001 b(DONE); 5002 bind(SHORT_LAST2); 5003 cmp(tmp2, rscratch1); 5004 br(EQ, DONE); 5005 sub(result, tmp2, rscratch1); 5006 5007 b(DONE); 5008 bind(SHORT_LAST_INIT); 5009 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size))); 5010 bind(SHORT_LAST); 5011 cmp(tmp1, cnt1); 5012 br(EQ, DONE); 5013 sub(result, tmp1, cnt1); 5014 5015 bind(DONE); 5016 5017 BLOCK_COMMENT("} string_compare"); 5018 } 5019 5020 // This method checks if provided byte array contains byte with highest bit set. 5021 void MacroAssembler::has_negatives(Register ary1, Register len, Register result) { 5022 // Simple and most common case of aligned small array which is not at the 5023 // end of memory page is placed here. All other cases are in stub. 5024 Label LOOP, END, STUB, STUB_LONG, SET_RESULT, DONE; 5025 const uint64_t UPPER_BIT_MASK=0x8080808080808080; 5026 assert_different_registers(ary1, len, result); 5027 5028 cmpw(len, 0); 5029 br(LE, SET_RESULT); 5030 cmpw(len, 4 * wordSize); 5031 br(GE, STUB_LONG); // size > 32 then go to stub 5032 5033 int shift = 64 - exact_log2(os::vm_page_size()); 5034 lsl(rscratch1, ary1, shift); 5035 mov(rscratch2, (size_t)(4 * wordSize) << shift); 5036 adds(rscratch2, rscratch1, rscratch2); // At end of page? 5037 br(CS, STUB); // at the end of page then go to stub 5038 subs(len, len, wordSize); 5039 br(LT, END); 5040 5041 BIND(LOOP); 5042 ldr(rscratch1, Address(post(ary1, wordSize))); 5043 tst(rscratch1, UPPER_BIT_MASK); 5044 br(NE, SET_RESULT); 5045 subs(len, len, wordSize); 5046 br(GE, LOOP); 5047 cmpw(len, -wordSize); 5048 br(EQ, SET_RESULT); 5049 5050 BIND(END); 5051 ldr(result, Address(ary1)); 5052 sub(len, zr, len, LSL, 3); // LSL 3 is to get bits from bytes 5053 lslv(result, result, len); 5054 tst(result, UPPER_BIT_MASK); 5055 b(SET_RESULT); 5056 5057 BIND(STUB); 5058 RuntimeAddress has_neg = RuntimeAddress(StubRoutines::aarch64::has_negatives()); 5059 assert(has_neg.target() != NULL, "has_negatives stub has not been generated"); 5060 trampoline_call(has_neg); 5061 b(DONE); 5062 5063 BIND(STUB_LONG); 5064 RuntimeAddress has_neg_long = RuntimeAddress( 5065 StubRoutines::aarch64::has_negatives_long()); 5066 assert(has_neg_long.target() != NULL, "has_negatives stub has not been generated"); 5067 trampoline_call(has_neg_long); 5068 b(DONE); 5069 5070 BIND(SET_RESULT); 5071 cset(result, NE); // set true or false 5072 5073 BIND(DONE); 5074 } 5075 5076 void MacroAssembler::arrays_equals(Register a1, Register a2, Register tmp3, 5077 Register tmp4, Register tmp5, Register result, 5078 Register cnt1, int elem_size) { 5079 Label DONE, SAME; 5080 Register tmp1 = rscratch1; 5081 Register tmp2 = rscratch2; 5082 Register cnt2 = tmp2; // cnt2 only used in array length compare 5083 int elem_per_word = wordSize/elem_size; 5084 int log_elem_size = exact_log2(elem_size); 5085 int length_offset = arrayOopDesc::length_offset_in_bytes(); 5086 int base_offset 5087 = arrayOopDesc::base_offset_in_bytes(elem_size == 2 ? T_CHAR : T_BYTE); 5088 int stubBytesThreshold = 3 * 64 + (UseSIMDForArrayEquals ? 0 : 16); 5089 5090 assert(elem_size == 1 || elem_size == 2, "must be char or byte"); 5091 assert_different_registers(a1, a2, result, cnt1, rscratch1, rscratch2); 5092 5093 #ifndef PRODUCT 5094 { 5095 const char kind = (elem_size == 2) ? 'U' : 'L'; 5096 char comment[64]; 5097 snprintf(comment, sizeof comment, "array_equals%c{", kind); 5098 BLOCK_COMMENT(comment); 5099 } 5100 #endif 5101 5102 // if (a1 == a2) 5103 // return true; 5104 cmpoop(a1, a2); // May have read barriers for a1 and a2. 5105 br(EQ, SAME); 5106 5107 if (UseSimpleArrayEquals) { 5108 Label NEXT_WORD, SHORT, TAIL03, TAIL01, A_MIGHT_BE_NULL, A_IS_NOT_NULL; 5109 // if (a1 == null || a2 == null) 5110 // return false; 5111 // a1 & a2 == 0 means (some-pointer is null) or 5112 // (very-rare-or-even-probably-impossible-pointer-values) 5113 // so, we can save one branch in most cases 5114 tst(a1, a2); 5115 mov(result, false); 5116 br(EQ, A_MIGHT_BE_NULL); 5117 // if (a1.length != a2.length) 5118 // return false; 5119 bind(A_IS_NOT_NULL); 5120 ldrw(cnt1, Address(a1, length_offset)); 5121 ldrw(cnt2, Address(a2, length_offset)); 5122 eorw(tmp5, cnt1, cnt2); 5123 cbnzw(tmp5, DONE); 5124 lea(a1, Address(a1, base_offset)); 5125 lea(a2, Address(a2, base_offset)); 5126 // Check for short strings, i.e. smaller than wordSize. 5127 subs(cnt1, cnt1, elem_per_word); 5128 br(Assembler::LT, SHORT); 5129 // Main 8 byte comparison loop. 5130 bind(NEXT_WORD); { 5131 ldr(tmp1, Address(post(a1, wordSize))); 5132 ldr(tmp2, Address(post(a2, wordSize))); 5133 subs(cnt1, cnt1, elem_per_word); 5134 eor(tmp5, tmp1, tmp2); 5135 cbnz(tmp5, DONE); 5136 } br(GT, NEXT_WORD); 5137 // Last longword. In the case where length == 4 we compare the 5138 // same longword twice, but that's still faster than another 5139 // conditional branch. 5140 // cnt1 could be 0, -1, -2, -3, -4 for chars; -4 only happens when 5141 // length == 4. 5142 if (log_elem_size > 0) 5143 lsl(cnt1, cnt1, log_elem_size); 5144 ldr(tmp3, Address(a1, cnt1)); 5145 ldr(tmp4, Address(a2, cnt1)); 5146 eor(tmp5, tmp3, tmp4); 5147 cbnz(tmp5, DONE); 5148 b(SAME); 5149 bind(A_MIGHT_BE_NULL); 5150 // in case both a1 and a2 are not-null, proceed with loads 5151 cbz(a1, DONE); 5152 cbz(a2, DONE); 5153 b(A_IS_NOT_NULL); 5154 bind(SHORT); 5155 5156 tbz(cnt1, 2 - log_elem_size, TAIL03); // 0-7 bytes left. 5157 { 5158 ldrw(tmp1, Address(post(a1, 4))); 5159 ldrw(tmp2, Address(post(a2, 4))); 5160 eorw(tmp5, tmp1, tmp2); 5161 cbnzw(tmp5, DONE); 5162 } 5163 bind(TAIL03); 5164 tbz(cnt1, 1 - log_elem_size, TAIL01); // 0-3 bytes left. 5165 { 5166 ldrh(tmp3, Address(post(a1, 2))); 5167 ldrh(tmp4, Address(post(a2, 2))); 5168 eorw(tmp5, tmp3, tmp4); 5169 cbnzw(tmp5, DONE); 5170 } 5171 bind(TAIL01); 5172 if (elem_size == 1) { // Only needed when comparing byte arrays. 5173 tbz(cnt1, 0, SAME); // 0-1 bytes left. 5174 { 5175 ldrb(tmp1, a1); 5176 ldrb(tmp2, a2); 5177 eorw(tmp5, tmp1, tmp2); 5178 cbnzw(tmp5, DONE); 5179 } 5180 } 5181 } else { 5182 Label NEXT_DWORD, SHORT, TAIL, TAIL2, STUB, EARLY_OUT, 5183 CSET_EQ, LAST_CHECK; 5184 mov(result, false); 5185 cbz(a1, DONE); 5186 ldrw(cnt1, Address(a1, length_offset)); 5187 cbz(a2, DONE); 5188 ldrw(cnt2, Address(a2, length_offset)); 5189 // on most CPUs a2 is still "locked"(surprisingly) in ldrw and it's 5190 // faster to perform another branch before comparing a1 and a2 5191 cmp(cnt1, elem_per_word); 5192 br(LE, SHORT); // short or same 5193 ldr(tmp3, Address(pre(a1, base_offset))); 5194 cmp(cnt1, stubBytesThreshold); 5195 br(GE, STUB); 5196 ldr(tmp4, Address(pre(a2, base_offset))); 5197 sub(tmp5, zr, cnt1, LSL, 3 + log_elem_size); 5198 cmp(cnt2, cnt1); 5199 br(NE, DONE); 5200 5201 // Main 16 byte comparison loop with 2 exits 5202 bind(NEXT_DWORD); { 5203 ldr(tmp1, Address(pre(a1, wordSize))); 5204 ldr(tmp2, Address(pre(a2, wordSize))); 5205 subs(cnt1, cnt1, 2 * elem_per_word); 5206 br(LE, TAIL); 5207 eor(tmp4, tmp3, tmp4); 5208 cbnz(tmp4, DONE); 5209 ldr(tmp3, Address(pre(a1, wordSize))); 5210 ldr(tmp4, Address(pre(a2, wordSize))); 5211 cmp(cnt1, elem_per_word); 5212 br(LE, TAIL2); 5213 cmp(tmp1, tmp2); 5214 } br(EQ, NEXT_DWORD); 5215 b(DONE); 5216 5217 bind(TAIL); 5218 eor(tmp4, tmp3, tmp4); 5219 eor(tmp2, tmp1, tmp2); 5220 lslv(tmp2, tmp2, tmp5); 5221 orr(tmp5, tmp4, tmp2); 5222 cmp(tmp5, zr); 5223 b(CSET_EQ); 5224 5225 bind(TAIL2); 5226 eor(tmp2, tmp1, tmp2); 5227 cbnz(tmp2, DONE); 5228 b(LAST_CHECK); 5229 5230 bind(STUB); 5231 ldr(tmp4, Address(pre(a2, base_offset))); 5232 cmp(cnt2, cnt1); 5233 br(NE, DONE); 5234 if (elem_size == 2) { // convert to byte counter 5235 lsl(cnt1, cnt1, 1); 5236 } 5237 eor(tmp5, tmp3, tmp4); 5238 cbnz(tmp5, DONE); 5239 RuntimeAddress stub = RuntimeAddress(StubRoutines::aarch64::large_array_equals()); 5240 assert(stub.target() != NULL, "array_equals_long stub has not been generated"); 5241 trampoline_call(stub); 5242 b(DONE); 5243 5244 bind(EARLY_OUT); 5245 // (a1 != null && a2 == null) || (a1 != null && a2 != null && a1 == a2) 5246 // so, if a2 == null => return false(0), else return true, so we can return a2 5247 mov(result, a2); 5248 b(DONE); 5249 bind(SHORT); 5250 cmp(cnt2, cnt1); 5251 br(NE, DONE); 5252 cbz(cnt1, SAME); 5253 sub(tmp5, zr, cnt1, LSL, 3 + log_elem_size); 5254 ldr(tmp3, Address(a1, base_offset)); 5255 ldr(tmp4, Address(a2, base_offset)); 5256 bind(LAST_CHECK); 5257 eor(tmp4, tmp3, tmp4); 5258 lslv(tmp5, tmp4, tmp5); 5259 cmp(tmp5, zr); 5260 bind(CSET_EQ); 5261 cset(result, EQ); 5262 b(DONE); 5263 } 5264 5265 bind(SAME); 5266 mov(result, true); 5267 // That's it. 5268 bind(DONE); 5269 5270 BLOCK_COMMENT("} array_equals"); 5271 } 5272 5273 // Compare Strings 5274 5275 // For Strings we're passed the address of the first characters in a1 5276 // and a2 and the length in cnt1. 5277 // elem_size is the element size in bytes: either 1 or 2. 5278 // There are two implementations. For arrays >= 8 bytes, all 5279 // comparisons (including the final one, which may overlap) are 5280 // performed 8 bytes at a time. For strings < 8 bytes, we compare a 5281 // halfword, then a short, and then a byte. 5282 5283 void MacroAssembler::string_equals(Register a1, Register a2, 5284 Register result, Register cnt1, int elem_size) 5285 { 5286 Label SAME, DONE, SHORT, NEXT_WORD; 5287 Register tmp1 = rscratch1; 5288 Register tmp2 = rscratch2; 5289 Register cnt2 = tmp2; // cnt2 only used in array length compare 5290 5291 assert(elem_size == 1 || elem_size == 2, "must be 2 or 1 byte"); 5292 assert_different_registers(a1, a2, result, cnt1, rscratch1, rscratch2); 5293 5294 #ifndef PRODUCT 5295 { 5296 const char kind = (elem_size == 2) ? 'U' : 'L'; 5297 char comment[64]; 5298 snprintf(comment, sizeof comment, "{string_equals%c", kind); 5299 BLOCK_COMMENT(comment); 5300 } 5301 #endif 5302 5303 mov(result, false); 5304 5305 // Check for short strings, i.e. smaller than wordSize. 5306 subs(cnt1, cnt1, wordSize); 5307 br(Assembler::LT, SHORT); 5308 // Main 8 byte comparison loop. 5309 bind(NEXT_WORD); { 5310 ldr(tmp1, Address(post(a1, wordSize))); 5311 ldr(tmp2, Address(post(a2, wordSize))); 5312 subs(cnt1, cnt1, wordSize); 5313 eor(tmp1, tmp1, tmp2); 5314 cbnz(tmp1, DONE); 5315 } br(GT, NEXT_WORD); 5316 // Last longword. In the case where length == 4 we compare the 5317 // same longword twice, but that's still faster than another 5318 // conditional branch. 5319 // cnt1 could be 0, -1, -2, -3, -4 for chars; -4 only happens when 5320 // length == 4. 5321 ldr(tmp1, Address(a1, cnt1)); 5322 ldr(tmp2, Address(a2, cnt1)); 5323 eor(tmp2, tmp1, tmp2); 5324 cbnz(tmp2, DONE); 5325 b(SAME); 5326 5327 bind(SHORT); 5328 Label TAIL03, TAIL01; 5329 5330 tbz(cnt1, 2, TAIL03); // 0-7 bytes left. 5331 { 5332 ldrw(tmp1, Address(post(a1, 4))); 5333 ldrw(tmp2, Address(post(a2, 4))); 5334 eorw(tmp1, tmp1, tmp2); 5335 cbnzw(tmp1, DONE); 5336 } 5337 bind(TAIL03); 5338 tbz(cnt1, 1, TAIL01); // 0-3 bytes left. 5339 { 5340 ldrh(tmp1, Address(post(a1, 2))); 5341 ldrh(tmp2, Address(post(a2, 2))); 5342 eorw(tmp1, tmp1, tmp2); 5343 cbnzw(tmp1, DONE); 5344 } 5345 bind(TAIL01); 5346 if (elem_size == 1) { // Only needed when comparing 1-byte elements 5347 tbz(cnt1, 0, SAME); // 0-1 bytes left. 5348 { 5349 ldrb(tmp1, a1); 5350 ldrb(tmp2, a2); 5351 eorw(tmp1, tmp1, tmp2); 5352 cbnzw(tmp1, DONE); 5353 } 5354 } 5355 // Arrays are equal. 5356 bind(SAME); 5357 mov(result, true); 5358 5359 // That's it. 5360 bind(DONE); 5361 BLOCK_COMMENT("} string_equals"); 5362 } 5363 5364 5365 // The size of the blocks erased by the zero_blocks stub. We must 5366 // handle anything smaller than this ourselves in zero_words(). 5367 const int MacroAssembler::zero_words_block_size = 8; 5368 5369 // zero_words() is used by C2 ClearArray patterns. It is as small as 5370 // possible, handling small word counts locally and delegating 5371 // anything larger to the zero_blocks stub. It is expanded many times 5372 // in compiled code, so it is important to keep it short. 5373 5374 // ptr: Address of a buffer to be zeroed. 5375 // cnt: Count in HeapWords. 5376 // 5377 // ptr, cnt, rscratch1, and rscratch2 are clobbered. 5378 void MacroAssembler::zero_words(Register ptr, Register cnt) 5379 { 5380 assert(is_power_of_2(zero_words_block_size), "adjust this"); 5381 assert(ptr == r10 && cnt == r11, "mismatch in register usage"); 5382 5383 BLOCK_COMMENT("zero_words {"); 5384 cmp(cnt, zero_words_block_size); 5385 Label around, done, done16; 5386 br(LO, around); 5387 { 5388 RuntimeAddress zero_blocks = RuntimeAddress(StubRoutines::aarch64::zero_blocks()); 5389 assert(zero_blocks.target() != NULL, "zero_blocks stub has not been generated"); 5390 if (StubRoutines::aarch64::complete()) { 5391 trampoline_call(zero_blocks); 5392 } else { 5393 bl(zero_blocks); 5394 } 5395 } 5396 bind(around); 5397 for (int i = zero_words_block_size >> 1; i > 1; i >>= 1) { 5398 Label l; 5399 tbz(cnt, exact_log2(i), l); 5400 for (int j = 0; j < i; j += 2) { 5401 stp(zr, zr, post(ptr, 16)); 5402 } 5403 bind(l); 5404 } 5405 { 5406 Label l; 5407 tbz(cnt, 0, l); 5408 str(zr, Address(ptr)); 5409 bind(l); 5410 } 5411 BLOCK_COMMENT("} zero_words"); 5412 } 5413 5414 // base: Address of a buffer to be zeroed, 8 bytes aligned. 5415 // cnt: Immediate count in HeapWords. 5416 #define SmallArraySize (18 * BytesPerLong) 5417 void MacroAssembler::zero_words(Register base, u_int64_t cnt) 5418 { 5419 BLOCK_COMMENT("zero_words {"); 5420 int i = cnt & 1; // store any odd word to start 5421 if (i) str(zr, Address(base)); 5422 5423 if (cnt <= SmallArraySize / BytesPerLong) { 5424 for (; i < (int)cnt; i += 2) 5425 stp(zr, zr, Address(base, i * wordSize)); 5426 } else { 5427 const int unroll = 4; // Number of stp(zr, zr) instructions we'll unroll 5428 int remainder = cnt % (2 * unroll); 5429 for (; i < remainder; i += 2) 5430 stp(zr, zr, Address(base, i * wordSize)); 5431 5432 Label loop; 5433 Register cnt_reg = rscratch1; 5434 Register loop_base = rscratch2; 5435 cnt = cnt - remainder; 5436 mov(cnt_reg, cnt); 5437 // adjust base and prebias by -2 * wordSize so we can pre-increment 5438 add(loop_base, base, (remainder - 2) * wordSize); 5439 bind(loop); 5440 sub(cnt_reg, cnt_reg, 2 * unroll); 5441 for (i = 1; i < unroll; i++) 5442 stp(zr, zr, Address(loop_base, 2 * i * wordSize)); 5443 stp(zr, zr, Address(pre(loop_base, 2 * unroll * wordSize))); 5444 cbnz(cnt_reg, loop); 5445 } 5446 BLOCK_COMMENT("} zero_words"); 5447 } 5448 5449 // Zero blocks of memory by using DC ZVA. 5450 // 5451 // Aligns the base address first sufficently for DC ZVA, then uses 5452 // DC ZVA repeatedly for every full block. cnt is the size to be 5453 // zeroed in HeapWords. Returns the count of words left to be zeroed 5454 // in cnt. 5455 // 5456 // NOTE: This is intended to be used in the zero_blocks() stub. If 5457 // you want to use it elsewhere, note that cnt must be >= 2*zva_length. 5458 void MacroAssembler::zero_dcache_blocks(Register base, Register cnt) { 5459 Register tmp = rscratch1; 5460 Register tmp2 = rscratch2; 5461 int zva_length = VM_Version::zva_length(); 5462 Label initial_table_end, loop_zva; 5463 Label fini; 5464 5465 // Base must be 16 byte aligned. If not just return and let caller handle it 5466 tst(base, 0x0f); 5467 br(Assembler::NE, fini); 5468 // Align base with ZVA length. 5469 neg(tmp, base); 5470 andr(tmp, tmp, zva_length - 1); 5471 5472 // tmp: the number of bytes to be filled to align the base with ZVA length. 5473 add(base, base, tmp); 5474 sub(cnt, cnt, tmp, Assembler::ASR, 3); 5475 adr(tmp2, initial_table_end); 5476 sub(tmp2, tmp2, tmp, Assembler::LSR, 2); 5477 br(tmp2); 5478 5479 for (int i = -zva_length + 16; i < 0; i += 16) 5480 stp(zr, zr, Address(base, i)); 5481 bind(initial_table_end); 5482 5483 sub(cnt, cnt, zva_length >> 3); 5484 bind(loop_zva); 5485 dc(Assembler::ZVA, base); 5486 subs(cnt, cnt, zva_length >> 3); 5487 add(base, base, zva_length); 5488 br(Assembler::GE, loop_zva); 5489 add(cnt, cnt, zva_length >> 3); // count not zeroed by DC ZVA 5490 bind(fini); 5491 } 5492 5493 // base: Address of a buffer to be filled, 8 bytes aligned. 5494 // cnt: Count in 8-byte unit. 5495 // value: Value to be filled with. 5496 // base will point to the end of the buffer after filling. 5497 void MacroAssembler::fill_words(Register base, Register cnt, Register value) 5498 { 5499 // Algorithm: 5500 // 5501 // scratch1 = cnt & 7; 5502 // cnt -= scratch1; 5503 // p += scratch1; 5504 // switch (scratch1) { 5505 // do { 5506 // cnt -= 8; 5507 // p[-8] = v; 5508 // case 7: 5509 // p[-7] = v; 5510 // case 6: 5511 // p[-6] = v; 5512 // // ... 5513 // case 1: 5514 // p[-1] = v; 5515 // case 0: 5516 // p += 8; 5517 // } while (cnt); 5518 // } 5519 5520 assert_different_registers(base, cnt, value, rscratch1, rscratch2); 5521 5522 Label fini, skip, entry, loop; 5523 const int unroll = 8; // Number of stp instructions we'll unroll 5524 5525 cbz(cnt, fini); 5526 tbz(base, 3, skip); 5527 str(value, Address(post(base, 8))); 5528 sub(cnt, cnt, 1); 5529 bind(skip); 5530 5531 andr(rscratch1, cnt, (unroll-1) * 2); 5532 sub(cnt, cnt, rscratch1); 5533 add(base, base, rscratch1, Assembler::LSL, 3); 5534 adr(rscratch2, entry); 5535 sub(rscratch2, rscratch2, rscratch1, Assembler::LSL, 1); 5536 br(rscratch2); 5537 5538 bind(loop); 5539 add(base, base, unroll * 16); 5540 for (int i = -unroll; i < 0; i++) 5541 stp(value, value, Address(base, i * 16)); 5542 bind(entry); 5543 subs(cnt, cnt, unroll * 2); 5544 br(Assembler::GE, loop); 5545 5546 tbz(cnt, 0, fini); 5547 str(value, Address(post(base, 8))); 5548 bind(fini); 5549 } 5550 5551 // Intrinsic for sun/nio/cs/ISO_8859_1$Encoder.implEncodeISOArray and 5552 // java/lang/StringUTF16.compress. 5553 void MacroAssembler::encode_iso_array(Register src, Register dst, 5554 Register len, Register result, 5555 FloatRegister Vtmp1, FloatRegister Vtmp2, 5556 FloatRegister Vtmp3, FloatRegister Vtmp4) 5557 { 5558 Label DONE, SET_RESULT, NEXT_32, NEXT_32_PRFM, LOOP_8, NEXT_8, LOOP_1, NEXT_1, 5559 NEXT_32_START, NEXT_32_PRFM_START; 5560 Register tmp1 = rscratch1, tmp2 = rscratch2; 5561 5562 mov(result, len); // Save initial len 5563 5564 cmp(len, 8); // handle shortest strings first 5565 br(LT, LOOP_1); 5566 cmp(len, 32); 5567 br(LT, NEXT_8); 5568 // The following code uses the SIMD 'uzp1' and 'uzp2' instructions 5569 // to convert chars to bytes 5570 if (SoftwarePrefetchHintDistance >= 0) { 5571 ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src); 5572 cmp(len, SoftwarePrefetchHintDistance/2 + 16); 5573 br(LE, NEXT_32_START); 5574 b(NEXT_32_PRFM_START); 5575 BIND(NEXT_32_PRFM); 5576 ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src); 5577 BIND(NEXT_32_PRFM_START); 5578 prfm(Address(src, SoftwarePrefetchHintDistance)); 5579 orr(v4, T16B, Vtmp1, Vtmp2); 5580 orr(v5, T16B, Vtmp3, Vtmp4); 5581 uzp1(Vtmp1, T16B, Vtmp1, Vtmp2); 5582 uzp1(Vtmp3, T16B, Vtmp3, Vtmp4); 5583 uzp2(v5, T16B, v4, v5); // high bytes 5584 umov(tmp2, v5, D, 1); 5585 fmovd(tmp1, v5); 5586 orr(tmp1, tmp1, tmp2); 5587 cbnz(tmp1, LOOP_8); 5588 stpq(Vtmp1, Vtmp3, dst); 5589 sub(len, len, 32); 5590 add(dst, dst, 32); 5591 add(src, src, 64); 5592 cmp(len, SoftwarePrefetchHintDistance/2 + 16); 5593 br(GE, NEXT_32_PRFM); 5594 cmp(len, 32); 5595 br(LT, LOOP_8); 5596 BIND(NEXT_32); 5597 ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src); 5598 BIND(NEXT_32_START); 5599 } else { 5600 BIND(NEXT_32); 5601 ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src); 5602 } 5603 prfm(Address(src, SoftwarePrefetchHintDistance)); 5604 uzp1(v4, T16B, Vtmp1, Vtmp2); 5605 uzp1(v5, T16B, Vtmp3, Vtmp4); 5606 orr(Vtmp1, T16B, Vtmp1, Vtmp2); 5607 orr(Vtmp3, T16B, Vtmp3, Vtmp4); 5608 uzp2(Vtmp1, T16B, Vtmp1, Vtmp3); // high bytes 5609 umov(tmp2, Vtmp1, D, 1); 5610 fmovd(tmp1, Vtmp1); 5611 orr(tmp1, tmp1, tmp2); 5612 cbnz(tmp1, LOOP_8); 5613 stpq(v4, v5, dst); 5614 sub(len, len, 32); 5615 add(dst, dst, 32); 5616 add(src, src, 64); 5617 cmp(len, 32); 5618 br(GE, NEXT_32); 5619 cbz(len, DONE); 5620 5621 BIND(LOOP_8); 5622 cmp(len, 8); 5623 br(LT, LOOP_1); 5624 BIND(NEXT_8); 5625 ld1(Vtmp1, T8H, src); 5626 uzp1(Vtmp2, T16B, Vtmp1, Vtmp1); // low bytes 5627 uzp2(Vtmp3, T16B, Vtmp1, Vtmp1); // high bytes 5628 fmovd(tmp1, Vtmp3); 5629 cbnz(tmp1, NEXT_1); 5630 strd(Vtmp2, dst); 5631 5632 sub(len, len, 8); 5633 add(dst, dst, 8); 5634 add(src, src, 16); 5635 cmp(len, 8); 5636 br(GE, NEXT_8); 5637 5638 BIND(LOOP_1); 5639 5640 cbz(len, DONE); 5641 BIND(NEXT_1); 5642 ldrh(tmp1, Address(post(src, 2))); 5643 tst(tmp1, 0xff00); 5644 br(NE, SET_RESULT); 5645 strb(tmp1, Address(post(dst, 1))); 5646 subs(len, len, 1); 5647 br(GT, NEXT_1); 5648 5649 BIND(SET_RESULT); 5650 sub(result, result, len); // Return index where we stopped 5651 // Return len == 0 if we processed all 5652 // characters 5653 BIND(DONE); 5654 } 5655 5656 5657 // Inflate byte[] array to char[]. 5658 void MacroAssembler::byte_array_inflate(Register src, Register dst, Register len, 5659 FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3, 5660 Register tmp4) { 5661 Label big, done, after_init, to_stub; 5662 5663 assert_different_registers(src, dst, len, tmp4, rscratch1); 5664 5665 fmovd(vtmp1, zr); 5666 lsrw(tmp4, len, 3); 5667 bind(after_init); 5668 cbnzw(tmp4, big); 5669 // Short string: less than 8 bytes. 5670 { 5671 Label loop, tiny; 5672 5673 cmpw(len, 4); 5674 br(LT, tiny); 5675 // Use SIMD to do 4 bytes. 5676 ldrs(vtmp2, post(src, 4)); 5677 zip1(vtmp3, T8B, vtmp2, vtmp1); 5678 subw(len, len, 4); 5679 strd(vtmp3, post(dst, 8)); 5680 5681 cbzw(len, done); 5682 5683 // Do the remaining bytes by steam. 5684 bind(loop); 5685 ldrb(tmp4, post(src, 1)); 5686 strh(tmp4, post(dst, 2)); 5687 subw(len, len, 1); 5688 5689 bind(tiny); 5690 cbnz(len, loop); 5691 5692 b(done); 5693 } 5694 5695 if (SoftwarePrefetchHintDistance >= 0) { 5696 bind(to_stub); 5697 RuntimeAddress stub = RuntimeAddress(StubRoutines::aarch64::large_byte_array_inflate()); 5698 assert(stub.target() != NULL, "large_byte_array_inflate stub has not been generated"); 5699 trampoline_call(stub); 5700 b(after_init); 5701 } 5702 5703 // Unpack the bytes 8 at a time. 5704 bind(big); 5705 { 5706 Label loop, around, loop_last, loop_start; 5707 5708 if (SoftwarePrefetchHintDistance >= 0) { 5709 const int large_loop_threshold = (64 + 16)/8; 5710 ldrd(vtmp2, post(src, 8)); 5711 andw(len, len, 7); 5712 cmp(tmp4, large_loop_threshold); 5713 br(GE, to_stub); 5714 b(loop_start); 5715 5716 bind(loop); 5717 ldrd(vtmp2, post(src, 8)); 5718 bind(loop_start); 5719 subs(tmp4, tmp4, 1); 5720 br(EQ, loop_last); 5721 zip1(vtmp2, T16B, vtmp2, vtmp1); 5722 ldrd(vtmp3, post(src, 8)); 5723 st1(vtmp2, T8H, post(dst, 16)); 5724 subs(tmp4, tmp4, 1); 5725 zip1(vtmp3, T16B, vtmp3, vtmp1); 5726 st1(vtmp3, T8H, post(dst, 16)); 5727 br(NE, loop); 5728 b(around); 5729 bind(loop_last); 5730 zip1(vtmp2, T16B, vtmp2, vtmp1); 5731 st1(vtmp2, T8H, post(dst, 16)); 5732 bind(around); 5733 cbz(len, done); 5734 } else { 5735 andw(len, len, 7); 5736 bind(loop); 5737 ldrd(vtmp2, post(src, 8)); 5738 sub(tmp4, tmp4, 1); 5739 zip1(vtmp3, T16B, vtmp2, vtmp1); 5740 st1(vtmp3, T8H, post(dst, 16)); 5741 cbnz(tmp4, loop); 5742 } 5743 } 5744 5745 // Do the tail of up to 8 bytes. 5746 add(src, src, len); 5747 ldrd(vtmp3, Address(src, -8)); 5748 add(dst, dst, len, ext::uxtw, 1); 5749 zip1(vtmp3, T16B, vtmp3, vtmp1); 5750 strq(vtmp3, Address(dst, -16)); 5751 5752 bind(done); 5753 } 5754 5755 // Compress char[] array to byte[]. 5756 void MacroAssembler::char_array_compress(Register src, Register dst, Register len, 5757 FloatRegister tmp1Reg, FloatRegister tmp2Reg, 5758 FloatRegister tmp3Reg, FloatRegister tmp4Reg, 5759 Register result) { 5760 encode_iso_array(src, dst, len, result, 5761 tmp1Reg, tmp2Reg, tmp3Reg, tmp4Reg); 5762 cmp(len, zr); 5763 csel(result, result, zr, EQ); 5764 } 5765 5766 // get_thread() can be called anywhere inside generated code so we 5767 // need to save whatever non-callee save context might get clobbered 5768 // by the call to JavaThread::aarch64_get_thread_helper() or, indeed, 5769 // the call setup code. 5770 // 5771 // aarch64_get_thread_helper() clobbers only r0, r1, and flags. 5772 // 5773 void MacroAssembler::get_thread(Register dst) { 5774 RegSet saved_regs = RegSet::range(r0, r1) + lr - dst; 5775 push(saved_regs, sp); 5776 5777 mov(lr, CAST_FROM_FN_PTR(address, JavaThread::aarch64_get_thread_helper)); 5778 blr(lr); 5779 if (dst != c_rarg0) { 5780 mov(dst, c_rarg0); 5781 } 5782 5783 pop(saved_regs, sp); 5784 }