1 /* 2 * Copyright (c) 1997, 2016, Oracle and/or its affiliates. All rights reserved. 3 * Copyright (c) 2014, 2015, Red Hat Inc. All rights reserved. 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. 9 * 10 * This code is distributed in the hope that it will be useful, but WITHOUT 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 * version 2 for more details (a copy is included in the LICENSE file that 14 * accompanied this code). 15 * 16 * You should have received a copy of the GNU General Public License version 17 * 2 along with this work; if not, write to the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 19 * 20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 21 * or visit www.oracle.com if you need additional information or have any 22 * questions. 23 * 24 */ 25 26 #include <sys/types.h> 27 28 #include "precompiled.hpp" 29 #include "asm/assembler.hpp" 30 #include "asm/assembler.inline.hpp" 31 #include "interpreter/interpreter.hpp" 32 33 #include "compiler/disassembler.hpp" 34 #include "gc/shared/collectedHeap.hpp" 35 #include "gc/shenandoah/brooksPointer.hpp" 36 #include "gc/shenandoah/shenandoahHeap.hpp" 37 #include "gc/shenandoah/shenandoahHeap.inline.hpp" 38 #include "gc/shenandoah/shenandoahHeapRegion.hpp" 39 #include "memory/resourceArea.hpp" 40 #include "nativeInst_aarch64.hpp" 41 #include "oops/klass.inline.hpp" 42 #include "oops/oop.inline.hpp" 43 #include "opto/compile.hpp" 44 #include "opto/intrinsicnode.hpp" 45 #include "opto/node.hpp" 46 #include "runtime/biasedLocking.hpp" 47 #include "runtime/icache.hpp" 48 #include "runtime/interfaceSupport.hpp" 49 #include "runtime/sharedRuntime.hpp" 50 #include "runtime/thread.hpp" 51 52 #if INCLUDE_ALL_GCS 53 #include "gc/g1/g1CollectedHeap.inline.hpp" 54 #include "gc/g1/g1SATBCardTableModRefBS.hpp" 55 #include "gc/g1/heapRegion.hpp" 56 #endif 57 58 #ifdef PRODUCT 59 #define BLOCK_COMMENT(str) /* nothing */ 60 #define STOP(error) stop(error) 61 #else 62 #define BLOCK_COMMENT(str) block_comment(str) 63 #define STOP(error) block_comment(error); stop(error) 64 #endif 65 66 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 67 68 // Patch any kind of instruction; there may be several instructions. 69 // Return the total length (in bytes) of the instructions. 70 int MacroAssembler::pd_patch_instruction_size(address branch, address target) { 71 int instructions = 1; 72 assert((uint64_t)target < (1ul << 48), "48-bit overflow in address constant"); 73 long offset = (target - branch) >> 2; 74 unsigned insn = *(unsigned*)branch; 75 if ((Instruction_aarch64::extract(insn, 29, 24) & 0b111011) == 0b011000) { 76 // Load register (literal) 77 Instruction_aarch64::spatch(branch, 23, 5, offset); 78 } else if (Instruction_aarch64::extract(insn, 30, 26) == 0b00101) { 79 // Unconditional branch (immediate) 80 Instruction_aarch64::spatch(branch, 25, 0, offset); 81 } else if (Instruction_aarch64::extract(insn, 31, 25) == 0b0101010) { 82 // Conditional branch (immediate) 83 Instruction_aarch64::spatch(branch, 23, 5, offset); 84 } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011010) { 85 // Compare & branch (immediate) 86 Instruction_aarch64::spatch(branch, 23, 5, offset); 87 } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011011) { 88 // Test & branch (immediate) 89 Instruction_aarch64::spatch(branch, 18, 5, offset); 90 } else if (Instruction_aarch64::extract(insn, 28, 24) == 0b10000) { 91 // PC-rel. addressing 92 offset = target-branch; 93 int shift = Instruction_aarch64::extract(insn, 31, 31); 94 if (shift) { 95 u_int64_t dest = (u_int64_t)target; 96 uint64_t pc_page = (uint64_t)branch >> 12; 97 uint64_t adr_page = (uint64_t)target >> 12; 98 unsigned offset_lo = dest & 0xfff; 99 offset = adr_page - pc_page; 100 101 // We handle 4 types of PC relative addressing 102 // 1 - adrp Rx, target_page 103 // ldr/str Ry, [Rx, #offset_in_page] 104 // 2 - adrp Rx, target_page 105 // add Ry, Rx, #offset_in_page 106 // 3 - adrp Rx, target_page (page aligned reloc, offset == 0) 107 // movk Rx, #imm16<<32 108 // 4 - adrp Rx, target_page (page aligned reloc, offset == 0) 109 // In the first 3 cases we must check that Rx is the same in the adrp and the 110 // subsequent ldr/str, add or movk instruction. Otherwise we could accidentally end 111 // up treating a type 4 relocation as a type 1, 2 or 3 just because it happened 112 // to be followed by a random unrelated ldr/str, add or movk instruction. 113 // 114 unsigned insn2 = ((unsigned*)branch)[1]; 115 if (Instruction_aarch64::extract(insn2, 29, 24) == 0b111001 && 116 Instruction_aarch64::extract(insn, 4, 0) == 117 Instruction_aarch64::extract(insn2, 9, 5)) { 118 // Load/store register (unsigned immediate) 119 unsigned size = Instruction_aarch64::extract(insn2, 31, 30); 120 Instruction_aarch64::patch(branch + sizeof (unsigned), 121 21, 10, offset_lo >> size); 122 guarantee(((dest >> size) << size) == dest, "misaligned target"); 123 instructions = 2; 124 } else if (Instruction_aarch64::extract(insn2, 31, 22) == 0b1001000100 && 125 Instruction_aarch64::extract(insn, 4, 0) == 126 Instruction_aarch64::extract(insn2, 4, 0)) { 127 // add (immediate) 128 Instruction_aarch64::patch(branch + sizeof (unsigned), 129 21, 10, offset_lo); 130 instructions = 2; 131 } else if (Instruction_aarch64::extract(insn2, 31, 21) == 0b11110010110 && 132 Instruction_aarch64::extract(insn, 4, 0) == 133 Instruction_aarch64::extract(insn2, 4, 0)) { 134 // movk #imm16<<32 135 Instruction_aarch64::patch(branch + 4, 20, 5, (uint64_t)target >> 32); 136 long dest = ((long)target & 0xffffffffL) | ((long)branch & 0xffff00000000L); 137 long pc_page = (long)branch >> 12; 138 long adr_page = (long)dest >> 12; 139 offset = adr_page - pc_page; 140 instructions = 2; 141 } 142 } 143 int offset_lo = offset & 3; 144 offset >>= 2; 145 Instruction_aarch64::spatch(branch, 23, 5, offset); 146 Instruction_aarch64::patch(branch, 30, 29, offset_lo); 147 } else if (Instruction_aarch64::extract(insn, 31, 21) == 0b11010010100) { 148 u_int64_t dest = (u_int64_t)target; 149 // Move wide constant 150 assert(nativeInstruction_at(branch+4)->is_movk(), "wrong insns in patch"); 151 assert(nativeInstruction_at(branch+8)->is_movk(), "wrong insns in patch"); 152 Instruction_aarch64::patch(branch, 20, 5, dest & 0xffff); 153 Instruction_aarch64::patch(branch+4, 20, 5, (dest >>= 16) & 0xffff); 154 Instruction_aarch64::patch(branch+8, 20, 5, (dest >>= 16) & 0xffff); 155 assert(target_addr_for_insn(branch) == target, "should be"); 156 instructions = 3; 157 } else if (Instruction_aarch64::extract(insn, 31, 22) == 0b1011100101 && 158 Instruction_aarch64::extract(insn, 4, 0) == 0b11111) { 159 // nothing to do 160 assert(target == 0, "did not expect to relocate target for polling page load"); 161 } else { 162 ShouldNotReachHere(); 163 } 164 return instructions * NativeInstruction::instruction_size; 165 } 166 167 int MacroAssembler::patch_oop(address insn_addr, address o) { 168 int instructions; 169 unsigned insn = *(unsigned*)insn_addr; 170 assert(nativeInstruction_at(insn_addr+4)->is_movk(), "wrong insns in patch"); 171 172 // OOPs are either narrow (32 bits) or wide (48 bits). We encode 173 // narrow OOPs by setting the upper 16 bits in the first 174 // instruction. 175 if (Instruction_aarch64::extract(insn, 31, 21) == 0b11010010101) { 176 // Move narrow OOP 177 narrowOop n = oopDesc::encode_heap_oop((oop)o); 178 Instruction_aarch64::patch(insn_addr, 20, 5, n >> 16); 179 Instruction_aarch64::patch(insn_addr+4, 20, 5, n & 0xffff); 180 instructions = 2; 181 } else { 182 // Move wide OOP 183 assert(nativeInstruction_at(insn_addr+8)->is_movk(), "wrong insns in patch"); 184 uintptr_t dest = (uintptr_t)o; 185 Instruction_aarch64::patch(insn_addr, 20, 5, dest & 0xffff); 186 Instruction_aarch64::patch(insn_addr+4, 20, 5, (dest >>= 16) & 0xffff); 187 Instruction_aarch64::patch(insn_addr+8, 20, 5, (dest >>= 16) & 0xffff); 188 instructions = 3; 189 } 190 return instructions * NativeInstruction::instruction_size; 191 } 192 193 int MacroAssembler::patch_narrow_klass(address insn_addr, narrowKlass n) { 194 // Metatdata pointers are either narrow (32 bits) or wide (48 bits). 195 // We encode narrow ones by setting the upper 16 bits in the first 196 // instruction. 197 NativeInstruction *insn = nativeInstruction_at(insn_addr); 198 assert(Instruction_aarch64::extract(insn->encoding(), 31, 21) == 0b11010010101 && 199 nativeInstruction_at(insn_addr+4)->is_movk(), "wrong insns in patch"); 200 201 Instruction_aarch64::patch(insn_addr, 20, 5, n >> 16); 202 Instruction_aarch64::patch(insn_addr+4, 20, 5, n & 0xffff); 203 return 2 * NativeInstruction::instruction_size; 204 } 205 206 address MacroAssembler::target_addr_for_insn(address insn_addr, unsigned insn) { 207 long offset = 0; 208 if ((Instruction_aarch64::extract(insn, 29, 24) & 0b011011) == 0b00011000) { 209 // Load register (literal) 210 offset = Instruction_aarch64::sextract(insn, 23, 5); 211 return address(((uint64_t)insn_addr + (offset << 2))); 212 } else if (Instruction_aarch64::extract(insn, 30, 26) == 0b00101) { 213 // Unconditional branch (immediate) 214 offset = Instruction_aarch64::sextract(insn, 25, 0); 215 } else if (Instruction_aarch64::extract(insn, 31, 25) == 0b0101010) { 216 // Conditional branch (immediate) 217 offset = Instruction_aarch64::sextract(insn, 23, 5); 218 } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011010) { 219 // Compare & branch (immediate) 220 offset = Instruction_aarch64::sextract(insn, 23, 5); 221 } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011011) { 222 // Test & branch (immediate) 223 offset = Instruction_aarch64::sextract(insn, 18, 5); 224 } else if (Instruction_aarch64::extract(insn, 28, 24) == 0b10000) { 225 // PC-rel. addressing 226 offset = Instruction_aarch64::extract(insn, 30, 29); 227 offset |= Instruction_aarch64::sextract(insn, 23, 5) << 2; 228 int shift = Instruction_aarch64::extract(insn, 31, 31) ? 12 : 0; 229 if (shift) { 230 offset <<= shift; 231 uint64_t target_page = ((uint64_t)insn_addr) + offset; 232 target_page &= ((uint64_t)-1) << shift; 233 // Return the target address for the following sequences 234 // 1 - adrp Rx, target_page 235 // ldr/str Ry, [Rx, #offset_in_page] 236 // 2 - adrp Rx, target_page 237 // add Ry, Rx, #offset_in_page 238 // 3 - adrp Rx, target_page (page aligned reloc, offset == 0) 239 // movk Rx, #imm12<<32 240 // 4 - adrp Rx, target_page (page aligned reloc, offset == 0) 241 // 242 // In the first two cases we check that the register is the same and 243 // return the target_page + the offset within the page. 244 // Otherwise we assume it is a page aligned relocation and return 245 // the target page only. 246 // 247 unsigned insn2 = ((unsigned*)insn_addr)[1]; 248 if (Instruction_aarch64::extract(insn2, 29, 24) == 0b111001 && 249 Instruction_aarch64::extract(insn, 4, 0) == 250 Instruction_aarch64::extract(insn2, 9, 5)) { 251 // Load/store register (unsigned immediate) 252 unsigned int byte_offset = Instruction_aarch64::extract(insn2, 21, 10); 253 unsigned int size = Instruction_aarch64::extract(insn2, 31, 30); 254 return address(target_page + (byte_offset << size)); 255 } else if (Instruction_aarch64::extract(insn2, 31, 22) == 0b1001000100 && 256 Instruction_aarch64::extract(insn, 4, 0) == 257 Instruction_aarch64::extract(insn2, 4, 0)) { 258 // add (immediate) 259 unsigned int byte_offset = Instruction_aarch64::extract(insn2, 21, 10); 260 return address(target_page + byte_offset); 261 } else { 262 if (Instruction_aarch64::extract(insn2, 31, 21) == 0b11110010110 && 263 Instruction_aarch64::extract(insn, 4, 0) == 264 Instruction_aarch64::extract(insn2, 4, 0)) { 265 target_page = (target_page & 0xffffffff) | 266 ((uint64_t)Instruction_aarch64::extract(insn2, 20, 5) << 32); 267 } 268 return (address)target_page; 269 } 270 } else { 271 ShouldNotReachHere(); 272 } 273 } else if (Instruction_aarch64::extract(insn, 31, 23) == 0b110100101) { 274 u_int32_t *insns = (u_int32_t *)insn_addr; 275 // Move wide constant: movz, movk, movk. See movptr(). 276 assert(nativeInstruction_at(insns+1)->is_movk(), "wrong insns in patch"); 277 assert(nativeInstruction_at(insns+2)->is_movk(), "wrong insns in patch"); 278 return address(u_int64_t(Instruction_aarch64::extract(insns[0], 20, 5)) 279 + (u_int64_t(Instruction_aarch64::extract(insns[1], 20, 5)) << 16) 280 + (u_int64_t(Instruction_aarch64::extract(insns[2], 20, 5)) << 32)); 281 } else if (Instruction_aarch64::extract(insn, 31, 22) == 0b1011100101 && 282 Instruction_aarch64::extract(insn, 4, 0) == 0b11111) { 283 return 0; 284 } else { 285 ShouldNotReachHere(); 286 } 287 return address(((uint64_t)insn_addr + (offset << 2))); 288 } 289 290 void MacroAssembler::serialize_memory(Register thread, Register tmp) { 291 dsb(Assembler::SY); 292 } 293 294 295 void MacroAssembler::reset_last_Java_frame(bool clear_fp) { 296 // we must set sp to zero to clear frame 297 str(zr, Address(rthread, JavaThread::last_Java_sp_offset())); 298 299 // must clear fp, so that compiled frames are not confused; it is 300 // possible that we need it only for debugging 301 if (clear_fp) { 302 str(zr, Address(rthread, JavaThread::last_Java_fp_offset())); 303 } 304 305 // Always clear the pc because it could have been set by make_walkable() 306 str(zr, Address(rthread, JavaThread::last_Java_pc_offset())); 307 } 308 309 // Calls to C land 310 // 311 // When entering C land, the rfp, & resp of the last Java frame have to be recorded 312 // in the (thread-local) JavaThread object. When leaving C land, the last Java fp 313 // has to be reset to 0. This is required to allow proper stack traversal. 314 void MacroAssembler::set_last_Java_frame(Register last_java_sp, 315 Register last_java_fp, 316 Register last_java_pc, 317 Register scratch) { 318 319 if (last_java_pc->is_valid()) { 320 str(last_java_pc, Address(rthread, 321 JavaThread::frame_anchor_offset() 322 + JavaFrameAnchor::last_Java_pc_offset())); 323 } 324 325 // determine last_java_sp register 326 if (last_java_sp == sp) { 327 mov(scratch, sp); 328 last_java_sp = scratch; 329 } else if (!last_java_sp->is_valid()) { 330 last_java_sp = esp; 331 } 332 333 str(last_java_sp, Address(rthread, JavaThread::last_Java_sp_offset())); 334 335 // last_java_fp is optional 336 if (last_java_fp->is_valid()) { 337 str(last_java_fp, Address(rthread, JavaThread::last_Java_fp_offset())); 338 } 339 } 340 341 void MacroAssembler::set_last_Java_frame(Register last_java_sp, 342 Register last_java_fp, 343 address last_java_pc, 344 Register scratch) { 345 if (last_java_pc != NULL) { 346 adr(scratch, last_java_pc); 347 } else { 348 // FIXME: This is almost never correct. We should delete all 349 // cases of set_last_Java_frame with last_java_pc=NULL and use the 350 // correct return address instead. 351 adr(scratch, pc()); 352 } 353 354 str(scratch, Address(rthread, 355 JavaThread::frame_anchor_offset() 356 + JavaFrameAnchor::last_Java_pc_offset())); 357 358 set_last_Java_frame(last_java_sp, last_java_fp, noreg, scratch); 359 } 360 361 void MacroAssembler::set_last_Java_frame(Register last_java_sp, 362 Register last_java_fp, 363 Label &L, 364 Register scratch) { 365 if (L.is_bound()) { 366 set_last_Java_frame(last_java_sp, last_java_fp, target(L), scratch); 367 } else { 368 InstructionMark im(this); 369 L.add_patch_at(code(), locator()); 370 set_last_Java_frame(last_java_sp, last_java_fp, (address)NULL, scratch); 371 } 372 } 373 374 void MacroAssembler::far_call(Address entry, CodeBuffer *cbuf, Register tmp) { 375 assert(ReservedCodeCacheSize < 4*G, "branch out of range"); 376 assert(CodeCache::find_blob(entry.target()) != NULL, 377 "destination of far call not found in code cache"); 378 if (far_branches()) { 379 unsigned long offset; 380 // We can use ADRP here because we know that the total size of 381 // the code cache cannot exceed 2Gb. 382 adrp(tmp, entry, offset); 383 add(tmp, tmp, offset); 384 if (cbuf) cbuf->set_insts_mark(); 385 blr(tmp); 386 } else { 387 if (cbuf) cbuf->set_insts_mark(); 388 bl(entry); 389 } 390 } 391 392 void MacroAssembler::far_jump(Address entry, CodeBuffer *cbuf, Register tmp) { 393 assert(ReservedCodeCacheSize < 4*G, "branch out of range"); 394 assert(CodeCache::find_blob(entry.target()) != NULL, 395 "destination of far call not found in code cache"); 396 if (far_branches()) { 397 unsigned long offset; 398 // We can use ADRP here because we know that the total size of 399 // the code cache cannot exceed 2Gb. 400 adrp(tmp, entry, offset); 401 add(tmp, tmp, offset); 402 if (cbuf) cbuf->set_insts_mark(); 403 br(tmp); 404 } else { 405 if (cbuf) cbuf->set_insts_mark(); 406 b(entry); 407 } 408 } 409 410 void MacroAssembler::reserved_stack_check() { 411 // testing if reserved zone needs to be enabled 412 Label no_reserved_zone_enabling; 413 414 ldr(rscratch1, Address(rthread, JavaThread::reserved_stack_activation_offset())); 415 cmp(sp, rscratch1); 416 br(Assembler::LO, no_reserved_zone_enabling); 417 418 enter(); // LR and FP are live. 419 lea(rscratch1, CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone)); 420 mov(c_rarg0, rthread); 421 blr(rscratch1); 422 leave(); 423 424 // We have already removed our own frame. 425 // throw_delayed_StackOverflowError will think that it's been 426 // called by our caller. 427 lea(rscratch1, RuntimeAddress(StubRoutines::throw_delayed_StackOverflowError_entry())); 428 br(rscratch1); 429 should_not_reach_here(); 430 431 bind(no_reserved_zone_enabling); 432 } 433 434 int MacroAssembler::biased_locking_enter(Register lock_reg, 435 Register obj_reg, 436 Register swap_reg, 437 Register tmp_reg, 438 bool swap_reg_contains_mark, 439 Label& done, 440 Label* slow_case, 441 BiasedLockingCounters* counters) { 442 assert(UseBiasedLocking, "why call this otherwise?"); 443 assert_different_registers(lock_reg, obj_reg, swap_reg); 444 445 if (PrintBiasedLockingStatistics && counters == NULL) 446 counters = BiasedLocking::counters(); 447 448 assert_different_registers(lock_reg, obj_reg, swap_reg, tmp_reg, rscratch1, rscratch2, noreg); 449 assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits, "biased locking makes assumptions about bit layout"); 450 Address mark_addr (obj_reg, oopDesc::mark_offset_in_bytes()); 451 Address klass_addr (obj_reg, oopDesc::klass_offset_in_bytes()); 452 Address saved_mark_addr(lock_reg, 0); 453 454 shenandoah_store_addr_check(obj_reg); 455 456 // Biased locking 457 // See whether the lock is currently biased toward our thread and 458 // whether the epoch is still valid 459 // Note that the runtime guarantees sufficient alignment of JavaThread 460 // pointers to allow age to be placed into low bits 461 // First check to see whether biasing is even enabled for this object 462 Label cas_label; 463 int null_check_offset = -1; 464 if (!swap_reg_contains_mark) { 465 null_check_offset = offset(); 466 ldr(swap_reg, mark_addr); 467 } 468 andr(tmp_reg, swap_reg, markOopDesc::biased_lock_mask_in_place); 469 cmp(tmp_reg, markOopDesc::biased_lock_pattern); 470 br(Assembler::NE, cas_label); 471 // The bias pattern is present in the object's header. Need to check 472 // whether the bias owner and the epoch are both still current. 473 load_prototype_header(tmp_reg, obj_reg); 474 orr(tmp_reg, tmp_reg, rthread); 475 eor(tmp_reg, swap_reg, tmp_reg); 476 andr(tmp_reg, tmp_reg, ~((int) markOopDesc::age_mask_in_place)); 477 if (counters != NULL) { 478 Label around; 479 cbnz(tmp_reg, around); 480 atomic_incw(Address((address)counters->biased_lock_entry_count_addr()), tmp_reg, rscratch1, rscratch2); 481 b(done); 482 bind(around); 483 } else { 484 cbz(tmp_reg, done); 485 } 486 487 Label try_revoke_bias; 488 Label try_rebias; 489 490 // At this point we know that the header has the bias pattern and 491 // that we are not the bias owner in the current epoch. We need to 492 // figure out more details about the state of the header in order to 493 // know what operations can be legally performed on the object's 494 // header. 495 496 // If the low three bits in the xor result aren't clear, that means 497 // the prototype header is no longer biased and we have to revoke 498 // the bias on this object. 499 andr(rscratch1, tmp_reg, markOopDesc::biased_lock_mask_in_place); 500 cbnz(rscratch1, try_revoke_bias); 501 502 // Biasing is still enabled for this data type. See whether the 503 // epoch of the current bias is still valid, meaning that the epoch 504 // bits of the mark word are equal to the epoch bits of the 505 // prototype header. (Note that the prototype header's epoch bits 506 // only change at a safepoint.) If not, attempt to rebias the object 507 // toward the current thread. Note that we must be absolutely sure 508 // that the current epoch is invalid in order to do this because 509 // otherwise the manipulations it performs on the mark word are 510 // illegal. 511 andr(rscratch1, tmp_reg, markOopDesc::epoch_mask_in_place); 512 cbnz(rscratch1, try_rebias); 513 514 // The epoch of the current bias is still valid but we know nothing 515 // about the owner; it might be set or it might be clear. Try to 516 // acquire the bias of the object using an atomic operation. If this 517 // fails we will go in to the runtime to revoke the object's bias. 518 // Note that we first construct the presumed unbiased header so we 519 // don't accidentally blow away another thread's valid bias. 520 { 521 Label here; 522 mov(rscratch1, markOopDesc::biased_lock_mask_in_place | markOopDesc::age_mask_in_place | markOopDesc::epoch_mask_in_place); 523 andr(swap_reg, swap_reg, rscratch1); 524 orr(tmp_reg, swap_reg, rthread); 525 cmpxchgptr(swap_reg, tmp_reg, obj_reg, rscratch1, here, slow_case); 526 // If the biasing toward our thread failed, this means that 527 // another thread succeeded in biasing it toward itself and we 528 // need to revoke that bias. The revocation will occur in the 529 // interpreter runtime in the slow case. 530 bind(here); 531 if (counters != NULL) { 532 atomic_incw(Address((address)counters->anonymously_biased_lock_entry_count_addr()), 533 tmp_reg, rscratch1, rscratch2); 534 } 535 } 536 b(done); 537 538 bind(try_rebias); 539 // At this point we know the epoch has expired, meaning that the 540 // current "bias owner", if any, is actually invalid. Under these 541 // circumstances _only_, we are allowed to use the current header's 542 // value as the comparison value when doing the cas to acquire the 543 // bias in the current epoch. In other words, we allow transfer of 544 // the bias from one thread to another directly in this situation. 545 // 546 // FIXME: due to a lack of registers we currently blow away the age 547 // bits in this situation. Should attempt to preserve them. 548 { 549 Label here; 550 load_prototype_header(tmp_reg, obj_reg); 551 orr(tmp_reg, rthread, tmp_reg); 552 cmpxchgptr(swap_reg, tmp_reg, obj_reg, rscratch1, here, slow_case); 553 // If the biasing toward our thread failed, then another thread 554 // succeeded in biasing it toward itself and we need to revoke that 555 // bias. The revocation will occur in the runtime in the slow case. 556 bind(here); 557 if (counters != NULL) { 558 atomic_incw(Address((address)counters->rebiased_lock_entry_count_addr()), 559 tmp_reg, rscratch1, rscratch2); 560 } 561 } 562 b(done); 563 564 bind(try_revoke_bias); 565 // The prototype mark in the klass doesn't have the bias bit set any 566 // more, indicating that objects of this data type are not supposed 567 // to be biased any more. We are going to try to reset the mark of 568 // this object to the prototype value and fall through to the 569 // CAS-based locking scheme. Note that if our CAS fails, it means 570 // that another thread raced us for the privilege of revoking the 571 // bias of this particular object, so it's okay to continue in the 572 // normal locking code. 573 // 574 // FIXME: due to a lack of registers we currently blow away the age 575 // bits in this situation. Should attempt to preserve them. 576 { 577 Label here, nope; 578 load_prototype_header(tmp_reg, obj_reg); 579 cmpxchgptr(swap_reg, tmp_reg, obj_reg, rscratch1, here, &nope); 580 bind(here); 581 582 // Fall through to the normal CAS-based lock, because no matter what 583 // the result of the above CAS, some thread must have succeeded in 584 // removing the bias bit from the object's header. 585 if (counters != NULL) { 586 atomic_incw(Address((address)counters->revoked_lock_entry_count_addr()), tmp_reg, 587 rscratch1, rscratch2); 588 } 589 bind(nope); 590 } 591 592 bind(cas_label); 593 594 return null_check_offset; 595 } 596 597 void MacroAssembler::biased_locking_exit(Register obj_reg, Register temp_reg, Label& done) { 598 assert(UseBiasedLocking, "why call this otherwise?"); 599 600 // Check for biased locking unlock case, which is a no-op 601 // Note: we do not have to check the thread ID for two reasons. 602 // First, the interpreter checks for IllegalMonitorStateException at 603 // a higher level. Second, if the bias was revoked while we held the 604 // lock, the object could not be rebiased toward another thread, so 605 // the bias bit would be clear. 606 shenandoah_store_addr_check(obj_reg); // Access mark word 607 ldr(temp_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes())); 608 andr(temp_reg, temp_reg, markOopDesc::biased_lock_mask_in_place); 609 cmp(temp_reg, markOopDesc::biased_lock_pattern); 610 br(Assembler::EQ, done); 611 } 612 613 static void pass_arg0(MacroAssembler* masm, Register arg) { 614 if (c_rarg0 != arg ) { 615 masm->mov(c_rarg0, arg); 616 } 617 } 618 619 static void pass_arg1(MacroAssembler* masm, Register arg) { 620 if (c_rarg1 != arg ) { 621 masm->mov(c_rarg1, arg); 622 } 623 } 624 625 static void pass_arg2(MacroAssembler* masm, Register arg) { 626 if (c_rarg2 != arg ) { 627 masm->mov(c_rarg2, arg); 628 } 629 } 630 631 static void pass_arg3(MacroAssembler* masm, Register arg) { 632 if (c_rarg3 != arg ) { 633 masm->mov(c_rarg3, arg); 634 } 635 } 636 637 void MacroAssembler::call_VM_base(Register oop_result, 638 Register java_thread, 639 Register last_java_sp, 640 address entry_point, 641 int number_of_arguments, 642 bool check_exceptions) { 643 // determine java_thread register 644 if (!java_thread->is_valid()) { 645 java_thread = rthread; 646 } 647 648 // determine last_java_sp register 649 if (!last_java_sp->is_valid()) { 650 last_java_sp = esp; 651 } 652 653 // debugging support 654 assert(number_of_arguments >= 0 , "cannot have negative number of arguments"); 655 assert(java_thread == rthread, "unexpected register"); 656 #ifdef ASSERT 657 // TraceBytecodes does not use r12 but saves it over the call, so don't verify 658 // if ((UseCompressedOops || UseCompressedClassPointers) && !TraceBytecodes) verify_heapbase("call_VM_base: heap base corrupted?"); 659 #endif // ASSERT 660 661 assert(java_thread != oop_result , "cannot use the same register for java_thread & oop_result"); 662 assert(java_thread != last_java_sp, "cannot use the same register for java_thread & last_java_sp"); 663 664 // push java thread (becomes first argument of C function) 665 666 mov(c_rarg0, java_thread); 667 668 // set last Java frame before call 669 assert(last_java_sp != rfp, "can't use rfp"); 670 671 Label l; 672 set_last_Java_frame(last_java_sp, rfp, l, rscratch1); 673 674 // do the call, remove parameters 675 MacroAssembler::call_VM_leaf_base(entry_point, number_of_arguments, &l); 676 677 // reset last Java frame 678 // Only interpreter should have to clear fp 679 reset_last_Java_frame(true); 680 681 // C++ interp handles this in the interpreter 682 check_and_handle_popframe(java_thread); 683 check_and_handle_earlyret(java_thread); 684 685 if (check_exceptions) { 686 // check for pending exceptions (java_thread is set upon return) 687 ldr(rscratch1, Address(java_thread, in_bytes(Thread::pending_exception_offset()))); 688 Label ok; 689 cbz(rscratch1, ok); 690 lea(rscratch1, RuntimeAddress(StubRoutines::forward_exception_entry())); 691 br(rscratch1); 692 bind(ok); 693 } 694 695 // get oop result if there is one and reset the value in the thread 696 if (oop_result->is_valid()) { 697 get_vm_result(oop_result, java_thread); 698 } 699 } 700 701 void MacroAssembler::call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions) { 702 call_VM_base(oop_result, noreg, noreg, entry_point, number_of_arguments, check_exceptions); 703 } 704 705 // Maybe emit a call via a trampoline. If the code cache is small 706 // trampolines won't be emitted. 707 708 address MacroAssembler::trampoline_call(Address entry, CodeBuffer *cbuf) { 709 assert(entry.rspec().type() == relocInfo::runtime_call_type 710 || entry.rspec().type() == relocInfo::opt_virtual_call_type 711 || entry.rspec().type() == relocInfo::static_call_type 712 || entry.rspec().type() == relocInfo::virtual_call_type, "wrong reloc type"); 713 714 unsigned int start_offset = offset(); 715 if (far_branches() && !Compile::current()->in_scratch_emit_size()) { 716 address stub = emit_trampoline_stub(start_offset, entry.target()); 717 if (stub == NULL) { 718 return NULL; // CodeCache is full 719 } 720 } 721 722 if (cbuf) cbuf->set_insts_mark(); 723 relocate(entry.rspec()); 724 if (!far_branches()) { 725 bl(entry.target()); 726 } else { 727 bl(pc()); 728 } 729 // just need to return a non-null address 730 return pc(); 731 } 732 733 734 // Emit a trampoline stub for a call to a target which is too far away. 735 // 736 // code sequences: 737 // 738 // call-site: 739 // branch-and-link to <destination> or <trampoline stub> 740 // 741 // Related trampoline stub for this call site in the stub section: 742 // load the call target from the constant pool 743 // branch (LR still points to the call site above) 744 745 address MacroAssembler::emit_trampoline_stub(int insts_call_instruction_offset, 746 address dest) { 747 address stub = start_a_stub(Compile::MAX_stubs_size/2); 748 if (stub == NULL) { 749 return NULL; // CodeBuffer::expand failed 750 } 751 752 // Create a trampoline stub relocation which relates this trampoline stub 753 // with the call instruction at insts_call_instruction_offset in the 754 // instructions code-section. 755 align(wordSize); 756 relocate(trampoline_stub_Relocation::spec(code()->insts()->start() 757 + insts_call_instruction_offset)); 758 const int stub_start_offset = offset(); 759 760 // Now, create the trampoline stub's code: 761 // - load the call 762 // - call 763 Label target; 764 ldr(rscratch1, target); 765 br(rscratch1); 766 bind(target); 767 assert(offset() - stub_start_offset == NativeCallTrampolineStub::data_offset, 768 "should be"); 769 emit_int64((int64_t)dest); 770 771 const address stub_start_addr = addr_at(stub_start_offset); 772 773 assert(is_NativeCallTrampolineStub_at(stub_start_addr), "doesn't look like a trampoline"); 774 775 end_a_stub(); 776 return stub; 777 } 778 779 address MacroAssembler::ic_call(address entry, jint method_index) { 780 RelocationHolder rh = virtual_call_Relocation::spec(pc(), method_index); 781 // address const_ptr = long_constant((jlong)Universe::non_oop_word()); 782 // unsigned long offset; 783 // ldr_constant(rscratch2, const_ptr); 784 movptr(rscratch2, (uintptr_t)Universe::non_oop_word()); 785 return trampoline_call(Address(entry, rh)); 786 } 787 788 // Implementation of call_VM versions 789 790 void MacroAssembler::call_VM(Register oop_result, 791 address entry_point, 792 bool check_exceptions) { 793 call_VM_helper(oop_result, entry_point, 0, check_exceptions); 794 } 795 796 void MacroAssembler::call_VM(Register oop_result, 797 address entry_point, 798 Register arg_1, 799 bool check_exceptions) { 800 pass_arg1(this, arg_1); 801 call_VM_helper(oop_result, entry_point, 1, check_exceptions); 802 } 803 804 void MacroAssembler::call_VM(Register oop_result, 805 address entry_point, 806 Register arg_1, 807 Register arg_2, 808 bool check_exceptions) { 809 assert(arg_1 != c_rarg2, "smashed arg"); 810 pass_arg2(this, arg_2); 811 pass_arg1(this, arg_1); 812 call_VM_helper(oop_result, entry_point, 2, check_exceptions); 813 } 814 815 void MacroAssembler::call_VM(Register oop_result, 816 address entry_point, 817 Register arg_1, 818 Register arg_2, 819 Register arg_3, 820 bool check_exceptions) { 821 assert(arg_1 != c_rarg3, "smashed arg"); 822 assert(arg_2 != c_rarg3, "smashed arg"); 823 pass_arg3(this, arg_3); 824 825 assert(arg_1 != c_rarg2, "smashed arg"); 826 pass_arg2(this, arg_2); 827 828 pass_arg1(this, arg_1); 829 call_VM_helper(oop_result, entry_point, 3, check_exceptions); 830 } 831 832 void MacroAssembler::call_VM(Register oop_result, 833 Register last_java_sp, 834 address entry_point, 835 int number_of_arguments, 836 bool check_exceptions) { 837 call_VM_base(oop_result, rthread, last_java_sp, entry_point, number_of_arguments, check_exceptions); 838 } 839 840 void MacroAssembler::call_VM(Register oop_result, 841 Register last_java_sp, 842 address entry_point, 843 Register arg_1, 844 bool check_exceptions) { 845 pass_arg1(this, arg_1); 846 call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions); 847 } 848 849 void MacroAssembler::call_VM(Register oop_result, 850 Register last_java_sp, 851 address entry_point, 852 Register arg_1, 853 Register arg_2, 854 bool check_exceptions) { 855 856 assert(arg_1 != c_rarg2, "smashed arg"); 857 pass_arg2(this, arg_2); 858 pass_arg1(this, arg_1); 859 call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions); 860 } 861 862 void MacroAssembler::call_VM(Register oop_result, 863 Register last_java_sp, 864 address entry_point, 865 Register arg_1, 866 Register arg_2, 867 Register arg_3, 868 bool check_exceptions) { 869 assert(arg_1 != c_rarg3, "smashed arg"); 870 assert(arg_2 != c_rarg3, "smashed arg"); 871 pass_arg3(this, arg_3); 872 assert(arg_1 != c_rarg2, "smashed arg"); 873 pass_arg2(this, arg_2); 874 pass_arg1(this, arg_1); 875 call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions); 876 } 877 878 879 void MacroAssembler::get_vm_result(Register oop_result, Register java_thread) { 880 ldr(oop_result, Address(java_thread, JavaThread::vm_result_offset())); 881 str(zr, Address(java_thread, JavaThread::vm_result_offset())); 882 verify_oop(oop_result, "broken oop in call_VM_base"); 883 } 884 885 void MacroAssembler::get_vm_result_2(Register metadata_result, Register java_thread) { 886 ldr(metadata_result, Address(java_thread, JavaThread::vm_result_2_offset())); 887 str(zr, Address(java_thread, JavaThread::vm_result_2_offset())); 888 } 889 890 void MacroAssembler::align(int modulus) { 891 while (offset() % modulus != 0) nop(); 892 } 893 894 // these are no-ops overridden by InterpreterMacroAssembler 895 896 void MacroAssembler::check_and_handle_earlyret(Register java_thread) { } 897 898 void MacroAssembler::check_and_handle_popframe(Register java_thread) { } 899 900 901 RegisterOrConstant MacroAssembler::delayed_value_impl(intptr_t* delayed_value_addr, 902 Register tmp, 903 int offset) { 904 intptr_t value = *delayed_value_addr; 905 if (value != 0) 906 return RegisterOrConstant(value + offset); 907 908 // load indirectly to solve generation ordering problem 909 ldr(tmp, ExternalAddress((address) delayed_value_addr)); 910 911 if (offset != 0) 912 add(tmp, tmp, offset); 913 914 return RegisterOrConstant(tmp); 915 } 916 917 918 void MacroAssembler:: notify(int type) { 919 if (type == bytecode_start) { 920 // set_last_Java_frame(esp, rfp, (address)NULL); 921 Assembler:: notify(type); 922 // reset_last_Java_frame(true); 923 } 924 else 925 Assembler:: notify(type); 926 } 927 928 // Look up the method for a megamorphic invokeinterface call. 929 // The target method is determined by <intf_klass, itable_index>. 930 // The receiver klass is in recv_klass. 931 // On success, the result will be in method_result, and execution falls through. 932 // On failure, execution transfers to the given label. 933 void MacroAssembler::lookup_interface_method(Register recv_klass, 934 Register intf_klass, 935 RegisterOrConstant itable_index, 936 Register method_result, 937 Register scan_temp, 938 Label& L_no_such_interface) { 939 assert_different_registers(recv_klass, intf_klass, method_result, scan_temp); 940 assert(itable_index.is_constant() || itable_index.as_register() == method_result, 941 "caller must use same register for non-constant itable index as for method"); 942 943 // Compute start of first itableOffsetEntry (which is at the end of the vtable) 944 int vtable_base = in_bytes(Klass::vtable_start_offset()); 945 int itentry_off = itableMethodEntry::method_offset_in_bytes(); 946 int scan_step = itableOffsetEntry::size() * wordSize; 947 int vte_size = vtableEntry::size_in_bytes(); 948 assert(vte_size == wordSize, "else adjust times_vte_scale"); 949 950 ldrw(scan_temp, Address(recv_klass, Klass::vtable_length_offset())); 951 952 // %%% Could store the aligned, prescaled offset in the klassoop. 953 // lea(scan_temp, Address(recv_klass, scan_temp, times_vte_scale, vtable_base)); 954 lea(scan_temp, Address(recv_klass, scan_temp, Address::lsl(3))); 955 add(scan_temp, scan_temp, vtable_base); 956 957 // Adjust recv_klass by scaled itable_index, so we can free itable_index. 958 assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below"); 959 // lea(recv_klass, Address(recv_klass, itable_index, Address::times_ptr, itentry_off)); 960 lea(recv_klass, Address(recv_klass, itable_index, Address::lsl(3))); 961 if (itentry_off) 962 add(recv_klass, recv_klass, itentry_off); 963 964 // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) { 965 // if (scan->interface() == intf) { 966 // result = (klass + scan->offset() + itable_index); 967 // } 968 // } 969 Label search, found_method; 970 971 for (int peel = 1; peel >= 0; peel--) { 972 ldr(method_result, Address(scan_temp, itableOffsetEntry::interface_offset_in_bytes())); 973 cmp(intf_klass, method_result); 974 975 if (peel) { 976 br(Assembler::EQ, found_method); 977 } else { 978 br(Assembler::NE, search); 979 // (invert the test to fall through to found_method...) 980 } 981 982 if (!peel) break; 983 984 bind(search); 985 986 // Check that the previous entry is non-null. A null entry means that 987 // the receiver class doesn't implement the interface, and wasn't the 988 // same as when the caller was compiled. 989 cbz(method_result, L_no_such_interface); 990 add(scan_temp, scan_temp, scan_step); 991 } 992 993 bind(found_method); 994 995 // Got a hit. 996 ldr(scan_temp, Address(scan_temp, itableOffsetEntry::offset_offset_in_bytes())); 997 ldr(method_result, Address(recv_klass, scan_temp)); 998 } 999 1000 // virtual method calling 1001 void MacroAssembler::lookup_virtual_method(Register recv_klass, 1002 RegisterOrConstant vtable_index, 1003 Register method_result) { 1004 const int base = in_bytes(Klass::vtable_start_offset()); 1005 assert(vtableEntry::size() * wordSize == 8, 1006 "adjust the scaling in the code below"); 1007 int vtable_offset_in_bytes = base + vtableEntry::method_offset_in_bytes(); 1008 1009 if (vtable_index.is_register()) { 1010 lea(method_result, Address(recv_klass, 1011 vtable_index.as_register(), 1012 Address::lsl(LogBytesPerWord))); 1013 ldr(method_result, Address(method_result, vtable_offset_in_bytes)); 1014 } else { 1015 vtable_offset_in_bytes += vtable_index.as_constant() * wordSize; 1016 ldr(method_result, Address(recv_klass, vtable_offset_in_bytes)); 1017 } 1018 } 1019 1020 void MacroAssembler::check_klass_subtype(Register sub_klass, 1021 Register super_klass, 1022 Register temp_reg, 1023 Label& L_success) { 1024 Label L_failure; 1025 check_klass_subtype_fast_path(sub_klass, super_klass, temp_reg, &L_success, &L_failure, NULL); 1026 check_klass_subtype_slow_path(sub_klass, super_klass, temp_reg, noreg, &L_success, NULL); 1027 bind(L_failure); 1028 } 1029 1030 1031 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass, 1032 Register super_klass, 1033 Register temp_reg, 1034 Label* L_success, 1035 Label* L_failure, 1036 Label* L_slow_path, 1037 RegisterOrConstant super_check_offset) { 1038 assert_different_registers(sub_klass, super_klass, temp_reg); 1039 bool must_load_sco = (super_check_offset.constant_or_zero() == -1); 1040 if (super_check_offset.is_register()) { 1041 assert_different_registers(sub_klass, super_klass, 1042 super_check_offset.as_register()); 1043 } else if (must_load_sco) { 1044 assert(temp_reg != noreg, "supply either a temp or a register offset"); 1045 } 1046 1047 Label L_fallthrough; 1048 int label_nulls = 0; 1049 if (L_success == NULL) { L_success = &L_fallthrough; label_nulls++; } 1050 if (L_failure == NULL) { L_failure = &L_fallthrough; label_nulls++; } 1051 if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; } 1052 assert(label_nulls <= 1, "at most one NULL in the batch"); 1053 1054 int sc_offset = in_bytes(Klass::secondary_super_cache_offset()); 1055 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 1056 Address super_check_offset_addr(super_klass, sco_offset); 1057 1058 // Hacked jmp, which may only be used just before L_fallthrough. 1059 #define final_jmp(label) \ 1060 if (&(label) == &L_fallthrough) { /*do nothing*/ } \ 1061 else b(label) /*omit semi*/ 1062 1063 // If the pointers are equal, we are done (e.g., String[] elements). 1064 // This self-check enables sharing of secondary supertype arrays among 1065 // non-primary types such as array-of-interface. Otherwise, each such 1066 // type would need its own customized SSA. 1067 // We move this check to the front of the fast path because many 1068 // type checks are in fact trivially successful in this manner, 1069 // so we get a nicely predicted branch right at the start of the check. 1070 cmp(sub_klass, super_klass); 1071 br(Assembler::EQ, *L_success); 1072 1073 // Check the supertype display: 1074 if (must_load_sco) { 1075 ldrw(temp_reg, super_check_offset_addr); 1076 super_check_offset = RegisterOrConstant(temp_reg); 1077 } 1078 Address super_check_addr(sub_klass, super_check_offset); 1079 ldr(rscratch1, super_check_addr); 1080 cmp(super_klass, rscratch1); // load displayed supertype 1081 1082 // This check has worked decisively for primary supers. 1083 // Secondary supers are sought in the super_cache ('super_cache_addr'). 1084 // (Secondary supers are interfaces and very deeply nested subtypes.) 1085 // This works in the same check above because of a tricky aliasing 1086 // between the super_cache and the primary super display elements. 1087 // (The 'super_check_addr' can address either, as the case requires.) 1088 // Note that the cache is updated below if it does not help us find 1089 // what we need immediately. 1090 // So if it was a primary super, we can just fail immediately. 1091 // Otherwise, it's the slow path for us (no success at this point). 1092 1093 if (super_check_offset.is_register()) { 1094 br(Assembler::EQ, *L_success); 1095 cmp(super_check_offset.as_register(), sc_offset); 1096 if (L_failure == &L_fallthrough) { 1097 br(Assembler::EQ, *L_slow_path); 1098 } else { 1099 br(Assembler::NE, *L_failure); 1100 final_jmp(*L_slow_path); 1101 } 1102 } else if (super_check_offset.as_constant() == sc_offset) { 1103 // Need a slow path; fast failure is impossible. 1104 if (L_slow_path == &L_fallthrough) { 1105 br(Assembler::EQ, *L_success); 1106 } else { 1107 br(Assembler::NE, *L_slow_path); 1108 final_jmp(*L_success); 1109 } 1110 } else { 1111 // No slow path; it's a fast decision. 1112 if (L_failure == &L_fallthrough) { 1113 br(Assembler::EQ, *L_success); 1114 } else { 1115 br(Assembler::NE, *L_failure); 1116 final_jmp(*L_success); 1117 } 1118 } 1119 1120 bind(L_fallthrough); 1121 1122 #undef final_jmp 1123 } 1124 1125 // These two are taken from x86, but they look generally useful 1126 1127 // scans count pointer sized words at [addr] for occurence of value, 1128 // generic 1129 void MacroAssembler::repne_scan(Register addr, Register value, Register count, 1130 Register scratch) { 1131 Label Lloop, Lexit; 1132 cbz(count, Lexit); 1133 bind(Lloop); 1134 ldr(scratch, post(addr, wordSize)); 1135 cmp(value, scratch); 1136 br(EQ, Lexit); 1137 sub(count, count, 1); 1138 cbnz(count, Lloop); 1139 bind(Lexit); 1140 } 1141 1142 // scans count 4 byte words at [addr] for occurence of value, 1143 // generic 1144 void MacroAssembler::repne_scanw(Register addr, Register value, Register count, 1145 Register scratch) { 1146 Label Lloop, Lexit; 1147 cbz(count, Lexit); 1148 bind(Lloop); 1149 ldrw(scratch, post(addr, wordSize)); 1150 cmpw(value, scratch); 1151 br(EQ, Lexit); 1152 sub(count, count, 1); 1153 cbnz(count, Lloop); 1154 bind(Lexit); 1155 } 1156 1157 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass, 1158 Register super_klass, 1159 Register temp_reg, 1160 Register temp2_reg, 1161 Label* L_success, 1162 Label* L_failure, 1163 bool set_cond_codes) { 1164 assert_different_registers(sub_klass, super_klass, temp_reg); 1165 if (temp2_reg != noreg) 1166 assert_different_registers(sub_klass, super_klass, temp_reg, temp2_reg, rscratch1); 1167 #define IS_A_TEMP(reg) ((reg) == temp_reg || (reg) == temp2_reg) 1168 1169 Label L_fallthrough; 1170 int label_nulls = 0; 1171 if (L_success == NULL) { L_success = &L_fallthrough; label_nulls++; } 1172 if (L_failure == NULL) { L_failure = &L_fallthrough; label_nulls++; } 1173 assert(label_nulls <= 1, "at most one NULL in the batch"); 1174 1175 // a couple of useful fields in sub_klass: 1176 int ss_offset = in_bytes(Klass::secondary_supers_offset()); 1177 int sc_offset = in_bytes(Klass::secondary_super_cache_offset()); 1178 Address secondary_supers_addr(sub_klass, ss_offset); 1179 Address super_cache_addr( sub_klass, sc_offset); 1180 1181 BLOCK_COMMENT("check_klass_subtype_slow_path"); 1182 1183 // Do a linear scan of the secondary super-klass chain. 1184 // This code is rarely used, so simplicity is a virtue here. 1185 // The repne_scan instruction uses fixed registers, which we must spill. 1186 // Don't worry too much about pre-existing connections with the input regs. 1187 1188 assert(sub_klass != r0, "killed reg"); // killed by mov(r0, super) 1189 assert(sub_klass != r2, "killed reg"); // killed by lea(r2, &pst_counter) 1190 1191 // Get super_klass value into r0 (even if it was in r5 or r2). 1192 RegSet pushed_registers; 1193 if (!IS_A_TEMP(r2)) pushed_registers += r2; 1194 if (!IS_A_TEMP(r5)) pushed_registers += r5; 1195 1196 if (super_klass != r0 || UseCompressedOops) { 1197 if (!IS_A_TEMP(r0)) pushed_registers += r0; 1198 } 1199 1200 push(pushed_registers, sp); 1201 1202 #ifndef PRODUCT 1203 mov(rscratch2, (address)&SharedRuntime::_partial_subtype_ctr); 1204 Address pst_counter_addr(rscratch2); 1205 ldr(rscratch1, pst_counter_addr); 1206 add(rscratch1, rscratch1, 1); 1207 str(rscratch1, pst_counter_addr); 1208 #endif //PRODUCT 1209 1210 // We will consult the secondary-super array. 1211 ldr(r5, secondary_supers_addr); 1212 // Load the array length. 1213 ldrw(r2, Address(r5, Array<Klass*>::length_offset_in_bytes())); 1214 // Skip to start of data. 1215 add(r5, r5, Array<Klass*>::base_offset_in_bytes()); 1216 1217 cmp(sp, zr); // Clear Z flag; SP is never zero 1218 // Scan R2 words at [R5] for an occurrence of R0. 1219 // Set NZ/Z based on last compare. 1220 repne_scan(r5, r0, r2, rscratch1); 1221 1222 // Unspill the temp. registers: 1223 pop(pushed_registers, sp); 1224 1225 br(Assembler::NE, *L_failure); 1226 1227 // Success. Cache the super we found and proceed in triumph. 1228 str(super_klass, super_cache_addr); 1229 1230 if (L_success != &L_fallthrough) { 1231 b(*L_success); 1232 } 1233 1234 #undef IS_A_TEMP 1235 1236 bind(L_fallthrough); 1237 } 1238 1239 1240 void MacroAssembler::verify_oop(Register reg, const char* s) { 1241 if (!VerifyOops) return; 1242 1243 // Pass register number to verify_oop_subroutine 1244 const char* b = NULL; 1245 { 1246 ResourceMark rm; 1247 stringStream ss; 1248 ss.print("verify_oop: %s: %s", reg->name(), s); 1249 b = code_string(ss.as_string()); 1250 } 1251 BLOCK_COMMENT("verify_oop {"); 1252 1253 stp(r0, rscratch1, Address(pre(sp, -2 * wordSize))); 1254 stp(rscratch2, lr, Address(pre(sp, -2 * wordSize))); 1255 1256 mov(r0, reg); 1257 mov(rscratch1, (address)b); 1258 1259 // call indirectly to solve generation ordering problem 1260 lea(rscratch2, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address())); 1261 ldr(rscratch2, Address(rscratch2)); 1262 blr(rscratch2); 1263 1264 ldp(rscratch2, lr, Address(post(sp, 2 * wordSize))); 1265 ldp(r0, rscratch1, Address(post(sp, 2 * wordSize))); 1266 1267 BLOCK_COMMENT("} verify_oop"); 1268 } 1269 1270 void MacroAssembler::verify_oop_addr(Address addr, const char* s) { 1271 if (!VerifyOops) return; 1272 1273 const char* b = NULL; 1274 { 1275 ResourceMark rm; 1276 stringStream ss; 1277 ss.print("verify_oop_addr: %s", s); 1278 b = code_string(ss.as_string()); 1279 } 1280 BLOCK_COMMENT("verify_oop_addr {"); 1281 1282 stp(r0, rscratch1, Address(pre(sp, -2 * wordSize))); 1283 stp(rscratch2, lr, Address(pre(sp, -2 * wordSize))); 1284 1285 // addr may contain sp so we will have to adjust it based on the 1286 // pushes that we just did. 1287 if (addr.uses(sp)) { 1288 lea(r0, addr); 1289 ldr(r0, Address(r0, 4 * wordSize)); 1290 } else { 1291 ldr(r0, addr); 1292 } 1293 mov(rscratch1, (address)b); 1294 1295 // call indirectly to solve generation ordering problem 1296 lea(rscratch2, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address())); 1297 ldr(rscratch2, Address(rscratch2)); 1298 blr(rscratch2); 1299 1300 ldp(rscratch2, lr, Address(post(sp, 2 * wordSize))); 1301 ldp(r0, rscratch1, Address(post(sp, 2 * wordSize))); 1302 1303 BLOCK_COMMENT("} verify_oop_addr"); 1304 } 1305 1306 Address MacroAssembler::argument_address(RegisterOrConstant arg_slot, 1307 int extra_slot_offset) { 1308 // cf. TemplateTable::prepare_invoke(), if (load_receiver). 1309 int stackElementSize = Interpreter::stackElementSize; 1310 int offset = Interpreter::expr_offset_in_bytes(extra_slot_offset+0); 1311 #ifdef ASSERT 1312 int offset1 = Interpreter::expr_offset_in_bytes(extra_slot_offset+1); 1313 assert(offset1 - offset == stackElementSize, "correct arithmetic"); 1314 #endif 1315 if (arg_slot.is_constant()) { 1316 return Address(esp, arg_slot.as_constant() * stackElementSize 1317 + offset); 1318 } else { 1319 add(rscratch1, esp, arg_slot.as_register(), 1320 ext::uxtx, exact_log2(stackElementSize)); 1321 return Address(rscratch1, offset); 1322 } 1323 } 1324 1325 void MacroAssembler::call_VM_leaf_base(address entry_point, 1326 int number_of_arguments, 1327 Label *retaddr) { 1328 call_VM_leaf_base1(entry_point, number_of_arguments, 0, ret_type_integral, retaddr); 1329 } 1330 1331 void MacroAssembler::call_VM_leaf_base1(address entry_point, 1332 int number_of_gp_arguments, 1333 int number_of_fp_arguments, 1334 ret_type type, 1335 Label *retaddr) { 1336 Label E, L; 1337 1338 stp(rscratch1, rmethod, Address(pre(sp, -2 * wordSize))); 1339 1340 // We add 1 to number_of_arguments because the thread in arg0 is 1341 // not counted 1342 mov(rscratch1, entry_point); 1343 blrt(rscratch1, number_of_gp_arguments + 1, number_of_fp_arguments, type); 1344 if (retaddr) 1345 bind(*retaddr); 1346 1347 ldp(rscratch1, rmethod, Address(post(sp, 2 * wordSize))); 1348 maybe_isb(); 1349 } 1350 1351 void MacroAssembler::call_VM_leaf(address entry_point, int number_of_arguments) { 1352 call_VM_leaf_base(entry_point, number_of_arguments); 1353 } 1354 1355 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0) { 1356 pass_arg0(this, arg_0); 1357 call_VM_leaf_base(entry_point, 1); 1358 } 1359 1360 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1) { 1361 pass_arg0(this, arg_0); 1362 pass_arg1(this, arg_1); 1363 call_VM_leaf_base(entry_point, 2); 1364 } 1365 1366 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, 1367 Register arg_1, Register arg_2) { 1368 pass_arg0(this, arg_0); 1369 pass_arg1(this, arg_1); 1370 pass_arg2(this, arg_2); 1371 call_VM_leaf_base(entry_point, 3); 1372 } 1373 1374 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0) { 1375 pass_arg0(this, arg_0); 1376 MacroAssembler::call_VM_leaf_base(entry_point, 1); 1377 } 1378 1379 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1) { 1380 1381 assert(arg_0 != c_rarg1, "smashed arg"); 1382 pass_arg1(this, arg_1); 1383 pass_arg0(this, arg_0); 1384 MacroAssembler::call_VM_leaf_base(entry_point, 2); 1385 } 1386 1387 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) { 1388 assert(arg_0 != c_rarg2, "smashed arg"); 1389 assert(arg_1 != c_rarg2, "smashed arg"); 1390 pass_arg2(this, arg_2); 1391 assert(arg_0 != c_rarg1, "smashed arg"); 1392 pass_arg1(this, arg_1); 1393 pass_arg0(this, arg_0); 1394 MacroAssembler::call_VM_leaf_base(entry_point, 3); 1395 } 1396 1397 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) { 1398 assert(arg_0 != c_rarg3, "smashed arg"); 1399 assert(arg_1 != c_rarg3, "smashed arg"); 1400 assert(arg_2 != c_rarg3, "smashed arg"); 1401 pass_arg3(this, arg_3); 1402 assert(arg_0 != c_rarg2, "smashed arg"); 1403 assert(arg_1 != c_rarg2, "smashed arg"); 1404 pass_arg2(this, arg_2); 1405 assert(arg_0 != c_rarg1, "smashed arg"); 1406 pass_arg1(this, arg_1); 1407 pass_arg0(this, arg_0); 1408 MacroAssembler::call_VM_leaf_base(entry_point, 4); 1409 } 1410 1411 void MacroAssembler::null_check(Register reg, int offset) { 1412 if (needs_explicit_null_check(offset)) { 1413 // provoke OS NULL exception if reg = NULL by 1414 // accessing M[reg] w/o changing any registers 1415 // NOTE: this is plenty to provoke a segv 1416 ldr(zr, Address(reg)); 1417 } else { 1418 // nothing to do, (later) access of M[reg + offset] 1419 // will provoke OS NULL exception if reg = NULL 1420 } 1421 } 1422 1423 // MacroAssembler protected routines needed to implement 1424 // public methods 1425 1426 void MacroAssembler::mov(Register r, Address dest) { 1427 code_section()->relocate(pc(), dest.rspec()); 1428 u_int64_t imm64 = (u_int64_t)dest.target(); 1429 movptr(r, imm64); 1430 } 1431 1432 // Move a constant pointer into r. In AArch64 mode the virtual 1433 // address space is 48 bits in size, so we only need three 1434 // instructions to create a patchable instruction sequence that can 1435 // reach anywhere. 1436 void MacroAssembler::movptr(Register r, uintptr_t imm64) { 1437 #ifndef PRODUCT 1438 { 1439 char buffer[64]; 1440 snprintf(buffer, sizeof(buffer), "0x%"PRIX64, imm64); 1441 block_comment(buffer); 1442 } 1443 #endif 1444 assert(imm64 < (1ul << 48), "48-bit overflow in address constant"); 1445 movz(r, imm64 & 0xffff); 1446 imm64 >>= 16; 1447 movk(r, imm64 & 0xffff, 16); 1448 imm64 >>= 16; 1449 movk(r, imm64 & 0xffff, 32); 1450 } 1451 1452 // Macro to mov replicated immediate to vector register. 1453 // Vd will get the following values for different arrangements in T 1454 // imm32 == hex 000000gh T8B: Vd = ghghghghghghghgh 1455 // imm32 == hex 000000gh T16B: Vd = ghghghghghghghghghghghghghghghgh 1456 // imm32 == hex 0000efgh T4H: Vd = efghefghefghefgh 1457 // imm32 == hex 0000efgh T8H: Vd = efghefghefghefghefghefghefghefgh 1458 // imm32 == hex abcdefgh T2S: Vd = abcdefghabcdefgh 1459 // imm32 == hex abcdefgh T4S: Vd = abcdefghabcdefghabcdefghabcdefgh 1460 // T1D/T2D: invalid 1461 void MacroAssembler::mov(FloatRegister Vd, SIMD_Arrangement T, u_int32_t imm32) { 1462 assert(T != T1D && T != T2D, "invalid arrangement"); 1463 if (T == T8B || T == T16B) { 1464 assert((imm32 & ~0xff) == 0, "extraneous bits in unsigned imm32 (T8B/T16B)"); 1465 movi(Vd, T, imm32 & 0xff, 0); 1466 return; 1467 } 1468 u_int32_t nimm32 = ~imm32; 1469 if (T == T4H || T == T8H) { 1470 assert((imm32 & ~0xffff) == 0, "extraneous bits in unsigned imm32 (T4H/T8H)"); 1471 imm32 &= 0xffff; 1472 nimm32 &= 0xffff; 1473 } 1474 u_int32_t x = imm32; 1475 int movi_cnt = 0; 1476 int movn_cnt = 0; 1477 while (x) { if (x & 0xff) movi_cnt++; x >>= 8; } 1478 x = nimm32; 1479 while (x) { if (x & 0xff) movn_cnt++; x >>= 8; } 1480 if (movn_cnt < movi_cnt) imm32 = nimm32; 1481 unsigned lsl = 0; 1482 while (imm32 && (imm32 & 0xff) == 0) { lsl += 8; imm32 >>= 8; } 1483 if (movn_cnt < movi_cnt) 1484 mvni(Vd, T, imm32 & 0xff, lsl); 1485 else 1486 movi(Vd, T, imm32 & 0xff, lsl); 1487 imm32 >>= 8; lsl += 8; 1488 while (imm32) { 1489 while ((imm32 & 0xff) == 0) { lsl += 8; imm32 >>= 8; } 1490 if (movn_cnt < movi_cnt) 1491 bici(Vd, T, imm32 & 0xff, lsl); 1492 else 1493 orri(Vd, T, imm32 & 0xff, lsl); 1494 lsl += 8; imm32 >>= 8; 1495 } 1496 } 1497 1498 void MacroAssembler::mov_immediate64(Register dst, u_int64_t imm64) 1499 { 1500 #ifndef PRODUCT 1501 { 1502 char buffer[64]; 1503 snprintf(buffer, sizeof(buffer), "0x%"PRIX64, imm64); 1504 block_comment(buffer); 1505 } 1506 #endif 1507 if (operand_valid_for_logical_immediate(false, imm64)) { 1508 orr(dst, zr, imm64); 1509 } else { 1510 // we can use a combination of MOVZ or MOVN with 1511 // MOVK to build up the constant 1512 u_int64_t imm_h[4]; 1513 int zero_count = 0; 1514 int neg_count = 0; 1515 int i; 1516 for (i = 0; i < 4; i++) { 1517 imm_h[i] = ((imm64 >> (i * 16)) & 0xffffL); 1518 if (imm_h[i] == 0) { 1519 zero_count++; 1520 } else if (imm_h[i] == 0xffffL) { 1521 neg_count++; 1522 } 1523 } 1524 if (zero_count == 4) { 1525 // one MOVZ will do 1526 movz(dst, 0); 1527 } else if (neg_count == 4) { 1528 // one MOVN will do 1529 movn(dst, 0); 1530 } else if (zero_count == 3) { 1531 for (i = 0; i < 4; i++) { 1532 if (imm_h[i] != 0L) { 1533 movz(dst, (u_int32_t)imm_h[i], (i << 4)); 1534 break; 1535 } 1536 } 1537 } else if (neg_count == 3) { 1538 // one MOVN will do 1539 for (int i = 0; i < 4; i++) { 1540 if (imm_h[i] != 0xffffL) { 1541 movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4)); 1542 break; 1543 } 1544 } 1545 } else if (zero_count == 2) { 1546 // one MOVZ and one MOVK will do 1547 for (i = 0; i < 3; i++) { 1548 if (imm_h[i] != 0L) { 1549 movz(dst, (u_int32_t)imm_h[i], (i << 4)); 1550 i++; 1551 break; 1552 } 1553 } 1554 for (;i < 4; i++) { 1555 if (imm_h[i] != 0L) { 1556 movk(dst, (u_int32_t)imm_h[i], (i << 4)); 1557 } 1558 } 1559 } else if (neg_count == 2) { 1560 // one MOVN and one MOVK will do 1561 for (i = 0; i < 4; i++) { 1562 if (imm_h[i] != 0xffffL) { 1563 movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4)); 1564 i++; 1565 break; 1566 } 1567 } 1568 for (;i < 4; i++) { 1569 if (imm_h[i] != 0xffffL) { 1570 movk(dst, (u_int32_t)imm_h[i], (i << 4)); 1571 } 1572 } 1573 } else if (zero_count == 1) { 1574 // one MOVZ and two MOVKs will do 1575 for (i = 0; i < 4; i++) { 1576 if (imm_h[i] != 0L) { 1577 movz(dst, (u_int32_t)imm_h[i], (i << 4)); 1578 i++; 1579 break; 1580 } 1581 } 1582 for (;i < 4; i++) { 1583 if (imm_h[i] != 0x0L) { 1584 movk(dst, (u_int32_t)imm_h[i], (i << 4)); 1585 } 1586 } 1587 } else if (neg_count == 1) { 1588 // one MOVN and two MOVKs will do 1589 for (i = 0; i < 4; i++) { 1590 if (imm_h[i] != 0xffffL) { 1591 movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4)); 1592 i++; 1593 break; 1594 } 1595 } 1596 for (;i < 4; i++) { 1597 if (imm_h[i] != 0xffffL) { 1598 movk(dst, (u_int32_t)imm_h[i], (i << 4)); 1599 } 1600 } 1601 } else { 1602 // use a MOVZ and 3 MOVKs (makes it easier to debug) 1603 movz(dst, (u_int32_t)imm_h[0], 0); 1604 for (i = 1; i < 4; i++) { 1605 movk(dst, (u_int32_t)imm_h[i], (i << 4)); 1606 } 1607 } 1608 } 1609 } 1610 1611 void MacroAssembler::mov_immediate32(Register dst, u_int32_t imm32) 1612 { 1613 #ifndef PRODUCT 1614 { 1615 char buffer[64]; 1616 snprintf(buffer, sizeof(buffer), "0x%"PRIX32, imm32); 1617 block_comment(buffer); 1618 } 1619 #endif 1620 if (operand_valid_for_logical_immediate(true, imm32)) { 1621 orrw(dst, zr, imm32); 1622 } else { 1623 // we can use MOVZ, MOVN or two calls to MOVK to build up the 1624 // constant 1625 u_int32_t imm_h[2]; 1626 imm_h[0] = imm32 & 0xffff; 1627 imm_h[1] = ((imm32 >> 16) & 0xffff); 1628 if (imm_h[0] == 0) { 1629 movzw(dst, imm_h[1], 16); 1630 } else if (imm_h[0] == 0xffff) { 1631 movnw(dst, imm_h[1] ^ 0xffff, 16); 1632 } else if (imm_h[1] == 0) { 1633 movzw(dst, imm_h[0], 0); 1634 } else if (imm_h[1] == 0xffff) { 1635 movnw(dst, imm_h[0] ^ 0xffff, 0); 1636 } else { 1637 // use a MOVZ and MOVK (makes it easier to debug) 1638 movzw(dst, imm_h[0], 0); 1639 movkw(dst, imm_h[1], 16); 1640 } 1641 } 1642 } 1643 1644 // Form an address from base + offset in Rd. Rd may or may 1645 // not actually be used: you must use the Address that is returned. 1646 // It is up to you to ensure that the shift provided matches the size 1647 // of your data. 1648 Address MacroAssembler::form_address(Register Rd, Register base, long byte_offset, int shift) { 1649 if (Address::offset_ok_for_immed(byte_offset, shift)) 1650 // It fits; no need for any heroics 1651 return Address(base, byte_offset); 1652 1653 // Don't do anything clever with negative or misaligned offsets 1654 unsigned mask = (1 << shift) - 1; 1655 if (byte_offset < 0 || byte_offset & mask) { 1656 mov(Rd, byte_offset); 1657 add(Rd, base, Rd); 1658 return Address(Rd); 1659 } 1660 1661 // See if we can do this with two 12-bit offsets 1662 { 1663 unsigned long word_offset = byte_offset >> shift; 1664 unsigned long masked_offset = word_offset & 0xfff000; 1665 if (Address::offset_ok_for_immed(word_offset - masked_offset) 1666 && Assembler::operand_valid_for_add_sub_immediate(masked_offset << shift)) { 1667 add(Rd, base, masked_offset << shift); 1668 word_offset -= masked_offset; 1669 return Address(Rd, word_offset << shift); 1670 } 1671 } 1672 1673 // Do it the hard way 1674 mov(Rd, byte_offset); 1675 add(Rd, base, Rd); 1676 return Address(Rd); 1677 } 1678 1679 void MacroAssembler::atomic_incw(Register counter_addr, Register tmp, Register tmp2) { 1680 if (UseLSE) { 1681 mov(tmp, 1); 1682 ldadd(Assembler::word, tmp, zr, counter_addr); 1683 return; 1684 } 1685 Label retry_load; 1686 if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH)) 1687 prfm(Address(counter_addr), PSTL1STRM); 1688 bind(retry_load); 1689 // flush and load exclusive from the memory location 1690 ldxrw(tmp, counter_addr); 1691 addw(tmp, tmp, 1); 1692 // if we store+flush with no intervening write tmp wil be zero 1693 stxrw(tmp2, tmp, counter_addr); 1694 cbnzw(tmp2, retry_load); 1695 } 1696 1697 1698 int MacroAssembler::corrected_idivl(Register result, Register ra, Register rb, 1699 bool want_remainder, Register scratch) 1700 { 1701 // Full implementation of Java idiv and irem. The function 1702 // returns the (pc) offset of the div instruction - may be needed 1703 // for implicit exceptions. 1704 // 1705 // constraint : ra/rb =/= scratch 1706 // normal case 1707 // 1708 // input : ra: dividend 1709 // rb: divisor 1710 // 1711 // result: either 1712 // quotient (= ra idiv rb) 1713 // remainder (= ra irem rb) 1714 1715 assert(ra != scratch && rb != scratch, "reg cannot be scratch"); 1716 1717 int idivl_offset = offset(); 1718 if (! want_remainder) { 1719 sdivw(result, ra, rb); 1720 } else { 1721 sdivw(scratch, ra, rb); 1722 Assembler::msubw(result, scratch, rb, ra); 1723 } 1724 1725 return idivl_offset; 1726 } 1727 1728 int MacroAssembler::corrected_idivq(Register result, Register ra, Register rb, 1729 bool want_remainder, Register scratch) 1730 { 1731 // Full implementation of Java ldiv and lrem. The function 1732 // returns the (pc) offset of the div instruction - may be needed 1733 // for implicit exceptions. 1734 // 1735 // constraint : ra/rb =/= scratch 1736 // normal case 1737 // 1738 // input : ra: dividend 1739 // rb: divisor 1740 // 1741 // result: either 1742 // quotient (= ra idiv rb) 1743 // remainder (= ra irem rb) 1744 1745 assert(ra != scratch && rb != scratch, "reg cannot be scratch"); 1746 1747 int idivq_offset = offset(); 1748 if (! want_remainder) { 1749 sdiv(result, ra, rb); 1750 } else { 1751 sdiv(scratch, ra, rb); 1752 Assembler::msub(result, scratch, rb, ra); 1753 } 1754 1755 return idivq_offset; 1756 } 1757 1758 void MacroAssembler::membar(Membar_mask_bits order_constraint) { 1759 address prev = pc() - NativeMembar::instruction_size; 1760 if (prev == code()->last_membar()) { 1761 NativeMembar *bar = NativeMembar_at(prev); 1762 // We are merging two memory barrier instructions. On AArch64 we 1763 // can do this simply by ORing them together. 1764 bar->set_kind(bar->get_kind() | order_constraint); 1765 BLOCK_COMMENT("merged membar"); 1766 } else { 1767 code()->set_last_membar(pc()); 1768 dmb(Assembler::barrier(order_constraint)); 1769 } 1770 } 1771 1772 // MacroAssembler routines found actually to be needed 1773 1774 void MacroAssembler::push(Register src) 1775 { 1776 str(src, Address(pre(esp, -1 * wordSize))); 1777 } 1778 1779 void MacroAssembler::pop(Register dst) 1780 { 1781 ldr(dst, Address(post(esp, 1 * wordSize))); 1782 } 1783 1784 // Note: load_unsigned_short used to be called load_unsigned_word. 1785 int MacroAssembler::load_unsigned_short(Register dst, Address src) { 1786 int off = offset(); 1787 ldrh(dst, src); 1788 return off; 1789 } 1790 1791 int MacroAssembler::load_unsigned_byte(Register dst, Address src) { 1792 int off = offset(); 1793 ldrb(dst, src); 1794 return off; 1795 } 1796 1797 int MacroAssembler::load_signed_short(Register dst, Address src) { 1798 int off = offset(); 1799 ldrsh(dst, src); 1800 return off; 1801 } 1802 1803 int MacroAssembler::load_signed_byte(Register dst, Address src) { 1804 int off = offset(); 1805 ldrsb(dst, src); 1806 return off; 1807 } 1808 1809 int MacroAssembler::load_signed_short32(Register dst, Address src) { 1810 int off = offset(); 1811 ldrshw(dst, src); 1812 return off; 1813 } 1814 1815 int MacroAssembler::load_signed_byte32(Register dst, Address src) { 1816 int off = offset(); 1817 ldrsbw(dst, src); 1818 return off; 1819 } 1820 1821 void MacroAssembler::load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed, Register dst2) { 1822 switch (size_in_bytes) { 1823 case 8: ldr(dst, src); break; 1824 case 4: ldrw(dst, src); break; 1825 case 2: is_signed ? load_signed_short(dst, src) : load_unsigned_short(dst, src); break; 1826 case 1: is_signed ? load_signed_byte( dst, src) : load_unsigned_byte( dst, src); break; 1827 default: ShouldNotReachHere(); 1828 } 1829 } 1830 1831 void MacroAssembler::store_sized_value(Address dst, Register src, size_t size_in_bytes, Register src2) { 1832 switch (size_in_bytes) { 1833 case 8: str(src, dst); break; 1834 case 4: strw(src, dst); break; 1835 case 2: strh(src, dst); break; 1836 case 1: strb(src, dst); break; 1837 default: ShouldNotReachHere(); 1838 } 1839 } 1840 1841 void MacroAssembler::decrementw(Register reg, int value) 1842 { 1843 if (value < 0) { incrementw(reg, -value); return; } 1844 if (value == 0) { return; } 1845 if (value < (1 << 12)) { subw(reg, reg, value); return; } 1846 /* else */ { 1847 guarantee(reg != rscratch2, "invalid dst for register decrement"); 1848 movw(rscratch2, (unsigned)value); 1849 subw(reg, reg, rscratch2); 1850 } 1851 } 1852 1853 void MacroAssembler::decrement(Register reg, int value) 1854 { 1855 if (value < 0) { increment(reg, -value); return; } 1856 if (value == 0) { return; } 1857 if (value < (1 << 12)) { sub(reg, reg, value); return; } 1858 /* else */ { 1859 assert(reg != rscratch2, "invalid dst for register decrement"); 1860 mov(rscratch2, (unsigned long)value); 1861 sub(reg, reg, rscratch2); 1862 } 1863 } 1864 1865 void MacroAssembler::decrementw(Address dst, int value) 1866 { 1867 assert(!dst.uses(rscratch1), "invalid dst for address decrement"); 1868 ldrw(rscratch1, dst); 1869 decrementw(rscratch1, value); 1870 strw(rscratch1, dst); 1871 } 1872 1873 void MacroAssembler::decrement(Address dst, int value) 1874 { 1875 assert(!dst.uses(rscratch1), "invalid address for decrement"); 1876 ldr(rscratch1, dst); 1877 decrement(rscratch1, value); 1878 str(rscratch1, dst); 1879 } 1880 1881 void MacroAssembler::incrementw(Register reg, int value) 1882 { 1883 if (value < 0) { decrementw(reg, -value); return; } 1884 if (value == 0) { return; } 1885 if (value < (1 << 12)) { addw(reg, reg, value); return; } 1886 /* else */ { 1887 assert(reg != rscratch2, "invalid dst for register increment"); 1888 movw(rscratch2, (unsigned)value); 1889 addw(reg, reg, rscratch2); 1890 } 1891 } 1892 1893 void MacroAssembler::increment(Register reg, int value) 1894 { 1895 if (value < 0) { decrement(reg, -value); return; } 1896 if (value == 0) { return; } 1897 if (value < (1 << 12)) { add(reg, reg, value); return; } 1898 /* else */ { 1899 assert(reg != rscratch2, "invalid dst for register increment"); 1900 movw(rscratch2, (unsigned)value); 1901 add(reg, reg, rscratch2); 1902 } 1903 } 1904 1905 void MacroAssembler::incrementw(Address dst, int value) 1906 { 1907 assert(!dst.uses(rscratch1), "invalid dst for address increment"); 1908 ldrw(rscratch1, dst); 1909 incrementw(rscratch1, value); 1910 strw(rscratch1, dst); 1911 } 1912 1913 void MacroAssembler::increment(Address dst, int value) 1914 { 1915 assert(!dst.uses(rscratch1), "invalid dst for address increment"); 1916 ldr(rscratch1, dst); 1917 increment(rscratch1, value); 1918 str(rscratch1, dst); 1919 } 1920 1921 1922 void MacroAssembler::pusha() { 1923 push(0x7fffffff, sp); 1924 } 1925 1926 void MacroAssembler::popa() { 1927 pop(0x7fffffff, sp); 1928 } 1929 1930 // Push lots of registers in the bit set supplied. Don't push sp. 1931 // Return the number of words pushed 1932 int MacroAssembler::push(unsigned int bitset, Register stack) { 1933 int words_pushed = 0; 1934 1935 // Scan bitset to accumulate register pairs 1936 unsigned char regs[32]; 1937 int count = 0; 1938 for (int reg = 0; reg <= 30; reg++) { 1939 if (1 & bitset) 1940 regs[count++] = reg; 1941 bitset >>= 1; 1942 } 1943 regs[count++] = zr->encoding_nocheck(); 1944 count &= ~1; // Only push an even nuber of regs 1945 1946 if (count) { 1947 stp(as_Register(regs[0]), as_Register(regs[1]), 1948 Address(pre(stack, -count * wordSize))); 1949 words_pushed += 2; 1950 } 1951 for (int i = 2; i < count; i += 2) { 1952 stp(as_Register(regs[i]), as_Register(regs[i+1]), 1953 Address(stack, i * wordSize)); 1954 words_pushed += 2; 1955 } 1956 1957 assert(words_pushed == count, "oops, pushed != count"); 1958 1959 return count; 1960 } 1961 1962 int MacroAssembler::pop(unsigned int bitset, Register stack) { 1963 int words_pushed = 0; 1964 1965 // Scan bitset to accumulate register pairs 1966 unsigned char regs[32]; 1967 int count = 0; 1968 for (int reg = 0; reg <= 30; reg++) { 1969 if (1 & bitset) 1970 regs[count++] = reg; 1971 bitset >>= 1; 1972 } 1973 regs[count++] = zr->encoding_nocheck(); 1974 count &= ~1; 1975 1976 for (int i = 2; i < count; i += 2) { 1977 ldp(as_Register(regs[i]), as_Register(regs[i+1]), 1978 Address(stack, i * wordSize)); 1979 words_pushed += 2; 1980 } 1981 if (count) { 1982 ldp(as_Register(regs[0]), as_Register(regs[1]), 1983 Address(post(stack, count * wordSize))); 1984 words_pushed += 2; 1985 } 1986 1987 assert(words_pushed == count, "oops, pushed != count"); 1988 1989 return count; 1990 } 1991 #ifdef ASSERT 1992 void MacroAssembler::verify_heapbase(const char* msg) { 1993 #if 0 1994 assert (UseCompressedOops || UseCompressedClassPointers, "should be compressed"); 1995 assert (Universe::heap() != NULL, "java heap should be initialized"); 1996 if (CheckCompressedOops) { 1997 Label ok; 1998 push(1 << rscratch1->encoding(), sp); // cmpptr trashes rscratch1 1999 cmpptr(rheapbase, ExternalAddress((address)Universe::narrow_ptrs_base_addr())); 2000 br(Assembler::EQ, ok); 2001 stop(msg); 2002 bind(ok); 2003 pop(1 << rscratch1->encoding(), sp); 2004 } 2005 #endif 2006 } 2007 #endif 2008 2009 void MacroAssembler::stop(const char* msg) { 2010 address ip = pc(); 2011 pusha(); 2012 mov(c_rarg0, (address)msg); 2013 mov(c_rarg1, (address)ip); 2014 mov(c_rarg2, sp); 2015 mov(c_rarg3, CAST_FROM_FN_PTR(address, MacroAssembler::debug64)); 2016 // call(c_rarg3); 2017 blrt(c_rarg3, 3, 0, 1); 2018 hlt(0); 2019 } 2020 2021 // If a constant does not fit in an immediate field, generate some 2022 // number of MOV instructions and then perform the operation. 2023 void MacroAssembler::wrap_add_sub_imm_insn(Register Rd, Register Rn, unsigned imm, 2024 add_sub_imm_insn insn1, 2025 add_sub_reg_insn insn2) { 2026 assert(Rd != zr, "Rd = zr and not setting flags?"); 2027 if (operand_valid_for_add_sub_immediate((int)imm)) { 2028 (this->*insn1)(Rd, Rn, imm); 2029 } else { 2030 if (uabs(imm) < (1 << 24)) { 2031 (this->*insn1)(Rd, Rn, imm & -(1 << 12)); 2032 (this->*insn1)(Rd, Rd, imm & ((1 << 12)-1)); 2033 } else { 2034 assert_different_registers(Rd, Rn); 2035 mov(Rd, (uint64_t)imm); 2036 (this->*insn2)(Rd, Rn, Rd, LSL, 0); 2037 } 2038 } 2039 } 2040 2041 // Seperate vsn which sets the flags. Optimisations are more restricted 2042 // because we must set the flags correctly. 2043 void MacroAssembler::wrap_adds_subs_imm_insn(Register Rd, Register Rn, unsigned imm, 2044 add_sub_imm_insn insn1, 2045 add_sub_reg_insn insn2) { 2046 if (operand_valid_for_add_sub_immediate((int)imm)) { 2047 (this->*insn1)(Rd, Rn, imm); 2048 } else { 2049 assert_different_registers(Rd, Rn); 2050 assert(Rd != zr, "overflow in immediate operand"); 2051 mov(Rd, (uint64_t)imm); 2052 (this->*insn2)(Rd, Rn, Rd, LSL, 0); 2053 } 2054 } 2055 2056 2057 void MacroAssembler::add(Register Rd, Register Rn, RegisterOrConstant increment) { 2058 if (increment.is_register()) { 2059 add(Rd, Rn, increment.as_register()); 2060 } else { 2061 add(Rd, Rn, increment.as_constant()); 2062 } 2063 } 2064 2065 void MacroAssembler::addw(Register Rd, Register Rn, RegisterOrConstant increment) { 2066 if (increment.is_register()) { 2067 addw(Rd, Rn, increment.as_register()); 2068 } else { 2069 addw(Rd, Rn, increment.as_constant()); 2070 } 2071 } 2072 2073 void MacroAssembler::sub(Register Rd, Register Rn, RegisterOrConstant decrement) { 2074 if (decrement.is_register()) { 2075 sub(Rd, Rn, decrement.as_register()); 2076 } else { 2077 sub(Rd, Rn, decrement.as_constant()); 2078 } 2079 } 2080 2081 void MacroAssembler::subw(Register Rd, Register Rn, RegisterOrConstant decrement) { 2082 if (decrement.is_register()) { 2083 subw(Rd, Rn, decrement.as_register()); 2084 } else { 2085 subw(Rd, Rn, decrement.as_constant()); 2086 } 2087 } 2088 2089 void MacroAssembler::reinit_heapbase() 2090 { 2091 if (UseCompressedOops) { 2092 if (Universe::is_fully_initialized()) { 2093 mov(rheapbase, Universe::narrow_ptrs_base()); 2094 } else { 2095 lea(rheapbase, ExternalAddress((address)Universe::narrow_ptrs_base_addr())); 2096 ldr(rheapbase, Address(rheapbase)); 2097 } 2098 } 2099 } 2100 2101 // this simulates the behaviour of the x86 cmpxchg instruction using a 2102 // load linked/store conditional pair. we use the acquire/release 2103 // versions of these instructions so that we flush pending writes as 2104 // per Java semantics. 2105 2106 // n.b the x86 version assumes the old value to be compared against is 2107 // in rax and updates rax with the value located in memory if the 2108 // cmpxchg fails. we supply a register for the old value explicitly 2109 2110 // the aarch64 load linked/store conditional instructions do not 2111 // accept an offset. so, unlike x86, we must provide a plain register 2112 // to identify the memory word to be compared/exchanged rather than a 2113 // register+offset Address. 2114 2115 void MacroAssembler::cmpxchgptr(Register oldv, Register newv, Register addr, Register tmp, 2116 Label &succeed, Label *fail) { 2117 // oldv holds comparison value 2118 // newv holds value to write in exchange 2119 // addr identifies memory word to compare against/update 2120 if (UseLSE) { 2121 mov(tmp, oldv); 2122 casal(Assembler::xword, oldv, newv, addr); 2123 cmp(tmp, oldv); 2124 br(Assembler::EQ, succeed); 2125 membar(AnyAny); 2126 } else { 2127 Label retry_load, nope; 2128 if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH)) 2129 prfm(Address(addr), PSTL1STRM); 2130 bind(retry_load); 2131 // flush and load exclusive from the memory location 2132 // and fail if it is not what we expect 2133 ldaxr(tmp, addr); 2134 cmp(tmp, oldv); 2135 br(Assembler::NE, nope); 2136 // if we store+flush with no intervening write tmp wil be zero 2137 stlxr(tmp, newv, addr); 2138 cbzw(tmp, succeed); 2139 // retry so we only ever return after a load fails to compare 2140 // ensures we don't return a stale value after a failed write. 2141 b(retry_load); 2142 // if the memory word differs we return it in oldv and signal a fail 2143 bind(nope); 2144 membar(AnyAny); 2145 mov(oldv, tmp); 2146 } 2147 if (fail) 2148 b(*fail); 2149 } 2150 2151 void MacroAssembler::cmpxchgw(Register oldv, Register newv, Register addr, Register tmp, 2152 Label &succeed, Label *fail) { 2153 // oldv holds comparison value 2154 // newv holds value to write in exchange 2155 // addr identifies memory word to compare against/update 2156 // tmp returns 0/1 for success/failure 2157 if (UseLSE) { 2158 mov(tmp, oldv); 2159 casal(Assembler::word, oldv, newv, addr); 2160 cmp(tmp, oldv); 2161 br(Assembler::EQ, succeed); 2162 membar(AnyAny); 2163 } else { 2164 Label retry_load, nope; 2165 if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH)) 2166 prfm(Address(addr), PSTL1STRM); 2167 bind(retry_load); 2168 // flush and load exclusive from the memory location 2169 // and fail if it is not what we expect 2170 ldaxrw(tmp, addr); 2171 cmp(tmp, oldv); 2172 br(Assembler::NE, nope); 2173 // if we store+flush with no intervening write tmp wil be zero 2174 stlxrw(tmp, newv, addr); 2175 cbzw(tmp, succeed); 2176 // retry so we only ever return after a load fails to compare 2177 // ensures we don't return a stale value after a failed write. 2178 b(retry_load); 2179 // if the memory word differs we return it in oldv and signal a fail 2180 bind(nope); 2181 membar(AnyAny); 2182 mov(oldv, tmp); 2183 } 2184 if (fail) 2185 b(*fail); 2186 } 2187 2188 // A generic CAS; success or failure is in the EQ flag. A weak CAS 2189 // doesn't retry and may fail spuriously. If the oldval is wanted, 2190 // Pass a register for the result, otherwise pass noreg. 2191 2192 // Clobbers rscratch1 2193 void MacroAssembler::cmpxchg(Register addr, Register expected, 2194 Register new_val, 2195 enum operand_size size, 2196 bool acquire, bool release, 2197 bool weak, 2198 Register result) { 2199 if (result == noreg) result = rscratch1; 2200 if (UseLSE) { 2201 mov(result, expected); 2202 lse_cas(result, new_val, addr, size, acquire, release, /*not_pair*/ true); 2203 cmp(result, expected); 2204 } else { 2205 BLOCK_COMMENT("cmpxchg {"); 2206 Label retry_load, done; 2207 if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH)) 2208 prfm(Address(addr), PSTL1STRM); 2209 bind(retry_load); 2210 load_exclusive(result, addr, size, acquire); 2211 if (size == xword) 2212 cmp(result, expected); 2213 else 2214 cmpw(result, expected); 2215 br(Assembler::NE, done); 2216 store_exclusive(rscratch1, new_val, addr, size, release); 2217 if (weak) { 2218 cmpw(rscratch1, 0u); // If the store fails, return NE to our caller. 2219 } else { 2220 cbnzw(rscratch1, retry_load); 2221 } 2222 bind(done); 2223 BLOCK_COMMENT("} cmpxchg"); 2224 } 2225 } 2226 2227 void MacroAssembler::cmpxchg_oop_shenandoah(Register addr, Register expected, 2228 Register new_val, 2229 enum operand_size size, 2230 bool acquire, bool release, 2231 bool weak, 2232 Register result, Register tmp2) { 2233 assert(UseShenandoahGC, "only for shenandoah"); 2234 bool is_cae = (result != noreg); 2235 bool is_narrow = (size == word); 2236 2237 if (! is_cae) result = rscratch1; 2238 2239 assert_different_registers(addr, expected, new_val, result, tmp2); 2240 2241 if (ShenandoahStoreCheck) { 2242 if (is_narrow) { 2243 decode_heap_oop(tmp2, new_val); 2244 shenandoah_store_check(addr, tmp2); 2245 } else { 2246 shenandoah_store_check(addr, new_val); 2247 } 2248 } 2249 Label retry, done, fail; 2250 2251 // CAS, using LL/SC pair. 2252 bind(retry); 2253 load_exclusive(result, addr, size, acquire); 2254 if (is_narrow) { 2255 cmpw(result, expected); 2256 } else { 2257 cmp(result, expected); 2258 } 2259 br(Assembler::NE, fail); 2260 store_exclusive(tmp2, new_val, addr, size, release); 2261 if (weak) { 2262 cmpw(tmp2, 0u); // If the store fails, return NE to our caller 2263 } else { 2264 cbnzw(tmp2, retry); 2265 } 2266 b(done); 2267 2268 bind(fail); 2269 // Check if rb(expected)==rb(result) 2270 // Shuffle registers so that we have memory value ready for next expected. 2271 mov(tmp2, expected); 2272 mov(expected, result); 2273 if (is_narrow) { 2274 decode_heap_oop(result, result); 2275 decode_heap_oop(tmp2, tmp2); 2276 } 2277 oopDesc::bs()->interpreter_read_barrier(this, result); 2278 oopDesc::bs()->interpreter_read_barrier(this, tmp2); 2279 cmp(result, tmp2); 2280 // Retry with expected now being the value we just loaded from addr. 2281 br(Assembler::EQ, retry); 2282 if (is_narrow && is_cae) { 2283 // For cmp-and-exchange and narrow oops, we need to restore 2284 // the compressed old-value. We moved it to 'expected' a few lines up. 2285 mov(result, expected); 2286 } 2287 bind(done); 2288 } 2289 2290 static bool different(Register a, RegisterOrConstant b, Register c) { 2291 if (b.is_constant()) 2292 return a != c; 2293 else 2294 return a != b.as_register() && a != c && b.as_register() != c; 2295 } 2296 2297 #define ATOMIC_OP(NAME, LDXR, OP, IOP, AOP, STXR, sz) \ 2298 void MacroAssembler::atomic_##NAME(Register prev, RegisterOrConstant incr, Register addr) { \ 2299 if (UseLSE) { \ 2300 prev = prev->is_valid() ? prev : zr; \ 2301 if (incr.is_register()) { \ 2302 AOP(sz, incr.as_register(), prev, addr); \ 2303 } else { \ 2304 mov(rscratch2, incr.as_constant()); \ 2305 AOP(sz, rscratch2, prev, addr); \ 2306 } \ 2307 return; \ 2308 } \ 2309 Register result = rscratch2; \ 2310 if (prev->is_valid()) \ 2311 result = different(prev, incr, addr) ? prev : rscratch2; \ 2312 \ 2313 Label retry_load; \ 2314 if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH)) \ 2315 prfm(Address(addr), PSTL1STRM); \ 2316 bind(retry_load); \ 2317 LDXR(result, addr); \ 2318 OP(rscratch1, result, incr); \ 2319 STXR(rscratch2, rscratch1, addr); \ 2320 cbnzw(rscratch2, retry_load); \ 2321 if (prev->is_valid() && prev != result) { \ 2322 IOP(prev, rscratch1, incr); \ 2323 } \ 2324 } 2325 2326 ATOMIC_OP(add, ldxr, add, sub, ldadd, stxr, Assembler::xword) 2327 ATOMIC_OP(addw, ldxrw, addw, subw, ldadd, stxrw, Assembler::word) 2328 ATOMIC_OP(addal, ldaxr, add, sub, ldaddal, stlxr, Assembler::xword) 2329 ATOMIC_OP(addalw, ldaxrw, addw, subw, ldaddal, stlxrw, Assembler::word) 2330 2331 #undef ATOMIC_OP 2332 2333 #define ATOMIC_XCHG(OP, AOP, LDXR, STXR, sz) \ 2334 void MacroAssembler::atomic_##OP(Register prev, Register newv, Register addr) { \ 2335 if (UseLSE) { \ 2336 prev = prev->is_valid() ? prev : zr; \ 2337 AOP(sz, newv, prev, addr); \ 2338 return; \ 2339 } \ 2340 Register result = rscratch2; \ 2341 if (prev->is_valid()) \ 2342 result = different(prev, newv, addr) ? prev : rscratch2; \ 2343 \ 2344 Label retry_load; \ 2345 if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH)) \ 2346 prfm(Address(addr), PSTL1STRM); \ 2347 bind(retry_load); \ 2348 LDXR(result, addr); \ 2349 STXR(rscratch1, newv, addr); \ 2350 cbnzw(rscratch1, retry_load); \ 2351 if (prev->is_valid() && prev != result) \ 2352 mov(prev, result); \ 2353 } 2354 2355 ATOMIC_XCHG(xchg, swp, ldxr, stxr, Assembler::xword) 2356 ATOMIC_XCHG(xchgw, swp, ldxrw, stxrw, Assembler::word) 2357 ATOMIC_XCHG(xchgal, swpal, ldaxr, stlxr, Assembler::xword) 2358 ATOMIC_XCHG(xchgalw, swpal, ldaxrw, stlxrw, Assembler::word) 2359 2360 #undef ATOMIC_XCHG 2361 2362 void MacroAssembler::incr_allocated_bytes(Register thread, 2363 Register var_size_in_bytes, 2364 int con_size_in_bytes, 2365 Register t1) { 2366 if (!thread->is_valid()) { 2367 thread = rthread; 2368 } 2369 assert(t1->is_valid(), "need temp reg"); 2370 2371 ldr(t1, Address(thread, in_bytes(JavaThread::allocated_bytes_offset()))); 2372 if (var_size_in_bytes->is_valid()) { 2373 add(t1, t1, var_size_in_bytes); 2374 } else { 2375 add(t1, t1, con_size_in_bytes); 2376 } 2377 str(t1, Address(thread, in_bytes(JavaThread::allocated_bytes_offset()))); 2378 } 2379 2380 #ifndef PRODUCT 2381 extern "C" void findpc(intptr_t x); 2382 #endif 2383 2384 void MacroAssembler::debug64(char* msg, int64_t pc, int64_t regs[]) 2385 { 2386 // In order to get locks to work, we need to fake a in_VM state 2387 if (ShowMessageBoxOnError ) { 2388 JavaThread* thread = JavaThread::current(); 2389 JavaThreadState saved_state = thread->thread_state(); 2390 thread->set_thread_state(_thread_in_vm); 2391 #ifndef PRODUCT 2392 if (CountBytecodes || TraceBytecodes || StopInterpreterAt) { 2393 ttyLocker ttyl; 2394 BytecodeCounter::print(); 2395 } 2396 #endif 2397 if (os::message_box(msg, "Execution stopped, print registers?")) { 2398 ttyLocker ttyl; 2399 tty->print_cr(" pc = 0x%016lx", pc); 2400 #ifndef PRODUCT 2401 tty->cr(); 2402 findpc(pc); 2403 tty->cr(); 2404 #endif 2405 tty->print_cr(" r0 = 0x%016lx", regs[0]); 2406 tty->print_cr(" r1 = 0x%016lx", regs[1]); 2407 tty->print_cr(" r2 = 0x%016lx", regs[2]); 2408 tty->print_cr(" r3 = 0x%016lx", regs[3]); 2409 tty->print_cr(" r4 = 0x%016lx", regs[4]); 2410 tty->print_cr(" r5 = 0x%016lx", regs[5]); 2411 tty->print_cr(" r6 = 0x%016lx", regs[6]); 2412 tty->print_cr(" r7 = 0x%016lx", regs[7]); 2413 tty->print_cr(" r8 = 0x%016lx", regs[8]); 2414 tty->print_cr(" r9 = 0x%016lx", regs[9]); 2415 tty->print_cr("r10 = 0x%016lx", regs[10]); 2416 tty->print_cr("r11 = 0x%016lx", regs[11]); 2417 tty->print_cr("r12 = 0x%016lx", regs[12]); 2418 tty->print_cr("r13 = 0x%016lx", regs[13]); 2419 tty->print_cr("r14 = 0x%016lx", regs[14]); 2420 tty->print_cr("r15 = 0x%016lx", regs[15]); 2421 tty->print_cr("r16 = 0x%016lx", regs[16]); 2422 tty->print_cr("r17 = 0x%016lx", regs[17]); 2423 tty->print_cr("r18 = 0x%016lx", regs[18]); 2424 tty->print_cr("r19 = 0x%016lx", regs[19]); 2425 tty->print_cr("r20 = 0x%016lx", regs[20]); 2426 tty->print_cr("r21 = 0x%016lx", regs[21]); 2427 tty->print_cr("r22 = 0x%016lx", regs[22]); 2428 tty->print_cr("r23 = 0x%016lx", regs[23]); 2429 tty->print_cr("r24 = 0x%016lx", regs[24]); 2430 tty->print_cr("r25 = 0x%016lx", regs[25]); 2431 tty->print_cr("r26 = 0x%016lx", regs[26]); 2432 tty->print_cr("r27 = 0x%016lx", regs[27]); 2433 tty->print_cr("r28 = 0x%016lx", regs[28]); 2434 tty->print_cr("r30 = 0x%016lx", regs[30]); 2435 tty->print_cr("r31 = 0x%016lx", regs[31]); 2436 BREAKPOINT; 2437 } 2438 ThreadStateTransition::transition(thread, _thread_in_vm, saved_state); 2439 } else { 2440 ttyLocker ttyl; 2441 ::tty->print_cr("=============== DEBUG MESSAGE: %s ================\n", 2442 msg); 2443 assert(false, "DEBUG MESSAGE: %s", msg); 2444 } 2445 } 2446 2447 #ifdef BUILTIN_SIM 2448 // routine to generate an x86 prolog for a stub function which 2449 // bootstraps into the generated ARM code which directly follows the 2450 // stub 2451 // 2452 // the argument encodes the number of general and fp registers 2453 // passed by the caller and the callng convention (currently just 2454 // the number of general registers and assumes C argument passing) 2455 2456 extern "C" { 2457 int aarch64_stub_prolog_size(); 2458 void aarch64_stub_prolog(); 2459 void aarch64_prolog(); 2460 } 2461 2462 void MacroAssembler::c_stub_prolog(int gp_arg_count, int fp_arg_count, int ret_type, 2463 address *prolog_ptr) 2464 { 2465 int calltype = (((ret_type & 0x3) << 8) | 2466 ((fp_arg_count & 0xf) << 4) | 2467 (gp_arg_count & 0xf)); 2468 2469 // the addresses for the x86 to ARM entry code we need to use 2470 address start = pc(); 2471 // printf("start = %lx\n", start); 2472 int byteCount = aarch64_stub_prolog_size(); 2473 // printf("byteCount = %x\n", byteCount); 2474 int instructionCount = (byteCount + 3)/ 4; 2475 // printf("instructionCount = %x\n", instructionCount); 2476 for (int i = 0; i < instructionCount; i++) { 2477 nop(); 2478 } 2479 2480 memcpy(start, (void*)aarch64_stub_prolog, byteCount); 2481 2482 // write the address of the setup routine and the call format at the 2483 // end of into the copied code 2484 u_int64_t *patch_end = (u_int64_t *)(start + byteCount); 2485 if (prolog_ptr) 2486 patch_end[-2] = (u_int64_t)prolog_ptr; 2487 patch_end[-1] = calltype; 2488 } 2489 #endif 2490 2491 void MacroAssembler::push_call_clobbered_fp_registers() { 2492 // Push v0-v7, v16-v31. 2493 for (int i = 30; i >= 0; i -= 2) { 2494 if (i <= v7->encoding() || i >= v16->encoding()) { 2495 stpd(as_FloatRegister(i), as_FloatRegister(i+1), 2496 Address(pre(sp, -2 * wordSize))); 2497 } 2498 } 2499 } 2500 2501 void MacroAssembler::pop_call_clobbered_fp_registers() { 2502 2503 for (int i = 0; i < 32; i += 2) { 2504 if (i <= v7->encoding() || i >= v16->encoding()) { 2505 ldpd(as_FloatRegister(i), as_FloatRegister(i+1), 2506 Address(post(sp, 2 * wordSize))); 2507 } 2508 } 2509 } 2510 2511 void MacroAssembler::push_call_clobbered_registers() { 2512 push(RegSet::range(r0, r18) - RegSet::of(rscratch1, rscratch2), sp); 2513 2514 push_call_clobbered_fp_registers(); 2515 } 2516 2517 void MacroAssembler::pop_call_clobbered_registers() { 2518 pop_call_clobbered_fp_registers(); 2519 2520 pop(RegSet::range(r0, r18) - RegSet::of(rscratch1, rscratch2), sp); 2521 } 2522 2523 void MacroAssembler::push_CPU_state(bool save_vectors) { 2524 push(0x3fffffff, sp); // integer registers except lr & sp 2525 2526 if (!save_vectors) { 2527 for (int i = 30; i >= 0; i -= 2) 2528 stpd(as_FloatRegister(i), as_FloatRegister(i+1), 2529 Address(pre(sp, -2 * wordSize))); 2530 } else { 2531 for (int i = 30; i >= 0; i -= 2) 2532 stpq(as_FloatRegister(i), as_FloatRegister(i+1), 2533 Address(pre(sp, -4 * wordSize))); 2534 } 2535 } 2536 2537 void MacroAssembler::pop_CPU_state(bool restore_vectors) { 2538 if (!restore_vectors) { 2539 for (int i = 0; i < 32; i += 2) 2540 ldpd(as_FloatRegister(i), as_FloatRegister(i+1), 2541 Address(post(sp, 2 * wordSize))); 2542 } else { 2543 for (int i = 0; i < 32; i += 2) 2544 ldpq(as_FloatRegister(i), as_FloatRegister(i+1), 2545 Address(post(sp, 4 * wordSize))); 2546 } 2547 2548 pop(0x3fffffff, sp); // integer registers except lr & sp 2549 } 2550 2551 /** 2552 * Helpers for multiply_to_len(). 2553 */ 2554 void MacroAssembler::add2_with_carry(Register final_dest_hi, Register dest_hi, Register dest_lo, 2555 Register src1, Register src2) { 2556 adds(dest_lo, dest_lo, src1); 2557 adc(dest_hi, dest_hi, zr); 2558 adds(dest_lo, dest_lo, src2); 2559 adc(final_dest_hi, dest_hi, zr); 2560 } 2561 2562 // Generate an address from (r + r1 extend offset). "size" is the 2563 // size of the operand. The result may be in rscratch2. 2564 Address MacroAssembler::offsetted_address(Register r, Register r1, 2565 Address::extend ext, int offset, int size) { 2566 if (offset || (ext.shift() % size != 0)) { 2567 lea(rscratch2, Address(r, r1, ext)); 2568 return Address(rscratch2, offset); 2569 } else { 2570 return Address(r, r1, ext); 2571 } 2572 } 2573 2574 Address MacroAssembler::spill_address(int size, int offset, Register tmp) 2575 { 2576 assert(offset >= 0, "spill to negative address?"); 2577 // Offset reachable ? 2578 // Not aligned - 9 bits signed offset 2579 // Aligned - 12 bits unsigned offset shifted 2580 Register base = sp; 2581 if ((offset & (size-1)) && offset >= (1<<8)) { 2582 add(tmp, base, offset & ((1<<12)-1)); 2583 base = tmp; 2584 offset &= -1<<12; 2585 } 2586 2587 if (offset >= (1<<12) * size) { 2588 add(tmp, base, offset & (((1<<12)-1)<<12)); 2589 base = tmp; 2590 offset &= ~(((1<<12)-1)<<12); 2591 } 2592 2593 return Address(base, offset); 2594 } 2595 2596 /** 2597 * Multiply 64 bit by 64 bit first loop. 2598 */ 2599 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart, 2600 Register y, Register y_idx, Register z, 2601 Register carry, Register product, 2602 Register idx, Register kdx) { 2603 // 2604 // jlong carry, x[], y[], z[]; 2605 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) { 2606 // huge_128 product = y[idx] * x[xstart] + carry; 2607 // z[kdx] = (jlong)product; 2608 // carry = (jlong)(product >>> 64); 2609 // } 2610 // z[xstart] = carry; 2611 // 2612 2613 Label L_first_loop, L_first_loop_exit; 2614 Label L_one_x, L_one_y, L_multiply; 2615 2616 subsw(xstart, xstart, 1); 2617 br(Assembler::MI, L_one_x); 2618 2619 lea(rscratch1, Address(x, xstart, Address::lsl(LogBytesPerInt))); 2620 ldr(x_xstart, Address(rscratch1)); 2621 ror(x_xstart, x_xstart, 32); // convert big-endian to little-endian 2622 2623 bind(L_first_loop); 2624 subsw(idx, idx, 1); 2625 br(Assembler::MI, L_first_loop_exit); 2626 subsw(idx, idx, 1); 2627 br(Assembler::MI, L_one_y); 2628 lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt))); 2629 ldr(y_idx, Address(rscratch1)); 2630 ror(y_idx, y_idx, 32); // convert big-endian to little-endian 2631 bind(L_multiply); 2632 2633 // AArch64 has a multiply-accumulate instruction that we can't use 2634 // here because it has no way to process carries, so we have to use 2635 // separate add and adc instructions. Bah. 2636 umulh(rscratch1, x_xstart, y_idx); // x_xstart * y_idx -> rscratch1:product 2637 mul(product, x_xstart, y_idx); 2638 adds(product, product, carry); 2639 adc(carry, rscratch1, zr); // x_xstart * y_idx + carry -> carry:product 2640 2641 subw(kdx, kdx, 2); 2642 ror(product, product, 32); // back to big-endian 2643 str(product, offsetted_address(z, kdx, Address::uxtw(LogBytesPerInt), 0, BytesPerLong)); 2644 2645 b(L_first_loop); 2646 2647 bind(L_one_y); 2648 ldrw(y_idx, Address(y, 0)); 2649 b(L_multiply); 2650 2651 bind(L_one_x); 2652 ldrw(x_xstart, Address(x, 0)); 2653 b(L_first_loop); 2654 2655 bind(L_first_loop_exit); 2656 } 2657 2658 /** 2659 * Multiply 128 bit by 128. Unrolled inner loop. 2660 * 2661 */ 2662 void MacroAssembler::multiply_128_x_128_loop(Register y, Register z, 2663 Register carry, Register carry2, 2664 Register idx, Register jdx, 2665 Register yz_idx1, Register yz_idx2, 2666 Register tmp, Register tmp3, Register tmp4, 2667 Register tmp6, Register product_hi) { 2668 2669 // jlong carry, x[], y[], z[]; 2670 // int kdx = ystart+1; 2671 // for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop 2672 // huge_128 tmp3 = (y[idx+1] * product_hi) + z[kdx+idx+1] + carry; 2673 // jlong carry2 = (jlong)(tmp3 >>> 64); 2674 // huge_128 tmp4 = (y[idx] * product_hi) + z[kdx+idx] + carry2; 2675 // carry = (jlong)(tmp4 >>> 64); 2676 // z[kdx+idx+1] = (jlong)tmp3; 2677 // z[kdx+idx] = (jlong)tmp4; 2678 // } 2679 // idx += 2; 2680 // if (idx > 0) { 2681 // yz_idx1 = (y[idx] * product_hi) + z[kdx+idx] + carry; 2682 // z[kdx+idx] = (jlong)yz_idx1; 2683 // carry = (jlong)(yz_idx1 >>> 64); 2684 // } 2685 // 2686 2687 Label L_third_loop, L_third_loop_exit, L_post_third_loop_done; 2688 2689 lsrw(jdx, idx, 2); 2690 2691 bind(L_third_loop); 2692 2693 subsw(jdx, jdx, 1); 2694 br(Assembler::MI, L_third_loop_exit); 2695 subw(idx, idx, 4); 2696 2697 lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt))); 2698 2699 ldp(yz_idx2, yz_idx1, Address(rscratch1, 0)); 2700 2701 lea(tmp6, Address(z, idx, Address::uxtw(LogBytesPerInt))); 2702 2703 ror(yz_idx1, yz_idx1, 32); // convert big-endian to little-endian 2704 ror(yz_idx2, yz_idx2, 32); 2705 2706 ldp(rscratch2, rscratch1, Address(tmp6, 0)); 2707 2708 mul(tmp3, product_hi, yz_idx1); // yz_idx1 * product_hi -> tmp4:tmp3 2709 umulh(tmp4, product_hi, yz_idx1); 2710 2711 ror(rscratch1, rscratch1, 32); // convert big-endian to little-endian 2712 ror(rscratch2, rscratch2, 32); 2713 2714 mul(tmp, product_hi, yz_idx2); // yz_idx2 * product_hi -> carry2:tmp 2715 umulh(carry2, product_hi, yz_idx2); 2716 2717 // propagate sum of both multiplications into carry:tmp4:tmp3 2718 adds(tmp3, tmp3, carry); 2719 adc(tmp4, tmp4, zr); 2720 adds(tmp3, tmp3, rscratch1); 2721 adcs(tmp4, tmp4, tmp); 2722 adc(carry, carry2, zr); 2723 adds(tmp4, tmp4, rscratch2); 2724 adc(carry, carry, zr); 2725 2726 ror(tmp3, tmp3, 32); // convert little-endian to big-endian 2727 ror(tmp4, tmp4, 32); 2728 stp(tmp4, tmp3, Address(tmp6, 0)); 2729 2730 b(L_third_loop); 2731 bind (L_third_loop_exit); 2732 2733 andw (idx, idx, 0x3); 2734 cbz(idx, L_post_third_loop_done); 2735 2736 Label L_check_1; 2737 subsw(idx, idx, 2); 2738 br(Assembler::MI, L_check_1); 2739 2740 lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt))); 2741 ldr(yz_idx1, Address(rscratch1, 0)); 2742 ror(yz_idx1, yz_idx1, 32); 2743 mul(tmp3, product_hi, yz_idx1); // yz_idx1 * product_hi -> tmp4:tmp3 2744 umulh(tmp4, product_hi, yz_idx1); 2745 lea(rscratch1, Address(z, idx, Address::uxtw(LogBytesPerInt))); 2746 ldr(yz_idx2, Address(rscratch1, 0)); 2747 ror(yz_idx2, yz_idx2, 32); 2748 2749 add2_with_carry(carry, tmp4, tmp3, carry, yz_idx2); 2750 2751 ror(tmp3, tmp3, 32); 2752 str(tmp3, Address(rscratch1, 0)); 2753 2754 bind (L_check_1); 2755 2756 andw (idx, idx, 0x1); 2757 subsw(idx, idx, 1); 2758 br(Assembler::MI, L_post_third_loop_done); 2759 ldrw(tmp4, Address(y, idx, Address::uxtw(LogBytesPerInt))); 2760 mul(tmp3, tmp4, product_hi); // tmp4 * product_hi -> carry2:tmp3 2761 umulh(carry2, tmp4, product_hi); 2762 ldrw(tmp4, Address(z, idx, Address::uxtw(LogBytesPerInt))); 2763 2764 add2_with_carry(carry2, tmp3, tmp4, carry); 2765 2766 strw(tmp3, Address(z, idx, Address::uxtw(LogBytesPerInt))); 2767 extr(carry, carry2, tmp3, 32); 2768 2769 bind(L_post_third_loop_done); 2770 } 2771 2772 /** 2773 * Code for BigInteger::multiplyToLen() instrinsic. 2774 * 2775 * r0: x 2776 * r1: xlen 2777 * r2: y 2778 * r3: ylen 2779 * r4: z 2780 * r5: zlen 2781 * r10: tmp1 2782 * r11: tmp2 2783 * r12: tmp3 2784 * r13: tmp4 2785 * r14: tmp5 2786 * r15: tmp6 2787 * r16: tmp7 2788 * 2789 */ 2790 void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Register ylen, 2791 Register z, Register zlen, 2792 Register tmp1, Register tmp2, Register tmp3, Register tmp4, 2793 Register tmp5, Register tmp6, Register product_hi) { 2794 2795 assert_different_registers(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6); 2796 2797 const Register idx = tmp1; 2798 const Register kdx = tmp2; 2799 const Register xstart = tmp3; 2800 2801 const Register y_idx = tmp4; 2802 const Register carry = tmp5; 2803 const Register product = xlen; 2804 const Register x_xstart = zlen; // reuse register 2805 2806 // First Loop. 2807 // 2808 // final static long LONG_MASK = 0xffffffffL; 2809 // int xstart = xlen - 1; 2810 // int ystart = ylen - 1; 2811 // long carry = 0; 2812 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) { 2813 // long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry; 2814 // z[kdx] = (int)product; 2815 // carry = product >>> 32; 2816 // } 2817 // z[xstart] = (int)carry; 2818 // 2819 2820 movw(idx, ylen); // idx = ylen; 2821 movw(kdx, zlen); // kdx = xlen+ylen; 2822 mov(carry, zr); // carry = 0; 2823 2824 Label L_done; 2825 2826 movw(xstart, xlen); 2827 subsw(xstart, xstart, 1); 2828 br(Assembler::MI, L_done); 2829 2830 multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, carry, product, idx, kdx); 2831 2832 Label L_second_loop; 2833 cbzw(kdx, L_second_loop); 2834 2835 Label L_carry; 2836 subw(kdx, kdx, 1); 2837 cbzw(kdx, L_carry); 2838 2839 strw(carry, Address(z, kdx, Address::uxtw(LogBytesPerInt))); 2840 lsr(carry, carry, 32); 2841 subw(kdx, kdx, 1); 2842 2843 bind(L_carry); 2844 strw(carry, Address(z, kdx, Address::uxtw(LogBytesPerInt))); 2845 2846 // Second and third (nested) loops. 2847 // 2848 // for (int i = xstart-1; i >= 0; i--) { // Second loop 2849 // carry = 0; 2850 // for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop 2851 // long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) + 2852 // (z[k] & LONG_MASK) + carry; 2853 // z[k] = (int)product; 2854 // carry = product >>> 32; 2855 // } 2856 // z[i] = (int)carry; 2857 // } 2858 // 2859 // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = product_hi 2860 2861 const Register jdx = tmp1; 2862 2863 bind(L_second_loop); 2864 mov(carry, zr); // carry = 0; 2865 movw(jdx, ylen); // j = ystart+1 2866 2867 subsw(xstart, xstart, 1); // i = xstart-1; 2868 br(Assembler::MI, L_done); 2869 2870 str(z, Address(pre(sp, -4 * wordSize))); 2871 2872 Label L_last_x; 2873 lea(z, offsetted_address(z, xstart, Address::uxtw(LogBytesPerInt), 4, BytesPerInt)); // z = z + k - j 2874 subsw(xstart, xstart, 1); // i = xstart-1; 2875 br(Assembler::MI, L_last_x); 2876 2877 lea(rscratch1, Address(x, xstart, Address::uxtw(LogBytesPerInt))); 2878 ldr(product_hi, Address(rscratch1)); 2879 ror(product_hi, product_hi, 32); // convert big-endian to little-endian 2880 2881 Label L_third_loop_prologue; 2882 bind(L_third_loop_prologue); 2883 2884 str(ylen, Address(sp, wordSize)); 2885 stp(x, xstart, Address(sp, 2 * wordSize)); 2886 multiply_128_x_128_loop(y, z, carry, x, jdx, ylen, product, 2887 tmp2, x_xstart, tmp3, tmp4, tmp6, product_hi); 2888 ldp(z, ylen, Address(post(sp, 2 * wordSize))); 2889 ldp(x, xlen, Address(post(sp, 2 * wordSize))); // copy old xstart -> xlen 2890 2891 addw(tmp3, xlen, 1); 2892 strw(carry, Address(z, tmp3, Address::uxtw(LogBytesPerInt))); 2893 subsw(tmp3, tmp3, 1); 2894 br(Assembler::MI, L_done); 2895 2896 lsr(carry, carry, 32); 2897 strw(carry, Address(z, tmp3, Address::uxtw(LogBytesPerInt))); 2898 b(L_second_loop); 2899 2900 // Next infrequent code is moved outside loops. 2901 bind(L_last_x); 2902 ldrw(product_hi, Address(x, 0)); 2903 b(L_third_loop_prologue); 2904 2905 bind(L_done); 2906 } 2907 2908 /** 2909 * Emits code to update CRC-32 with a byte value according to constants in table 2910 * 2911 * @param [in,out]crc Register containing the crc. 2912 * @param [in]val Register containing the byte to fold into the CRC. 2913 * @param [in]table Register containing the table of crc constants. 2914 * 2915 * uint32_t crc; 2916 * val = crc_table[(val ^ crc) & 0xFF]; 2917 * crc = val ^ (crc >> 8); 2918 * 2919 */ 2920 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) { 2921 eor(val, val, crc); 2922 andr(val, val, 0xff); 2923 ldrw(val, Address(table, val, Address::lsl(2))); 2924 eor(crc, val, crc, Assembler::LSR, 8); 2925 } 2926 2927 /** 2928 * Emits code to update CRC-32 with a 32-bit value according to tables 0 to 3 2929 * 2930 * @param [in,out]crc Register containing the crc. 2931 * @param [in]v Register containing the 32-bit to fold into the CRC. 2932 * @param [in]table0 Register containing table 0 of crc constants. 2933 * @param [in]table1 Register containing table 1 of crc constants. 2934 * @param [in]table2 Register containing table 2 of crc constants. 2935 * @param [in]table3 Register containing table 3 of crc constants. 2936 * 2937 * uint32_t crc; 2938 * v = crc ^ v 2939 * crc = table3[v&0xff]^table2[(v>>8)&0xff]^table1[(v>>16)&0xff]^table0[v>>24] 2940 * 2941 */ 2942 void MacroAssembler::update_word_crc32(Register crc, Register v, Register tmp, 2943 Register table0, Register table1, Register table2, Register table3, 2944 bool upper) { 2945 eor(v, crc, v, upper ? LSR:LSL, upper ? 32:0); 2946 uxtb(tmp, v); 2947 ldrw(crc, Address(table3, tmp, Address::lsl(2))); 2948 ubfx(tmp, v, 8, 8); 2949 ldrw(tmp, Address(table2, tmp, Address::lsl(2))); 2950 eor(crc, crc, tmp); 2951 ubfx(tmp, v, 16, 8); 2952 ldrw(tmp, Address(table1, tmp, Address::lsl(2))); 2953 eor(crc, crc, tmp); 2954 ubfx(tmp, v, 24, 8); 2955 ldrw(tmp, Address(table0, tmp, Address::lsl(2))); 2956 eor(crc, crc, tmp); 2957 } 2958 2959 /** 2960 * @param crc register containing existing CRC (32-bit) 2961 * @param buf register pointing to input byte buffer (byte*) 2962 * @param len register containing number of bytes 2963 * @param table register that will contain address of CRC table 2964 * @param tmp scratch register 2965 */ 2966 void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len, 2967 Register table0, Register table1, Register table2, Register table3, 2968 Register tmp, Register tmp2, Register tmp3) { 2969 Label L_by16, L_by16_loop, L_by4, L_by4_loop, L_by1, L_by1_loop, L_exit; 2970 unsigned long offset; 2971 2972 ornw(crc, zr, crc); 2973 2974 if (UseCRC32) { 2975 Label CRC_by64_loop, CRC_by4_loop, CRC_by1_loop; 2976 2977 subs(len, len, 64); 2978 br(Assembler::GE, CRC_by64_loop); 2979 adds(len, len, 64-4); 2980 br(Assembler::GE, CRC_by4_loop); 2981 adds(len, len, 4); 2982 br(Assembler::GT, CRC_by1_loop); 2983 b(L_exit); 2984 2985 BIND(CRC_by4_loop); 2986 ldrw(tmp, Address(post(buf, 4))); 2987 subs(len, len, 4); 2988 crc32w(crc, crc, tmp); 2989 br(Assembler::GE, CRC_by4_loop); 2990 adds(len, len, 4); 2991 br(Assembler::LE, L_exit); 2992 BIND(CRC_by1_loop); 2993 ldrb(tmp, Address(post(buf, 1))); 2994 subs(len, len, 1); 2995 crc32b(crc, crc, tmp); 2996 br(Assembler::GT, CRC_by1_loop); 2997 b(L_exit); 2998 2999 align(CodeEntryAlignment); 3000 BIND(CRC_by64_loop); 3001 subs(len, len, 64); 3002 ldp(tmp, tmp3, Address(post(buf, 16))); 3003 crc32x(crc, crc, tmp); 3004 crc32x(crc, crc, tmp3); 3005 ldp(tmp, tmp3, Address(post(buf, 16))); 3006 crc32x(crc, crc, tmp); 3007 crc32x(crc, crc, tmp3); 3008 ldp(tmp, tmp3, Address(post(buf, 16))); 3009 crc32x(crc, crc, tmp); 3010 crc32x(crc, crc, tmp3); 3011 ldp(tmp, tmp3, Address(post(buf, 16))); 3012 crc32x(crc, crc, tmp); 3013 crc32x(crc, crc, tmp3); 3014 br(Assembler::GE, CRC_by64_loop); 3015 adds(len, len, 64-4); 3016 br(Assembler::GE, CRC_by4_loop); 3017 adds(len, len, 4); 3018 br(Assembler::GT, CRC_by1_loop); 3019 BIND(L_exit); 3020 ornw(crc, zr, crc); 3021 return; 3022 } 3023 3024 adrp(table0, ExternalAddress(StubRoutines::crc_table_addr()), offset); 3025 if (offset) add(table0, table0, offset); 3026 add(table1, table0, 1*256*sizeof(juint)); 3027 add(table2, table0, 2*256*sizeof(juint)); 3028 add(table3, table0, 3*256*sizeof(juint)); 3029 3030 if (UseNeon) { 3031 cmp(len, 64); 3032 br(Assembler::LT, L_by16); 3033 eor(v16, T16B, v16, v16); 3034 3035 Label L_fold; 3036 3037 add(tmp, table0, 4*256*sizeof(juint)); // Point at the Neon constants 3038 3039 ld1(v0, v1, T2D, post(buf, 32)); 3040 ld1r(v4, T2D, post(tmp, 8)); 3041 ld1r(v5, T2D, post(tmp, 8)); 3042 ld1r(v6, T2D, post(tmp, 8)); 3043 ld1r(v7, T2D, post(tmp, 8)); 3044 mov(v16, T4S, 0, crc); 3045 3046 eor(v0, T16B, v0, v16); 3047 sub(len, len, 64); 3048 3049 BIND(L_fold); 3050 pmull(v22, T8H, v0, v5, T8B); 3051 pmull(v20, T8H, v0, v7, T8B); 3052 pmull(v23, T8H, v0, v4, T8B); 3053 pmull(v21, T8H, v0, v6, T8B); 3054 3055 pmull2(v18, T8H, v0, v5, T16B); 3056 pmull2(v16, T8H, v0, v7, T16B); 3057 pmull2(v19, T8H, v0, v4, T16B); 3058 pmull2(v17, T8H, v0, v6, T16B); 3059 3060 uzp1(v24, v20, v22, T8H); 3061 uzp2(v25, v20, v22, T8H); 3062 eor(v20, T16B, v24, v25); 3063 3064 uzp1(v26, v16, v18, T8H); 3065 uzp2(v27, v16, v18, T8H); 3066 eor(v16, T16B, v26, v27); 3067 3068 ushll2(v22, T4S, v20, T8H, 8); 3069 ushll(v20, T4S, v20, T4H, 8); 3070 3071 ushll2(v18, T4S, v16, T8H, 8); 3072 ushll(v16, T4S, v16, T4H, 8); 3073 3074 eor(v22, T16B, v23, v22); 3075 eor(v18, T16B, v19, v18); 3076 eor(v20, T16B, v21, v20); 3077 eor(v16, T16B, v17, v16); 3078 3079 uzp1(v17, v16, v20, T2D); 3080 uzp2(v21, v16, v20, T2D); 3081 eor(v17, T16B, v17, v21); 3082 3083 ushll2(v20, T2D, v17, T4S, 16); 3084 ushll(v16, T2D, v17, T2S, 16); 3085 3086 eor(v20, T16B, v20, v22); 3087 eor(v16, T16B, v16, v18); 3088 3089 uzp1(v17, v20, v16, T2D); 3090 uzp2(v21, v20, v16, T2D); 3091 eor(v28, T16B, v17, v21); 3092 3093 pmull(v22, T8H, v1, v5, T8B); 3094 pmull(v20, T8H, v1, v7, T8B); 3095 pmull(v23, T8H, v1, v4, T8B); 3096 pmull(v21, T8H, v1, v6, T8B); 3097 3098 pmull2(v18, T8H, v1, v5, T16B); 3099 pmull2(v16, T8H, v1, v7, T16B); 3100 pmull2(v19, T8H, v1, v4, T16B); 3101 pmull2(v17, T8H, v1, v6, T16B); 3102 3103 ld1(v0, v1, T2D, post(buf, 32)); 3104 3105 uzp1(v24, v20, v22, T8H); 3106 uzp2(v25, v20, v22, T8H); 3107 eor(v20, T16B, v24, v25); 3108 3109 uzp1(v26, v16, v18, T8H); 3110 uzp2(v27, v16, v18, T8H); 3111 eor(v16, T16B, v26, v27); 3112 3113 ushll2(v22, T4S, v20, T8H, 8); 3114 ushll(v20, T4S, v20, T4H, 8); 3115 3116 ushll2(v18, T4S, v16, T8H, 8); 3117 ushll(v16, T4S, v16, T4H, 8); 3118 3119 eor(v22, T16B, v23, v22); 3120 eor(v18, T16B, v19, v18); 3121 eor(v20, T16B, v21, v20); 3122 eor(v16, T16B, v17, v16); 3123 3124 uzp1(v17, v16, v20, T2D); 3125 uzp2(v21, v16, v20, T2D); 3126 eor(v16, T16B, v17, v21); 3127 3128 ushll2(v20, T2D, v16, T4S, 16); 3129 ushll(v16, T2D, v16, T2S, 16); 3130 3131 eor(v20, T16B, v22, v20); 3132 eor(v16, T16B, v16, v18); 3133 3134 uzp1(v17, v20, v16, T2D); 3135 uzp2(v21, v20, v16, T2D); 3136 eor(v20, T16B, v17, v21); 3137 3138 shl(v16, T2D, v28, 1); 3139 shl(v17, T2D, v20, 1); 3140 3141 eor(v0, T16B, v0, v16); 3142 eor(v1, T16B, v1, v17); 3143 3144 subs(len, len, 32); 3145 br(Assembler::GE, L_fold); 3146 3147 mov(crc, 0); 3148 mov(tmp, v0, T1D, 0); 3149 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false); 3150 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true); 3151 mov(tmp, v0, T1D, 1); 3152 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false); 3153 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true); 3154 mov(tmp, v1, T1D, 0); 3155 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false); 3156 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true); 3157 mov(tmp, v1, T1D, 1); 3158 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false); 3159 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true); 3160 3161 add(len, len, 32); 3162 } 3163 3164 BIND(L_by16); 3165 subs(len, len, 16); 3166 br(Assembler::GE, L_by16_loop); 3167 adds(len, len, 16-4); 3168 br(Assembler::GE, L_by4_loop); 3169 adds(len, len, 4); 3170 br(Assembler::GT, L_by1_loop); 3171 b(L_exit); 3172 3173 BIND(L_by4_loop); 3174 ldrw(tmp, Address(post(buf, 4))); 3175 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3); 3176 subs(len, len, 4); 3177 br(Assembler::GE, L_by4_loop); 3178 adds(len, len, 4); 3179 br(Assembler::LE, L_exit); 3180 BIND(L_by1_loop); 3181 subs(len, len, 1); 3182 ldrb(tmp, Address(post(buf, 1))); 3183 update_byte_crc32(crc, tmp, table0); 3184 br(Assembler::GT, L_by1_loop); 3185 b(L_exit); 3186 3187 align(CodeEntryAlignment); 3188 BIND(L_by16_loop); 3189 subs(len, len, 16); 3190 ldp(tmp, tmp3, Address(post(buf, 16))); 3191 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false); 3192 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true); 3193 update_word_crc32(crc, tmp3, tmp2, table0, table1, table2, table3, false); 3194 update_word_crc32(crc, tmp3, tmp2, table0, table1, table2, table3, true); 3195 br(Assembler::GE, L_by16_loop); 3196 adds(len, len, 16-4); 3197 br(Assembler::GE, L_by4_loop); 3198 adds(len, len, 4); 3199 br(Assembler::GT, L_by1_loop); 3200 BIND(L_exit); 3201 ornw(crc, zr, crc); 3202 } 3203 3204 /** 3205 * @param crc register containing existing CRC (32-bit) 3206 * @param buf register pointing to input byte buffer (byte*) 3207 * @param len register containing number of bytes 3208 * @param table register that will contain address of CRC table 3209 * @param tmp scratch register 3210 */ 3211 void MacroAssembler::kernel_crc32c(Register crc, Register buf, Register len, 3212 Register table0, Register table1, Register table2, Register table3, 3213 Register tmp, Register tmp2, Register tmp3) { 3214 Label L_exit; 3215 Label CRC_by64_loop, CRC_by4_loop, CRC_by1_loop; 3216 3217 subs(len, len, 64); 3218 br(Assembler::GE, CRC_by64_loop); 3219 adds(len, len, 64-4); 3220 br(Assembler::GE, CRC_by4_loop); 3221 adds(len, len, 4); 3222 br(Assembler::GT, CRC_by1_loop); 3223 b(L_exit); 3224 3225 BIND(CRC_by4_loop); 3226 ldrw(tmp, Address(post(buf, 4))); 3227 subs(len, len, 4); 3228 crc32cw(crc, crc, tmp); 3229 br(Assembler::GE, CRC_by4_loop); 3230 adds(len, len, 4); 3231 br(Assembler::LE, L_exit); 3232 BIND(CRC_by1_loop); 3233 ldrb(tmp, Address(post(buf, 1))); 3234 subs(len, len, 1); 3235 crc32cb(crc, crc, tmp); 3236 br(Assembler::GT, CRC_by1_loop); 3237 b(L_exit); 3238 3239 align(CodeEntryAlignment); 3240 BIND(CRC_by64_loop); 3241 subs(len, len, 64); 3242 ldp(tmp, tmp3, Address(post(buf, 16))); 3243 crc32cx(crc, crc, tmp); 3244 crc32cx(crc, crc, tmp3); 3245 ldp(tmp, tmp3, Address(post(buf, 16))); 3246 crc32cx(crc, crc, tmp); 3247 crc32cx(crc, crc, tmp3); 3248 ldp(tmp, tmp3, Address(post(buf, 16))); 3249 crc32cx(crc, crc, tmp); 3250 crc32cx(crc, crc, tmp3); 3251 ldp(tmp, tmp3, Address(post(buf, 16))); 3252 crc32cx(crc, crc, tmp); 3253 crc32cx(crc, crc, tmp3); 3254 br(Assembler::GE, CRC_by64_loop); 3255 adds(len, len, 64-4); 3256 br(Assembler::GE, CRC_by4_loop); 3257 adds(len, len, 4); 3258 br(Assembler::GT, CRC_by1_loop); 3259 BIND(L_exit); 3260 return; 3261 } 3262 3263 SkipIfEqual::SkipIfEqual( 3264 MacroAssembler* masm, const bool* flag_addr, bool value) { 3265 _masm = masm; 3266 unsigned long offset; 3267 _masm->adrp(rscratch1, ExternalAddress((address)flag_addr), offset); 3268 _masm->ldrb(rscratch1, Address(rscratch1, offset)); 3269 _masm->cbzw(rscratch1, _label); 3270 } 3271 3272 SkipIfEqual::~SkipIfEqual() { 3273 _masm->bind(_label); 3274 } 3275 3276 void MacroAssembler::addptr(const Address &dst, int32_t src) { 3277 Address adr; 3278 switch(dst.getMode()) { 3279 case Address::base_plus_offset: 3280 // This is the expected mode, although we allow all the other 3281 // forms below. 3282 adr = form_address(rscratch2, dst.base(), dst.offset(), LogBytesPerWord); 3283 break; 3284 default: 3285 lea(rscratch2, dst); 3286 adr = Address(rscratch2); 3287 break; 3288 } 3289 ldr(rscratch1, adr); 3290 add(rscratch1, rscratch1, src); 3291 str(rscratch1, adr); 3292 } 3293 3294 void MacroAssembler::cmpptr(Register src1, Address src2) { 3295 unsigned long offset; 3296 adrp(rscratch1, src2, offset); 3297 ldr(rscratch1, Address(rscratch1, offset)); 3298 cmp(src1, rscratch1); 3299 } 3300 3301 void MacroAssembler::store_check(Register obj, Address dst) { 3302 store_check(obj); 3303 } 3304 3305 void MacroAssembler::store_check(Register obj) { 3306 // Does a store check for the oop in register obj. The content of 3307 // register obj is destroyed afterwards. 3308 3309 BarrierSet* bs = Universe::heap()->barrier_set(); 3310 assert(bs->kind() == BarrierSet::CardTableForRS || 3311 bs->kind() == BarrierSet::CardTableExtension, 3312 "Wrong barrier set kind"); 3313 3314 CardTableModRefBS* ct = barrier_set_cast<CardTableModRefBS>(bs); 3315 assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code"); 3316 3317 lsr(obj, obj, CardTableModRefBS::card_shift); 3318 3319 assert(CardTableModRefBS::dirty_card_val() == 0, "must be"); 3320 3321 load_byte_map_base(rscratch1); 3322 3323 if (UseCondCardMark) { 3324 Label L_already_dirty; 3325 membar(StoreLoad); 3326 ldrb(rscratch2, Address(obj, rscratch1)); 3327 cbz(rscratch2, L_already_dirty); 3328 strb(zr, Address(obj, rscratch1)); 3329 bind(L_already_dirty); 3330 } else { 3331 if (UseConcMarkSweepGC && CMSPrecleaningEnabled) { 3332 membar(StoreStore); 3333 } 3334 strb(zr, Address(obj, rscratch1)); 3335 } 3336 } 3337 3338 void MacroAssembler::load_klass(Register dst, Register src) { 3339 if (UseCompressedClassPointers) { 3340 ldrw(dst, Address(src, oopDesc::klass_offset_in_bytes())); 3341 decode_klass_not_null(dst); 3342 } else { 3343 ldr(dst, Address(src, oopDesc::klass_offset_in_bytes())); 3344 } 3345 } 3346 3347 void MacroAssembler::load_mirror(Register dst, Register method) { 3348 const int mirror_offset = in_bytes(Klass::java_mirror_offset()); 3349 ldr(dst, Address(rmethod, Method::const_offset())); 3350 ldr(dst, Address(dst, ConstMethod::constants_offset())); 3351 ldr(dst, Address(dst, ConstantPool::pool_holder_offset_in_bytes())); 3352 ldr(dst, Address(dst, mirror_offset)); 3353 } 3354 3355 void MacroAssembler::cmp_klass(Register oop, Register trial_klass, Register tmp) { 3356 if (UseCompressedClassPointers) { 3357 ldrw(tmp, Address(oop, oopDesc::klass_offset_in_bytes())); 3358 if (Universe::narrow_klass_base() == NULL) { 3359 cmp(trial_klass, tmp, LSL, Universe::narrow_klass_shift()); 3360 return; 3361 } else if (((uint64_t)Universe::narrow_klass_base() & 0xffffffff) == 0 3362 && Universe::narrow_klass_shift() == 0) { 3363 // Only the bottom 32 bits matter 3364 cmpw(trial_klass, tmp); 3365 return; 3366 } 3367 decode_klass_not_null(tmp); 3368 } else { 3369 ldr(tmp, Address(oop, oopDesc::klass_offset_in_bytes())); 3370 } 3371 cmp(trial_klass, tmp); 3372 } 3373 3374 void MacroAssembler::load_prototype_header(Register dst, Register src) { 3375 load_klass(dst, src); 3376 ldr(dst, Address(dst, Klass::prototype_header_offset())); 3377 } 3378 3379 void MacroAssembler::store_klass(Register dst, Register src) { 3380 // FIXME: Should this be a store release? concurrent gcs assumes 3381 // klass length is valid if klass field is not null. 3382 if (UseCompressedClassPointers) { 3383 encode_klass_not_null(src); 3384 strw(src, Address(dst, oopDesc::klass_offset_in_bytes())); 3385 } else { 3386 str(src, Address(dst, oopDesc::klass_offset_in_bytes())); 3387 } 3388 } 3389 3390 void MacroAssembler::store_klass_gap(Register dst, Register src) { 3391 if (UseCompressedClassPointers) { 3392 // Store to klass gap in destination 3393 strw(src, Address(dst, oopDesc::klass_gap_offset_in_bytes())); 3394 } 3395 } 3396 3397 // Algorithm must match oop.inline.hpp encode_heap_oop. 3398 void MacroAssembler::encode_heap_oop(Register d, Register s) { 3399 #ifdef ASSERT 3400 verify_heapbase("MacroAssembler::encode_heap_oop: heap base corrupted?"); 3401 #endif 3402 verify_oop(s, "broken oop in encode_heap_oop"); 3403 if (Universe::narrow_oop_base() == NULL) { 3404 if (Universe::narrow_oop_shift() != 0) { 3405 assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong"); 3406 lsr(d, s, LogMinObjAlignmentInBytes); 3407 } else { 3408 mov(d, s); 3409 } 3410 } else { 3411 subs(d, s, rheapbase); 3412 csel(d, d, zr, Assembler::HS); 3413 lsr(d, d, LogMinObjAlignmentInBytes); 3414 3415 /* Old algorithm: is this any worse? 3416 Label nonnull; 3417 cbnz(r, nonnull); 3418 sub(r, r, rheapbase); 3419 bind(nonnull); 3420 lsr(r, r, LogMinObjAlignmentInBytes); 3421 */ 3422 } 3423 } 3424 3425 void MacroAssembler::encode_heap_oop_not_null(Register r) { 3426 #ifdef ASSERT 3427 verify_heapbase("MacroAssembler::encode_heap_oop_not_null: heap base corrupted?"); 3428 if (CheckCompressedOops) { 3429 Label ok; 3430 cbnz(r, ok); 3431 stop("null oop passed to encode_heap_oop_not_null"); 3432 bind(ok); 3433 } 3434 #endif 3435 verify_oop(r, "broken oop in encode_heap_oop_not_null"); 3436 if (Universe::narrow_oop_base() != NULL) { 3437 sub(r, r, rheapbase); 3438 } 3439 if (Universe::narrow_oop_shift() != 0) { 3440 assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong"); 3441 lsr(r, r, LogMinObjAlignmentInBytes); 3442 } 3443 } 3444 3445 void MacroAssembler::encode_heap_oop_not_null(Register dst, Register src) { 3446 #ifdef ASSERT 3447 verify_heapbase("MacroAssembler::encode_heap_oop_not_null2: heap base corrupted?"); 3448 if (CheckCompressedOops) { 3449 Label ok; 3450 cbnz(src, ok); 3451 stop("null oop passed to encode_heap_oop_not_null2"); 3452 bind(ok); 3453 } 3454 #endif 3455 verify_oop(src, "broken oop in encode_heap_oop_not_null2"); 3456 3457 Register data = src; 3458 if (Universe::narrow_oop_base() != NULL) { 3459 sub(dst, src, rheapbase); 3460 data = dst; 3461 } 3462 if (Universe::narrow_oop_shift() != 0) { 3463 assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong"); 3464 lsr(dst, data, LogMinObjAlignmentInBytes); 3465 data = dst; 3466 } 3467 if (data == src) 3468 mov(dst, src); 3469 } 3470 3471 void MacroAssembler::decode_heap_oop(Register d, Register s) { 3472 #ifdef ASSERT 3473 verify_heapbase("MacroAssembler::decode_heap_oop: heap base corrupted?"); 3474 #endif 3475 if (Universe::narrow_oop_base() == NULL) { 3476 if (Universe::narrow_oop_shift() != 0 || d != s) { 3477 lsl(d, s, Universe::narrow_oop_shift()); 3478 } 3479 } else { 3480 Label done; 3481 if (d != s) 3482 mov(d, s); 3483 cbz(s, done); 3484 add(d, rheapbase, s, Assembler::LSL, LogMinObjAlignmentInBytes); 3485 bind(done); 3486 } 3487 verify_oop(d, "broken oop in decode_heap_oop"); 3488 } 3489 3490 void MacroAssembler::decode_heap_oop_not_null(Register r) { 3491 assert (UseCompressedOops, "should only be used for compressed headers"); 3492 assert (Universe::heap() != NULL, "java heap should be initialized"); 3493 // Cannot assert, unverified entry point counts instructions (see .ad file) 3494 // vtableStubs also counts instructions in pd_code_size_limit. 3495 // Also do not verify_oop as this is called by verify_oop. 3496 if (Universe::narrow_oop_shift() != 0) { 3497 assert(LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong"); 3498 if (Universe::narrow_oop_base() != NULL) { 3499 add(r, rheapbase, r, Assembler::LSL, LogMinObjAlignmentInBytes); 3500 } else { 3501 add(r, zr, r, Assembler::LSL, LogMinObjAlignmentInBytes); 3502 } 3503 } else { 3504 assert (Universe::narrow_oop_base() == NULL, "sanity"); 3505 } 3506 } 3507 3508 void MacroAssembler::decode_heap_oop_not_null(Register dst, Register src) { 3509 assert (UseCompressedOops, "should only be used for compressed headers"); 3510 assert (Universe::heap() != NULL, "java heap should be initialized"); 3511 // Cannot assert, unverified entry point counts instructions (see .ad file) 3512 // vtableStubs also counts instructions in pd_code_size_limit. 3513 // Also do not verify_oop as this is called by verify_oop. 3514 if (Universe::narrow_oop_shift() != 0) { 3515 assert(LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong"); 3516 if (Universe::narrow_oop_base() != NULL) { 3517 add(dst, rheapbase, src, Assembler::LSL, LogMinObjAlignmentInBytes); 3518 } else { 3519 add(dst, zr, src, Assembler::LSL, LogMinObjAlignmentInBytes); 3520 } 3521 } else { 3522 assert (Universe::narrow_oop_base() == NULL, "sanity"); 3523 if (dst != src) { 3524 mov(dst, src); 3525 } 3526 } 3527 } 3528 3529 void MacroAssembler::encode_klass_not_null(Register dst, Register src) { 3530 if (Universe::narrow_klass_base() == NULL) { 3531 if (Universe::narrow_klass_shift() != 0) { 3532 assert (LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong"); 3533 lsr(dst, src, LogKlassAlignmentInBytes); 3534 } else { 3535 if (dst != src) mov(dst, src); 3536 } 3537 return; 3538 } 3539 3540 if (use_XOR_for_compressed_class_base) { 3541 if (Universe::narrow_klass_shift() != 0) { 3542 eor(dst, src, (uint64_t)Universe::narrow_klass_base()); 3543 lsr(dst, dst, LogKlassAlignmentInBytes); 3544 } else { 3545 eor(dst, src, (uint64_t)Universe::narrow_klass_base()); 3546 } 3547 return; 3548 } 3549 3550 if (((uint64_t)Universe::narrow_klass_base() & 0xffffffff) == 0 3551 && Universe::narrow_klass_shift() == 0) { 3552 movw(dst, src); 3553 return; 3554 } 3555 3556 #ifdef ASSERT 3557 verify_heapbase("MacroAssembler::encode_klass_not_null2: heap base corrupted?"); 3558 #endif 3559 3560 Register rbase = dst; 3561 if (dst == src) rbase = rheapbase; 3562 mov(rbase, (uint64_t)Universe::narrow_klass_base()); 3563 sub(dst, src, rbase); 3564 if (Universe::narrow_klass_shift() != 0) { 3565 assert (LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong"); 3566 lsr(dst, dst, LogKlassAlignmentInBytes); 3567 } 3568 if (dst == src) reinit_heapbase(); 3569 } 3570 3571 void MacroAssembler::encode_klass_not_null(Register r) { 3572 encode_klass_not_null(r, r); 3573 } 3574 3575 void MacroAssembler::decode_klass_not_null(Register dst, Register src) { 3576 Register rbase = dst; 3577 assert (UseCompressedClassPointers, "should only be used for compressed headers"); 3578 3579 if (Universe::narrow_klass_base() == NULL) { 3580 if (Universe::narrow_klass_shift() != 0) { 3581 assert(LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong"); 3582 lsl(dst, src, LogKlassAlignmentInBytes); 3583 } else { 3584 if (dst != src) mov(dst, src); 3585 } 3586 return; 3587 } 3588 3589 if (use_XOR_for_compressed_class_base) { 3590 if (Universe::narrow_klass_shift() != 0) { 3591 lsl(dst, src, LogKlassAlignmentInBytes); 3592 eor(dst, dst, (uint64_t)Universe::narrow_klass_base()); 3593 } else { 3594 eor(dst, src, (uint64_t)Universe::narrow_klass_base()); 3595 } 3596 return; 3597 } 3598 3599 if (((uint64_t)Universe::narrow_klass_base() & 0xffffffff) == 0 3600 && Universe::narrow_klass_shift() == 0) { 3601 if (dst != src) 3602 movw(dst, src); 3603 movk(dst, (uint64_t)Universe::narrow_klass_base() >> 32, 32); 3604 return; 3605 } 3606 3607 // Cannot assert, unverified entry point counts instructions (see .ad file) 3608 // vtableStubs also counts instructions in pd_code_size_limit. 3609 // Also do not verify_oop as this is called by verify_oop. 3610 if (dst == src) rbase = rheapbase; 3611 mov(rbase, (uint64_t)Universe::narrow_klass_base()); 3612 if (Universe::narrow_klass_shift() != 0) { 3613 assert(LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong"); 3614 add(dst, rbase, src, Assembler::LSL, LogKlassAlignmentInBytes); 3615 } else { 3616 add(dst, rbase, src); 3617 } 3618 if (dst == src) reinit_heapbase(); 3619 } 3620 3621 void MacroAssembler::decode_klass_not_null(Register r) { 3622 decode_klass_not_null(r, r); 3623 } 3624 3625 void MacroAssembler::set_narrow_oop(Register dst, jobject obj) { 3626 assert (UseCompressedOops, "should only be used for compressed oops"); 3627 assert (Universe::heap() != NULL, "java heap should be initialized"); 3628 assert (oop_recorder() != NULL, "this assembler needs an OopRecorder"); 3629 3630 int oop_index = oop_recorder()->find_index(obj); 3631 assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "should be real oop"); 3632 3633 InstructionMark im(this); 3634 RelocationHolder rspec = oop_Relocation::spec(oop_index); 3635 code_section()->relocate(inst_mark(), rspec); 3636 movz(dst, 0xDEAD, 16); 3637 movk(dst, 0xBEEF); 3638 } 3639 3640 void MacroAssembler::set_narrow_klass(Register dst, Klass* k) { 3641 assert (UseCompressedClassPointers, "should only be used for compressed headers"); 3642 assert (oop_recorder() != NULL, "this assembler needs an OopRecorder"); 3643 int index = oop_recorder()->find_index(k); 3644 assert(! Universe::heap()->is_in_reserved(k), "should not be an oop"); 3645 3646 InstructionMark im(this); 3647 RelocationHolder rspec = metadata_Relocation::spec(index); 3648 code_section()->relocate(inst_mark(), rspec); 3649 narrowKlass nk = Klass::encode_klass(k); 3650 movz(dst, (nk >> 16), 16); 3651 movk(dst, nk & 0xffff); 3652 } 3653 3654 void MacroAssembler::load_heap_oop(Register dst, Address src) 3655 { 3656 if (UseCompressedOops) { 3657 ldrw(dst, src); 3658 decode_heap_oop(dst); 3659 } else { 3660 ldr(dst, src); 3661 } 3662 } 3663 3664 void MacroAssembler::load_heap_oop_not_null(Register dst, Address src) 3665 { 3666 if (UseCompressedOops) { 3667 ldrw(dst, src); 3668 decode_heap_oop_not_null(dst); 3669 } else { 3670 ldr(dst, src); 3671 } 3672 } 3673 3674 void MacroAssembler::store_heap_oop(Address dst, Register src) { 3675 if (UseCompressedOops) { 3676 assert(!dst.uses(src), "not enough registers"); 3677 encode_heap_oop(src); 3678 strw(src, dst); 3679 } else 3680 str(src, dst); 3681 } 3682 3683 // Used for storing NULLs. 3684 void MacroAssembler::store_heap_oop_null(Address dst) { 3685 if (UseCompressedOops) { 3686 strw(zr, dst); 3687 } else 3688 str(zr, dst); 3689 } 3690 3691 #if INCLUDE_ALL_GCS 3692 void MacroAssembler::g1_write_barrier_pre(Register obj, 3693 Register pre_val, 3694 Register thread, 3695 Register tmp, 3696 bool tosca_live, 3697 bool expand_call) { 3698 // If expand_call is true then we expand the call_VM_leaf macro 3699 // directly to skip generating the check by 3700 // InterpreterMacroAssembler::call_VM_leaf_base that checks _last_sp. 3701 3702 assert(thread == rthread, "must be"); 3703 3704 Label done; 3705 Label runtime; 3706 3707 assert(pre_val != noreg, "check this code"); 3708 3709 if (obj != noreg) 3710 assert_different_registers(obj, pre_val, tmp); 3711 3712 Address in_progress(thread, in_bytes(JavaThread::satb_mark_queue_offset() + 3713 SATBMarkQueue::byte_offset_of_active())); 3714 Address index(thread, in_bytes(JavaThread::satb_mark_queue_offset() + 3715 SATBMarkQueue::byte_offset_of_index())); 3716 Address buffer(thread, in_bytes(JavaThread::satb_mark_queue_offset() + 3717 SATBMarkQueue::byte_offset_of_buf())); 3718 3719 3720 // Is marking active? 3721 if (in_bytes(SATBMarkQueue::byte_width_of_active()) == 4) { 3722 ldrw(tmp, in_progress); 3723 } else { 3724 assert(in_bytes(SATBMarkQueue::byte_width_of_active()) == 1, "Assumption"); 3725 ldrb(tmp, in_progress); 3726 } 3727 cbzw(tmp, done); 3728 3729 // Do we need to load the previous value? 3730 if (obj != noreg) { 3731 load_heap_oop(pre_val, Address(obj, 0)); 3732 } 3733 3734 // Is the previous value null? 3735 cbz(pre_val, done); 3736 3737 // Can we store original value in the thread's buffer? 3738 // Is index == 0? 3739 // (The index field is typed as size_t.) 3740 3741 ldr(tmp, index); // tmp := *index_adr 3742 cbz(tmp, runtime); // tmp == 0? 3743 // If yes, goto runtime 3744 3745 sub(tmp, tmp, wordSize); // tmp := tmp - wordSize 3746 str(tmp, index); // *index_adr := tmp 3747 ldr(rscratch1, buffer); 3748 add(tmp, tmp, rscratch1); // tmp := tmp + *buffer_adr 3749 3750 // Record the previous value 3751 str(pre_val, Address(tmp, 0)); 3752 b(done); 3753 3754 bind(runtime); 3755 // save the live input values 3756 push(r0->bit(tosca_live) | obj->bit(obj != noreg) | pre_val->bit(true), sp); 3757 3758 // Calling the runtime using the regular call_VM_leaf mechanism generates 3759 // code (generated by InterpreterMacroAssember::call_VM_leaf_base) 3760 // that checks that the *(rfp+frame::interpreter_frame_last_sp) == NULL. 3761 // 3762 // If we care generating the pre-barrier without a frame (e.g. in the 3763 // intrinsified Reference.get() routine) then ebp might be pointing to 3764 // the caller frame and so this check will most likely fail at runtime. 3765 // 3766 // Expanding the call directly bypasses the generation of the check. 3767 // So when we do not have have a full interpreter frame on the stack 3768 // expand_call should be passed true. 3769 3770 if (expand_call) { 3771 assert(pre_val != c_rarg1, "smashed arg"); 3772 pass_arg1(this, thread); 3773 pass_arg0(this, pre_val); 3774 MacroAssembler::call_VM_leaf_base(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_pre), 2); 3775 } else { 3776 call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_pre), pre_val, thread); 3777 } 3778 3779 pop(r0->bit(tosca_live) | obj->bit(obj != noreg) | pre_val->bit(true), sp); 3780 3781 bind(done); 3782 } 3783 3784 void MacroAssembler::g1_write_barrier_post(Register store_addr, 3785 Register new_val, 3786 Register thread, 3787 Register tmp, 3788 Register tmp2) { 3789 assert(thread == rthread, "must be"); 3790 3791 if (UseShenandoahGC) { 3792 // No need for this in Shenandoah. 3793 return; 3794 } 3795 3796 assert(UseG1GC, "expect G1 GC"); 3797 3798 Address queue_index(thread, in_bytes(JavaThread::dirty_card_queue_offset() + 3799 DirtyCardQueue::byte_offset_of_index())); 3800 Address buffer(thread, in_bytes(JavaThread::dirty_card_queue_offset() + 3801 DirtyCardQueue::byte_offset_of_buf())); 3802 3803 BarrierSet* bs = Universe::heap()->barrier_set(); 3804 CardTableModRefBS* ct = (CardTableModRefBS*)bs; 3805 assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code"); 3806 3807 Label done; 3808 Label runtime; 3809 3810 // Does store cross heap regions? 3811 3812 eor(tmp, store_addr, new_val); 3813 lsr(tmp, tmp, HeapRegion::LogOfHRGrainBytes); 3814 cbz(tmp, done); 3815 3816 // crosses regions, storing NULL? 3817 3818 cbz(new_val, done); 3819 3820 // storing region crossing non-NULL, is card already dirty? 3821 3822 ExternalAddress cardtable((address) ct->byte_map_base); 3823 assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code"); 3824 const Register card_addr = tmp; 3825 3826 lsr(card_addr, store_addr, CardTableModRefBS::card_shift); 3827 3828 // get the address of the card 3829 load_byte_map_base(tmp2); 3830 add(card_addr, card_addr, tmp2); 3831 ldrb(tmp2, Address(card_addr)); 3832 cmpw(tmp2, (int)G1SATBCardTableModRefBS::g1_young_card_val()); 3833 br(Assembler::EQ, done); 3834 3835 assert((int)CardTableModRefBS::dirty_card_val() == 0, "must be 0"); 3836 3837 membar(Assembler::StoreLoad); 3838 3839 ldrb(tmp2, Address(card_addr)); 3840 cbzw(tmp2, done); 3841 3842 // storing a region crossing, non-NULL oop, card is clean. 3843 // dirty card and log. 3844 3845 strb(zr, Address(card_addr)); 3846 3847 ldr(rscratch1, queue_index); 3848 cbz(rscratch1, runtime); 3849 sub(rscratch1, rscratch1, wordSize); 3850 str(rscratch1, queue_index); 3851 3852 ldr(tmp2, buffer); 3853 str(card_addr, Address(tmp2, rscratch1)); 3854 b(done); 3855 3856 bind(runtime); 3857 // save the live input values 3858 push(store_addr->bit(true) | new_val->bit(true), sp); 3859 call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_post), card_addr, thread); 3860 pop(store_addr->bit(true) | new_val->bit(true), sp); 3861 3862 bind(done); 3863 } 3864 3865 void MacroAssembler::shenandoah_write_barrier(Register dst) { 3866 assert(UseShenandoahGC, "must only be called with Shenandoah GC active"); 3867 assert(dst != rscratch1, "need rscratch1"); 3868 assert(dst != rscratch2, "need rscratch2"); 3869 3870 Label done; 3871 3872 // Check for evacuation-in-progress 3873 Address evacuation_in_progress = Address(rthread, in_bytes(JavaThread::evacuation_in_progress_offset())); 3874 ldrb(rscratch1, evacuation_in_progress); 3875 membar(Assembler::LoadLoad); 3876 3877 // The read-barrier. 3878 ldr(dst, Address(dst, BrooksPointer::byte_offset())); 3879 3880 // Evac-check ... 3881 cbzw(rscratch1, done); 3882 3883 RegSet to_save = RegSet::of(r0); 3884 if (dst != r0) { 3885 push(to_save, sp); 3886 mov(r0, dst); 3887 } 3888 3889 assert(StubRoutines::aarch64::shenandoah_wb() != NULL, "need write barrier stub"); 3890 far_call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::aarch64::shenandoah_wb()))); 3891 3892 if (dst != r0) { 3893 mov(dst, r0); 3894 pop(to_save, sp); 3895 } 3896 block_comment("} Shenandoah write barrier"); 3897 3898 bind(done); 3899 } 3900 3901 #endif // INCLUDE_ALL_GCS 3902 3903 Address MacroAssembler::allocate_metadata_address(Metadata* obj) { 3904 assert(oop_recorder() != NULL, "this assembler needs a Recorder"); 3905 int index = oop_recorder()->allocate_metadata_index(obj); 3906 RelocationHolder rspec = metadata_Relocation::spec(index); 3907 return Address((address)obj, rspec); 3908 } 3909 3910 // Move an oop into a register. immediate is true if we want 3911 // immediate instrcutions, i.e. we are not going to patch this 3912 // instruction while the code is being executed by another thread. In 3913 // that case we can use move immediates rather than the constant pool. 3914 void MacroAssembler::movoop(Register dst, jobject obj, bool immediate) { 3915 int oop_index; 3916 if (obj == NULL) { 3917 oop_index = oop_recorder()->allocate_oop_index(obj); 3918 } else { 3919 oop_index = oop_recorder()->find_index(obj); 3920 assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "should be real oop"); 3921 } 3922 RelocationHolder rspec = oop_Relocation::spec(oop_index); 3923 if (! immediate) { 3924 address dummy = address(uintptr_t(pc()) & -wordSize); // A nearby aligned address 3925 ldr_constant(dst, Address(dummy, rspec)); 3926 } else 3927 mov(dst, Address((address)obj, rspec)); 3928 } 3929 3930 // Move a metadata address into a register. 3931 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) { 3932 int oop_index; 3933 if (obj == NULL) { 3934 oop_index = oop_recorder()->allocate_metadata_index(obj); 3935 } else { 3936 oop_index = oop_recorder()->find_index(obj); 3937 } 3938 RelocationHolder rspec = metadata_Relocation::spec(oop_index); 3939 mov(dst, Address((address)obj, rspec)); 3940 } 3941 3942 Address MacroAssembler::constant_oop_address(jobject obj) { 3943 assert(oop_recorder() != NULL, "this assembler needs an OopRecorder"); 3944 assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "not an oop"); 3945 int oop_index = oop_recorder()->find_index(obj); 3946 return Address((address)obj, oop_Relocation::spec(oop_index)); 3947 } 3948 3949 // Defines obj, preserves var_size_in_bytes, okay for t2 == var_size_in_bytes. 3950 void MacroAssembler::tlab_allocate(Register obj, 3951 Register var_size_in_bytes, 3952 int con_size_in_bytes, 3953 Register t1, 3954 Register t2, 3955 Label& slow_case) { 3956 assert_different_registers(obj, t2); 3957 assert_different_registers(obj, var_size_in_bytes); 3958 Register end = t2; 3959 3960 // verify_tlab(); 3961 3962 int oop_extra_words = Universe::heap()->oop_extra_words(); 3963 3964 ldr(obj, Address(rthread, JavaThread::tlab_top_offset())); 3965 if (var_size_in_bytes == noreg) { 3966 lea(end, Address(obj, con_size_in_bytes + oop_extra_words * HeapWordSize)); 3967 } else { 3968 if (oop_extra_words > 0) { 3969 add(var_size_in_bytes, var_size_in_bytes, oop_extra_words * HeapWordSize); 3970 } 3971 lea(end, Address(obj, var_size_in_bytes)); 3972 } 3973 ldr(rscratch1, Address(rthread, JavaThread::tlab_end_offset())); 3974 cmp(end, rscratch1); 3975 br(Assembler::HI, slow_case); 3976 3977 // update the tlab top pointer 3978 str(end, Address(rthread, JavaThread::tlab_top_offset())); 3979 3980 Universe::heap()->compile_prepare_oop(this, obj); 3981 3982 // recover var_size_in_bytes if necessary 3983 if (var_size_in_bytes == end) { 3984 sub(var_size_in_bytes, var_size_in_bytes, obj); 3985 } 3986 // verify_tlab(); 3987 } 3988 3989 // Preserves r19, and r3. 3990 Register MacroAssembler::tlab_refill(Label& retry, 3991 Label& try_eden, 3992 Label& slow_case) { 3993 Register top = r0; 3994 Register t1 = r2; 3995 Register t2 = r4; 3996 assert_different_registers(top, rthread, t1, t2, /* preserve: */ r19, r3); 3997 Label do_refill, discard_tlab; 3998 3999 if (!Universe::heap()->supports_inline_contig_alloc()) { 4000 // No allocation in the shared eden. 4001 b(slow_case); 4002 } 4003 4004 ldr(top, Address(rthread, in_bytes(JavaThread::tlab_top_offset()))); 4005 ldr(t1, Address(rthread, in_bytes(JavaThread::tlab_end_offset()))); 4006 4007 // calculate amount of free space 4008 sub(t1, t1, top); 4009 lsr(t1, t1, LogHeapWordSize); 4010 4011 // Retain tlab and allocate object in shared space if 4012 // the amount free in the tlab is too large to discard. 4013 4014 ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_refill_waste_limit_offset()))); 4015 cmp(t1, rscratch1); 4016 br(Assembler::LE, discard_tlab); 4017 4018 // Retain 4019 // ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_refill_waste_limit_offset()))); 4020 mov(t2, (int32_t) ThreadLocalAllocBuffer::refill_waste_limit_increment()); 4021 add(rscratch1, rscratch1, t2); 4022 str(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_refill_waste_limit_offset()))); 4023 4024 if (TLABStats) { 4025 // increment number of slow_allocations 4026 addmw(Address(rthread, in_bytes(JavaThread::tlab_slow_allocations_offset())), 4027 1, rscratch1); 4028 } 4029 b(try_eden); 4030 4031 bind(discard_tlab); 4032 if (TLABStats) { 4033 // increment number of refills 4034 addmw(Address(rthread, in_bytes(JavaThread::tlab_number_of_refills_offset())), 1, 4035 rscratch1); 4036 // accumulate wastage -- t1 is amount free in tlab 4037 addmw(Address(rthread, in_bytes(JavaThread::tlab_fast_refill_waste_offset())), t1, 4038 rscratch1); 4039 } 4040 4041 // if tlab is currently allocated (top or end != null) then 4042 // fill [top, end + alignment_reserve) with array object 4043 cbz(top, do_refill); 4044 4045 // set up the mark word 4046 mov(rscratch1, (intptr_t)markOopDesc::prototype()->copy_set_hash(0x2)); 4047 str(rscratch1, Address(top, oopDesc::mark_offset_in_bytes())); 4048 // set the length to the remaining space 4049 sub(t1, t1, typeArrayOopDesc::header_size(T_INT)); 4050 add(t1, t1, (int32_t)ThreadLocalAllocBuffer::alignment_reserve()); 4051 lsl(t1, t1, log2_intptr(HeapWordSize/sizeof(jint))); 4052 strw(t1, Address(top, arrayOopDesc::length_offset_in_bytes())); 4053 // set klass to intArrayKlass 4054 { 4055 unsigned long offset; 4056 // dubious reloc why not an oop reloc? 4057 adrp(rscratch1, ExternalAddress((address)Universe::intArrayKlassObj_addr()), 4058 offset); 4059 ldr(t1, Address(rscratch1, offset)); 4060 } 4061 // store klass last. concurrent gcs assumes klass length is valid if 4062 // klass field is not null. 4063 store_klass(top, t1); 4064 4065 mov(t1, top); 4066 ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_start_offset()))); 4067 sub(t1, t1, rscratch1); 4068 incr_allocated_bytes(rthread, t1, 0, rscratch1); 4069 4070 // refill the tlab with an eden allocation 4071 bind(do_refill); 4072 ldr(t1, Address(rthread, in_bytes(JavaThread::tlab_size_offset()))); 4073 lsl(t1, t1, LogHeapWordSize); 4074 // allocate new tlab, address returned in top 4075 eden_allocate(top, t1, 0, t2, slow_case); 4076 4077 // Check that t1 was preserved in eden_allocate. 4078 #ifdef ASSERT 4079 if (UseTLAB) { 4080 Label ok; 4081 Register tsize = r4; 4082 assert_different_registers(tsize, rthread, t1); 4083 str(tsize, Address(pre(sp, -16))); 4084 ldr(tsize, Address(rthread, in_bytes(JavaThread::tlab_size_offset()))); 4085 lsl(tsize, tsize, LogHeapWordSize); 4086 cmp(t1, tsize); 4087 br(Assembler::EQ, ok); 4088 STOP("assert(t1 != tlab size)"); 4089 should_not_reach_here(); 4090 4091 bind(ok); 4092 ldr(tsize, Address(post(sp, 16))); 4093 } 4094 #endif 4095 str(top, Address(rthread, in_bytes(JavaThread::tlab_start_offset()))); 4096 str(top, Address(rthread, in_bytes(JavaThread::tlab_top_offset()))); 4097 add(top, top, t1); 4098 sub(top, top, (int32_t)ThreadLocalAllocBuffer::alignment_reserve_in_bytes()); 4099 str(top, Address(rthread, in_bytes(JavaThread::tlab_end_offset()))); 4100 4101 if (ZeroTLAB) { 4102 // This is a fast TLAB refill, therefore the GC is not notified of it. 4103 // So compiled code must fill the new TLAB with zeroes. 4104 ldr(top, Address(rthread, in_bytes(JavaThread::tlab_start_offset()))); 4105 zero_memory(top,t1,t2); 4106 } 4107 4108 verify_tlab(); 4109 b(retry); 4110 4111 return rthread; // for use by caller 4112 } 4113 4114 // Zero words; len is in bytes 4115 // Destroys all registers except addr 4116 // len must be a nonzero multiple of wordSize 4117 void MacroAssembler::zero_memory(Register addr, Register len, Register t1) { 4118 assert_different_registers(addr, len, t1, rscratch1, rscratch2); 4119 4120 #ifdef ASSERT 4121 { Label L; 4122 tst(len, BytesPerWord - 1); 4123 br(Assembler::EQ, L); 4124 stop("len is not a multiple of BytesPerWord"); 4125 bind(L); 4126 } 4127 #endif 4128 4129 #ifndef PRODUCT 4130 block_comment("zero memory"); 4131 #endif 4132 4133 Label loop; 4134 Label entry; 4135 4136 // Algorithm: 4137 // 4138 // scratch1 = cnt & 7; 4139 // cnt -= scratch1; 4140 // p += scratch1; 4141 // switch (scratch1) { 4142 // do { 4143 // cnt -= 8; 4144 // p[-8] = 0; 4145 // case 7: 4146 // p[-7] = 0; 4147 // case 6: 4148 // p[-6] = 0; 4149 // // ... 4150 // case 1: 4151 // p[-1] = 0; 4152 // case 0: 4153 // p += 8; 4154 // } while (cnt); 4155 // } 4156 4157 const int unroll = 8; // Number of str(zr) instructions we'll unroll 4158 4159 lsr(len, len, LogBytesPerWord); 4160 andr(rscratch1, len, unroll - 1); // tmp1 = cnt % unroll 4161 sub(len, len, rscratch1); // cnt -= unroll 4162 // t1 always points to the end of the region we're about to zero 4163 add(t1, addr, rscratch1, Assembler::LSL, LogBytesPerWord); 4164 adr(rscratch2, entry); 4165 sub(rscratch2, rscratch2, rscratch1, Assembler::LSL, 2); 4166 br(rscratch2); 4167 bind(loop); 4168 sub(len, len, unroll); 4169 for (int i = -unroll; i < 0; i++) 4170 str(zr, Address(t1, i * wordSize)); 4171 bind(entry); 4172 add(t1, t1, unroll * wordSize); 4173 cbnz(len, loop); 4174 } 4175 4176 // Defines obj, preserves var_size_in_bytes 4177 void MacroAssembler::eden_allocate(Register obj, 4178 Register var_size_in_bytes, 4179 int con_size_in_bytes, 4180 Register t1, 4181 Label& slow_case) { 4182 assert_different_registers(obj, var_size_in_bytes, t1); 4183 if (!Universe::heap()->supports_inline_contig_alloc()) { 4184 b(slow_case); 4185 } else { 4186 Register end = t1; 4187 Register heap_end = rscratch2; 4188 Label retry; 4189 bind(retry); 4190 { 4191 unsigned long offset; 4192 adrp(rscratch1, ExternalAddress((address) Universe::heap()->end_addr()), offset); 4193 ldr(heap_end, Address(rscratch1, offset)); 4194 } 4195 4196 ExternalAddress heap_top((address) Universe::heap()->top_addr()); 4197 4198 // Get the current top of the heap 4199 { 4200 unsigned long offset; 4201 adrp(rscratch1, heap_top, offset); 4202 // Use add() here after ARDP, rather than lea(). 4203 // lea() does not generate anything if its offset is zero. 4204 // However, relocs expect to find either an ADD or a load/store 4205 // insn after an ADRP. add() always generates an ADD insn, even 4206 // for add(Rn, Rn, 0). 4207 add(rscratch1, rscratch1, offset); 4208 ldaxr(obj, rscratch1); 4209 } 4210 4211 // Adjust it my the size of our new object 4212 if (var_size_in_bytes == noreg) { 4213 lea(end, Address(obj, con_size_in_bytes)); 4214 } else { 4215 lea(end, Address(obj, var_size_in_bytes)); 4216 } 4217 4218 // if end < obj then we wrapped around high memory 4219 cmp(end, obj); 4220 br(Assembler::LO, slow_case); 4221 4222 cmp(end, heap_end); 4223 br(Assembler::HI, slow_case); 4224 4225 // If heap_top hasn't been changed by some other thread, update it. 4226 stlxr(rscratch2, end, rscratch1); 4227 cbnzw(rscratch2, retry); 4228 } 4229 } 4230 4231 void MacroAssembler::verify_tlab() { 4232 #ifdef ASSERT 4233 if (UseTLAB && VerifyOops) { 4234 Label next, ok; 4235 4236 stp(rscratch2, rscratch1, Address(pre(sp, -16))); 4237 4238 ldr(rscratch2, Address(rthread, in_bytes(JavaThread::tlab_top_offset()))); 4239 ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_start_offset()))); 4240 cmp(rscratch2, rscratch1); 4241 br(Assembler::HS, next); 4242 STOP("assert(top >= start)"); 4243 should_not_reach_here(); 4244 4245 bind(next); 4246 ldr(rscratch2, Address(rthread, in_bytes(JavaThread::tlab_end_offset()))); 4247 ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_top_offset()))); 4248 cmp(rscratch2, rscratch1); 4249 br(Assembler::HS, ok); 4250 STOP("assert(top <= end)"); 4251 should_not_reach_here(); 4252 4253 bind(ok); 4254 ldp(rscratch2, rscratch1, Address(post(sp, 16))); 4255 } 4256 #endif 4257 } 4258 4259 // Writes to stack successive pages until offset reached to check for 4260 // stack overflow + shadow pages. This clobbers tmp. 4261 void MacroAssembler::bang_stack_size(Register size, Register tmp) { 4262 assert_different_registers(tmp, size, rscratch1); 4263 mov(tmp, sp); 4264 // Bang stack for total size given plus shadow page size. 4265 // Bang one page at a time because large size can bang beyond yellow and 4266 // red zones. 4267 Label loop; 4268 mov(rscratch1, os::vm_page_size()); 4269 bind(loop); 4270 lea(tmp, Address(tmp, -os::vm_page_size())); 4271 subsw(size, size, rscratch1); 4272 str(size, Address(tmp)); 4273 br(Assembler::GT, loop); 4274 4275 // Bang down shadow pages too. 4276 // At this point, (tmp-0) is the last address touched, so don't 4277 // touch it again. (It was touched as (tmp-pagesize) but then tmp 4278 // was post-decremented.) Skip this address by starting at i=1, and 4279 // touch a few more pages below. N.B. It is important to touch all 4280 // the way down to and including i=StackShadowPages. 4281 for (int i = 0; i < (int)(JavaThread::stack_shadow_zone_size() / os::vm_page_size()) - 1; i++) { 4282 // this could be any sized move but this is can be a debugging crumb 4283 // so the bigger the better. 4284 lea(tmp, Address(tmp, -os::vm_page_size())); 4285 str(size, Address(tmp)); 4286 } 4287 } 4288 4289 4290 address MacroAssembler::read_polling_page(Register r, address page, relocInfo::relocType rtype) { 4291 unsigned long off; 4292 adrp(r, Address(page, rtype), off); 4293 InstructionMark im(this); 4294 code_section()->relocate(inst_mark(), rtype); 4295 ldrw(zr, Address(r, off)); 4296 return inst_mark(); 4297 } 4298 4299 address MacroAssembler::read_polling_page(Register r, relocInfo::relocType rtype) { 4300 InstructionMark im(this); 4301 code_section()->relocate(inst_mark(), rtype); 4302 ldrw(zr, Address(r, 0)); 4303 return inst_mark(); 4304 } 4305 4306 void MacroAssembler::adrp(Register reg1, const Address &dest, unsigned long &byte_offset) { 4307 relocInfo::relocType rtype = dest.rspec().reloc()->type(); 4308 unsigned long low_page = (unsigned long)CodeCache::low_bound() >> 12; 4309 unsigned long high_page = (unsigned long)(CodeCache::high_bound()-1) >> 12; 4310 unsigned long dest_page = (unsigned long)dest.target() >> 12; 4311 long offset_low = dest_page - low_page; 4312 long offset_high = dest_page - high_page; 4313 4314 assert(is_valid_AArch64_address(dest.target()), "bad address"); 4315 assert(dest.getMode() == Address::literal, "ADRP must be applied to a literal address"); 4316 4317 InstructionMark im(this); 4318 code_section()->relocate(inst_mark(), dest.rspec()); 4319 // 8143067: Ensure that the adrp can reach the dest from anywhere within 4320 // the code cache so that if it is relocated we know it will still reach 4321 if (offset_high >= -(1<<20) && offset_low < (1<<20)) { 4322 _adrp(reg1, dest.target()); 4323 } else { 4324 unsigned long target = (unsigned long)dest.target(); 4325 unsigned long adrp_target 4326 = (target & 0xffffffffUL) | ((unsigned long)pc() & 0xffff00000000UL); 4327 4328 _adrp(reg1, (address)adrp_target); 4329 movk(reg1, target >> 32, 32); 4330 } 4331 byte_offset = (unsigned long)dest.target() & 0xfff; 4332 } 4333 4334 void MacroAssembler::load_byte_map_base(Register reg) { 4335 jbyte *byte_map_base = 4336 ((CardTableModRefBS*)(Universe::heap()->barrier_set()))->byte_map_base; 4337 4338 if (is_valid_AArch64_address((address)byte_map_base)) { 4339 // Strictly speaking the byte_map_base isn't an address at all, 4340 // and it might even be negative. 4341 unsigned long offset; 4342 adrp(reg, ExternalAddress((address)byte_map_base), offset); 4343 // We expect offset to be zero with most collectors. 4344 if (offset != 0) { 4345 add(reg, reg, offset); 4346 } 4347 } else { 4348 mov(reg, (uint64_t)byte_map_base); 4349 } 4350 } 4351 4352 void MacroAssembler::build_frame(int framesize) { 4353 assert(framesize > 0, "framesize must be > 0"); 4354 if (framesize < ((1 << 9) + 2 * wordSize)) { 4355 sub(sp, sp, framesize); 4356 stp(rfp, lr, Address(sp, framesize - 2 * wordSize)); 4357 if (PreserveFramePointer) add(rfp, sp, framesize - 2 * wordSize); 4358 } else { 4359 stp(rfp, lr, Address(pre(sp, -2 * wordSize))); 4360 if (PreserveFramePointer) mov(rfp, sp); 4361 if (framesize < ((1 << 12) + 2 * wordSize)) 4362 sub(sp, sp, framesize - 2 * wordSize); 4363 else { 4364 mov(rscratch1, framesize - 2 * wordSize); 4365 sub(sp, sp, rscratch1); 4366 } 4367 } 4368 } 4369 4370 void MacroAssembler::remove_frame(int framesize) { 4371 assert(framesize > 0, "framesize must be > 0"); 4372 if (framesize < ((1 << 9) + 2 * wordSize)) { 4373 ldp(rfp, lr, Address(sp, framesize - 2 * wordSize)); 4374 add(sp, sp, framesize); 4375 } else { 4376 if (framesize < ((1 << 12) + 2 * wordSize)) 4377 add(sp, sp, framesize - 2 * wordSize); 4378 else { 4379 mov(rscratch1, framesize - 2 * wordSize); 4380 add(sp, sp, rscratch1); 4381 } 4382 ldp(rfp, lr, Address(post(sp, 2 * wordSize))); 4383 } 4384 } 4385 4386 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr); 4387 4388 // Search for str1 in str2 and return index or -1 4389 void MacroAssembler::string_indexof(Register str2, Register str1, 4390 Register cnt2, Register cnt1, 4391 Register tmp1, Register tmp2, 4392 Register tmp3, Register tmp4, 4393 int icnt1, Register result, int ae) { 4394 Label BM, LINEARSEARCH, DONE, NOMATCH, MATCH; 4395 4396 Register ch1 = rscratch1; 4397 Register ch2 = rscratch2; 4398 Register cnt1tmp = tmp1; 4399 Register cnt2tmp = tmp2; 4400 Register cnt1_neg = cnt1; 4401 Register cnt2_neg = cnt2; 4402 Register result_tmp = tmp4; 4403 4404 bool isL = ae == StrIntrinsicNode::LL; 4405 4406 bool str1_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL; 4407 bool str2_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU; 4408 int str1_chr_shift = str1_isL ? 0:1; 4409 int str2_chr_shift = str2_isL ? 0:1; 4410 int str1_chr_size = str1_isL ? 1:2; 4411 int str2_chr_size = str2_isL ? 1:2; 4412 chr_insn str1_load_1chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb : 4413 (chr_insn)&MacroAssembler::ldrh; 4414 chr_insn str2_load_1chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb : 4415 (chr_insn)&MacroAssembler::ldrh; 4416 chr_insn load_2chr = isL ? (chr_insn)&MacroAssembler::ldrh : (chr_insn)&MacroAssembler::ldrw; 4417 chr_insn load_4chr = isL ? (chr_insn)&MacroAssembler::ldrw : (chr_insn)&MacroAssembler::ldr; 4418 4419 // Note, inline_string_indexOf() generates checks: 4420 // if (substr.count > string.count) return -1; 4421 // if (substr.count == 0) return 0; 4422 4423 // We have two strings, a source string in str2, cnt2 and a pattern string 4424 // in str1, cnt1. Find the 1st occurence of pattern in source or return -1. 4425 4426 // For larger pattern and source we use a simplified Boyer Moore algorithm. 4427 // With a small pattern and source we use linear scan. 4428 4429 if (icnt1 == -1) { 4430 cmp(cnt1, 256); // Use Linear Scan if cnt1 < 8 || cnt1 >= 256 4431 ccmp(cnt1, 8, 0b0000, LO); // Can't handle skip >= 256 because we use 4432 br(LO, LINEARSEARCH); // a byte array. 4433 cmp(cnt1, cnt2, LSR, 2); // Source must be 4 * pattern for BM 4434 br(HS, LINEARSEARCH); 4435 } 4436 4437 // The Boyer Moore alogorithm is based on the description here:- 4438 // 4439 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm 4440 // 4441 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule 4442 // and the 'Good Suffix' rule. 4443 // 4444 // These rules are essentially heuristics for how far we can shift the 4445 // pattern along the search string. 4446 // 4447 // The implementation here uses the 'Bad Character' rule only because of the 4448 // complexity of initialisation for the 'Good Suffix' rule. 4449 // 4450 // This is also known as the Boyer-Moore-Horspool algorithm:- 4451 // 4452 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm 4453 // 4454 // #define ASIZE 128 4455 // 4456 // int bm(unsigned char *x, int m, unsigned char *y, int n) { 4457 // int i, j; 4458 // unsigned c; 4459 // unsigned char bc[ASIZE]; 4460 // 4461 // /* Preprocessing */ 4462 // for (i = 0; i < ASIZE; ++i) 4463 // bc[i] = 0; 4464 // for (i = 0; i < m - 1; ) { 4465 // c = x[i]; 4466 // ++i; 4467 // if (c < ASIZE) bc[c] = i; 4468 // } 4469 // 4470 // /* Searching */ 4471 // j = 0; 4472 // while (j <= n - m) { 4473 // c = y[i+j]; 4474 // if (x[m-1] == c) 4475 // for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i); 4476 // if (i < 0) return j; 4477 // if (c < ASIZE) 4478 // j = j - bc[y[j+m-1]] + m; 4479 // else 4480 // j += 1; // Advance by 1 only if char >= ASIZE 4481 // } 4482 // } 4483 4484 if (icnt1 == -1) { 4485 BIND(BM); 4486 4487 Label ZLOOP, BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP; 4488 Label BMADV, BMMATCH, BMCHECKEND; 4489 4490 Register cnt1end = tmp2; 4491 Register str2end = cnt2; 4492 Register skipch = tmp2; 4493 4494 // Restrict ASIZE to 128 to reduce stack space/initialisation. 4495 // The presence of chars >= ASIZE in the target string does not affect 4496 // performance, but we must be careful not to initialise them in the stack 4497 // array. 4498 // The presence of chars >= ASIZE in the source string may adversely affect 4499 // performance since we can only advance by one when we encounter one. 4500 4501 stp(zr, zr, pre(sp, -128)); 4502 for (int i = 1; i < 8; i++) 4503 stp(zr, zr, Address(sp, i*16)); 4504 4505 mov(cnt1tmp, 0); 4506 sub(cnt1end, cnt1, 1); 4507 BIND(BCLOOP); 4508 (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift))); 4509 cmp(ch1, 128); 4510 add(cnt1tmp, cnt1tmp, 1); 4511 br(HS, BCSKIP); 4512 strb(cnt1tmp, Address(sp, ch1)); 4513 BIND(BCSKIP); 4514 cmp(cnt1tmp, cnt1end); 4515 br(LT, BCLOOP); 4516 4517 mov(result_tmp, str2); 4518 4519 sub(cnt2, cnt2, cnt1); 4520 add(str2end, str2, cnt2, LSL, str2_chr_shift); 4521 BIND(BMLOOPSTR2); 4522 sub(cnt1tmp, cnt1, 1); 4523 (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift))); 4524 (this->*str2_load_1chr)(skipch, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift))); 4525 cmp(ch1, skipch); 4526 br(NE, BMSKIP); 4527 subs(cnt1tmp, cnt1tmp, 1); 4528 br(LT, BMMATCH); 4529 BIND(BMLOOPSTR1); 4530 (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift))); 4531 (this->*str2_load_1chr)(ch2, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift))); 4532 cmp(ch1, ch2); 4533 br(NE, BMSKIP); 4534 subs(cnt1tmp, cnt1tmp, 1); 4535 br(GE, BMLOOPSTR1); 4536 BIND(BMMATCH); 4537 sub(result, str2, result_tmp); 4538 if (!str2_isL) lsr(result, result, 1); 4539 add(sp, sp, 128); 4540 b(DONE); 4541 BIND(BMADV); 4542 add(str2, str2, str2_chr_size); 4543 b(BMCHECKEND); 4544 BIND(BMSKIP); 4545 cmp(skipch, 128); 4546 br(HS, BMADV); 4547 ldrb(ch2, Address(sp, skipch)); 4548 add(str2, str2, cnt1, LSL, str2_chr_shift); 4549 sub(str2, str2, ch2, LSL, str2_chr_shift); 4550 BIND(BMCHECKEND); 4551 cmp(str2, str2end); 4552 br(LE, BMLOOPSTR2); 4553 add(sp, sp, 128); 4554 b(NOMATCH); 4555 } 4556 4557 BIND(LINEARSEARCH); 4558 { 4559 Label DO1, DO2, DO3; 4560 4561 Register str2tmp = tmp2; 4562 Register first = tmp3; 4563 4564 if (icnt1 == -1) 4565 { 4566 Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT; 4567 4568 cmp(cnt1, str1_isL == str2_isL ? 4 : 2); 4569 br(LT, DOSHORT); 4570 4571 sub(cnt2, cnt2, cnt1); 4572 mov(result_tmp, cnt2); 4573 4574 lea(str1, Address(str1, cnt1, Address::lsl(str1_chr_shift))); 4575 lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift))); 4576 sub(cnt1_neg, zr, cnt1, LSL, str1_chr_shift); 4577 sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift); 4578 (this->*str1_load_1chr)(first, Address(str1, cnt1_neg)); 4579 4580 BIND(FIRST_LOOP); 4581 (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg)); 4582 cmp(first, ch2); 4583 br(EQ, STR1_LOOP); 4584 BIND(STR2_NEXT); 4585 adds(cnt2_neg, cnt2_neg, str2_chr_size); 4586 br(LE, FIRST_LOOP); 4587 b(NOMATCH); 4588 4589 BIND(STR1_LOOP); 4590 adds(cnt1tmp, cnt1_neg, str1_chr_size); 4591 add(cnt2tmp, cnt2_neg, str2_chr_size); 4592 br(GE, MATCH); 4593 4594 BIND(STR1_NEXT); 4595 (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp)); 4596 (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp)); 4597 cmp(ch1, ch2); 4598 br(NE, STR2_NEXT); 4599 adds(cnt1tmp, cnt1tmp, str1_chr_size); 4600 add(cnt2tmp, cnt2tmp, str2_chr_size); 4601 br(LT, STR1_NEXT); 4602 b(MATCH); 4603 4604 BIND(DOSHORT); 4605 if (str1_isL == str2_isL) { 4606 cmp(cnt1, 2); 4607 br(LT, DO1); 4608 br(GT, DO3); 4609 } 4610 } 4611 4612 if (icnt1 == 4) { 4613 Label CH1_LOOP; 4614 4615 (this->*load_4chr)(ch1, str1); 4616 sub(cnt2, cnt2, 4); 4617 mov(result_tmp, cnt2); 4618 lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift))); 4619 sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift); 4620 4621 BIND(CH1_LOOP); 4622 (this->*load_4chr)(ch2, Address(str2, cnt2_neg)); 4623 cmp(ch1, ch2); 4624 br(EQ, MATCH); 4625 adds(cnt2_neg, cnt2_neg, str2_chr_size); 4626 br(LE, CH1_LOOP); 4627 b(NOMATCH); 4628 } 4629 4630 if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 2) { 4631 Label CH1_LOOP; 4632 4633 BIND(DO2); 4634 (this->*load_2chr)(ch1, str1); 4635 sub(cnt2, cnt2, 2); 4636 mov(result_tmp, cnt2); 4637 lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift))); 4638 sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift); 4639 4640 BIND(CH1_LOOP); 4641 (this->*load_2chr)(ch2, Address(str2, cnt2_neg)); 4642 cmp(ch1, ch2); 4643 br(EQ, MATCH); 4644 adds(cnt2_neg, cnt2_neg, str2_chr_size); 4645 br(LE, CH1_LOOP); 4646 b(NOMATCH); 4647 } 4648 4649 if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 3) { 4650 Label FIRST_LOOP, STR2_NEXT, STR1_LOOP; 4651 4652 BIND(DO3); 4653 (this->*load_2chr)(first, str1); 4654 (this->*str1_load_1chr)(ch1, Address(str1, 2*str1_chr_size)); 4655 4656 sub(cnt2, cnt2, 3); 4657 mov(result_tmp, cnt2); 4658 lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift))); 4659 sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift); 4660 4661 BIND(FIRST_LOOP); 4662 (this->*load_2chr)(ch2, Address(str2, cnt2_neg)); 4663 cmpw(first, ch2); 4664 br(EQ, STR1_LOOP); 4665 BIND(STR2_NEXT); 4666 adds(cnt2_neg, cnt2_neg, str2_chr_size); 4667 br(LE, FIRST_LOOP); 4668 b(NOMATCH); 4669 4670 BIND(STR1_LOOP); 4671 add(cnt2tmp, cnt2_neg, 2*str2_chr_size); 4672 (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp)); 4673 cmp(ch1, ch2); 4674 br(NE, STR2_NEXT); 4675 b(MATCH); 4676 } 4677 4678 if (icnt1 == -1 || icnt1 == 1) { 4679 Label CH1_LOOP, HAS_ZERO; 4680 Label DO1_SHORT, DO1_LOOP; 4681 4682 BIND(DO1); 4683 (this->*str1_load_1chr)(ch1, str1); 4684 cmp(cnt2, 8); 4685 br(LT, DO1_SHORT); 4686 4687 if (str2_isL) { 4688 if (!str1_isL) { 4689 tst(ch1, 0xff00); 4690 br(NE, NOMATCH); 4691 } 4692 orr(ch1, ch1, ch1, LSL, 8); 4693 } 4694 orr(ch1, ch1, ch1, LSL, 16); 4695 orr(ch1, ch1, ch1, LSL, 32); 4696 4697 sub(cnt2, cnt2, 8/str2_chr_size); 4698 mov(result_tmp, cnt2); 4699 lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift))); 4700 sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift); 4701 4702 mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001); 4703 BIND(CH1_LOOP); 4704 ldr(ch2, Address(str2, cnt2_neg)); 4705 eor(ch2, ch1, ch2); 4706 sub(tmp1, ch2, tmp3); 4707 orr(tmp2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 4708 bics(tmp1, tmp1, tmp2); 4709 br(NE, HAS_ZERO); 4710 adds(cnt2_neg, cnt2_neg, 8); 4711 br(LT, CH1_LOOP); 4712 4713 cmp(cnt2_neg, 8); 4714 mov(cnt2_neg, 0); 4715 br(LT, CH1_LOOP); 4716 b(NOMATCH); 4717 4718 BIND(HAS_ZERO); 4719 rev(tmp1, tmp1); 4720 clz(tmp1, tmp1); 4721 add(cnt2_neg, cnt2_neg, tmp1, LSR, 3); 4722 b(MATCH); 4723 4724 BIND(DO1_SHORT); 4725 mov(result_tmp, cnt2); 4726 lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift))); 4727 sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift); 4728 BIND(DO1_LOOP); 4729 (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg)); 4730 cmpw(ch1, ch2); 4731 br(EQ, MATCH); 4732 adds(cnt2_neg, cnt2_neg, str2_chr_size); 4733 br(LT, DO1_LOOP); 4734 } 4735 } 4736 BIND(NOMATCH); 4737 mov(result, -1); 4738 b(DONE); 4739 BIND(MATCH); 4740 add(result, result_tmp, cnt2_neg, ASR, str2_chr_shift); 4741 BIND(DONE); 4742 } 4743 4744 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr); 4745 typedef void (MacroAssembler::* uxt_insn)(Register Rd, Register Rn); 4746 4747 void MacroAssembler::string_indexof_char(Register str1, Register cnt1, 4748 Register ch, Register result, 4749 Register tmp1, Register tmp2, Register tmp3) 4750 { 4751 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE; 4752 Register cnt1_neg = cnt1; 4753 Register ch1 = rscratch1; 4754 Register result_tmp = rscratch2; 4755 4756 cmp(cnt1, 4); 4757 br(LT, DO1_SHORT); 4758 4759 orr(ch, ch, ch, LSL, 16); 4760 orr(ch, ch, ch, LSL, 32); 4761 4762 sub(cnt1, cnt1, 4); 4763 mov(result_tmp, cnt1); 4764 lea(str1, Address(str1, cnt1, Address::uxtw(1))); 4765 sub(cnt1_neg, zr, cnt1, LSL, 1); 4766 4767 mov(tmp3, 0x0001000100010001); 4768 4769 BIND(CH1_LOOP); 4770 ldr(ch1, Address(str1, cnt1_neg)); 4771 eor(ch1, ch, ch1); 4772 sub(tmp1, ch1, tmp3); 4773 orr(tmp2, ch1, 0x7fff7fff7fff7fff); 4774 bics(tmp1, tmp1, tmp2); 4775 br(NE, HAS_ZERO); 4776 adds(cnt1_neg, cnt1_neg, 8); 4777 br(LT, CH1_LOOP); 4778 4779 cmp(cnt1_neg, 8); 4780 mov(cnt1_neg, 0); 4781 br(LT, CH1_LOOP); 4782 b(NOMATCH); 4783 4784 BIND(HAS_ZERO); 4785 rev(tmp1, tmp1); 4786 clz(tmp1, tmp1); 4787 add(cnt1_neg, cnt1_neg, tmp1, LSR, 3); 4788 b(MATCH); 4789 4790 BIND(DO1_SHORT); 4791 mov(result_tmp, cnt1); 4792 lea(str1, Address(str1, cnt1, Address::uxtw(1))); 4793 sub(cnt1_neg, zr, cnt1, LSL, 1); 4794 BIND(DO1_LOOP); 4795 ldrh(ch1, Address(str1, cnt1_neg)); 4796 cmpw(ch, ch1); 4797 br(EQ, MATCH); 4798 adds(cnt1_neg, cnt1_neg, 2); 4799 br(LT, DO1_LOOP); 4800 BIND(NOMATCH); 4801 mov(result, -1); 4802 b(DONE); 4803 BIND(MATCH); 4804 add(result, result_tmp, cnt1_neg, ASR, 1); 4805 BIND(DONE); 4806 } 4807 4808 // Compare strings. 4809 void MacroAssembler::string_compare(Register str1, Register str2, 4810 Register cnt1, Register cnt2, Register result, 4811 Register tmp1, 4812 FloatRegister vtmp, FloatRegister vtmpZ, int ae) { 4813 Label LENGTH_DIFF, DONE, SHORT_LOOP, SHORT_STRING, 4814 NEXT_WORD, DIFFERENCE; 4815 4816 bool isLL = ae == StrIntrinsicNode::LL; 4817 bool isLU = ae == StrIntrinsicNode::LU; 4818 bool isUL = ae == StrIntrinsicNode::UL; 4819 4820 bool str1_isL = isLL || isLU; 4821 bool str2_isL = isLL || isUL; 4822 4823 int str1_chr_shift = str1_isL ? 0 : 1; 4824 int str2_chr_shift = str2_isL ? 0 : 1; 4825 int str1_chr_size = str1_isL ? 1 : 2; 4826 int str2_chr_size = str2_isL ? 1 : 2; 4827 4828 chr_insn str1_load_chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb : 4829 (chr_insn)&MacroAssembler::ldrh; 4830 chr_insn str2_load_chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb : 4831 (chr_insn)&MacroAssembler::ldrh; 4832 uxt_insn ext_chr = isLL ? (uxt_insn)&MacroAssembler::uxtbw : 4833 (uxt_insn)&MacroAssembler::uxthw; 4834 4835 BLOCK_COMMENT("string_compare {"); 4836 4837 // Bizzarely, the counts are passed in bytes, regardless of whether they 4838 // are L or U strings, however the result is always in characters. 4839 if (!str1_isL) asrw(cnt1, cnt1, 1); 4840 if (!str2_isL) asrw(cnt2, cnt2, 1); 4841 4842 // Compute the minimum of the string lengths and save the difference. 4843 subsw(tmp1, cnt1, cnt2); 4844 cselw(cnt2, cnt1, cnt2, Assembler::LE); // min 4845 4846 // A very short string 4847 cmpw(cnt2, isLL ? 8:4); 4848 br(Assembler::LT, SHORT_STRING); 4849 4850 // Check if the strings start at the same location. 4851 cmp(str1, str2); 4852 br(Assembler::EQ, LENGTH_DIFF); 4853 4854 // Compare longwords 4855 { 4856 subw(cnt2, cnt2, isLL ? 8:4); // The last longword is a special case 4857 4858 // Move both string pointers to the last longword of their 4859 // strings, negate the remaining count, and convert it to bytes. 4860 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift))); 4861 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift))); 4862 if (isLU || isUL) { 4863 sub(cnt1, zr, cnt2, LSL, str1_chr_shift); 4864 eor(vtmpZ, T16B, vtmpZ, vtmpZ); 4865 } 4866 sub(cnt2, zr, cnt2, LSL, str2_chr_shift); 4867 4868 // Loop, loading longwords and comparing them into rscratch2. 4869 bind(NEXT_WORD); 4870 if (isLU) { 4871 ldrs(vtmp, Address(str1, cnt1)); 4872 zip1(vtmp, T8B, vtmp, vtmpZ); 4873 umov(result, vtmp, D, 0); 4874 } else { 4875 ldr(result, Address(str1, isUL ? cnt1:cnt2)); 4876 } 4877 if (isUL) { 4878 ldrs(vtmp, Address(str2, cnt2)); 4879 zip1(vtmp, T8B, vtmp, vtmpZ); 4880 umov(rscratch1, vtmp, D, 0); 4881 } else { 4882 ldr(rscratch1, Address(str2, cnt2)); 4883 } 4884 adds(cnt2, cnt2, isUL ? 4:8); 4885 if (isLU || isUL) add(cnt1, cnt1, isLU ? 4:8); 4886 eor(rscratch2, result, rscratch1); 4887 cbnz(rscratch2, DIFFERENCE); 4888 br(Assembler::LT, NEXT_WORD); 4889 4890 // Last longword. In the case where length == 4 we compare the 4891 // same longword twice, but that's still faster than another 4892 // conditional branch. 4893 4894 if (isLU) { 4895 ldrs(vtmp, Address(str1)); 4896 zip1(vtmp, T8B, vtmp, vtmpZ); 4897 umov(result, vtmp, D, 0); 4898 } else { 4899 ldr(result, Address(str1)); 4900 } 4901 if (isUL) { 4902 ldrs(vtmp, Address(str2)); 4903 zip1(vtmp, T8B, vtmp, vtmpZ); 4904 umov(rscratch1, vtmp, D, 0); 4905 } else { 4906 ldr(rscratch1, Address(str2)); 4907 } 4908 eor(rscratch2, result, rscratch1); 4909 cbz(rscratch2, LENGTH_DIFF); 4910 4911 // Find the first different characters in the longwords and 4912 // compute their difference. 4913 bind(DIFFERENCE); 4914 rev(rscratch2, rscratch2); 4915 clz(rscratch2, rscratch2); 4916 andr(rscratch2, rscratch2, isLL ? -8 : -16); 4917 lsrv(result, result, rscratch2); 4918 (this->*ext_chr)(result, result); 4919 lsrv(rscratch1, rscratch1, rscratch2); 4920 (this->*ext_chr)(rscratch1, rscratch1); 4921 subw(result, result, rscratch1); 4922 b(DONE); 4923 } 4924 4925 bind(SHORT_STRING); 4926 // Is the minimum length zero? 4927 cbz(cnt2, LENGTH_DIFF); 4928 4929 bind(SHORT_LOOP); 4930 (this->*str1_load_chr)(result, Address(post(str1, str1_chr_size))); 4931 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size))); 4932 subw(result, result, cnt1); 4933 cbnz(result, DONE); 4934 sub(cnt2, cnt2, 1); 4935 cbnz(cnt2, SHORT_LOOP); 4936 4937 // Strings are equal up to min length. Return the length difference. 4938 bind(LENGTH_DIFF); 4939 mov(result, tmp1); 4940 4941 // That's it 4942 bind(DONE); 4943 4944 BLOCK_COMMENT("} string_compare"); 4945 } 4946 4947 // Compare Strings or char/byte arrays. 4948 4949 // is_string is true iff this is a string comparison. 4950 4951 // For Strings we're passed the address of the first characters in a1 4952 // and a2 and the length in cnt1. 4953 4954 // For byte and char arrays we're passed the arrays themselves and we 4955 // have to extract length fields and do null checks here. 4956 4957 // elem_size is the element size in bytes: either 1 or 2. 4958 4959 // There are two implementations. For arrays >= 8 bytes, all 4960 // comparisons (including the final one, which may overlap) are 4961 // performed 8 bytes at a time. For arrays < 8 bytes, we compare a 4962 // halfword, then a short, and then a byte. 4963 4964 void MacroAssembler::arrays_equals(Register a1, Register a2, 4965 Register result, Register cnt1, 4966 int elem_size, bool is_string) 4967 { 4968 Label SAME, DONE, SHORT, NEXT_WORD, ONE; 4969 Register tmp1 = rscratch1; 4970 Register tmp2 = rscratch2; 4971 Register cnt2 = tmp2; // cnt2 only used in array length compare 4972 int elem_per_word = wordSize/elem_size; 4973 int log_elem_size = exact_log2(elem_size); 4974 int length_offset = arrayOopDesc::length_offset_in_bytes(); 4975 int base_offset 4976 = arrayOopDesc::base_offset_in_bytes(elem_size == 2 ? T_CHAR : T_BYTE); 4977 4978 assert(elem_size == 1 || elem_size == 2, "must be char or byte"); 4979 assert_different_registers(a1, a2, result, cnt1, rscratch1, rscratch2); 4980 4981 #ifndef PRODUCT 4982 { 4983 const char kind = (elem_size == 2) ? 'U' : 'L'; 4984 char comment[64]; 4985 snprintf(comment, sizeof comment, "%s%c%s {", 4986 is_string ? "string_equals" : "array_equals", 4987 kind, "{"); 4988 BLOCK_COMMENT(comment); 4989 } 4990 #endif 4991 4992 mov(result, false); 4993 4994 if (!is_string) { 4995 // if (a==a2) 4996 // return true; 4997 cmp(a1, a2); 4998 oopDesc::bs()->asm_acmp_barrier(this, a1, a2); 4999 br(Assembler::EQ, SAME); 5000 // if (a==null || a2==null) 5001 // return false; 5002 cbz(a1, DONE); 5003 cbz(a2, DONE); 5004 // if (a1.length != a2.length) 5005 // return false; 5006 ldrw(cnt1, Address(a1, length_offset)); 5007 ldrw(cnt2, Address(a2, length_offset)); 5008 eorw(tmp1, cnt1, cnt2); 5009 cbnzw(tmp1, DONE); 5010 5011 lea(a1, Address(a1, base_offset)); 5012 lea(a2, Address(a2, base_offset)); 5013 } 5014 5015 // Check for short strings, i.e. smaller than wordSize. 5016 subs(cnt1, cnt1, elem_per_word); 5017 br(Assembler::LT, SHORT); 5018 // Main 8 byte comparison loop. 5019 bind(NEXT_WORD); { 5020 ldr(tmp1, Address(post(a1, wordSize))); 5021 ldr(tmp2, Address(post(a2, wordSize))); 5022 subs(cnt1, cnt1, elem_per_word); 5023 eor(tmp1, tmp1, tmp2); 5024 cbnz(tmp1, DONE); 5025 } br(GT, NEXT_WORD); 5026 // Last longword. In the case where length == 4 we compare the 5027 // same longword twice, but that's still faster than another 5028 // conditional branch. 5029 // cnt1 could be 0, -1, -2, -3, -4 for chars; -4 only happens when 5030 // length == 4. 5031 if (log_elem_size > 0) 5032 lsl(cnt1, cnt1, log_elem_size); 5033 ldr(tmp1, Address(a1, cnt1)); 5034 ldr(tmp2, Address(a2, cnt1)); 5035 eor(tmp1, tmp1, tmp2); 5036 cbnz(tmp1, DONE); 5037 b(SAME); 5038 5039 bind(SHORT); 5040 Label TAIL03, TAIL01; 5041 5042 tbz(cnt1, 2 - log_elem_size, TAIL03); // 0-7 bytes left. 5043 { 5044 ldrw(tmp1, Address(post(a1, 4))); 5045 ldrw(tmp2, Address(post(a2, 4))); 5046 eorw(tmp1, tmp1, tmp2); 5047 cbnzw(tmp1, DONE); 5048 } 5049 bind(TAIL03); 5050 tbz(cnt1, 1 - log_elem_size, TAIL01); // 0-3 bytes left. 5051 { 5052 ldrh(tmp1, Address(post(a1, 2))); 5053 ldrh(tmp2, Address(post(a2, 2))); 5054 eorw(tmp1, tmp1, tmp2); 5055 cbnzw(tmp1, DONE); 5056 } 5057 bind(TAIL01); 5058 if (elem_size == 1) { // Only needed when comparing byte arrays. 5059 tbz(cnt1, 0, SAME); // 0-1 bytes left. 5060 { 5061 ldrb(tmp1, a1); 5062 ldrb(tmp2, a2); 5063 eorw(tmp1, tmp1, tmp2); 5064 cbnzw(tmp1, DONE); 5065 } 5066 } 5067 // Arrays are equal. 5068 bind(SAME); 5069 mov(result, true); 5070 5071 // That's it. 5072 bind(DONE); 5073 BLOCK_COMMENT(is_string ? "} string_equals" : "} array_equals"); 5074 } 5075 5076 5077 // base: Address of a buffer to be zeroed, 8 bytes aligned. 5078 // cnt: Count in HeapWords. 5079 // is_large: True when 'cnt' is known to be >= BlockZeroingLowLimit. 5080 void MacroAssembler::zero_words(Register base, Register cnt) 5081 { 5082 if (UseBlockZeroing) { 5083 block_zero(base, cnt); 5084 } else { 5085 fill_words(base, cnt, zr); 5086 } 5087 } 5088 5089 // r10 = base: Address of a buffer to be zeroed, 8 bytes aligned. 5090 // cnt: Immediate count in HeapWords. 5091 // r11 = tmp: For use as cnt if we need to call out 5092 #define ShortArraySize (18 * BytesPerLong) 5093 void MacroAssembler::zero_words(Register base, u_int64_t cnt) 5094 { 5095 Register tmp = r11; 5096 int i = cnt & 1; // store any odd word to start 5097 if (i) str(zr, Address(base)); 5098 5099 if (cnt <= ShortArraySize / BytesPerLong) { 5100 for (; i < (int)cnt; i += 2) 5101 stp(zr, zr, Address(base, i * wordSize)); 5102 } else if (UseBlockZeroing && cnt >= (u_int64_t)(BlockZeroingLowLimit >> LogBytesPerWord)) { 5103 mov(tmp, cnt); 5104 block_zero(base, tmp, true); 5105 } else { 5106 const int unroll = 4; // Number of stp(zr, zr) instructions we'll unroll 5107 int remainder = cnt % (2 * unroll); 5108 for (; i < remainder; i += 2) 5109 stp(zr, zr, Address(base, i * wordSize)); 5110 5111 Label loop; 5112 Register cnt_reg = rscratch1; 5113 Register loop_base = rscratch2; 5114 cnt = cnt - remainder; 5115 mov(cnt_reg, cnt); 5116 // adjust base and prebias by -2 * wordSize so we can pre-increment 5117 add(loop_base, base, (remainder - 2) * wordSize); 5118 bind(loop); 5119 sub(cnt_reg, cnt_reg, 2 * unroll); 5120 for (i = 1; i < unroll; i++) 5121 stp(zr, zr, Address(loop_base, 2 * i * wordSize)); 5122 stp(zr, zr, Address(pre(loop_base, 2 * unroll * wordSize))); 5123 cbnz(cnt_reg, loop); 5124 } 5125 } 5126 5127 // base: Address of a buffer to be filled, 8 bytes aligned. 5128 // cnt: Count in 8-byte unit. 5129 // value: Value to be filled with. 5130 // base will point to the end of the buffer after filling. 5131 void MacroAssembler::fill_words(Register base, Register cnt, Register value) 5132 { 5133 // Algorithm: 5134 // 5135 // scratch1 = cnt & 7; 5136 // cnt -= scratch1; 5137 // p += scratch1; 5138 // switch (scratch1) { 5139 // do { 5140 // cnt -= 8; 5141 // p[-8] = v; 5142 // case 7: 5143 // p[-7] = v; 5144 // case 6: 5145 // p[-6] = v; 5146 // // ... 5147 // case 1: 5148 // p[-1] = v; 5149 // case 0: 5150 // p += 8; 5151 // } while (cnt); 5152 // } 5153 5154 assert_different_registers(base, cnt, value, rscratch1, rscratch2); 5155 5156 Label fini, skip, entry, loop; 5157 const int unroll = 8; // Number of stp instructions we'll unroll 5158 5159 cbz(cnt, fini); 5160 tbz(base, 3, skip); 5161 str(value, Address(post(base, 8))); 5162 sub(cnt, cnt, 1); 5163 bind(skip); 5164 5165 andr(rscratch1, cnt, (unroll-1) * 2); 5166 sub(cnt, cnt, rscratch1); 5167 add(base, base, rscratch1, Assembler::LSL, 3); 5168 adr(rscratch2, entry); 5169 sub(rscratch2, rscratch2, rscratch1, Assembler::LSL, 1); 5170 br(rscratch2); 5171 5172 bind(loop); 5173 add(base, base, unroll * 16); 5174 for (int i = -unroll; i < 0; i++) 5175 stp(value, value, Address(base, i * 16)); 5176 bind(entry); 5177 subs(cnt, cnt, unroll * 2); 5178 br(Assembler::GE, loop); 5179 5180 tbz(cnt, 0, fini); 5181 str(value, Address(post(base, 8))); 5182 bind(fini); 5183 } 5184 5185 // Use DC ZVA to do fast zeroing. 5186 // base: Address of a buffer to be zeroed, 8 bytes aligned. 5187 // cnt: Count in HeapWords. 5188 // is_large: True when 'cnt' is known to be >= BlockZeroingLowLimit. 5189 void MacroAssembler::block_zero(Register base, Register cnt, bool is_large) 5190 { 5191 Label small; 5192 Label store_pair, loop_store_pair, done; 5193 Label base_aligned; 5194 5195 assert_different_registers(base, cnt, rscratch1); 5196 guarantee(base == r10 && cnt == r11, "fix register usage"); 5197 5198 Register tmp = rscratch1; 5199 Register tmp2 = rscratch2; 5200 int zva_length = VM_Version::zva_length(); 5201 5202 // Ensure ZVA length can be divided by 16. This is required by 5203 // the subsequent operations. 5204 assert (zva_length % 16 == 0, "Unexpected ZVA Length"); 5205 5206 if (!is_large) cbz(cnt, done); 5207 tbz(base, 3, base_aligned); 5208 str(zr, Address(post(base, 8))); 5209 sub(cnt, cnt, 1); 5210 bind(base_aligned); 5211 5212 // Ensure count >= zva_length * 2 so that it still deserves a zva after 5213 // alignment. 5214 if (!is_large || !(BlockZeroingLowLimit >= zva_length * 2)) { 5215 int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit); 5216 subs(tmp, cnt, low_limit >> 3); 5217 br(Assembler::LT, small); 5218 } 5219 5220 far_call(StubRoutines::aarch64::get_zero_longs()); 5221 5222 bind(small); 5223 5224 const int unroll = 8; // Number of stp instructions we'll unroll 5225 Label small_loop, small_table_end; 5226 5227 andr(tmp, cnt, (unroll-1) * 2); 5228 sub(cnt, cnt, tmp); 5229 add(base, base, tmp, Assembler::LSL, 3); 5230 adr(tmp2, small_table_end); 5231 sub(tmp2, tmp2, tmp, Assembler::LSL, 1); 5232 br(tmp2); 5233 5234 bind(small_loop); 5235 add(base, base, unroll * 16); 5236 for (int i = -unroll; i < 0; i++) 5237 stp(zr, zr, Address(base, i * 16)); 5238 bind(small_table_end); 5239 subs(cnt, cnt, unroll * 2); 5240 br(Assembler::GE, small_loop); 5241 5242 tbz(cnt, 0, done); 5243 str(zr, Address(post(base, 8))); 5244 5245 bind(done); 5246 } 5247 5248 // Intrinsic for sun/nio/cs/ISO_8859_1$Encoder.implEncodeISOArray and 5249 // java/lang/StringUTF16.compress. 5250 void MacroAssembler::encode_iso_array(Register src, Register dst, 5251 Register len, Register result, 5252 FloatRegister Vtmp1, FloatRegister Vtmp2, 5253 FloatRegister Vtmp3, FloatRegister Vtmp4) 5254 { 5255 Label DONE, NEXT_32, LOOP_8, NEXT_8, LOOP_1, NEXT_1; 5256 Register tmp1 = rscratch1; 5257 5258 mov(result, len); // Save initial len 5259 5260 #ifndef BUILTIN_SIM 5261 subs(len, len, 32); 5262 br(LT, LOOP_8); 5263 5264 // The following code uses the SIMD 'uqxtn' and 'uqxtn2' instructions 5265 // to convert chars to bytes. These set the 'QC' bit in the FPSR if 5266 // any char could not fit in a byte, so clear the FPSR so we can test it. 5267 clear_fpsr(); 5268 5269 BIND(NEXT_32); 5270 ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src); 5271 uqxtn(Vtmp1, T8B, Vtmp1, T8H); // uqxtn - write bottom half 5272 uqxtn(Vtmp1, T16B, Vtmp2, T8H); // uqxtn2 - write top half 5273 uqxtn(Vtmp2, T8B, Vtmp3, T8H); 5274 uqxtn(Vtmp2, T16B, Vtmp4, T8H); // uqxtn2 5275 get_fpsr(tmp1); 5276 cbnzw(tmp1, LOOP_8); 5277 st1(Vtmp1, Vtmp2, T16B, post(dst, 32)); 5278 subs(len, len, 32); 5279 add(src, src, 64); 5280 br(GE, NEXT_32); 5281 5282 BIND(LOOP_8); 5283 adds(len, len, 32-8); 5284 br(LT, LOOP_1); 5285 clear_fpsr(); // QC may be set from loop above, clear again 5286 BIND(NEXT_8); 5287 ld1(Vtmp1, T8H, src); 5288 uqxtn(Vtmp1, T8B, Vtmp1, T8H); 5289 get_fpsr(tmp1); 5290 cbnzw(tmp1, LOOP_1); 5291 st1(Vtmp1, T8B, post(dst, 8)); 5292 subs(len, len, 8); 5293 add(src, src, 16); 5294 br(GE, NEXT_8); 5295 5296 BIND(LOOP_1); 5297 adds(len, len, 8); 5298 br(LE, DONE); 5299 #else 5300 cbz(len, DONE); 5301 #endif 5302 BIND(NEXT_1); 5303 ldrh(tmp1, Address(post(src, 2))); 5304 tst(tmp1, 0xff00); 5305 br(NE, DONE); 5306 strb(tmp1, Address(post(dst, 1))); 5307 subs(len, len, 1); 5308 br(GT, NEXT_1); 5309 5310 BIND(DONE); 5311 sub(result, result, len); // Return index where we stopped 5312 // Return len == 0 if we processed all 5313 // characters 5314 } 5315 5316 5317 // Inflate byte[] array to char[]. 5318 void MacroAssembler::byte_array_inflate(Register src, Register dst, Register len, 5319 FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3, 5320 Register tmp4) { 5321 Label big, done; 5322 5323 assert_different_registers(src, dst, len, tmp4, rscratch1); 5324 5325 fmovd(vtmp1 , zr); 5326 lsrw(rscratch1, len, 3); 5327 5328 cbnzw(rscratch1, big); 5329 5330 // Short string: less than 8 bytes. 5331 { 5332 Label loop, around, tiny; 5333 5334 subsw(len, len, 4); 5335 andw(len, len, 3); 5336 br(LO, tiny); 5337 5338 // Use SIMD to do 4 bytes. 5339 ldrs(vtmp2, post(src, 4)); 5340 zip1(vtmp3, T8B, vtmp2, vtmp1); 5341 strd(vtmp3, post(dst, 8)); 5342 5343 cbzw(len, done); 5344 5345 // Do the remaining bytes by steam. 5346 bind(loop); 5347 ldrb(tmp4, post(src, 1)); 5348 strh(tmp4, post(dst, 2)); 5349 subw(len, len, 1); 5350 5351 bind(tiny); 5352 cbnz(len, loop); 5353 5354 bind(around); 5355 b(done); 5356 } 5357 5358 // Unpack the bytes 8 at a time. 5359 bind(big); 5360 andw(len, len, 7); 5361 5362 { 5363 Label loop, around; 5364 5365 bind(loop); 5366 ldrd(vtmp2, post(src, 8)); 5367 sub(rscratch1, rscratch1, 1); 5368 zip1(vtmp3, T16B, vtmp2, vtmp1); 5369 st1(vtmp3, T8H, post(dst, 16)); 5370 cbnz(rscratch1, loop); 5371 5372 bind(around); 5373 } 5374 5375 // Do the tail of up to 8 bytes. 5376 sub(src, src, 8); 5377 add(src, src, len, ext::uxtw, 0); 5378 ldrd(vtmp2, Address(src)); 5379 sub(dst, dst, 16); 5380 add(dst, dst, len, ext::uxtw, 1); 5381 zip1(vtmp3, T16B, vtmp2, vtmp1); 5382 st1(vtmp3, T8H, Address(dst)); 5383 5384 bind(done); 5385 } 5386 5387 // Compress char[] array to byte[]. 5388 void MacroAssembler::char_array_compress(Register src, Register dst, Register len, 5389 FloatRegister tmp1Reg, FloatRegister tmp2Reg, 5390 FloatRegister tmp3Reg, FloatRegister tmp4Reg, 5391 Register result) { 5392 encode_iso_array(src, dst, len, result, 5393 tmp1Reg, tmp2Reg, tmp3Reg, tmp4Reg); 5394 cmp(len, zr); 5395 csel(result, result, zr, EQ); 5396 } 5397 5398 // get_thread() can be called anywhere inside generated code so we 5399 // need to save whatever non-callee save context might get clobbered 5400 // by the call to JavaThread::aarch64_get_thread_helper() or, indeed, 5401 // the call setup code. 5402 // 5403 // aarch64_get_thread_helper() clobbers only r0, r1, and flags. 5404 // 5405 void MacroAssembler::get_thread(Register dst) { 5406 RegSet saved_regs = RegSet::range(r0, r1) + lr - dst; 5407 push(saved_regs, sp); 5408 5409 mov(lr, CAST_FROM_FN_PTR(address, JavaThread::aarch64_get_thread_helper)); 5410 blrt(lr, 1, 0, 1); 5411 if (dst != c_rarg0) { 5412 mov(dst, c_rarg0); 5413 } 5414 5415 pop(saved_regs, sp); 5416 } 5417 5418 // Shenandoah requires that all objects are evacuated before being 5419 // written to, and that fromspace pointers are not written into 5420 // objects during concurrent marking. These methods check for that. 5421 5422 void MacroAssembler::in_heap_check(Register r, Register tmp, Label &nope) { 5423 ShenandoahHeap *h = (ShenandoahHeap *)Universe::heap(); 5424 5425 HeapWord* first_region_bottom = h->first_region_bottom(); 5426 HeapWord* last_region_end = first_region_bottom + (ShenandoahHeapRegion::region_size_bytes() / HeapWordSize) * h->max_regions(); 5427 5428 mov(tmp, (uintptr_t)first_region_bottom); 5429 cmp(r, tmp); 5430 br(Assembler::LO, nope); 5431 mov(tmp, (uintptr_t)last_region_end); 5432 cmp(r, tmp); 5433 br(Assembler::HS, nope); 5434 } 5435 5436 void MacroAssembler::shenandoah_cset_check(Register obj, Register tmp1, Register tmp2, Label& done) { 5437 5438 // Test that oop is not in to-space. 5439 lsr(tmp1, obj, ShenandoahHeapRegion::region_size_shift_jint()); 5440 assert(ShenandoahHeap::in_cset_fast_test_addr() != 0, "sanity"); 5441 mov(tmp2, ShenandoahHeap::in_cset_fast_test_addr()); 5442 ldrb(tmp2, Address(tmp2, tmp1)); 5443 tbz(tmp2, 0, done); 5444 5445 // Check for cancelled GC. 5446 assert(ShenandoahHeap::cancelled_concgc_addr() != 0, "sanity"); 5447 mov(tmp2, ShenandoahHeap::cancelled_concgc_addr()); 5448 ldrb(tmp2, Address(tmp2)); 5449 cbnz(tmp2, done); 5450 } 5451 5452 void MacroAssembler::_shenandoah_store_check(Address addr, Register value, const char* msg, const char* file, int line) { 5453 _shenandoah_store_check(addr.base(), value, msg, file, line); 5454 } 5455 5456 void MacroAssembler::_shenandoah_store_check(Register addr, Register value, const char* msg, const char* file, int line) { 5457 5458 if (! UseShenandoahGC || ! ShenandoahStoreCheck) return; 5459 if (addr == r31_sp || addr == sp) return; // Stack-based target 5460 5461 Register raddr = r8; 5462 Register rval = r9; 5463 Register tmp1 = r10; 5464 Register tmp2 = r11; 5465 5466 RegSet to_save = RegSet::of(raddr, rval, tmp1, tmp2); 5467 5468 // Push tmp regs and flags. 5469 push(to_save, sp); 5470 get_nzcv(tmp1); 5471 push(RegSet::of(tmp1), sp); 5472 5473 mov(rval, value); 5474 mov(raddr, addr); 5475 5476 Label done; 5477 5478 // If not in-heap target, skip check. 5479 in_heap_check(raddr, tmp1, done); 5480 5481 // Test that target oop is not in to-space. 5482 shenandoah_cset_check(raddr, tmp1, tmp2, done); 5483 5484 // Do value-check only when concurrent mark is in progress. 5485 mov(tmp1, ShenandoahHeap::concurrent_mark_in_progress_addr()); 5486 ldrw(tmp1, Address(tmp1)); 5487 cbzw(tmp1, done); 5488 5489 // Null-check value. 5490 cbz(rval, done); 5491 5492 // Test that value oop is not in to-space. 5493 shenandoah_cset_check(rval, tmp1, tmp2, done); 5494 5495 // Failure. 5496 // Pop tmp regs and flags. 5497 pop(RegSet::of(tmp1), sp); 5498 set_nzcv(tmp1); 5499 pop(to_save, sp); 5500 const char* b = NULL; 5501 { 5502 ResourceMark rm; 5503 stringStream ss; 5504 ss.print("shenandoah_store_check: %s in file: %s line: %i", msg, file, line); 5505 b = code_string(ss.as_string()); 5506 } 5507 // hlt(0); 5508 5509 stop(b); 5510 5511 bind(done); 5512 // Pop tmp regs and flags. 5513 pop(RegSet::of(tmp1), sp); 5514 set_nzcv(tmp1); 5515 pop(to_save, sp); 5516 } 5517 5518 void MacroAssembler::_shenandoah_store_addr_check(Address addr, const char* msg, const char* file, int line) { 5519 _shenandoah_store_addr_check(addr.base(), msg, file, line); 5520 } 5521 5522 void MacroAssembler::_shenandoah_store_addr_check(Register dst, const char* msg, const char* file, int line) { 5523 5524 if (! UseShenandoahGC || ! ShenandoahStoreCheck) return; 5525 if (dst == r31_sp || dst == sp) return; // Stack-based target 5526 5527 Register addr = r8; 5528 Register tmp1 = r9; 5529 Register tmp2 = r10; 5530 5531 Label done; 5532 RegSet to_save = RegSet::of(addr, tmp1, tmp2); 5533 5534 // Push tmp regs and flags. 5535 push(to_save, sp); 5536 get_nzcv(tmp1); 5537 push(RegSet::of(tmp1), sp); 5538 5539 orr(addr, zr, dst); 5540 // mov(addr, dst); 5541 5542 // Check null. 5543 cbz(addr, done); 5544 5545 in_heap_check(addr, tmp1, done); 5546 5547 shenandoah_cset_check(addr, tmp1, tmp2, done); 5548 5549 // Fail. 5550 // Pop tmp regs and flags. 5551 pop(RegSet::of(tmp1), sp); 5552 set_nzcv(tmp1); 5553 pop(to_save, sp); 5554 const char* b = NULL; 5555 { 5556 ResourceMark rm; 5557 stringStream ss; 5558 ss.print("shenandoah_store_check: %s in file: %s line: %i", msg, file, line); 5559 b = code_string(ss.as_string()); 5560 } 5561 // hlt(0); 5562 stop(b); 5563 // should_not_reach_here(); 5564 5565 bind(done); 5566 // Pop tmp regs and flags. 5567 pop(RegSet::of(tmp1), sp); 5568 set_nzcv(tmp1); 5569 pop(to_save, sp); 5570 5571 }