1 /* 2 * Copyright (c) 1997, 2019, Oracle and/or its affiliates. All rights reserved. 3 * Copyright (c) 2014, 2019, Red Hat Inc. All rights reserved. 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. 9 * 10 * This code is distributed in the hope that it will be useful, but WITHOUT 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 * version 2 for more details (a copy is included in the LICENSE file that 14 * accompanied this code). 15 * 16 * You should have received a copy of the GNU General Public License version 17 * 2 along with this work; if not, write to the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 19 * 20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 21 * or visit www.oracle.com if you need additional information or have any 22 * questions. 23 * 24 */ 25 26 #include <sys/types.h> 27 28 #include "precompiled.hpp" 29 #include "jvm.h" 30 #include "asm/assembler.hpp" 31 #include "asm/assembler.inline.hpp" 32 #include "gc/shared/barrierSet.hpp" 33 #include "gc/shared/cardTable.hpp" 34 #include "gc/shared/barrierSetAssembler.hpp" 35 #include "gc/shared/cardTableBarrierSet.hpp" 36 #include "interpreter/interpreter.hpp" 37 #include "compiler/disassembler.hpp" 38 #include "memory/resourceArea.hpp" 39 #include "memory/universe.hpp" 40 #include "nativeInst_aarch64.hpp" 41 #include "oops/accessDecorators.hpp" 42 #include "oops/compressedOops.inline.hpp" 43 #include "oops/klass.inline.hpp" 44 #include "runtime/biasedLocking.hpp" 45 #include "runtime/icache.hpp" 46 #include "runtime/interfaceSupport.inline.hpp" 47 #include "runtime/jniHandles.inline.hpp" 48 #include "runtime/sharedRuntime.hpp" 49 #include "runtime/signature_cc.hpp" 50 #include "runtime/thread.hpp" 51 #ifdef COMPILER1 52 #include "c1/c1_LIRAssembler.hpp" 53 #endif 54 #ifdef COMPILER2 55 #include "oops/oop.hpp" 56 #include "opto/compile.hpp" 57 #include "opto/intrinsicnode.hpp" 58 #include "opto/node.hpp" 59 #endif 60 61 #ifdef PRODUCT 62 #define BLOCK_COMMENT(str) /* nothing */ 63 #define STOP(error) stop(error) 64 #else 65 #define BLOCK_COMMENT(str) block_comment(str) 66 #define STOP(error) block_comment(error); stop(error) 67 #endif 68 69 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 70 71 // Patch any kind of instruction; there may be several instructions. 72 // Return the total length (in bytes) of the instructions. 73 int MacroAssembler::pd_patch_instruction_size(address branch, address target) { 74 int instructions = 1; 75 assert((uint64_t)target < (1ul << 48), "48-bit overflow in address constant"); 76 long offset = (target - branch) >> 2; 77 unsigned insn = *(unsigned*)branch; 78 if ((Instruction_aarch64::extract(insn, 29, 24) & 0b111011) == 0b011000) { 79 // Load register (literal) 80 Instruction_aarch64::spatch(branch, 23, 5, offset); 81 } else if (Instruction_aarch64::extract(insn, 30, 26) == 0b00101) { 82 // Unconditional branch (immediate) 83 Instruction_aarch64::spatch(branch, 25, 0, offset); 84 } else if (Instruction_aarch64::extract(insn, 31, 25) == 0b0101010) { 85 // Conditional branch (immediate) 86 Instruction_aarch64::spatch(branch, 23, 5, offset); 87 } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011010) { 88 // Compare & branch (immediate) 89 Instruction_aarch64::spatch(branch, 23, 5, offset); 90 } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011011) { 91 // Test & branch (immediate) 92 Instruction_aarch64::spatch(branch, 18, 5, offset); 93 } else if (Instruction_aarch64::extract(insn, 28, 24) == 0b10000) { 94 // PC-rel. addressing 95 offset = target-branch; 96 int shift = Instruction_aarch64::extract(insn, 31, 31); 97 if (shift) { 98 u_int64_t dest = (u_int64_t)target; 99 uint64_t pc_page = (uint64_t)branch >> 12; 100 uint64_t adr_page = (uint64_t)target >> 12; 101 unsigned offset_lo = dest & 0xfff; 102 offset = adr_page - pc_page; 103 104 // We handle 4 types of PC relative addressing 105 // 1 - adrp Rx, target_page 106 // ldr/str Ry, [Rx, #offset_in_page] 107 // 2 - adrp Rx, target_page 108 // add Ry, Rx, #offset_in_page 109 // 3 - adrp Rx, target_page (page aligned reloc, offset == 0) 110 // movk Rx, #imm16<<32 111 // 4 - adrp Rx, target_page (page aligned reloc, offset == 0) 112 // In the first 3 cases we must check that Rx is the same in the adrp and the 113 // subsequent ldr/str, add or movk instruction. Otherwise we could accidentally end 114 // up treating a type 4 relocation as a type 1, 2 or 3 just because it happened 115 // to be followed by a random unrelated ldr/str, add or movk instruction. 116 // 117 unsigned insn2 = ((unsigned*)branch)[1]; 118 if (Instruction_aarch64::extract(insn2, 29, 24) == 0b111001 && 119 Instruction_aarch64::extract(insn, 4, 0) == 120 Instruction_aarch64::extract(insn2, 9, 5)) { 121 // Load/store register (unsigned immediate) 122 unsigned size = Instruction_aarch64::extract(insn2, 31, 30); 123 Instruction_aarch64::patch(branch + sizeof (unsigned), 124 21, 10, offset_lo >> size); 125 guarantee(((dest >> size) << size) == dest, "misaligned target"); 126 instructions = 2; 127 } else if (Instruction_aarch64::extract(insn2, 31, 22) == 0b1001000100 && 128 Instruction_aarch64::extract(insn, 4, 0) == 129 Instruction_aarch64::extract(insn2, 4, 0)) { 130 // add (immediate) 131 Instruction_aarch64::patch(branch + sizeof (unsigned), 132 21, 10, offset_lo); 133 instructions = 2; 134 } else if (Instruction_aarch64::extract(insn2, 31, 21) == 0b11110010110 && 135 Instruction_aarch64::extract(insn, 4, 0) == 136 Instruction_aarch64::extract(insn2, 4, 0)) { 137 // movk #imm16<<32 138 Instruction_aarch64::patch(branch + 4, 20, 5, (uint64_t)target >> 32); 139 long dest = ((long)target & 0xffffffffL) | ((long)branch & 0xffff00000000L); 140 long pc_page = (long)branch >> 12; 141 long adr_page = (long)dest >> 12; 142 offset = adr_page - pc_page; 143 instructions = 2; 144 } 145 } 146 int offset_lo = offset & 3; 147 offset >>= 2; 148 Instruction_aarch64::spatch(branch, 23, 5, offset); 149 Instruction_aarch64::patch(branch, 30, 29, offset_lo); 150 } else if (Instruction_aarch64::extract(insn, 31, 21) == 0b11010010100) { 151 u_int64_t dest = (u_int64_t)target; 152 // Move wide constant 153 assert(nativeInstruction_at(branch+4)->is_movk(), "wrong insns in patch"); 154 assert(nativeInstruction_at(branch+8)->is_movk(), "wrong insns in patch"); 155 Instruction_aarch64::patch(branch, 20, 5, dest & 0xffff); 156 Instruction_aarch64::patch(branch+4, 20, 5, (dest >>= 16) & 0xffff); 157 Instruction_aarch64::patch(branch+8, 20, 5, (dest >>= 16) & 0xffff); 158 assert(target_addr_for_insn(branch) == target, "should be"); 159 instructions = 3; 160 } else if (Instruction_aarch64::extract(insn, 31, 22) == 0b1011100101 && 161 Instruction_aarch64::extract(insn, 4, 0) == 0b11111) { 162 // nothing to do 163 assert(target == 0, "did not expect to relocate target for polling page load"); 164 } else { 165 ShouldNotReachHere(); 166 } 167 return instructions * NativeInstruction::instruction_size; 168 } 169 170 int MacroAssembler::patch_oop(address insn_addr, address o) { 171 int instructions; 172 unsigned insn = *(unsigned*)insn_addr; 173 assert(nativeInstruction_at(insn_addr+4)->is_movk(), "wrong insns in patch"); 174 175 // OOPs are either narrow (32 bits) or wide (48 bits). We encode 176 // narrow OOPs by setting the upper 16 bits in the first 177 // instruction. 178 if (Instruction_aarch64::extract(insn, 31, 21) == 0b11010010101) { 179 // Move narrow OOP 180 narrowOop n = CompressedOops::encode((oop)o); 181 Instruction_aarch64::patch(insn_addr, 20, 5, n >> 16); 182 Instruction_aarch64::patch(insn_addr+4, 20, 5, n & 0xffff); 183 instructions = 2; 184 } else { 185 // Move wide OOP 186 assert(nativeInstruction_at(insn_addr+8)->is_movk(), "wrong insns in patch"); 187 uintptr_t dest = (uintptr_t)o; 188 Instruction_aarch64::patch(insn_addr, 20, 5, dest & 0xffff); 189 Instruction_aarch64::patch(insn_addr+4, 20, 5, (dest >>= 16) & 0xffff); 190 Instruction_aarch64::patch(insn_addr+8, 20, 5, (dest >>= 16) & 0xffff); 191 instructions = 3; 192 } 193 return instructions * NativeInstruction::instruction_size; 194 } 195 196 int MacroAssembler::patch_narrow_klass(address insn_addr, narrowKlass n) { 197 // Metatdata pointers are either narrow (32 bits) or wide (48 bits). 198 // We encode narrow ones by setting the upper 16 bits in the first 199 // instruction. 200 NativeInstruction *insn = nativeInstruction_at(insn_addr); 201 assert(Instruction_aarch64::extract(insn->encoding(), 31, 21) == 0b11010010101 && 202 nativeInstruction_at(insn_addr+4)->is_movk(), "wrong insns in patch"); 203 204 Instruction_aarch64::patch(insn_addr, 20, 5, n >> 16); 205 Instruction_aarch64::patch(insn_addr+4, 20, 5, n & 0xffff); 206 return 2 * NativeInstruction::instruction_size; 207 } 208 209 address MacroAssembler::target_addr_for_insn(address insn_addr, unsigned insn) { 210 long offset = 0; 211 if ((Instruction_aarch64::extract(insn, 29, 24) & 0b011011) == 0b00011000) { 212 // Load register (literal) 213 offset = Instruction_aarch64::sextract(insn, 23, 5); 214 return address(((uint64_t)insn_addr + (offset << 2))); 215 } else if (Instruction_aarch64::extract(insn, 30, 26) == 0b00101) { 216 // Unconditional branch (immediate) 217 offset = Instruction_aarch64::sextract(insn, 25, 0); 218 } else if (Instruction_aarch64::extract(insn, 31, 25) == 0b0101010) { 219 // Conditional branch (immediate) 220 offset = Instruction_aarch64::sextract(insn, 23, 5); 221 } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011010) { 222 // Compare & branch (immediate) 223 offset = Instruction_aarch64::sextract(insn, 23, 5); 224 } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011011) { 225 // Test & branch (immediate) 226 offset = Instruction_aarch64::sextract(insn, 18, 5); 227 } else if (Instruction_aarch64::extract(insn, 28, 24) == 0b10000) { 228 // PC-rel. addressing 229 offset = Instruction_aarch64::extract(insn, 30, 29); 230 offset |= Instruction_aarch64::sextract(insn, 23, 5) << 2; 231 int shift = Instruction_aarch64::extract(insn, 31, 31) ? 12 : 0; 232 if (shift) { 233 offset <<= shift; 234 uint64_t target_page = ((uint64_t)insn_addr) + offset; 235 target_page &= ((uint64_t)-1) << shift; 236 // Return the target address for the following sequences 237 // 1 - adrp Rx, target_page 238 // ldr/str Ry, [Rx, #offset_in_page] 239 // 2 - adrp Rx, target_page 240 // add Ry, Rx, #offset_in_page 241 // 3 - adrp Rx, target_page (page aligned reloc, offset == 0) 242 // movk Rx, #imm12<<32 243 // 4 - adrp Rx, target_page (page aligned reloc, offset == 0) 244 // 245 // In the first two cases we check that the register is the same and 246 // return the target_page + the offset within the page. 247 // Otherwise we assume it is a page aligned relocation and return 248 // the target page only. 249 // 250 unsigned insn2 = ((unsigned*)insn_addr)[1]; 251 if (Instruction_aarch64::extract(insn2, 29, 24) == 0b111001 && 252 Instruction_aarch64::extract(insn, 4, 0) == 253 Instruction_aarch64::extract(insn2, 9, 5)) { 254 // Load/store register (unsigned immediate) 255 unsigned int byte_offset = Instruction_aarch64::extract(insn2, 21, 10); 256 unsigned int size = Instruction_aarch64::extract(insn2, 31, 30); 257 return address(target_page + (byte_offset << size)); 258 } else if (Instruction_aarch64::extract(insn2, 31, 22) == 0b1001000100 && 259 Instruction_aarch64::extract(insn, 4, 0) == 260 Instruction_aarch64::extract(insn2, 4, 0)) { 261 // add (immediate) 262 unsigned int byte_offset = Instruction_aarch64::extract(insn2, 21, 10); 263 return address(target_page + byte_offset); 264 } else { 265 if (Instruction_aarch64::extract(insn2, 31, 21) == 0b11110010110 && 266 Instruction_aarch64::extract(insn, 4, 0) == 267 Instruction_aarch64::extract(insn2, 4, 0)) { 268 target_page = (target_page & 0xffffffff) | 269 ((uint64_t)Instruction_aarch64::extract(insn2, 20, 5) << 32); 270 } 271 return (address)target_page; 272 } 273 } else { 274 ShouldNotReachHere(); 275 } 276 } else if (Instruction_aarch64::extract(insn, 31, 23) == 0b110100101) { 277 u_int32_t *insns = (u_int32_t *)insn_addr; 278 // Move wide constant: movz, movk, movk. See movptr(). 279 assert(nativeInstruction_at(insns+1)->is_movk(), "wrong insns in patch"); 280 assert(nativeInstruction_at(insns+2)->is_movk(), "wrong insns in patch"); 281 return address(u_int64_t(Instruction_aarch64::extract(insns[0], 20, 5)) 282 + (u_int64_t(Instruction_aarch64::extract(insns[1], 20, 5)) << 16) 283 + (u_int64_t(Instruction_aarch64::extract(insns[2], 20, 5)) << 32)); 284 } else if (Instruction_aarch64::extract(insn, 31, 22) == 0b1011100101 && 285 Instruction_aarch64::extract(insn, 4, 0) == 0b11111) { 286 return 0; 287 } else { 288 ShouldNotReachHere(); 289 } 290 return address(((uint64_t)insn_addr + (offset << 2))); 291 } 292 293 void MacroAssembler::safepoint_poll(Label& slow_path) { 294 if (SafepointMechanism::uses_thread_local_poll()) { 295 ldr(rscratch1, Address(rthread, Thread::polling_page_offset())); 296 tbnz(rscratch1, exact_log2(SafepointMechanism::poll_bit()), slow_path); 297 } else { 298 unsigned long offset; 299 adrp(rscratch1, ExternalAddress(SafepointSynchronize::address_of_state()), offset); 300 ldrw(rscratch1, Address(rscratch1, offset)); 301 assert(SafepointSynchronize::_not_synchronized == 0, "rewrite this code"); 302 cbnz(rscratch1, slow_path); 303 } 304 } 305 306 // Just like safepoint_poll, but use an acquiring load for thread- 307 // local polling. 308 // 309 // We need an acquire here to ensure that any subsequent load of the 310 // global SafepointSynchronize::_state flag is ordered after this load 311 // of the local Thread::_polling page. We don't want this poll to 312 // return false (i.e. not safepointing) and a later poll of the global 313 // SafepointSynchronize::_state spuriously to return true. 314 // 315 // This is to avoid a race when we're in a native->Java transition 316 // racing the code which wakes up from a safepoint. 317 // 318 void MacroAssembler::safepoint_poll_acquire(Label& slow_path) { 319 if (SafepointMechanism::uses_thread_local_poll()) { 320 lea(rscratch1, Address(rthread, Thread::polling_page_offset())); 321 ldar(rscratch1, rscratch1); 322 tbnz(rscratch1, exact_log2(SafepointMechanism::poll_bit()), slow_path); 323 } else { 324 safepoint_poll(slow_path); 325 } 326 } 327 328 void MacroAssembler::reset_last_Java_frame(bool clear_fp) { 329 // we must set sp to zero to clear frame 330 str(zr, Address(rthread, JavaThread::last_Java_sp_offset())); 331 332 // must clear fp, so that compiled frames are not confused; it is 333 // possible that we need it only for debugging 334 if (clear_fp) { 335 str(zr, Address(rthread, JavaThread::last_Java_fp_offset())); 336 } 337 338 // Always clear the pc because it could have been set by make_walkable() 339 str(zr, Address(rthread, JavaThread::last_Java_pc_offset())); 340 } 341 342 // Calls to C land 343 // 344 // When entering C land, the rfp, & resp of the last Java frame have to be recorded 345 // in the (thread-local) JavaThread object. When leaving C land, the last Java fp 346 // has to be reset to 0. This is required to allow proper stack traversal. 347 void MacroAssembler::set_last_Java_frame(Register last_java_sp, 348 Register last_java_fp, 349 Register last_java_pc, 350 Register scratch) { 351 352 if (last_java_pc->is_valid()) { 353 str(last_java_pc, Address(rthread, 354 JavaThread::frame_anchor_offset() 355 + JavaFrameAnchor::last_Java_pc_offset())); 356 } 357 358 // determine last_java_sp register 359 if (last_java_sp == sp) { 360 mov(scratch, sp); 361 last_java_sp = scratch; 362 } else if (!last_java_sp->is_valid()) { 363 last_java_sp = esp; 364 } 365 366 str(last_java_sp, Address(rthread, JavaThread::last_Java_sp_offset())); 367 368 // last_java_fp is optional 369 if (last_java_fp->is_valid()) { 370 str(last_java_fp, Address(rthread, JavaThread::last_Java_fp_offset())); 371 } 372 } 373 374 void MacroAssembler::set_last_Java_frame(Register last_java_sp, 375 Register last_java_fp, 376 address last_java_pc, 377 Register scratch) { 378 assert(last_java_pc != NULL, "must provide a valid PC"); 379 380 adr(scratch, last_java_pc); 381 str(scratch, Address(rthread, 382 JavaThread::frame_anchor_offset() 383 + JavaFrameAnchor::last_Java_pc_offset())); 384 385 set_last_Java_frame(last_java_sp, last_java_fp, noreg, scratch); 386 } 387 388 void MacroAssembler::set_last_Java_frame(Register last_java_sp, 389 Register last_java_fp, 390 Label &L, 391 Register scratch) { 392 if (L.is_bound()) { 393 set_last_Java_frame(last_java_sp, last_java_fp, target(L), scratch); 394 } else { 395 InstructionMark im(this); 396 L.add_patch_at(code(), locator()); 397 set_last_Java_frame(last_java_sp, last_java_fp, pc() /* Patched later */, scratch); 398 } 399 } 400 401 void MacroAssembler::far_call(Address entry, CodeBuffer *cbuf, Register tmp) { 402 assert(ReservedCodeCacheSize < 4*G, "branch out of range"); 403 assert(CodeCache::find_blob(entry.target()) != NULL, 404 "destination of far call not found in code cache"); 405 if (far_branches()) { 406 unsigned long offset; 407 // We can use ADRP here because we know that the total size of 408 // the code cache cannot exceed 2Gb. 409 adrp(tmp, entry, offset); 410 add(tmp, tmp, offset); 411 if (cbuf) cbuf->set_insts_mark(); 412 blr(tmp); 413 } else { 414 if (cbuf) cbuf->set_insts_mark(); 415 bl(entry); 416 } 417 } 418 419 void MacroAssembler::far_jump(Address entry, CodeBuffer *cbuf, Register tmp) { 420 assert(ReservedCodeCacheSize < 4*G, "branch out of range"); 421 assert(CodeCache::find_blob(entry.target()) != NULL, 422 "destination of far call not found in code cache"); 423 if (far_branches()) { 424 unsigned long offset; 425 // We can use ADRP here because we know that the total size of 426 // the code cache cannot exceed 2Gb. 427 adrp(tmp, entry, offset); 428 add(tmp, tmp, offset); 429 if (cbuf) cbuf->set_insts_mark(); 430 br(tmp); 431 } else { 432 if (cbuf) cbuf->set_insts_mark(); 433 b(entry); 434 } 435 } 436 437 void MacroAssembler::reserved_stack_check() { 438 // testing if reserved zone needs to be enabled 439 Label no_reserved_zone_enabling; 440 441 ldr(rscratch1, Address(rthread, JavaThread::reserved_stack_activation_offset())); 442 cmp(sp, rscratch1); 443 br(Assembler::LO, no_reserved_zone_enabling); 444 445 enter(); // LR and FP are live. 446 lea(rscratch1, CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone)); 447 mov(c_rarg0, rthread); 448 blr(rscratch1); 449 leave(); 450 451 // We have already removed our own frame. 452 // throw_delayed_StackOverflowError will think that it's been 453 // called by our caller. 454 lea(rscratch1, RuntimeAddress(StubRoutines::throw_delayed_StackOverflowError_entry())); 455 br(rscratch1); 456 should_not_reach_here(); 457 458 bind(no_reserved_zone_enabling); 459 } 460 461 int MacroAssembler::biased_locking_enter(Register lock_reg, 462 Register obj_reg, 463 Register swap_reg, 464 Register tmp_reg, 465 bool swap_reg_contains_mark, 466 Label& done, 467 Label* slow_case, 468 BiasedLockingCounters* counters) { 469 assert(UseBiasedLocking, "why call this otherwise?"); 470 assert_different_registers(lock_reg, obj_reg, swap_reg); 471 472 if (PrintBiasedLockingStatistics && counters == NULL) 473 counters = BiasedLocking::counters(); 474 475 assert_different_registers(lock_reg, obj_reg, swap_reg, tmp_reg, rscratch1, rscratch2, noreg); 476 assert(markWord::age_shift == markWord::lock_bits + markWord::biased_lock_bits, "biased locking makes assumptions about bit layout"); 477 Address mark_addr (obj_reg, oopDesc::mark_offset_in_bytes()); 478 Address klass_addr (obj_reg, oopDesc::klass_offset_in_bytes()); 479 Address saved_mark_addr(lock_reg, 0); 480 481 // Biased locking 482 // See whether the lock is currently biased toward our thread and 483 // whether the epoch is still valid 484 // Note that the runtime guarantees sufficient alignment of JavaThread 485 // pointers to allow age to be placed into low bits 486 // First check to see whether biasing is even enabled for this object 487 Label cas_label; 488 int null_check_offset = -1; 489 if (!swap_reg_contains_mark) { 490 null_check_offset = offset(); 491 ldr(swap_reg, mark_addr); 492 } 493 andr(tmp_reg, swap_reg, markWord::biased_lock_mask_in_place); 494 cmp(tmp_reg, (u1)markWord::biased_lock_pattern); 495 br(Assembler::NE, cas_label); 496 // The bias pattern is present in the object's header. Need to check 497 // whether the bias owner and the epoch are both still current. 498 load_prototype_header(tmp_reg, obj_reg); 499 orr(tmp_reg, tmp_reg, rthread); 500 eor(tmp_reg, swap_reg, tmp_reg); 501 andr(tmp_reg, tmp_reg, ~((int) markWord::age_mask_in_place)); 502 if (counters != NULL) { 503 Label around; 504 cbnz(tmp_reg, around); 505 atomic_incw(Address((address)counters->biased_lock_entry_count_addr()), tmp_reg, rscratch1, rscratch2); 506 b(done); 507 bind(around); 508 } else { 509 cbz(tmp_reg, done); 510 } 511 512 Label try_revoke_bias; 513 Label try_rebias; 514 515 // At this point we know that the header has the bias pattern and 516 // that we are not the bias owner in the current epoch. We need to 517 // figure out more details about the state of the header in order to 518 // know what operations can be legally performed on the object's 519 // header. 520 521 // If the low three bits in the xor result aren't clear, that means 522 // the prototype header is no longer biased and we have to revoke 523 // the bias on this object. 524 andr(rscratch1, tmp_reg, markWord::biased_lock_mask_in_place); 525 cbnz(rscratch1, try_revoke_bias); 526 527 // Biasing is still enabled for this data type. See whether the 528 // epoch of the current bias is still valid, meaning that the epoch 529 // bits of the mark word are equal to the epoch bits of the 530 // prototype header. (Note that the prototype header's epoch bits 531 // only change at a safepoint.) If not, attempt to rebias the object 532 // toward the current thread. Note that we must be absolutely sure 533 // that the current epoch is invalid in order to do this because 534 // otherwise the manipulations it performs on the mark word are 535 // illegal. 536 andr(rscratch1, tmp_reg, markWord::epoch_mask_in_place); 537 cbnz(rscratch1, try_rebias); 538 539 // The epoch of the current bias is still valid but we know nothing 540 // about the owner; it might be set or it might be clear. Try to 541 // acquire the bias of the object using an atomic operation. If this 542 // fails we will go in to the runtime to revoke the object's bias. 543 // Note that we first construct the presumed unbiased header so we 544 // don't accidentally blow away another thread's valid bias. 545 { 546 Label here; 547 mov(rscratch1, markWord::biased_lock_mask_in_place | markWord::age_mask_in_place | markWord::epoch_mask_in_place); 548 andr(swap_reg, swap_reg, rscratch1); 549 orr(tmp_reg, swap_reg, rthread); 550 cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, rscratch1, here, slow_case); 551 // If the biasing toward our thread failed, this means that 552 // another thread succeeded in biasing it toward itself and we 553 // need to revoke that bias. The revocation will occur in the 554 // interpreter runtime in the slow case. 555 bind(here); 556 if (counters != NULL) { 557 atomic_incw(Address((address)counters->anonymously_biased_lock_entry_count_addr()), 558 tmp_reg, rscratch1, rscratch2); 559 } 560 } 561 b(done); 562 563 bind(try_rebias); 564 // At this point we know the epoch has expired, meaning that the 565 // current "bias owner", if any, is actually invalid. Under these 566 // circumstances _only_, we are allowed to use the current header's 567 // value as the comparison value when doing the cas to acquire the 568 // bias in the current epoch. In other words, we allow transfer of 569 // the bias from one thread to another directly in this situation. 570 // 571 // FIXME: due to a lack of registers we currently blow away the age 572 // bits in this situation. Should attempt to preserve them. 573 { 574 Label here; 575 load_prototype_header(tmp_reg, obj_reg); 576 orr(tmp_reg, rthread, tmp_reg); 577 cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, rscratch1, here, slow_case); 578 // If the biasing toward our thread failed, then another thread 579 // succeeded in biasing it toward itself and we need to revoke that 580 // bias. The revocation will occur in the runtime in the slow case. 581 bind(here); 582 if (counters != NULL) { 583 atomic_incw(Address((address)counters->rebiased_lock_entry_count_addr()), 584 tmp_reg, rscratch1, rscratch2); 585 } 586 } 587 b(done); 588 589 bind(try_revoke_bias); 590 // The prototype mark in the klass doesn't have the bias bit set any 591 // more, indicating that objects of this data type are not supposed 592 // to be biased any more. We are going to try to reset the mark of 593 // this object to the prototype value and fall through to the 594 // CAS-based locking scheme. Note that if our CAS fails, it means 595 // that another thread raced us for the privilege of revoking the 596 // bias of this particular object, so it's okay to continue in the 597 // normal locking code. 598 // 599 // FIXME: due to a lack of registers we currently blow away the age 600 // bits in this situation. Should attempt to preserve them. 601 { 602 Label here, nope; 603 load_prototype_header(tmp_reg, obj_reg); 604 cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, rscratch1, here, &nope); 605 bind(here); 606 607 // Fall through to the normal CAS-based lock, because no matter what 608 // the result of the above CAS, some thread must have succeeded in 609 // removing the bias bit from the object's header. 610 if (counters != NULL) { 611 atomic_incw(Address((address)counters->revoked_lock_entry_count_addr()), tmp_reg, 612 rscratch1, rscratch2); 613 } 614 bind(nope); 615 } 616 617 bind(cas_label); 618 619 return null_check_offset; 620 } 621 622 void MacroAssembler::biased_locking_exit(Register obj_reg, Register temp_reg, Label& done) { 623 assert(UseBiasedLocking, "why call this otherwise?"); 624 625 // Check for biased locking unlock case, which is a no-op 626 // Note: we do not have to check the thread ID for two reasons. 627 // First, the interpreter checks for IllegalMonitorStateException at 628 // a higher level. Second, if the bias was revoked while we held the 629 // lock, the object could not be rebiased toward another thread, so 630 // the bias bit would be clear. 631 ldr(temp_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes())); 632 andr(temp_reg, temp_reg, markWord::biased_lock_mask_in_place); 633 cmp(temp_reg, (u1)markWord::biased_lock_pattern); 634 br(Assembler::EQ, done); 635 } 636 637 static void pass_arg0(MacroAssembler* masm, Register arg) { 638 if (c_rarg0 != arg ) { 639 masm->mov(c_rarg0, arg); 640 } 641 } 642 643 static void pass_arg1(MacroAssembler* masm, Register arg) { 644 if (c_rarg1 != arg ) { 645 masm->mov(c_rarg1, arg); 646 } 647 } 648 649 static void pass_arg2(MacroAssembler* masm, Register arg) { 650 if (c_rarg2 != arg ) { 651 masm->mov(c_rarg2, arg); 652 } 653 } 654 655 static void pass_arg3(MacroAssembler* masm, Register arg) { 656 if (c_rarg3 != arg ) { 657 masm->mov(c_rarg3, arg); 658 } 659 } 660 661 void MacroAssembler::call_VM_base(Register oop_result, 662 Register java_thread, 663 Register last_java_sp, 664 address entry_point, 665 int number_of_arguments, 666 bool check_exceptions) { 667 // determine java_thread register 668 if (!java_thread->is_valid()) { 669 java_thread = rthread; 670 } 671 672 // determine last_java_sp register 673 if (!last_java_sp->is_valid()) { 674 last_java_sp = esp; 675 } 676 677 // debugging support 678 assert(number_of_arguments >= 0 , "cannot have negative number of arguments"); 679 assert(java_thread == rthread, "unexpected register"); 680 #ifdef ASSERT 681 // TraceBytecodes does not use r12 but saves it over the call, so don't verify 682 // if ((UseCompressedOops || UseCompressedClassPointers) && !TraceBytecodes) verify_heapbase("call_VM_base: heap base corrupted?"); 683 #endif // ASSERT 684 685 assert(java_thread != oop_result , "cannot use the same register for java_thread & oop_result"); 686 assert(java_thread != last_java_sp, "cannot use the same register for java_thread & last_java_sp"); 687 688 // push java thread (becomes first argument of C function) 689 690 mov(c_rarg0, java_thread); 691 692 // set last Java frame before call 693 assert(last_java_sp != rfp, "can't use rfp"); 694 695 Label l; 696 set_last_Java_frame(last_java_sp, rfp, l, rscratch1); 697 698 // do the call, remove parameters 699 MacroAssembler::call_VM_leaf_base(entry_point, number_of_arguments, &l); 700 701 // reset last Java frame 702 // Only interpreter should have to clear fp 703 reset_last_Java_frame(true); 704 705 // C++ interp handles this in the interpreter 706 check_and_handle_popframe(java_thread); 707 check_and_handle_earlyret(java_thread); 708 709 if (check_exceptions) { 710 // check for pending exceptions (java_thread is set upon return) 711 ldr(rscratch1, Address(java_thread, in_bytes(Thread::pending_exception_offset()))); 712 Label ok; 713 cbz(rscratch1, ok); 714 lea(rscratch1, RuntimeAddress(StubRoutines::forward_exception_entry())); 715 br(rscratch1); 716 bind(ok); 717 } 718 719 // get oop result if there is one and reset the value in the thread 720 if (oop_result->is_valid()) { 721 get_vm_result(oop_result, java_thread); 722 } 723 } 724 725 void MacroAssembler::call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions) { 726 call_VM_base(oop_result, noreg, noreg, entry_point, number_of_arguments, check_exceptions); 727 } 728 729 // Maybe emit a call via a trampoline. If the code cache is small 730 // trampolines won't be emitted. 731 732 address MacroAssembler::trampoline_call(Address entry, CodeBuffer *cbuf) { 733 assert(JavaThread::current()->is_Compiler_thread(), "just checking"); 734 assert(entry.rspec().type() == relocInfo::runtime_call_type 735 || entry.rspec().type() == relocInfo::opt_virtual_call_type 736 || entry.rspec().type() == relocInfo::static_call_type 737 || entry.rspec().type() == relocInfo::virtual_call_type, "wrong reloc type"); 738 739 // We need a trampoline if branches are far. 740 if (far_branches()) { 741 bool in_scratch_emit_size = false; 742 #ifdef COMPILER2 743 // We don't want to emit a trampoline if C2 is generating dummy 744 // code during its branch shortening phase. 745 CompileTask* task = ciEnv::current()->task(); 746 in_scratch_emit_size = 747 (task != NULL && is_c2_compile(task->comp_level()) && 748 Compile::current()->in_scratch_emit_size()); 749 #endif 750 if (!in_scratch_emit_size) { 751 address stub = emit_trampoline_stub(offset(), entry.target()); 752 if (stub == NULL) { 753 return NULL; // CodeCache is full 754 } 755 } 756 } 757 758 if (cbuf) cbuf->set_insts_mark(); 759 relocate(entry.rspec()); 760 if (!far_branches()) { 761 bl(entry.target()); 762 } else { 763 bl(pc()); 764 } 765 // just need to return a non-null address 766 return pc(); 767 } 768 769 770 // Emit a trampoline stub for a call to a target which is too far away. 771 // 772 // code sequences: 773 // 774 // call-site: 775 // branch-and-link to <destination> or <trampoline stub> 776 // 777 // Related trampoline stub for this call site in the stub section: 778 // load the call target from the constant pool 779 // branch (LR still points to the call site above) 780 781 address MacroAssembler::emit_trampoline_stub(int insts_call_instruction_offset, 782 address dest) { 783 // Max stub size: alignment nop, TrampolineStub. 784 address stub = start_a_stub(NativeInstruction::instruction_size 785 + NativeCallTrampolineStub::instruction_size); 786 if (stub == NULL) { 787 return NULL; // CodeBuffer::expand failed 788 } 789 790 // Create a trampoline stub relocation which relates this trampoline stub 791 // with the call instruction at insts_call_instruction_offset in the 792 // instructions code-section. 793 align(wordSize); 794 relocate(trampoline_stub_Relocation::spec(code()->insts()->start() 795 + insts_call_instruction_offset)); 796 const int stub_start_offset = offset(); 797 798 // Now, create the trampoline stub's code: 799 // - load the call 800 // - call 801 Label target; 802 ldr(rscratch1, target); 803 br(rscratch1); 804 bind(target); 805 assert(offset() - stub_start_offset == NativeCallTrampolineStub::data_offset, 806 "should be"); 807 emit_int64((int64_t)dest); 808 809 const address stub_start_addr = addr_at(stub_start_offset); 810 811 assert(is_NativeCallTrampolineStub_at(stub_start_addr), "doesn't look like a trampoline"); 812 813 end_a_stub(); 814 return stub_start_addr; 815 } 816 817 void MacroAssembler::emit_static_call_stub() { 818 // CompiledDirectStaticCall::set_to_interpreted knows the 819 // exact layout of this stub. 820 821 isb(); 822 mov_metadata(rmethod, (Metadata*)NULL); 823 824 // Jump to the entry point of the i2c stub. 825 movptr(rscratch1, 0); 826 br(rscratch1); 827 } 828 829 void MacroAssembler::c2bool(Register x) { 830 // implements x == 0 ? 0 : 1 831 // note: must only look at least-significant byte of x 832 // since C-style booleans are stored in one byte 833 // only! (was bug) 834 tst(x, 0xff); 835 cset(x, Assembler::NE); 836 } 837 838 address MacroAssembler::ic_call(address entry, jint method_index) { 839 RelocationHolder rh = virtual_call_Relocation::spec(pc(), method_index); 840 // address const_ptr = long_constant((jlong)Universe::non_oop_word()); 841 // unsigned long offset; 842 // ldr_constant(rscratch2, const_ptr); 843 movptr(rscratch2, (uintptr_t)Universe::non_oop_word()); 844 return trampoline_call(Address(entry, rh)); 845 } 846 847 // Implementation of call_VM versions 848 849 void MacroAssembler::call_VM(Register oop_result, 850 address entry_point, 851 bool check_exceptions) { 852 call_VM_helper(oop_result, entry_point, 0, check_exceptions); 853 } 854 855 void MacroAssembler::call_VM(Register oop_result, 856 address entry_point, 857 Register arg_1, 858 bool check_exceptions) { 859 pass_arg1(this, arg_1); 860 call_VM_helper(oop_result, entry_point, 1, check_exceptions); 861 } 862 863 void MacroAssembler::call_VM(Register oop_result, 864 address entry_point, 865 Register arg_1, 866 Register arg_2, 867 bool check_exceptions) { 868 assert(arg_1 != c_rarg2, "smashed arg"); 869 pass_arg2(this, arg_2); 870 pass_arg1(this, arg_1); 871 call_VM_helper(oop_result, entry_point, 2, check_exceptions); 872 } 873 874 void MacroAssembler::call_VM(Register oop_result, 875 address entry_point, 876 Register arg_1, 877 Register arg_2, 878 Register arg_3, 879 bool check_exceptions) { 880 assert(arg_1 != c_rarg3, "smashed arg"); 881 assert(arg_2 != c_rarg3, "smashed arg"); 882 pass_arg3(this, arg_3); 883 884 assert(arg_1 != c_rarg2, "smashed arg"); 885 pass_arg2(this, arg_2); 886 887 pass_arg1(this, arg_1); 888 call_VM_helper(oop_result, entry_point, 3, check_exceptions); 889 } 890 891 void MacroAssembler::call_VM(Register oop_result, 892 Register last_java_sp, 893 address entry_point, 894 int number_of_arguments, 895 bool check_exceptions) { 896 call_VM_base(oop_result, rthread, last_java_sp, entry_point, number_of_arguments, check_exceptions); 897 } 898 899 void MacroAssembler::call_VM(Register oop_result, 900 Register last_java_sp, 901 address entry_point, 902 Register arg_1, 903 bool check_exceptions) { 904 pass_arg1(this, arg_1); 905 call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions); 906 } 907 908 void MacroAssembler::call_VM(Register oop_result, 909 Register last_java_sp, 910 address entry_point, 911 Register arg_1, 912 Register arg_2, 913 bool check_exceptions) { 914 915 assert(arg_1 != c_rarg2, "smashed arg"); 916 pass_arg2(this, arg_2); 917 pass_arg1(this, arg_1); 918 call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions); 919 } 920 921 void MacroAssembler::call_VM(Register oop_result, 922 Register last_java_sp, 923 address entry_point, 924 Register arg_1, 925 Register arg_2, 926 Register arg_3, 927 bool check_exceptions) { 928 assert(arg_1 != c_rarg3, "smashed arg"); 929 assert(arg_2 != c_rarg3, "smashed arg"); 930 pass_arg3(this, arg_3); 931 assert(arg_1 != c_rarg2, "smashed arg"); 932 pass_arg2(this, arg_2); 933 pass_arg1(this, arg_1); 934 call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions); 935 } 936 937 938 void MacroAssembler::get_vm_result(Register oop_result, Register java_thread) { 939 ldr(oop_result, Address(java_thread, JavaThread::vm_result_offset())); 940 str(zr, Address(java_thread, JavaThread::vm_result_offset())); 941 verify_oop(oop_result, "broken oop in call_VM_base"); 942 } 943 944 void MacroAssembler::get_vm_result_2(Register metadata_result, Register java_thread) { 945 ldr(metadata_result, Address(java_thread, JavaThread::vm_result_2_offset())); 946 str(zr, Address(java_thread, JavaThread::vm_result_2_offset())); 947 } 948 949 void MacroAssembler::align(int modulus) { 950 while (offset() % modulus != 0) nop(); 951 } 952 953 // these are no-ops overridden by InterpreterMacroAssembler 954 955 void MacroAssembler::check_and_handle_earlyret(Register java_thread) { } 956 957 void MacroAssembler::check_and_handle_popframe(Register java_thread) { } 958 959 960 RegisterOrConstant MacroAssembler::delayed_value_impl(intptr_t* delayed_value_addr, 961 Register tmp, 962 int offset) { 963 intptr_t value = *delayed_value_addr; 964 if (value != 0) 965 return RegisterOrConstant(value + offset); 966 967 // load indirectly to solve generation ordering problem 968 ldr(tmp, ExternalAddress((address) delayed_value_addr)); 969 970 if (offset != 0) 971 add(tmp, tmp, offset); 972 973 return RegisterOrConstant(tmp); 974 } 975 976 // Look up the method for a megamorphic invokeinterface call. 977 // The target method is determined by <intf_klass, itable_index>. 978 // The receiver klass is in recv_klass. 979 // On success, the result will be in method_result, and execution falls through. 980 // On failure, execution transfers to the given label. 981 void MacroAssembler::lookup_interface_method(Register recv_klass, 982 Register intf_klass, 983 RegisterOrConstant itable_index, 984 Register method_result, 985 Register scan_temp, 986 Label& L_no_such_interface, 987 bool return_method) { 988 assert_different_registers(recv_klass, intf_klass, scan_temp); 989 assert_different_registers(method_result, intf_klass, scan_temp); 990 assert(recv_klass != method_result || !return_method, 991 "recv_klass can be destroyed when method isn't needed"); 992 assert(itable_index.is_constant() || itable_index.as_register() == method_result, 993 "caller must use same register for non-constant itable index as for method"); 994 995 // Compute start of first itableOffsetEntry (which is at the end of the vtable) 996 int vtable_base = in_bytes(Klass::vtable_start_offset()); 997 int itentry_off = itableMethodEntry::method_offset_in_bytes(); 998 int scan_step = itableOffsetEntry::size() * wordSize; 999 int vte_size = vtableEntry::size_in_bytes(); 1000 assert(vte_size == wordSize, "else adjust times_vte_scale"); 1001 1002 ldrw(scan_temp, Address(recv_klass, Klass::vtable_length_offset())); 1003 1004 // %%% Could store the aligned, prescaled offset in the klassoop. 1005 // lea(scan_temp, Address(recv_klass, scan_temp, times_vte_scale, vtable_base)); 1006 lea(scan_temp, Address(recv_klass, scan_temp, Address::lsl(3))); 1007 add(scan_temp, scan_temp, vtable_base); 1008 1009 if (return_method) { 1010 // Adjust recv_klass by scaled itable_index, so we can free itable_index. 1011 assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below"); 1012 // lea(recv_klass, Address(recv_klass, itable_index, Address::times_ptr, itentry_off)); 1013 lea(recv_klass, Address(recv_klass, itable_index, Address::lsl(3))); 1014 if (itentry_off) 1015 add(recv_klass, recv_klass, itentry_off); 1016 } 1017 1018 // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) { 1019 // if (scan->interface() == intf) { 1020 // result = (klass + scan->offset() + itable_index); 1021 // } 1022 // } 1023 Label search, found_method; 1024 1025 for (int peel = 1; peel >= 0; peel--) { 1026 ldr(method_result, Address(scan_temp, itableOffsetEntry::interface_offset_in_bytes())); 1027 cmp(intf_klass, method_result); 1028 1029 if (peel) { 1030 br(Assembler::EQ, found_method); 1031 } else { 1032 br(Assembler::NE, search); 1033 // (invert the test to fall through to found_method...) 1034 } 1035 1036 if (!peel) break; 1037 1038 bind(search); 1039 1040 // Check that the previous entry is non-null. A null entry means that 1041 // the receiver class doesn't implement the interface, and wasn't the 1042 // same as when the caller was compiled. 1043 cbz(method_result, L_no_such_interface); 1044 add(scan_temp, scan_temp, scan_step); 1045 } 1046 1047 bind(found_method); 1048 1049 // Got a hit. 1050 if (return_method) { 1051 ldrw(scan_temp, Address(scan_temp, itableOffsetEntry::offset_offset_in_bytes())); 1052 ldr(method_result, Address(recv_klass, scan_temp, Address::uxtw(0))); 1053 } 1054 } 1055 1056 // virtual method calling 1057 void MacroAssembler::lookup_virtual_method(Register recv_klass, 1058 RegisterOrConstant vtable_index, 1059 Register method_result) { 1060 const int base = in_bytes(Klass::vtable_start_offset()); 1061 assert(vtableEntry::size() * wordSize == 8, 1062 "adjust the scaling in the code below"); 1063 int vtable_offset_in_bytes = base + vtableEntry::method_offset_in_bytes(); 1064 1065 if (vtable_index.is_register()) { 1066 lea(method_result, Address(recv_klass, 1067 vtable_index.as_register(), 1068 Address::lsl(LogBytesPerWord))); 1069 ldr(method_result, Address(method_result, vtable_offset_in_bytes)); 1070 } else { 1071 vtable_offset_in_bytes += vtable_index.as_constant() * wordSize; 1072 ldr(method_result, 1073 form_address(rscratch1, recv_klass, vtable_offset_in_bytes, 0)); 1074 } 1075 } 1076 1077 void MacroAssembler::check_klass_subtype(Register sub_klass, 1078 Register super_klass, 1079 Register temp_reg, 1080 Label& L_success) { 1081 Label L_failure; 1082 check_klass_subtype_fast_path(sub_klass, super_klass, temp_reg, &L_success, &L_failure, NULL); 1083 check_klass_subtype_slow_path(sub_klass, super_klass, temp_reg, noreg, &L_success, NULL); 1084 bind(L_failure); 1085 } 1086 1087 1088 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass, 1089 Register super_klass, 1090 Register temp_reg, 1091 Label* L_success, 1092 Label* L_failure, 1093 Label* L_slow_path, 1094 RegisterOrConstant super_check_offset) { 1095 assert_different_registers(sub_klass, super_klass, temp_reg); 1096 bool must_load_sco = (super_check_offset.constant_or_zero() == -1); 1097 if (super_check_offset.is_register()) { 1098 assert_different_registers(sub_klass, super_klass, 1099 super_check_offset.as_register()); 1100 } else if (must_load_sco) { 1101 assert(temp_reg != noreg, "supply either a temp or a register offset"); 1102 } 1103 1104 Label L_fallthrough; 1105 int label_nulls = 0; 1106 if (L_success == NULL) { L_success = &L_fallthrough; label_nulls++; } 1107 if (L_failure == NULL) { L_failure = &L_fallthrough; label_nulls++; } 1108 if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; } 1109 assert(label_nulls <= 1, "at most one NULL in the batch"); 1110 1111 int sc_offset = in_bytes(Klass::secondary_super_cache_offset()); 1112 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 1113 Address super_check_offset_addr(super_klass, sco_offset); 1114 1115 // Hacked jmp, which may only be used just before L_fallthrough. 1116 #define final_jmp(label) \ 1117 if (&(label) == &L_fallthrough) { /*do nothing*/ } \ 1118 else b(label) /*omit semi*/ 1119 1120 // If the pointers are equal, we are done (e.g., String[] elements). 1121 // This self-check enables sharing of secondary supertype arrays among 1122 // non-primary types such as array-of-interface. Otherwise, each such 1123 // type would need its own customized SSA. 1124 // We move this check to the front of the fast path because many 1125 // type checks are in fact trivially successful in this manner, 1126 // so we get a nicely predicted branch right at the start of the check. 1127 cmp(sub_klass, super_klass); 1128 br(Assembler::EQ, *L_success); 1129 1130 // Check the supertype display: 1131 if (must_load_sco) { 1132 ldrw(temp_reg, super_check_offset_addr); 1133 super_check_offset = RegisterOrConstant(temp_reg); 1134 } 1135 Address super_check_addr(sub_klass, super_check_offset); 1136 ldr(rscratch1, super_check_addr); 1137 cmp(super_klass, rscratch1); // load displayed supertype 1138 1139 // This check has worked decisively for primary supers. 1140 // Secondary supers are sought in the super_cache ('super_cache_addr'). 1141 // (Secondary supers are interfaces and very deeply nested subtypes.) 1142 // This works in the same check above because of a tricky aliasing 1143 // between the super_cache and the primary super display elements. 1144 // (The 'super_check_addr' can address either, as the case requires.) 1145 // Note that the cache is updated below if it does not help us find 1146 // what we need immediately. 1147 // So if it was a primary super, we can just fail immediately. 1148 // Otherwise, it's the slow path for us (no success at this point). 1149 1150 if (super_check_offset.is_register()) { 1151 br(Assembler::EQ, *L_success); 1152 subs(zr, super_check_offset.as_register(), sc_offset); 1153 if (L_failure == &L_fallthrough) { 1154 br(Assembler::EQ, *L_slow_path); 1155 } else { 1156 br(Assembler::NE, *L_failure); 1157 final_jmp(*L_slow_path); 1158 } 1159 } else if (super_check_offset.as_constant() == sc_offset) { 1160 // Need a slow path; fast failure is impossible. 1161 if (L_slow_path == &L_fallthrough) { 1162 br(Assembler::EQ, *L_success); 1163 } else { 1164 br(Assembler::NE, *L_slow_path); 1165 final_jmp(*L_success); 1166 } 1167 } else { 1168 // No slow path; it's a fast decision. 1169 if (L_failure == &L_fallthrough) { 1170 br(Assembler::EQ, *L_success); 1171 } else { 1172 br(Assembler::NE, *L_failure); 1173 final_jmp(*L_success); 1174 } 1175 } 1176 1177 bind(L_fallthrough); 1178 1179 #undef final_jmp 1180 } 1181 1182 // These two are taken from x86, but they look generally useful 1183 1184 // scans count pointer sized words at [addr] for occurence of value, 1185 // generic 1186 void MacroAssembler::repne_scan(Register addr, Register value, Register count, 1187 Register scratch) { 1188 Label Lloop, Lexit; 1189 cbz(count, Lexit); 1190 bind(Lloop); 1191 ldr(scratch, post(addr, wordSize)); 1192 cmp(value, scratch); 1193 br(EQ, Lexit); 1194 sub(count, count, 1); 1195 cbnz(count, Lloop); 1196 bind(Lexit); 1197 } 1198 1199 // scans count 4 byte words at [addr] for occurence of value, 1200 // generic 1201 void MacroAssembler::repne_scanw(Register addr, Register value, Register count, 1202 Register scratch) { 1203 Label Lloop, Lexit; 1204 cbz(count, Lexit); 1205 bind(Lloop); 1206 ldrw(scratch, post(addr, wordSize)); 1207 cmpw(value, scratch); 1208 br(EQ, Lexit); 1209 sub(count, count, 1); 1210 cbnz(count, Lloop); 1211 bind(Lexit); 1212 } 1213 1214 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass, 1215 Register super_klass, 1216 Register temp_reg, 1217 Register temp2_reg, 1218 Label* L_success, 1219 Label* L_failure, 1220 bool set_cond_codes) { 1221 assert_different_registers(sub_klass, super_klass, temp_reg); 1222 if (temp2_reg != noreg) 1223 assert_different_registers(sub_klass, super_klass, temp_reg, temp2_reg, rscratch1); 1224 #define IS_A_TEMP(reg) ((reg) == temp_reg || (reg) == temp2_reg) 1225 1226 Label L_fallthrough; 1227 int label_nulls = 0; 1228 if (L_success == NULL) { L_success = &L_fallthrough; label_nulls++; } 1229 if (L_failure == NULL) { L_failure = &L_fallthrough; label_nulls++; } 1230 assert(label_nulls <= 1, "at most one NULL in the batch"); 1231 1232 // a couple of useful fields in sub_klass: 1233 int ss_offset = in_bytes(Klass::secondary_supers_offset()); 1234 int sc_offset = in_bytes(Klass::secondary_super_cache_offset()); 1235 Address secondary_supers_addr(sub_klass, ss_offset); 1236 Address super_cache_addr( sub_klass, sc_offset); 1237 1238 BLOCK_COMMENT("check_klass_subtype_slow_path"); 1239 1240 // Do a linear scan of the secondary super-klass chain. 1241 // This code is rarely used, so simplicity is a virtue here. 1242 // The repne_scan instruction uses fixed registers, which we must spill. 1243 // Don't worry too much about pre-existing connections with the input regs. 1244 1245 assert(sub_klass != r0, "killed reg"); // killed by mov(r0, super) 1246 assert(sub_klass != r2, "killed reg"); // killed by lea(r2, &pst_counter) 1247 1248 RegSet pushed_registers; 1249 if (!IS_A_TEMP(r2)) pushed_registers += r2; 1250 if (!IS_A_TEMP(r5)) pushed_registers += r5; 1251 1252 if (super_klass != r0 || UseCompressedOops) { 1253 if (!IS_A_TEMP(r0)) pushed_registers += r0; 1254 } 1255 1256 push(pushed_registers, sp); 1257 1258 // Get super_klass value into r0 (even if it was in r5 or r2). 1259 if (super_klass != r0) { 1260 mov(r0, super_klass); 1261 } 1262 1263 #ifndef PRODUCT 1264 mov(rscratch2, (address)&SharedRuntime::_partial_subtype_ctr); 1265 Address pst_counter_addr(rscratch2); 1266 ldr(rscratch1, pst_counter_addr); 1267 add(rscratch1, rscratch1, 1); 1268 str(rscratch1, pst_counter_addr); 1269 #endif //PRODUCT 1270 1271 // We will consult the secondary-super array. 1272 ldr(r5, secondary_supers_addr); 1273 // Load the array length. 1274 ldrw(r2, Address(r5, Array<Klass*>::length_offset_in_bytes())); 1275 // Skip to start of data. 1276 add(r5, r5, Array<Klass*>::base_offset_in_bytes()); 1277 1278 cmp(sp, zr); // Clear Z flag; SP is never zero 1279 // Scan R2 words at [R5] for an occurrence of R0. 1280 // Set NZ/Z based on last compare. 1281 repne_scan(r5, r0, r2, rscratch1); 1282 1283 // Unspill the temp. registers: 1284 pop(pushed_registers, sp); 1285 1286 br(Assembler::NE, *L_failure); 1287 1288 // Success. Cache the super we found and proceed in triumph. 1289 str(super_klass, super_cache_addr); 1290 1291 if (L_success != &L_fallthrough) { 1292 b(*L_success); 1293 } 1294 1295 #undef IS_A_TEMP 1296 1297 bind(L_fallthrough); 1298 } 1299 1300 void MacroAssembler::clinit_barrier(Register klass, Register scratch, Label* L_fast_path, Label* L_slow_path) { 1301 assert(L_fast_path != NULL || L_slow_path != NULL, "at least one is required"); 1302 assert_different_registers(klass, rthread, scratch); 1303 1304 Label L_fallthrough, L_tmp; 1305 if (L_fast_path == NULL) { 1306 L_fast_path = &L_fallthrough; 1307 } else if (L_slow_path == NULL) { 1308 L_slow_path = &L_fallthrough; 1309 } 1310 // Fast path check: class is fully initialized 1311 ldrb(scratch, Address(klass, InstanceKlass::init_state_offset())); 1312 subs(zr, scratch, InstanceKlass::fully_initialized); 1313 br(Assembler::EQ, *L_fast_path); 1314 1315 // Fast path check: current thread is initializer thread 1316 ldr(scratch, Address(klass, InstanceKlass::init_thread_offset())); 1317 cmp(rthread, scratch); 1318 1319 if (L_slow_path == &L_fallthrough) { 1320 br(Assembler::EQ, *L_fast_path); 1321 bind(*L_slow_path); 1322 } else if (L_fast_path == &L_fallthrough) { 1323 br(Assembler::NE, *L_slow_path); 1324 bind(*L_fast_path); 1325 } else { 1326 Unimplemented(); 1327 } 1328 } 1329 1330 void MacroAssembler::verify_oop(Register reg, const char* s) { 1331 if (!VerifyOops || VerifyAdapterSharing) { 1332 // Below address of the code string confuses VerifyAdapterSharing 1333 // because it may differ between otherwise equivalent adapters. 1334 return; 1335 } 1336 1337 // Pass register number to verify_oop_subroutine 1338 const char* b = NULL; 1339 { 1340 ResourceMark rm; 1341 stringStream ss; 1342 ss.print("verify_oop: %s: %s", reg->name(), s); 1343 b = code_string(ss.as_string()); 1344 } 1345 BLOCK_COMMENT("verify_oop {"); 1346 1347 stp(r0, rscratch1, Address(pre(sp, -2 * wordSize))); 1348 stp(rscratch2, lr, Address(pre(sp, -2 * wordSize))); 1349 1350 mov(r0, reg); 1351 mov(rscratch1, (address)b); 1352 1353 // call indirectly to solve generation ordering problem 1354 lea(rscratch2, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address())); 1355 ldr(rscratch2, Address(rscratch2)); 1356 blr(rscratch2); 1357 1358 ldp(rscratch2, lr, Address(post(sp, 2 * wordSize))); 1359 ldp(r0, rscratch1, Address(post(sp, 2 * wordSize))); 1360 1361 BLOCK_COMMENT("} verify_oop"); 1362 } 1363 1364 void MacroAssembler::verify_oop_addr(Address addr, const char* s) { 1365 if (!VerifyOops || VerifyAdapterSharing) { 1366 // Below address of the code string confuses VerifyAdapterSharing 1367 // because it may differ between otherwise equivalent adapters. 1368 return; 1369 } 1370 1371 const char* b = NULL; 1372 { 1373 ResourceMark rm; 1374 stringStream ss; 1375 ss.print("verify_oop_addr: %s", s); 1376 b = code_string(ss.as_string()); 1377 } 1378 BLOCK_COMMENT("verify_oop_addr {"); 1379 1380 stp(r0, rscratch1, Address(pre(sp, -2 * wordSize))); 1381 stp(rscratch2, lr, Address(pre(sp, -2 * wordSize))); 1382 1383 // addr may contain sp so we will have to adjust it based on the 1384 // pushes that we just did. 1385 if (addr.uses(sp)) { 1386 lea(r0, addr); 1387 ldr(r0, Address(r0, 4 * wordSize)); 1388 } else { 1389 ldr(r0, addr); 1390 } 1391 mov(rscratch1, (address)b); 1392 1393 // call indirectly to solve generation ordering problem 1394 lea(rscratch2, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address())); 1395 ldr(rscratch2, Address(rscratch2)); 1396 blr(rscratch2); 1397 1398 ldp(rscratch2, lr, Address(post(sp, 2 * wordSize))); 1399 ldp(r0, rscratch1, Address(post(sp, 2 * wordSize))); 1400 1401 BLOCK_COMMENT("} verify_oop_addr"); 1402 } 1403 1404 Address MacroAssembler::argument_address(RegisterOrConstant arg_slot, 1405 int extra_slot_offset) { 1406 // cf. TemplateTable::prepare_invoke(), if (load_receiver). 1407 int stackElementSize = Interpreter::stackElementSize; 1408 int offset = Interpreter::expr_offset_in_bytes(extra_slot_offset+0); 1409 #ifdef ASSERT 1410 int offset1 = Interpreter::expr_offset_in_bytes(extra_slot_offset+1); 1411 assert(offset1 - offset == stackElementSize, "correct arithmetic"); 1412 #endif 1413 if (arg_slot.is_constant()) { 1414 return Address(esp, arg_slot.as_constant() * stackElementSize 1415 + offset); 1416 } else { 1417 add(rscratch1, esp, arg_slot.as_register(), 1418 ext::uxtx, exact_log2(stackElementSize)); 1419 return Address(rscratch1, offset); 1420 } 1421 } 1422 1423 void MacroAssembler::call_VM_leaf_base(address entry_point, 1424 int number_of_arguments, 1425 Label *retaddr) { 1426 Label E, L; 1427 1428 stp(rscratch1, rmethod, Address(pre(sp, -2 * wordSize))); 1429 1430 mov(rscratch1, entry_point); 1431 blr(rscratch1); 1432 if (retaddr) 1433 bind(*retaddr); 1434 1435 ldp(rscratch1, rmethod, Address(post(sp, 2 * wordSize))); 1436 maybe_isb(); 1437 } 1438 1439 void MacroAssembler::call_VM_leaf(address entry_point, int number_of_arguments) { 1440 call_VM_leaf_base(entry_point, number_of_arguments); 1441 } 1442 1443 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0) { 1444 pass_arg0(this, arg_0); 1445 call_VM_leaf_base(entry_point, 1); 1446 } 1447 1448 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1) { 1449 pass_arg0(this, arg_0); 1450 pass_arg1(this, arg_1); 1451 call_VM_leaf_base(entry_point, 2); 1452 } 1453 1454 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, 1455 Register arg_1, Register arg_2) { 1456 pass_arg0(this, arg_0); 1457 pass_arg1(this, arg_1); 1458 pass_arg2(this, arg_2); 1459 call_VM_leaf_base(entry_point, 3); 1460 } 1461 1462 void MacroAssembler::super_call_VM_leaf(address entry_point) { 1463 MacroAssembler::call_VM_leaf_base(entry_point, 1); 1464 } 1465 1466 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0) { 1467 pass_arg0(this, arg_0); 1468 MacroAssembler::call_VM_leaf_base(entry_point, 1); 1469 } 1470 1471 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1) { 1472 1473 assert(arg_0 != c_rarg1, "smashed arg"); 1474 pass_arg1(this, arg_1); 1475 pass_arg0(this, arg_0); 1476 MacroAssembler::call_VM_leaf_base(entry_point, 2); 1477 } 1478 1479 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) { 1480 assert(arg_0 != c_rarg2, "smashed arg"); 1481 assert(arg_1 != c_rarg2, "smashed arg"); 1482 pass_arg2(this, arg_2); 1483 assert(arg_0 != c_rarg1, "smashed arg"); 1484 pass_arg1(this, arg_1); 1485 pass_arg0(this, arg_0); 1486 MacroAssembler::call_VM_leaf_base(entry_point, 3); 1487 } 1488 1489 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) { 1490 assert(arg_0 != c_rarg3, "smashed arg"); 1491 assert(arg_1 != c_rarg3, "smashed arg"); 1492 assert(arg_2 != c_rarg3, "smashed arg"); 1493 pass_arg3(this, arg_3); 1494 assert(arg_0 != c_rarg2, "smashed arg"); 1495 assert(arg_1 != c_rarg2, "smashed arg"); 1496 pass_arg2(this, arg_2); 1497 assert(arg_0 != c_rarg1, "smashed arg"); 1498 pass_arg1(this, arg_1); 1499 pass_arg0(this, arg_0); 1500 MacroAssembler::call_VM_leaf_base(entry_point, 4); 1501 } 1502 1503 void MacroAssembler::null_check(Register reg, int offset) { 1504 if (needs_explicit_null_check(offset)) { 1505 // provoke OS NULL exception if reg = NULL by 1506 // accessing M[reg] w/o changing any registers 1507 // NOTE: this is plenty to provoke a segv 1508 ldr(zr, Address(reg)); 1509 } else { 1510 // nothing to do, (later) access of M[reg + offset] 1511 // will provoke OS NULL exception if reg = NULL 1512 } 1513 } 1514 1515 void MacroAssembler::test_klass_is_value(Register klass, Register temp_reg, Label& is_value) { 1516 ldrw(temp_reg, Address(klass, Klass::access_flags_offset())); 1517 andr(temp_reg, temp_reg, JVM_ACC_VALUE); 1518 cbnz(temp_reg, is_value); 1519 } 1520 1521 void MacroAssembler::test_field_is_flattenable(Register flags, Register temp_reg, Label& is_flattenable) { 1522 (void) temp_reg; // keep signature uniform with x86 1523 tbnz(flags, ConstantPoolCacheEntry::is_flattenable_field_shift, is_flattenable); 1524 } 1525 1526 void MacroAssembler::test_field_is_not_flattenable(Register flags, Register temp_reg, Label& not_flattenable) { 1527 (void) temp_reg; // keep signature uniform with x86 1528 tbz(flags, ConstantPoolCacheEntry::is_flattenable_field_shift, not_flattenable); 1529 } 1530 1531 void MacroAssembler::test_field_is_flattened(Register flags, Register temp_reg, Label& is_flattened) { 1532 (void) temp_reg; // keep signature uniform with x86 1533 tbnz(flags, ConstantPoolCacheEntry::is_flattened_field_shift, is_flattened); 1534 } 1535 1536 void MacroAssembler::test_flattened_array_oop(Register oop, Register temp_reg, Label& is_flattened_array) { 1537 load_storage_props(temp_reg, oop); 1538 andr(temp_reg, temp_reg, ArrayStorageProperties::flattened_value); 1539 cbnz(temp_reg, is_flattened_array); 1540 } 1541 1542 void MacroAssembler::test_null_free_array_oop(Register oop, Register temp_reg, Label& is_null_free_array) { 1543 load_storage_props(temp_reg, oop); 1544 andr(temp_reg, temp_reg, ArrayStorageProperties::null_free_value); 1545 cbnz(temp_reg, is_null_free_array); 1546 } 1547 1548 // MacroAssembler protected routines needed to implement 1549 // public methods 1550 1551 void MacroAssembler::mov(Register r, Address dest) { 1552 code_section()->relocate(pc(), dest.rspec()); 1553 u_int64_t imm64 = (u_int64_t)dest.target(); 1554 movptr(r, imm64); 1555 } 1556 1557 // Move a constant pointer into r. In AArch64 mode the virtual 1558 // address space is 48 bits in size, so we only need three 1559 // instructions to create a patchable instruction sequence that can 1560 // reach anywhere. 1561 void MacroAssembler::movptr(Register r, uintptr_t imm64) { 1562 #ifndef PRODUCT 1563 { 1564 char buffer[64]; 1565 snprintf(buffer, sizeof(buffer), "0x%" PRIX64, imm64); 1566 block_comment(buffer); 1567 } 1568 #endif 1569 assert(imm64 < (1ul << 48), "48-bit overflow in address constant"); 1570 movz(r, imm64 & 0xffff); 1571 imm64 >>= 16; 1572 movk(r, imm64 & 0xffff, 16); 1573 imm64 >>= 16; 1574 movk(r, imm64 & 0xffff, 32); 1575 } 1576 1577 // Macro to mov replicated immediate to vector register. 1578 // Vd will get the following values for different arrangements in T 1579 // imm32 == hex 000000gh T8B: Vd = ghghghghghghghgh 1580 // imm32 == hex 000000gh T16B: Vd = ghghghghghghghghghghghghghghghgh 1581 // imm32 == hex 0000efgh T4H: Vd = efghefghefghefgh 1582 // imm32 == hex 0000efgh T8H: Vd = efghefghefghefghefghefghefghefgh 1583 // imm32 == hex abcdefgh T2S: Vd = abcdefghabcdefgh 1584 // imm32 == hex abcdefgh T4S: Vd = abcdefghabcdefghabcdefghabcdefgh 1585 // T1D/T2D: invalid 1586 void MacroAssembler::mov(FloatRegister Vd, SIMD_Arrangement T, u_int32_t imm32) { 1587 assert(T != T1D && T != T2D, "invalid arrangement"); 1588 if (T == T8B || T == T16B) { 1589 assert((imm32 & ~0xff) == 0, "extraneous bits in unsigned imm32 (T8B/T16B)"); 1590 movi(Vd, T, imm32 & 0xff, 0); 1591 return; 1592 } 1593 u_int32_t nimm32 = ~imm32; 1594 if (T == T4H || T == T8H) { 1595 assert((imm32 & ~0xffff) == 0, "extraneous bits in unsigned imm32 (T4H/T8H)"); 1596 imm32 &= 0xffff; 1597 nimm32 &= 0xffff; 1598 } 1599 u_int32_t x = imm32; 1600 int movi_cnt = 0; 1601 int movn_cnt = 0; 1602 while (x) { if (x & 0xff) movi_cnt++; x >>= 8; } 1603 x = nimm32; 1604 while (x) { if (x & 0xff) movn_cnt++; x >>= 8; } 1605 if (movn_cnt < movi_cnt) imm32 = nimm32; 1606 unsigned lsl = 0; 1607 while (imm32 && (imm32 & 0xff) == 0) { lsl += 8; imm32 >>= 8; } 1608 if (movn_cnt < movi_cnt) 1609 mvni(Vd, T, imm32 & 0xff, lsl); 1610 else 1611 movi(Vd, T, imm32 & 0xff, lsl); 1612 imm32 >>= 8; lsl += 8; 1613 while (imm32) { 1614 while ((imm32 & 0xff) == 0) { lsl += 8; imm32 >>= 8; } 1615 if (movn_cnt < movi_cnt) 1616 bici(Vd, T, imm32 & 0xff, lsl); 1617 else 1618 orri(Vd, T, imm32 & 0xff, lsl); 1619 lsl += 8; imm32 >>= 8; 1620 } 1621 } 1622 1623 void MacroAssembler::mov_immediate64(Register dst, u_int64_t imm64) 1624 { 1625 #ifndef PRODUCT 1626 { 1627 char buffer[64]; 1628 snprintf(buffer, sizeof(buffer), "0x%" PRIX64, imm64); 1629 block_comment(buffer); 1630 } 1631 #endif 1632 if (operand_valid_for_logical_immediate(false, imm64)) { 1633 orr(dst, zr, imm64); 1634 } else { 1635 // we can use a combination of MOVZ or MOVN with 1636 // MOVK to build up the constant 1637 u_int64_t imm_h[4]; 1638 int zero_count = 0; 1639 int neg_count = 0; 1640 int i; 1641 for (i = 0; i < 4; i++) { 1642 imm_h[i] = ((imm64 >> (i * 16)) & 0xffffL); 1643 if (imm_h[i] == 0) { 1644 zero_count++; 1645 } else if (imm_h[i] == 0xffffL) { 1646 neg_count++; 1647 } 1648 } 1649 if (zero_count == 4) { 1650 // one MOVZ will do 1651 movz(dst, 0); 1652 } else if (neg_count == 4) { 1653 // one MOVN will do 1654 movn(dst, 0); 1655 } else if (zero_count == 3) { 1656 for (i = 0; i < 4; i++) { 1657 if (imm_h[i] != 0L) { 1658 movz(dst, (u_int32_t)imm_h[i], (i << 4)); 1659 break; 1660 } 1661 } 1662 } else if (neg_count == 3) { 1663 // one MOVN will do 1664 for (int i = 0; i < 4; i++) { 1665 if (imm_h[i] != 0xffffL) { 1666 movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4)); 1667 break; 1668 } 1669 } 1670 } else if (zero_count == 2) { 1671 // one MOVZ and one MOVK will do 1672 for (i = 0; i < 3; i++) { 1673 if (imm_h[i] != 0L) { 1674 movz(dst, (u_int32_t)imm_h[i], (i << 4)); 1675 i++; 1676 break; 1677 } 1678 } 1679 for (;i < 4; i++) { 1680 if (imm_h[i] != 0L) { 1681 movk(dst, (u_int32_t)imm_h[i], (i << 4)); 1682 } 1683 } 1684 } else if (neg_count == 2) { 1685 // one MOVN and one MOVK will do 1686 for (i = 0; i < 4; i++) { 1687 if (imm_h[i] != 0xffffL) { 1688 movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4)); 1689 i++; 1690 break; 1691 } 1692 } 1693 for (;i < 4; i++) { 1694 if (imm_h[i] != 0xffffL) { 1695 movk(dst, (u_int32_t)imm_h[i], (i << 4)); 1696 } 1697 } 1698 } else if (zero_count == 1) { 1699 // one MOVZ and two MOVKs will do 1700 for (i = 0; i < 4; i++) { 1701 if (imm_h[i] != 0L) { 1702 movz(dst, (u_int32_t)imm_h[i], (i << 4)); 1703 i++; 1704 break; 1705 } 1706 } 1707 for (;i < 4; i++) { 1708 if (imm_h[i] != 0x0L) { 1709 movk(dst, (u_int32_t)imm_h[i], (i << 4)); 1710 } 1711 } 1712 } else if (neg_count == 1) { 1713 // one MOVN and two MOVKs will do 1714 for (i = 0; i < 4; i++) { 1715 if (imm_h[i] != 0xffffL) { 1716 movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4)); 1717 i++; 1718 break; 1719 } 1720 } 1721 for (;i < 4; i++) { 1722 if (imm_h[i] != 0xffffL) { 1723 movk(dst, (u_int32_t)imm_h[i], (i << 4)); 1724 } 1725 } 1726 } else { 1727 // use a MOVZ and 3 MOVKs (makes it easier to debug) 1728 movz(dst, (u_int32_t)imm_h[0], 0); 1729 for (i = 1; i < 4; i++) { 1730 movk(dst, (u_int32_t)imm_h[i], (i << 4)); 1731 } 1732 } 1733 } 1734 } 1735 1736 void MacroAssembler::mov_immediate32(Register dst, u_int32_t imm32) 1737 { 1738 #ifndef PRODUCT 1739 { 1740 char buffer[64]; 1741 snprintf(buffer, sizeof(buffer), "0x%" PRIX32, imm32); 1742 block_comment(buffer); 1743 } 1744 #endif 1745 if (operand_valid_for_logical_immediate(true, imm32)) { 1746 orrw(dst, zr, imm32); 1747 } else { 1748 // we can use MOVZ, MOVN or two calls to MOVK to build up the 1749 // constant 1750 u_int32_t imm_h[2]; 1751 imm_h[0] = imm32 & 0xffff; 1752 imm_h[1] = ((imm32 >> 16) & 0xffff); 1753 if (imm_h[0] == 0) { 1754 movzw(dst, imm_h[1], 16); 1755 } else if (imm_h[0] == 0xffff) { 1756 movnw(dst, imm_h[1] ^ 0xffff, 16); 1757 } else if (imm_h[1] == 0) { 1758 movzw(dst, imm_h[0], 0); 1759 } else if (imm_h[1] == 0xffff) { 1760 movnw(dst, imm_h[0] ^ 0xffff, 0); 1761 } else { 1762 // use a MOVZ and MOVK (makes it easier to debug) 1763 movzw(dst, imm_h[0], 0); 1764 movkw(dst, imm_h[1], 16); 1765 } 1766 } 1767 } 1768 1769 // Form an address from base + offset in Rd. Rd may or may 1770 // not actually be used: you must use the Address that is returned. 1771 // It is up to you to ensure that the shift provided matches the size 1772 // of your data. 1773 Address MacroAssembler::form_address(Register Rd, Register base, long byte_offset, int shift) { 1774 if (Address::offset_ok_for_immed(byte_offset, shift)) 1775 // It fits; no need for any heroics 1776 return Address(base, byte_offset); 1777 1778 // Don't do anything clever with negative or misaligned offsets 1779 unsigned mask = (1 << shift) - 1; 1780 if (byte_offset < 0 || byte_offset & mask) { 1781 mov(Rd, byte_offset); 1782 add(Rd, base, Rd); 1783 return Address(Rd); 1784 } 1785 1786 // See if we can do this with two 12-bit offsets 1787 { 1788 unsigned long word_offset = byte_offset >> shift; 1789 unsigned long masked_offset = word_offset & 0xfff000; 1790 if (Address::offset_ok_for_immed(word_offset - masked_offset) 1791 && Assembler::operand_valid_for_add_sub_immediate(masked_offset << shift)) { 1792 add(Rd, base, masked_offset << shift); 1793 word_offset -= masked_offset; 1794 return Address(Rd, word_offset << shift); 1795 } 1796 } 1797 1798 // Do it the hard way 1799 mov(Rd, byte_offset); 1800 add(Rd, base, Rd); 1801 return Address(Rd); 1802 } 1803 1804 void MacroAssembler::atomic_incw(Register counter_addr, Register tmp, Register tmp2) { 1805 if (UseLSE) { 1806 mov(tmp, 1); 1807 ldadd(Assembler::word, tmp, zr, counter_addr); 1808 return; 1809 } 1810 Label retry_load; 1811 if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH)) 1812 prfm(Address(counter_addr), PSTL1STRM); 1813 bind(retry_load); 1814 // flush and load exclusive from the memory location 1815 ldxrw(tmp, counter_addr); 1816 addw(tmp, tmp, 1); 1817 // if we store+flush with no intervening write tmp wil be zero 1818 stxrw(tmp2, tmp, counter_addr); 1819 cbnzw(tmp2, retry_load); 1820 } 1821 1822 1823 int MacroAssembler::corrected_idivl(Register result, Register ra, Register rb, 1824 bool want_remainder, Register scratch) 1825 { 1826 // Full implementation of Java idiv and irem. The function 1827 // returns the (pc) offset of the div instruction - may be needed 1828 // for implicit exceptions. 1829 // 1830 // constraint : ra/rb =/= scratch 1831 // normal case 1832 // 1833 // input : ra: dividend 1834 // rb: divisor 1835 // 1836 // result: either 1837 // quotient (= ra idiv rb) 1838 // remainder (= ra irem rb) 1839 1840 assert(ra != scratch && rb != scratch, "reg cannot be scratch"); 1841 1842 int idivl_offset = offset(); 1843 if (! want_remainder) { 1844 sdivw(result, ra, rb); 1845 } else { 1846 sdivw(scratch, ra, rb); 1847 Assembler::msubw(result, scratch, rb, ra); 1848 } 1849 1850 return idivl_offset; 1851 } 1852 1853 int MacroAssembler::corrected_idivq(Register result, Register ra, Register rb, 1854 bool want_remainder, Register scratch) 1855 { 1856 // Full implementation of Java ldiv and lrem. The function 1857 // returns the (pc) offset of the div instruction - may be needed 1858 // for implicit exceptions. 1859 // 1860 // constraint : ra/rb =/= scratch 1861 // normal case 1862 // 1863 // input : ra: dividend 1864 // rb: divisor 1865 // 1866 // result: either 1867 // quotient (= ra idiv rb) 1868 // remainder (= ra irem rb) 1869 1870 assert(ra != scratch && rb != scratch, "reg cannot be scratch"); 1871 1872 int idivq_offset = offset(); 1873 if (! want_remainder) { 1874 sdiv(result, ra, rb); 1875 } else { 1876 sdiv(scratch, ra, rb); 1877 Assembler::msub(result, scratch, rb, ra); 1878 } 1879 1880 return idivq_offset; 1881 } 1882 1883 void MacroAssembler::membar(Membar_mask_bits order_constraint) { 1884 address prev = pc() - NativeMembar::instruction_size; 1885 address last = code()->last_insn(); 1886 if (last != NULL && nativeInstruction_at(last)->is_Membar() && prev == last) { 1887 NativeMembar *bar = NativeMembar_at(prev); 1888 // We are merging two memory barrier instructions. On AArch64 we 1889 // can do this simply by ORing them together. 1890 bar->set_kind(bar->get_kind() | order_constraint); 1891 BLOCK_COMMENT("merged membar"); 1892 } else { 1893 code()->set_last_insn(pc()); 1894 dmb(Assembler::barrier(order_constraint)); 1895 } 1896 } 1897 1898 bool MacroAssembler::try_merge_ldst(Register rt, const Address &adr, size_t size_in_bytes, bool is_store) { 1899 if (ldst_can_merge(rt, adr, size_in_bytes, is_store)) { 1900 merge_ldst(rt, adr, size_in_bytes, is_store); 1901 code()->clear_last_insn(); 1902 return true; 1903 } else { 1904 assert(size_in_bytes == 8 || size_in_bytes == 4, "only 8 bytes or 4 bytes load/store is supported."); 1905 const unsigned mask = size_in_bytes - 1; 1906 if (adr.getMode() == Address::base_plus_offset && 1907 (adr.offset() & mask) == 0) { // only supports base_plus_offset. 1908 code()->set_last_insn(pc()); 1909 } 1910 return false; 1911 } 1912 } 1913 1914 void MacroAssembler::ldr(Register Rx, const Address &adr) { 1915 // We always try to merge two adjacent loads into one ldp. 1916 if (!try_merge_ldst(Rx, adr, 8, false)) { 1917 Assembler::ldr(Rx, adr); 1918 } 1919 } 1920 1921 void MacroAssembler::ldrw(Register Rw, const Address &adr) { 1922 // We always try to merge two adjacent loads into one ldp. 1923 if (!try_merge_ldst(Rw, adr, 4, false)) { 1924 Assembler::ldrw(Rw, adr); 1925 } 1926 } 1927 1928 void MacroAssembler::str(Register Rx, const Address &adr) { 1929 // We always try to merge two adjacent stores into one stp. 1930 if (!try_merge_ldst(Rx, adr, 8, true)) { 1931 Assembler::str(Rx, adr); 1932 } 1933 } 1934 1935 void MacroAssembler::strw(Register Rw, const Address &adr) { 1936 // We always try to merge two adjacent stores into one stp. 1937 if (!try_merge_ldst(Rw, adr, 4, true)) { 1938 Assembler::strw(Rw, adr); 1939 } 1940 } 1941 1942 // MacroAssembler routines found actually to be needed 1943 1944 void MacroAssembler::push(Register src) 1945 { 1946 str(src, Address(pre(esp, -1 * wordSize))); 1947 } 1948 1949 void MacroAssembler::pop(Register dst) 1950 { 1951 ldr(dst, Address(post(esp, 1 * wordSize))); 1952 } 1953 1954 // Note: load_unsigned_short used to be called load_unsigned_word. 1955 int MacroAssembler::load_unsigned_short(Register dst, Address src) { 1956 int off = offset(); 1957 ldrh(dst, src); 1958 return off; 1959 } 1960 1961 int MacroAssembler::load_unsigned_byte(Register dst, Address src) { 1962 int off = offset(); 1963 ldrb(dst, src); 1964 return off; 1965 } 1966 1967 int MacroAssembler::load_signed_short(Register dst, Address src) { 1968 int off = offset(); 1969 ldrsh(dst, src); 1970 return off; 1971 } 1972 1973 int MacroAssembler::load_signed_byte(Register dst, Address src) { 1974 int off = offset(); 1975 ldrsb(dst, src); 1976 return off; 1977 } 1978 1979 int MacroAssembler::load_signed_short32(Register dst, Address src) { 1980 int off = offset(); 1981 ldrshw(dst, src); 1982 return off; 1983 } 1984 1985 int MacroAssembler::load_signed_byte32(Register dst, Address src) { 1986 int off = offset(); 1987 ldrsbw(dst, src); 1988 return off; 1989 } 1990 1991 void MacroAssembler::load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed, Register dst2) { 1992 switch (size_in_bytes) { 1993 case 8: ldr(dst, src); break; 1994 case 4: ldrw(dst, src); break; 1995 case 2: is_signed ? load_signed_short(dst, src) : load_unsigned_short(dst, src); break; 1996 case 1: is_signed ? load_signed_byte( dst, src) : load_unsigned_byte( dst, src); break; 1997 default: ShouldNotReachHere(); 1998 } 1999 } 2000 2001 void MacroAssembler::store_sized_value(Address dst, Register src, size_t size_in_bytes, Register src2) { 2002 switch (size_in_bytes) { 2003 case 8: str(src, dst); break; 2004 case 4: strw(src, dst); break; 2005 case 2: strh(src, dst); break; 2006 case 1: strb(src, dst); break; 2007 default: ShouldNotReachHere(); 2008 } 2009 } 2010 2011 void MacroAssembler::decrementw(Register reg, int value) 2012 { 2013 if (value < 0) { incrementw(reg, -value); return; } 2014 if (value == 0) { return; } 2015 if (value < (1 << 12)) { subw(reg, reg, value); return; } 2016 /* else */ { 2017 guarantee(reg != rscratch2, "invalid dst for register decrement"); 2018 movw(rscratch2, (unsigned)value); 2019 subw(reg, reg, rscratch2); 2020 } 2021 } 2022 2023 void MacroAssembler::decrement(Register reg, int value) 2024 { 2025 if (value < 0) { increment(reg, -value); return; } 2026 if (value == 0) { return; } 2027 if (value < (1 << 12)) { sub(reg, reg, value); return; } 2028 /* else */ { 2029 assert(reg != rscratch2, "invalid dst for register decrement"); 2030 mov(rscratch2, (unsigned long)value); 2031 sub(reg, reg, rscratch2); 2032 } 2033 } 2034 2035 void MacroAssembler::decrementw(Address dst, int value) 2036 { 2037 assert(!dst.uses(rscratch1), "invalid dst for address decrement"); 2038 if (dst.getMode() == Address::literal) { 2039 assert(abs(value) < (1 << 12), "invalid value and address mode combination"); 2040 lea(rscratch2, dst); 2041 dst = Address(rscratch2); 2042 } 2043 ldrw(rscratch1, dst); 2044 decrementw(rscratch1, value); 2045 strw(rscratch1, dst); 2046 } 2047 2048 void MacroAssembler::decrement(Address dst, int value) 2049 { 2050 assert(!dst.uses(rscratch1), "invalid address for decrement"); 2051 if (dst.getMode() == Address::literal) { 2052 assert(abs(value) < (1 << 12), "invalid value and address mode combination"); 2053 lea(rscratch2, dst); 2054 dst = Address(rscratch2); 2055 } 2056 ldr(rscratch1, dst); 2057 decrement(rscratch1, value); 2058 str(rscratch1, dst); 2059 } 2060 2061 void MacroAssembler::incrementw(Register reg, int value) 2062 { 2063 if (value < 0) { decrementw(reg, -value); return; } 2064 if (value == 0) { return; } 2065 if (value < (1 << 12)) { addw(reg, reg, value); return; } 2066 /* else */ { 2067 assert(reg != rscratch2, "invalid dst for register increment"); 2068 movw(rscratch2, (unsigned)value); 2069 addw(reg, reg, rscratch2); 2070 } 2071 } 2072 2073 void MacroAssembler::increment(Register reg, int value) 2074 { 2075 if (value < 0) { decrement(reg, -value); return; } 2076 if (value == 0) { return; } 2077 if (value < (1 << 12)) { add(reg, reg, value); return; } 2078 /* else */ { 2079 assert(reg != rscratch2, "invalid dst for register increment"); 2080 movw(rscratch2, (unsigned)value); 2081 add(reg, reg, rscratch2); 2082 } 2083 } 2084 2085 void MacroAssembler::incrementw(Address dst, int value) 2086 { 2087 assert(!dst.uses(rscratch1), "invalid dst for address increment"); 2088 if (dst.getMode() == Address::literal) { 2089 assert(abs(value) < (1 << 12), "invalid value and address mode combination"); 2090 lea(rscratch2, dst); 2091 dst = Address(rscratch2); 2092 } 2093 ldrw(rscratch1, dst); 2094 incrementw(rscratch1, value); 2095 strw(rscratch1, dst); 2096 } 2097 2098 void MacroAssembler::increment(Address dst, int value) 2099 { 2100 assert(!dst.uses(rscratch1), "invalid dst for address increment"); 2101 if (dst.getMode() == Address::literal) { 2102 assert(abs(value) < (1 << 12), "invalid value and address mode combination"); 2103 lea(rscratch2, dst); 2104 dst = Address(rscratch2); 2105 } 2106 ldr(rscratch1, dst); 2107 increment(rscratch1, value); 2108 str(rscratch1, dst); 2109 } 2110 2111 2112 void MacroAssembler::pusha() { 2113 push(0x7fffffff, sp); 2114 } 2115 2116 void MacroAssembler::popa() { 2117 pop(0x7fffffff, sp); 2118 } 2119 2120 // Push lots of registers in the bit set supplied. Don't push sp. 2121 // Return the number of words pushed 2122 int MacroAssembler::push(unsigned int bitset, Register stack) { 2123 int words_pushed = 0; 2124 2125 // Scan bitset to accumulate register pairs 2126 unsigned char regs[32]; 2127 int count = 0; 2128 for (int reg = 0; reg <= 30; reg++) { 2129 if (1 & bitset) 2130 regs[count++] = reg; 2131 bitset >>= 1; 2132 } 2133 regs[count++] = zr->encoding_nocheck(); 2134 count &= ~1; // Only push an even nuber of regs 2135 2136 if (count) { 2137 stp(as_Register(regs[0]), as_Register(regs[1]), 2138 Address(pre(stack, -count * wordSize))); 2139 words_pushed += 2; 2140 } 2141 for (int i = 2; i < count; i += 2) { 2142 stp(as_Register(regs[i]), as_Register(regs[i+1]), 2143 Address(stack, i * wordSize)); 2144 words_pushed += 2; 2145 } 2146 2147 assert(words_pushed == count, "oops, pushed != count"); 2148 2149 return count; 2150 } 2151 2152 int MacroAssembler::pop(unsigned int bitset, Register stack) { 2153 int words_pushed = 0; 2154 2155 // Scan bitset to accumulate register pairs 2156 unsigned char regs[32]; 2157 int count = 0; 2158 for (int reg = 0; reg <= 30; reg++) { 2159 if (1 & bitset) 2160 regs[count++] = reg; 2161 bitset >>= 1; 2162 } 2163 regs[count++] = zr->encoding_nocheck(); 2164 count &= ~1; 2165 2166 for (int i = 2; i < count; i += 2) { 2167 ldp(as_Register(regs[i]), as_Register(regs[i+1]), 2168 Address(stack, i * wordSize)); 2169 words_pushed += 2; 2170 } 2171 if (count) { 2172 ldp(as_Register(regs[0]), as_Register(regs[1]), 2173 Address(post(stack, count * wordSize))); 2174 words_pushed += 2; 2175 } 2176 2177 assert(words_pushed == count, "oops, pushed != count"); 2178 2179 return count; 2180 } 2181 #ifdef ASSERT 2182 void MacroAssembler::verify_heapbase(const char* msg) { 2183 #if 0 2184 assert (UseCompressedOops || UseCompressedClassPointers, "should be compressed"); 2185 assert (Universe::heap() != NULL, "java heap should be initialized"); 2186 if (CheckCompressedOops) { 2187 Label ok; 2188 push(1 << rscratch1->encoding(), sp); // cmpptr trashes rscratch1 2189 cmpptr(rheapbase, ExternalAddress((address)CompressedOops::ptrs_base_addr())); 2190 br(Assembler::EQ, ok); 2191 stop(msg); 2192 bind(ok); 2193 pop(1 << rscratch1->encoding(), sp); 2194 } 2195 #endif 2196 } 2197 #endif 2198 2199 void MacroAssembler::resolve_jobject(Register value, Register thread, Register tmp) { 2200 Label done, not_weak; 2201 cbz(value, done); // Use NULL as-is. 2202 2203 STATIC_ASSERT(JNIHandles::weak_tag_mask == 1u); 2204 tbz(r0, 0, not_weak); // Test for jweak tag. 2205 2206 // Resolve jweak. 2207 access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF, value, 2208 Address(value, -JNIHandles::weak_tag_value), tmp, thread); 2209 verify_oop(value); 2210 b(done); 2211 2212 bind(not_weak); 2213 // Resolve (untagged) jobject. 2214 access_load_at(T_OBJECT, IN_NATIVE, value, Address(value, 0), tmp, thread); 2215 verify_oop(value); 2216 bind(done); 2217 } 2218 2219 void MacroAssembler::stop(const char* msg) { 2220 address ip = pc(); 2221 pusha(); 2222 mov(c_rarg0, (address)msg); 2223 mov(c_rarg1, (address)ip); 2224 mov(c_rarg2, sp); 2225 mov(c_rarg3, CAST_FROM_FN_PTR(address, MacroAssembler::debug64)); 2226 blr(c_rarg3); 2227 hlt(0); 2228 } 2229 2230 void MacroAssembler::warn(const char* msg) { 2231 pusha(); 2232 mov(c_rarg0, (address)msg); 2233 mov(lr, CAST_FROM_FN_PTR(address, warning)); 2234 blr(lr); 2235 popa(); 2236 } 2237 2238 void MacroAssembler::unimplemented(const char* what) { 2239 const char* buf = NULL; 2240 { 2241 ResourceMark rm; 2242 stringStream ss; 2243 ss.print("unimplemented: %s", what); 2244 buf = code_string(ss.as_string()); 2245 } 2246 stop(buf); 2247 } 2248 2249 // If a constant does not fit in an immediate field, generate some 2250 // number of MOV instructions and then perform the operation. 2251 void MacroAssembler::wrap_add_sub_imm_insn(Register Rd, Register Rn, unsigned imm, 2252 add_sub_imm_insn insn1, 2253 add_sub_reg_insn insn2) { 2254 assert(Rd != zr, "Rd = zr and not setting flags?"); 2255 if (operand_valid_for_add_sub_immediate((int)imm)) { 2256 (this->*insn1)(Rd, Rn, imm); 2257 } else { 2258 if (uabs(imm) < (1 << 24)) { 2259 (this->*insn1)(Rd, Rn, imm & -(1 << 12)); 2260 (this->*insn1)(Rd, Rd, imm & ((1 << 12)-1)); 2261 } else { 2262 assert_different_registers(Rd, Rn); 2263 mov(Rd, (uint64_t)imm); 2264 (this->*insn2)(Rd, Rn, Rd, LSL, 0); 2265 } 2266 } 2267 } 2268 2269 // Seperate vsn which sets the flags. Optimisations are more restricted 2270 // because we must set the flags correctly. 2271 void MacroAssembler::wrap_adds_subs_imm_insn(Register Rd, Register Rn, unsigned imm, 2272 add_sub_imm_insn insn1, 2273 add_sub_reg_insn insn2) { 2274 if (operand_valid_for_add_sub_immediate((int)imm)) { 2275 (this->*insn1)(Rd, Rn, imm); 2276 } else { 2277 assert_different_registers(Rd, Rn); 2278 assert(Rd != zr, "overflow in immediate operand"); 2279 mov(Rd, (uint64_t)imm); 2280 (this->*insn2)(Rd, Rn, Rd, LSL, 0); 2281 } 2282 } 2283 2284 2285 void MacroAssembler::add(Register Rd, Register Rn, RegisterOrConstant increment) { 2286 if (increment.is_register()) { 2287 add(Rd, Rn, increment.as_register()); 2288 } else { 2289 add(Rd, Rn, increment.as_constant()); 2290 } 2291 } 2292 2293 void MacroAssembler::addw(Register Rd, Register Rn, RegisterOrConstant increment) { 2294 if (increment.is_register()) { 2295 addw(Rd, Rn, increment.as_register()); 2296 } else { 2297 addw(Rd, Rn, increment.as_constant()); 2298 } 2299 } 2300 2301 void MacroAssembler::sub(Register Rd, Register Rn, RegisterOrConstant decrement) { 2302 if (decrement.is_register()) { 2303 sub(Rd, Rn, decrement.as_register()); 2304 } else { 2305 sub(Rd, Rn, decrement.as_constant()); 2306 } 2307 } 2308 2309 void MacroAssembler::subw(Register Rd, Register Rn, RegisterOrConstant decrement) { 2310 if (decrement.is_register()) { 2311 subw(Rd, Rn, decrement.as_register()); 2312 } else { 2313 subw(Rd, Rn, decrement.as_constant()); 2314 } 2315 } 2316 2317 void MacroAssembler::reinit_heapbase() 2318 { 2319 if (UseCompressedOops) { 2320 if (Universe::is_fully_initialized()) { 2321 mov(rheapbase, CompressedOops::ptrs_base()); 2322 } else { 2323 lea(rheapbase, ExternalAddress((address)CompressedOops::ptrs_base_addr())); 2324 ldr(rheapbase, Address(rheapbase)); 2325 } 2326 } 2327 } 2328 2329 // this simulates the behaviour of the x86 cmpxchg instruction using a 2330 // load linked/store conditional pair. we use the acquire/release 2331 // versions of these instructions so that we flush pending writes as 2332 // per Java semantics. 2333 2334 // n.b the x86 version assumes the old value to be compared against is 2335 // in rax and updates rax with the value located in memory if the 2336 // cmpxchg fails. we supply a register for the old value explicitly 2337 2338 // the aarch64 load linked/store conditional instructions do not 2339 // accept an offset. so, unlike x86, we must provide a plain register 2340 // to identify the memory word to be compared/exchanged rather than a 2341 // register+offset Address. 2342 2343 void MacroAssembler::cmpxchgptr(Register oldv, Register newv, Register addr, Register tmp, 2344 Label &succeed, Label *fail) { 2345 // oldv holds comparison value 2346 // newv holds value to write in exchange 2347 // addr identifies memory word to compare against/update 2348 if (UseLSE) { 2349 mov(tmp, oldv); 2350 casal(Assembler::xword, oldv, newv, addr); 2351 cmp(tmp, oldv); 2352 br(Assembler::EQ, succeed); 2353 membar(AnyAny); 2354 } else { 2355 Label retry_load, nope; 2356 if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH)) 2357 prfm(Address(addr), PSTL1STRM); 2358 bind(retry_load); 2359 // flush and load exclusive from the memory location 2360 // and fail if it is not what we expect 2361 ldaxr(tmp, addr); 2362 cmp(tmp, oldv); 2363 br(Assembler::NE, nope); 2364 // if we store+flush with no intervening write tmp wil be zero 2365 stlxr(tmp, newv, addr); 2366 cbzw(tmp, succeed); 2367 // retry so we only ever return after a load fails to compare 2368 // ensures we don't return a stale value after a failed write. 2369 b(retry_load); 2370 // if the memory word differs we return it in oldv and signal a fail 2371 bind(nope); 2372 membar(AnyAny); 2373 mov(oldv, tmp); 2374 } 2375 if (fail) 2376 b(*fail); 2377 } 2378 2379 void MacroAssembler::cmpxchg_obj_header(Register oldv, Register newv, Register obj, Register tmp, 2380 Label &succeed, Label *fail) { 2381 assert(oopDesc::mark_offset_in_bytes() == 0, "assumption"); 2382 cmpxchgptr(oldv, newv, obj, tmp, succeed, fail); 2383 } 2384 2385 void MacroAssembler::cmpxchgw(Register oldv, Register newv, Register addr, Register tmp, 2386 Label &succeed, Label *fail) { 2387 // oldv holds comparison value 2388 // newv holds value to write in exchange 2389 // addr identifies memory word to compare against/update 2390 // tmp returns 0/1 for success/failure 2391 if (UseLSE) { 2392 mov(tmp, oldv); 2393 casal(Assembler::word, oldv, newv, addr); 2394 cmp(tmp, oldv); 2395 br(Assembler::EQ, succeed); 2396 membar(AnyAny); 2397 } else { 2398 Label retry_load, nope; 2399 if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH)) 2400 prfm(Address(addr), PSTL1STRM); 2401 bind(retry_load); 2402 // flush and load exclusive from the memory location 2403 // and fail if it is not what we expect 2404 ldaxrw(tmp, addr); 2405 cmp(tmp, oldv); 2406 br(Assembler::NE, nope); 2407 // if we store+flush with no intervening write tmp wil be zero 2408 stlxrw(tmp, newv, addr); 2409 cbzw(tmp, succeed); 2410 // retry so we only ever return after a load fails to compare 2411 // ensures we don't return a stale value after a failed write. 2412 b(retry_load); 2413 // if the memory word differs we return it in oldv and signal a fail 2414 bind(nope); 2415 membar(AnyAny); 2416 mov(oldv, tmp); 2417 } 2418 if (fail) 2419 b(*fail); 2420 } 2421 2422 // A generic CAS; success or failure is in the EQ flag. A weak CAS 2423 // doesn't retry and may fail spuriously. If the oldval is wanted, 2424 // Pass a register for the result, otherwise pass noreg. 2425 2426 // Clobbers rscratch1 2427 void MacroAssembler::cmpxchg(Register addr, Register expected, 2428 Register new_val, 2429 enum operand_size size, 2430 bool acquire, bool release, 2431 bool weak, 2432 Register result) { 2433 if (result == noreg) result = rscratch1; 2434 BLOCK_COMMENT("cmpxchg {"); 2435 if (UseLSE) { 2436 mov(result, expected); 2437 lse_cas(result, new_val, addr, size, acquire, release, /*not_pair*/ true); 2438 compare_eq(result, expected, size); 2439 } else { 2440 Label retry_load, done; 2441 if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH)) 2442 prfm(Address(addr), PSTL1STRM); 2443 bind(retry_load); 2444 load_exclusive(result, addr, size, acquire); 2445 compare_eq(result, expected, size); 2446 br(Assembler::NE, done); 2447 store_exclusive(rscratch1, new_val, addr, size, release); 2448 if (weak) { 2449 cmpw(rscratch1, 0u); // If the store fails, return NE to our caller. 2450 } else { 2451 cbnzw(rscratch1, retry_load); 2452 } 2453 bind(done); 2454 } 2455 BLOCK_COMMENT("} cmpxchg"); 2456 } 2457 2458 // A generic comparison. Only compares for equality, clobbers rscratch1. 2459 void MacroAssembler::compare_eq(Register rm, Register rn, enum operand_size size) { 2460 if (size == xword) { 2461 cmp(rm, rn); 2462 } else if (size == word) { 2463 cmpw(rm, rn); 2464 } else if (size == halfword) { 2465 eorw(rscratch1, rm, rn); 2466 ands(zr, rscratch1, 0xffff); 2467 } else if (size == byte) { 2468 eorw(rscratch1, rm, rn); 2469 ands(zr, rscratch1, 0xff); 2470 } else { 2471 ShouldNotReachHere(); 2472 } 2473 } 2474 2475 2476 static bool different(Register a, RegisterOrConstant b, Register c) { 2477 if (b.is_constant()) 2478 return a != c; 2479 else 2480 return a != b.as_register() && a != c && b.as_register() != c; 2481 } 2482 2483 #define ATOMIC_OP(NAME, LDXR, OP, IOP, AOP, STXR, sz) \ 2484 void MacroAssembler::atomic_##NAME(Register prev, RegisterOrConstant incr, Register addr) { \ 2485 if (UseLSE) { \ 2486 prev = prev->is_valid() ? prev : zr; \ 2487 if (incr.is_register()) { \ 2488 AOP(sz, incr.as_register(), prev, addr); \ 2489 } else { \ 2490 mov(rscratch2, incr.as_constant()); \ 2491 AOP(sz, rscratch2, prev, addr); \ 2492 } \ 2493 return; \ 2494 } \ 2495 Register result = rscratch2; \ 2496 if (prev->is_valid()) \ 2497 result = different(prev, incr, addr) ? prev : rscratch2; \ 2498 \ 2499 Label retry_load; \ 2500 if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH)) \ 2501 prfm(Address(addr), PSTL1STRM); \ 2502 bind(retry_load); \ 2503 LDXR(result, addr); \ 2504 OP(rscratch1, result, incr); \ 2505 STXR(rscratch2, rscratch1, addr); \ 2506 cbnzw(rscratch2, retry_load); \ 2507 if (prev->is_valid() && prev != result) { \ 2508 IOP(prev, rscratch1, incr); \ 2509 } \ 2510 } 2511 2512 ATOMIC_OP(add, ldxr, add, sub, ldadd, stxr, Assembler::xword) 2513 ATOMIC_OP(addw, ldxrw, addw, subw, ldadd, stxrw, Assembler::word) 2514 ATOMIC_OP(addal, ldaxr, add, sub, ldaddal, stlxr, Assembler::xword) 2515 ATOMIC_OP(addalw, ldaxrw, addw, subw, ldaddal, stlxrw, Assembler::word) 2516 2517 #undef ATOMIC_OP 2518 2519 #define ATOMIC_XCHG(OP, AOP, LDXR, STXR, sz) \ 2520 void MacroAssembler::atomic_##OP(Register prev, Register newv, Register addr) { \ 2521 if (UseLSE) { \ 2522 prev = prev->is_valid() ? prev : zr; \ 2523 AOP(sz, newv, prev, addr); \ 2524 return; \ 2525 } \ 2526 Register result = rscratch2; \ 2527 if (prev->is_valid()) \ 2528 result = different(prev, newv, addr) ? prev : rscratch2; \ 2529 \ 2530 Label retry_load; \ 2531 if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH)) \ 2532 prfm(Address(addr), PSTL1STRM); \ 2533 bind(retry_load); \ 2534 LDXR(result, addr); \ 2535 STXR(rscratch1, newv, addr); \ 2536 cbnzw(rscratch1, retry_load); \ 2537 if (prev->is_valid() && prev != result) \ 2538 mov(prev, result); \ 2539 } 2540 2541 ATOMIC_XCHG(xchg, swp, ldxr, stxr, Assembler::xword) 2542 ATOMIC_XCHG(xchgw, swp, ldxrw, stxrw, Assembler::word) 2543 ATOMIC_XCHG(xchgal, swpal, ldaxr, stlxr, Assembler::xword) 2544 ATOMIC_XCHG(xchgalw, swpal, ldaxrw, stlxrw, Assembler::word) 2545 2546 #undef ATOMIC_XCHG 2547 2548 #ifndef PRODUCT 2549 extern "C" void findpc(intptr_t x); 2550 #endif 2551 2552 void MacroAssembler::debug64(char* msg, int64_t pc, int64_t regs[]) 2553 { 2554 // In order to get locks to work, we need to fake a in_VM state 2555 if (ShowMessageBoxOnError ) { 2556 JavaThread* thread = JavaThread::current(); 2557 JavaThreadState saved_state = thread->thread_state(); 2558 thread->set_thread_state(_thread_in_vm); 2559 #ifndef PRODUCT 2560 if (CountBytecodes || TraceBytecodes || StopInterpreterAt) { 2561 ttyLocker ttyl; 2562 BytecodeCounter::print(); 2563 } 2564 #endif 2565 if (os::message_box(msg, "Execution stopped, print registers?")) { 2566 ttyLocker ttyl; 2567 tty->print_cr(" pc = 0x%016lx", pc); 2568 #ifndef PRODUCT 2569 tty->cr(); 2570 findpc(pc); 2571 tty->cr(); 2572 #endif 2573 tty->print_cr(" r0 = 0x%016lx", regs[0]); 2574 tty->print_cr(" r1 = 0x%016lx", regs[1]); 2575 tty->print_cr(" r2 = 0x%016lx", regs[2]); 2576 tty->print_cr(" r3 = 0x%016lx", regs[3]); 2577 tty->print_cr(" r4 = 0x%016lx", regs[4]); 2578 tty->print_cr(" r5 = 0x%016lx", regs[5]); 2579 tty->print_cr(" r6 = 0x%016lx", regs[6]); 2580 tty->print_cr(" r7 = 0x%016lx", regs[7]); 2581 tty->print_cr(" r8 = 0x%016lx", regs[8]); 2582 tty->print_cr(" r9 = 0x%016lx", regs[9]); 2583 tty->print_cr("r10 = 0x%016lx", regs[10]); 2584 tty->print_cr("r11 = 0x%016lx", regs[11]); 2585 tty->print_cr("r12 = 0x%016lx", regs[12]); 2586 tty->print_cr("r13 = 0x%016lx", regs[13]); 2587 tty->print_cr("r14 = 0x%016lx", regs[14]); 2588 tty->print_cr("r15 = 0x%016lx", regs[15]); 2589 tty->print_cr("r16 = 0x%016lx", regs[16]); 2590 tty->print_cr("r17 = 0x%016lx", regs[17]); 2591 tty->print_cr("r18 = 0x%016lx", regs[18]); 2592 tty->print_cr("r19 = 0x%016lx", regs[19]); 2593 tty->print_cr("r20 = 0x%016lx", regs[20]); 2594 tty->print_cr("r21 = 0x%016lx", regs[21]); 2595 tty->print_cr("r22 = 0x%016lx", regs[22]); 2596 tty->print_cr("r23 = 0x%016lx", regs[23]); 2597 tty->print_cr("r24 = 0x%016lx", regs[24]); 2598 tty->print_cr("r25 = 0x%016lx", regs[25]); 2599 tty->print_cr("r26 = 0x%016lx", regs[26]); 2600 tty->print_cr("r27 = 0x%016lx", regs[27]); 2601 tty->print_cr("r28 = 0x%016lx", regs[28]); 2602 tty->print_cr("r30 = 0x%016lx", regs[30]); 2603 tty->print_cr("r31 = 0x%016lx", regs[31]); 2604 BREAKPOINT; 2605 } 2606 } 2607 fatal("DEBUG MESSAGE: %s", msg); 2608 } 2609 2610 void MacroAssembler::push_call_clobbered_registers() { 2611 int step = 4 * wordSize; 2612 push(RegSet::range(r0, r18) - RegSet::of(rscratch1, rscratch2), sp); 2613 sub(sp, sp, step); 2614 mov(rscratch1, -step); 2615 // Push v0-v7, v16-v31. 2616 for (int i = 31; i>= 4; i -= 4) { 2617 if (i <= v7->encoding() || i >= v16->encoding()) 2618 st1(as_FloatRegister(i-3), as_FloatRegister(i-2), as_FloatRegister(i-1), 2619 as_FloatRegister(i), T1D, Address(post(sp, rscratch1))); 2620 } 2621 st1(as_FloatRegister(0), as_FloatRegister(1), as_FloatRegister(2), 2622 as_FloatRegister(3), T1D, Address(sp)); 2623 } 2624 2625 void MacroAssembler::pop_call_clobbered_registers() { 2626 for (int i = 0; i < 32; i += 4) { 2627 if (i <= v7->encoding() || i >= v16->encoding()) 2628 ld1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2), 2629 as_FloatRegister(i+3), T1D, Address(post(sp, 4 * wordSize))); 2630 } 2631 2632 pop(RegSet::range(r0, r18) - RegSet::of(rscratch1, rscratch2), sp); 2633 } 2634 2635 void MacroAssembler::push_CPU_state(bool save_vectors) { 2636 int step = (save_vectors ? 8 : 4) * wordSize; 2637 push(0x3fffffff, sp); // integer registers except lr & sp 2638 mov(rscratch1, -step); 2639 sub(sp, sp, step); 2640 for (int i = 28; i >= 4; i -= 4) { 2641 st1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2), 2642 as_FloatRegister(i+3), save_vectors ? T2D : T1D, Address(post(sp, rscratch1))); 2643 } 2644 st1(v0, v1, v2, v3, save_vectors ? T2D : T1D, sp); 2645 } 2646 2647 void MacroAssembler::pop_CPU_state(bool restore_vectors) { 2648 int step = (restore_vectors ? 8 : 4) * wordSize; 2649 for (int i = 0; i <= 28; i += 4) 2650 ld1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2), 2651 as_FloatRegister(i+3), restore_vectors ? T2D : T1D, Address(post(sp, step))); 2652 pop(0x3fffffff, sp); // integer registers except lr & sp 2653 } 2654 2655 /** 2656 * Helpers for multiply_to_len(). 2657 */ 2658 void MacroAssembler::add2_with_carry(Register final_dest_hi, Register dest_hi, Register dest_lo, 2659 Register src1, Register src2) { 2660 adds(dest_lo, dest_lo, src1); 2661 adc(dest_hi, dest_hi, zr); 2662 adds(dest_lo, dest_lo, src2); 2663 adc(final_dest_hi, dest_hi, zr); 2664 } 2665 2666 // Generate an address from (r + r1 extend offset). "size" is the 2667 // size of the operand. The result may be in rscratch2. 2668 Address MacroAssembler::offsetted_address(Register r, Register r1, 2669 Address::extend ext, int offset, int size) { 2670 if (offset || (ext.shift() % size != 0)) { 2671 lea(rscratch2, Address(r, r1, ext)); 2672 return Address(rscratch2, offset); 2673 } else { 2674 return Address(r, r1, ext); 2675 } 2676 } 2677 2678 Address MacroAssembler::spill_address(int size, int offset, Register tmp) 2679 { 2680 assert(offset >= 0, "spill to negative address?"); 2681 // Offset reachable ? 2682 // Not aligned - 9 bits signed offset 2683 // Aligned - 12 bits unsigned offset shifted 2684 Register base = sp; 2685 if ((offset & (size-1)) && offset >= (1<<8)) { 2686 add(tmp, base, offset & ((1<<12)-1)); 2687 base = tmp; 2688 offset &= -1u<<12; 2689 } 2690 2691 if (offset >= (1<<12) * size) { 2692 add(tmp, base, offset & (((1<<12)-1)<<12)); 2693 base = tmp; 2694 offset &= ~(((1<<12)-1)<<12); 2695 } 2696 2697 return Address(base, offset); 2698 } 2699 2700 // Checks whether offset is aligned. 2701 // Returns true if it is, else false. 2702 bool MacroAssembler::merge_alignment_check(Register base, 2703 size_t size, 2704 long cur_offset, 2705 long prev_offset) const { 2706 if (AvoidUnalignedAccesses) { 2707 if (base == sp) { 2708 // Checks whether low offset if aligned to pair of registers. 2709 long pair_mask = size * 2 - 1; 2710 long offset = prev_offset > cur_offset ? cur_offset : prev_offset; 2711 return (offset & pair_mask) == 0; 2712 } else { // If base is not sp, we can't guarantee the access is aligned. 2713 return false; 2714 } 2715 } else { 2716 long mask = size - 1; 2717 // Load/store pair instruction only supports element size aligned offset. 2718 return (cur_offset & mask) == 0 && (prev_offset & mask) == 0; 2719 } 2720 } 2721 2722 // Checks whether current and previous loads/stores can be merged. 2723 // Returns true if it can be merged, else false. 2724 bool MacroAssembler::ldst_can_merge(Register rt, 2725 const Address &adr, 2726 size_t cur_size_in_bytes, 2727 bool is_store) const { 2728 address prev = pc() - NativeInstruction::instruction_size; 2729 address last = code()->last_insn(); 2730 2731 if (last == NULL || !nativeInstruction_at(last)->is_Imm_LdSt()) { 2732 return false; 2733 } 2734 2735 if (adr.getMode() != Address::base_plus_offset || prev != last) { 2736 return false; 2737 } 2738 2739 NativeLdSt* prev_ldst = NativeLdSt_at(prev); 2740 size_t prev_size_in_bytes = prev_ldst->size_in_bytes(); 2741 2742 assert(prev_size_in_bytes == 4 || prev_size_in_bytes == 8, "only supports 64/32bit merging."); 2743 assert(cur_size_in_bytes == 4 || cur_size_in_bytes == 8, "only supports 64/32bit merging."); 2744 2745 if (cur_size_in_bytes != prev_size_in_bytes || is_store != prev_ldst->is_store()) { 2746 return false; 2747 } 2748 2749 long max_offset = 63 * prev_size_in_bytes; 2750 long min_offset = -64 * prev_size_in_bytes; 2751 2752 assert(prev_ldst->is_not_pre_post_index(), "pre-index or post-index is not supported to be merged."); 2753 2754 // Only same base can be merged. 2755 if (adr.base() != prev_ldst->base()) { 2756 return false; 2757 } 2758 2759 long cur_offset = adr.offset(); 2760 long prev_offset = prev_ldst->offset(); 2761 size_t diff = abs(cur_offset - prev_offset); 2762 if (diff != prev_size_in_bytes) { 2763 return false; 2764 } 2765 2766 // Following cases can not be merged: 2767 // ldr x2, [x2, #8] 2768 // ldr x3, [x2, #16] 2769 // or: 2770 // ldr x2, [x3, #8] 2771 // ldr x2, [x3, #16] 2772 // If t1 and t2 is the same in "ldp t1, t2, [xn, #imm]", we'll get SIGILL. 2773 if (!is_store && (adr.base() == prev_ldst->target() || rt == prev_ldst->target())) { 2774 return false; 2775 } 2776 2777 long low_offset = prev_offset > cur_offset ? cur_offset : prev_offset; 2778 // Offset range must be in ldp/stp instruction's range. 2779 if (low_offset > max_offset || low_offset < min_offset) { 2780 return false; 2781 } 2782 2783 if (merge_alignment_check(adr.base(), prev_size_in_bytes, cur_offset, prev_offset)) { 2784 return true; 2785 } 2786 2787 return false; 2788 } 2789 2790 // Merge current load/store with previous load/store into ldp/stp. 2791 void MacroAssembler::merge_ldst(Register rt, 2792 const Address &adr, 2793 size_t cur_size_in_bytes, 2794 bool is_store) { 2795 2796 assert(ldst_can_merge(rt, adr, cur_size_in_bytes, is_store) == true, "cur and prev must be able to be merged."); 2797 2798 Register rt_low, rt_high; 2799 address prev = pc() - NativeInstruction::instruction_size; 2800 NativeLdSt* prev_ldst = NativeLdSt_at(prev); 2801 2802 long offset; 2803 2804 if (adr.offset() < prev_ldst->offset()) { 2805 offset = adr.offset(); 2806 rt_low = rt; 2807 rt_high = prev_ldst->target(); 2808 } else { 2809 offset = prev_ldst->offset(); 2810 rt_low = prev_ldst->target(); 2811 rt_high = rt; 2812 } 2813 2814 Address adr_p = Address(prev_ldst->base(), offset); 2815 // Overwrite previous generated binary. 2816 code_section()->set_end(prev); 2817 2818 const int sz = prev_ldst->size_in_bytes(); 2819 assert(sz == 8 || sz == 4, "only supports 64/32bit merging."); 2820 if (!is_store) { 2821 BLOCK_COMMENT("merged ldr pair"); 2822 if (sz == 8) { 2823 ldp(rt_low, rt_high, adr_p); 2824 } else { 2825 ldpw(rt_low, rt_high, adr_p); 2826 } 2827 } else { 2828 BLOCK_COMMENT("merged str pair"); 2829 if (sz == 8) { 2830 stp(rt_low, rt_high, adr_p); 2831 } else { 2832 stpw(rt_low, rt_high, adr_p); 2833 } 2834 } 2835 } 2836 2837 /** 2838 * Multiply 64 bit by 64 bit first loop. 2839 */ 2840 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart, 2841 Register y, Register y_idx, Register z, 2842 Register carry, Register product, 2843 Register idx, Register kdx) { 2844 // 2845 // jlong carry, x[], y[], z[]; 2846 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) { 2847 // huge_128 product = y[idx] * x[xstart] + carry; 2848 // z[kdx] = (jlong)product; 2849 // carry = (jlong)(product >>> 64); 2850 // } 2851 // z[xstart] = carry; 2852 // 2853 2854 Label L_first_loop, L_first_loop_exit; 2855 Label L_one_x, L_one_y, L_multiply; 2856 2857 subsw(xstart, xstart, 1); 2858 br(Assembler::MI, L_one_x); 2859 2860 lea(rscratch1, Address(x, xstart, Address::lsl(LogBytesPerInt))); 2861 ldr(x_xstart, Address(rscratch1)); 2862 ror(x_xstart, x_xstart, 32); // convert big-endian to little-endian 2863 2864 bind(L_first_loop); 2865 subsw(idx, idx, 1); 2866 br(Assembler::MI, L_first_loop_exit); 2867 subsw(idx, idx, 1); 2868 br(Assembler::MI, L_one_y); 2869 lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt))); 2870 ldr(y_idx, Address(rscratch1)); 2871 ror(y_idx, y_idx, 32); // convert big-endian to little-endian 2872 bind(L_multiply); 2873 2874 // AArch64 has a multiply-accumulate instruction that we can't use 2875 // here because it has no way to process carries, so we have to use 2876 // separate add and adc instructions. Bah. 2877 umulh(rscratch1, x_xstart, y_idx); // x_xstart * y_idx -> rscratch1:product 2878 mul(product, x_xstart, y_idx); 2879 adds(product, product, carry); 2880 adc(carry, rscratch1, zr); // x_xstart * y_idx + carry -> carry:product 2881 2882 subw(kdx, kdx, 2); 2883 ror(product, product, 32); // back to big-endian 2884 str(product, offsetted_address(z, kdx, Address::uxtw(LogBytesPerInt), 0, BytesPerLong)); 2885 2886 b(L_first_loop); 2887 2888 bind(L_one_y); 2889 ldrw(y_idx, Address(y, 0)); 2890 b(L_multiply); 2891 2892 bind(L_one_x); 2893 ldrw(x_xstart, Address(x, 0)); 2894 b(L_first_loop); 2895 2896 bind(L_first_loop_exit); 2897 } 2898 2899 /** 2900 * Multiply 128 bit by 128. Unrolled inner loop. 2901 * 2902 */ 2903 void MacroAssembler::multiply_128_x_128_loop(Register y, Register z, 2904 Register carry, Register carry2, 2905 Register idx, Register jdx, 2906 Register yz_idx1, Register yz_idx2, 2907 Register tmp, Register tmp3, Register tmp4, 2908 Register tmp6, Register product_hi) { 2909 2910 // jlong carry, x[], y[], z[]; 2911 // int kdx = ystart+1; 2912 // for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop 2913 // huge_128 tmp3 = (y[idx+1] * product_hi) + z[kdx+idx+1] + carry; 2914 // jlong carry2 = (jlong)(tmp3 >>> 64); 2915 // huge_128 tmp4 = (y[idx] * product_hi) + z[kdx+idx] + carry2; 2916 // carry = (jlong)(tmp4 >>> 64); 2917 // z[kdx+idx+1] = (jlong)tmp3; 2918 // z[kdx+idx] = (jlong)tmp4; 2919 // } 2920 // idx += 2; 2921 // if (idx > 0) { 2922 // yz_idx1 = (y[idx] * product_hi) + z[kdx+idx] + carry; 2923 // z[kdx+idx] = (jlong)yz_idx1; 2924 // carry = (jlong)(yz_idx1 >>> 64); 2925 // } 2926 // 2927 2928 Label L_third_loop, L_third_loop_exit, L_post_third_loop_done; 2929 2930 lsrw(jdx, idx, 2); 2931 2932 bind(L_third_loop); 2933 2934 subsw(jdx, jdx, 1); 2935 br(Assembler::MI, L_third_loop_exit); 2936 subw(idx, idx, 4); 2937 2938 lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt))); 2939 2940 ldp(yz_idx2, yz_idx1, Address(rscratch1, 0)); 2941 2942 lea(tmp6, Address(z, idx, Address::uxtw(LogBytesPerInt))); 2943 2944 ror(yz_idx1, yz_idx1, 32); // convert big-endian to little-endian 2945 ror(yz_idx2, yz_idx2, 32); 2946 2947 ldp(rscratch2, rscratch1, Address(tmp6, 0)); 2948 2949 mul(tmp3, product_hi, yz_idx1); // yz_idx1 * product_hi -> tmp4:tmp3 2950 umulh(tmp4, product_hi, yz_idx1); 2951 2952 ror(rscratch1, rscratch1, 32); // convert big-endian to little-endian 2953 ror(rscratch2, rscratch2, 32); 2954 2955 mul(tmp, product_hi, yz_idx2); // yz_idx2 * product_hi -> carry2:tmp 2956 umulh(carry2, product_hi, yz_idx2); 2957 2958 // propagate sum of both multiplications into carry:tmp4:tmp3 2959 adds(tmp3, tmp3, carry); 2960 adc(tmp4, tmp4, zr); 2961 adds(tmp3, tmp3, rscratch1); 2962 adcs(tmp4, tmp4, tmp); 2963 adc(carry, carry2, zr); 2964 adds(tmp4, tmp4, rscratch2); 2965 adc(carry, carry, zr); 2966 2967 ror(tmp3, tmp3, 32); // convert little-endian to big-endian 2968 ror(tmp4, tmp4, 32); 2969 stp(tmp4, tmp3, Address(tmp6, 0)); 2970 2971 b(L_third_loop); 2972 bind (L_third_loop_exit); 2973 2974 andw (idx, idx, 0x3); 2975 cbz(idx, L_post_third_loop_done); 2976 2977 Label L_check_1; 2978 subsw(idx, idx, 2); 2979 br(Assembler::MI, L_check_1); 2980 2981 lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt))); 2982 ldr(yz_idx1, Address(rscratch1, 0)); 2983 ror(yz_idx1, yz_idx1, 32); 2984 mul(tmp3, product_hi, yz_idx1); // yz_idx1 * product_hi -> tmp4:tmp3 2985 umulh(tmp4, product_hi, yz_idx1); 2986 lea(rscratch1, Address(z, idx, Address::uxtw(LogBytesPerInt))); 2987 ldr(yz_idx2, Address(rscratch1, 0)); 2988 ror(yz_idx2, yz_idx2, 32); 2989 2990 add2_with_carry(carry, tmp4, tmp3, carry, yz_idx2); 2991 2992 ror(tmp3, tmp3, 32); 2993 str(tmp3, Address(rscratch1, 0)); 2994 2995 bind (L_check_1); 2996 2997 andw (idx, idx, 0x1); 2998 subsw(idx, idx, 1); 2999 br(Assembler::MI, L_post_third_loop_done); 3000 ldrw(tmp4, Address(y, idx, Address::uxtw(LogBytesPerInt))); 3001 mul(tmp3, tmp4, product_hi); // tmp4 * product_hi -> carry2:tmp3 3002 umulh(carry2, tmp4, product_hi); 3003 ldrw(tmp4, Address(z, idx, Address::uxtw(LogBytesPerInt))); 3004 3005 add2_with_carry(carry2, tmp3, tmp4, carry); 3006 3007 strw(tmp3, Address(z, idx, Address::uxtw(LogBytesPerInt))); 3008 extr(carry, carry2, tmp3, 32); 3009 3010 bind(L_post_third_loop_done); 3011 } 3012 3013 /** 3014 * Code for BigInteger::multiplyToLen() instrinsic. 3015 * 3016 * r0: x 3017 * r1: xlen 3018 * r2: y 3019 * r3: ylen 3020 * r4: z 3021 * r5: zlen 3022 * r10: tmp1 3023 * r11: tmp2 3024 * r12: tmp3 3025 * r13: tmp4 3026 * r14: tmp5 3027 * r15: tmp6 3028 * r16: tmp7 3029 * 3030 */ 3031 void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Register ylen, 3032 Register z, Register zlen, 3033 Register tmp1, Register tmp2, Register tmp3, Register tmp4, 3034 Register tmp5, Register tmp6, Register product_hi) { 3035 3036 assert_different_registers(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6); 3037 3038 const Register idx = tmp1; 3039 const Register kdx = tmp2; 3040 const Register xstart = tmp3; 3041 3042 const Register y_idx = tmp4; 3043 const Register carry = tmp5; 3044 const Register product = xlen; 3045 const Register x_xstart = zlen; // reuse register 3046 3047 // First Loop. 3048 // 3049 // final static long LONG_MASK = 0xffffffffL; 3050 // int xstart = xlen - 1; 3051 // int ystart = ylen - 1; 3052 // long carry = 0; 3053 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) { 3054 // long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry; 3055 // z[kdx] = (int)product; 3056 // carry = product >>> 32; 3057 // } 3058 // z[xstart] = (int)carry; 3059 // 3060 3061 movw(idx, ylen); // idx = ylen; 3062 movw(kdx, zlen); // kdx = xlen+ylen; 3063 mov(carry, zr); // carry = 0; 3064 3065 Label L_done; 3066 3067 movw(xstart, xlen); 3068 subsw(xstart, xstart, 1); 3069 br(Assembler::MI, L_done); 3070 3071 multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, carry, product, idx, kdx); 3072 3073 Label L_second_loop; 3074 cbzw(kdx, L_second_loop); 3075 3076 Label L_carry; 3077 subw(kdx, kdx, 1); 3078 cbzw(kdx, L_carry); 3079 3080 strw(carry, Address(z, kdx, Address::uxtw(LogBytesPerInt))); 3081 lsr(carry, carry, 32); 3082 subw(kdx, kdx, 1); 3083 3084 bind(L_carry); 3085 strw(carry, Address(z, kdx, Address::uxtw(LogBytesPerInt))); 3086 3087 // Second and third (nested) loops. 3088 // 3089 // for (int i = xstart-1; i >= 0; i--) { // Second loop 3090 // carry = 0; 3091 // for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop 3092 // long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) + 3093 // (z[k] & LONG_MASK) + carry; 3094 // z[k] = (int)product; 3095 // carry = product >>> 32; 3096 // } 3097 // z[i] = (int)carry; 3098 // } 3099 // 3100 // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = product_hi 3101 3102 const Register jdx = tmp1; 3103 3104 bind(L_second_loop); 3105 mov(carry, zr); // carry = 0; 3106 movw(jdx, ylen); // j = ystart+1 3107 3108 subsw(xstart, xstart, 1); // i = xstart-1; 3109 br(Assembler::MI, L_done); 3110 3111 str(z, Address(pre(sp, -4 * wordSize))); 3112 3113 Label L_last_x; 3114 lea(z, offsetted_address(z, xstart, Address::uxtw(LogBytesPerInt), 4, BytesPerInt)); // z = z + k - j 3115 subsw(xstart, xstart, 1); // i = xstart-1; 3116 br(Assembler::MI, L_last_x); 3117 3118 lea(rscratch1, Address(x, xstart, Address::uxtw(LogBytesPerInt))); 3119 ldr(product_hi, Address(rscratch1)); 3120 ror(product_hi, product_hi, 32); // convert big-endian to little-endian 3121 3122 Label L_third_loop_prologue; 3123 bind(L_third_loop_prologue); 3124 3125 str(ylen, Address(sp, wordSize)); 3126 stp(x, xstart, Address(sp, 2 * wordSize)); 3127 multiply_128_x_128_loop(y, z, carry, x, jdx, ylen, product, 3128 tmp2, x_xstart, tmp3, tmp4, tmp6, product_hi); 3129 ldp(z, ylen, Address(post(sp, 2 * wordSize))); 3130 ldp(x, xlen, Address(post(sp, 2 * wordSize))); // copy old xstart -> xlen 3131 3132 addw(tmp3, xlen, 1); 3133 strw(carry, Address(z, tmp3, Address::uxtw(LogBytesPerInt))); 3134 subsw(tmp3, tmp3, 1); 3135 br(Assembler::MI, L_done); 3136 3137 lsr(carry, carry, 32); 3138 strw(carry, Address(z, tmp3, Address::uxtw(LogBytesPerInt))); 3139 b(L_second_loop); 3140 3141 // Next infrequent code is moved outside loops. 3142 bind(L_last_x); 3143 ldrw(product_hi, Address(x, 0)); 3144 b(L_third_loop_prologue); 3145 3146 bind(L_done); 3147 } 3148 3149 // Code for BigInteger::mulAdd instrinsic 3150 // out = r0 3151 // in = r1 3152 // offset = r2 (already out.length-offset) 3153 // len = r3 3154 // k = r4 3155 // 3156 // pseudo code from java implementation: 3157 // carry = 0; 3158 // offset = out.length-offset - 1; 3159 // for (int j=len-1; j >= 0; j--) { 3160 // product = (in[j] & LONG_MASK) * kLong + (out[offset] & LONG_MASK) + carry; 3161 // out[offset--] = (int)product; 3162 // carry = product >>> 32; 3163 // } 3164 // return (int)carry; 3165 void MacroAssembler::mul_add(Register out, Register in, Register offset, 3166 Register len, Register k) { 3167 Label LOOP, END; 3168 // pre-loop 3169 cmp(len, zr); // cmp, not cbz/cbnz: to use condition twice => less branches 3170 csel(out, zr, out, Assembler::EQ); 3171 br(Assembler::EQ, END); 3172 add(in, in, len, LSL, 2); // in[j+1] address 3173 add(offset, out, offset, LSL, 2); // out[offset + 1] address 3174 mov(out, zr); // used to keep carry now 3175 BIND(LOOP); 3176 ldrw(rscratch1, Address(pre(in, -4))); 3177 madd(rscratch1, rscratch1, k, out); 3178 ldrw(rscratch2, Address(pre(offset, -4))); 3179 add(rscratch1, rscratch1, rscratch2); 3180 strw(rscratch1, Address(offset)); 3181 lsr(out, rscratch1, 32); 3182 subs(len, len, 1); 3183 br(Assembler::NE, LOOP); 3184 BIND(END); 3185 } 3186 3187 /** 3188 * Emits code to update CRC-32 with a byte value according to constants in table 3189 * 3190 * @param [in,out]crc Register containing the crc. 3191 * @param [in]val Register containing the byte to fold into the CRC. 3192 * @param [in]table Register containing the table of crc constants. 3193 * 3194 * uint32_t crc; 3195 * val = crc_table[(val ^ crc) & 0xFF]; 3196 * crc = val ^ (crc >> 8); 3197 * 3198 */ 3199 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) { 3200 eor(val, val, crc); 3201 andr(val, val, 0xff); 3202 ldrw(val, Address(table, val, Address::lsl(2))); 3203 eor(crc, val, crc, Assembler::LSR, 8); 3204 } 3205 3206 /** 3207 * Emits code to update CRC-32 with a 32-bit value according to tables 0 to 3 3208 * 3209 * @param [in,out]crc Register containing the crc. 3210 * @param [in]v Register containing the 32-bit to fold into the CRC. 3211 * @param [in]table0 Register containing table 0 of crc constants. 3212 * @param [in]table1 Register containing table 1 of crc constants. 3213 * @param [in]table2 Register containing table 2 of crc constants. 3214 * @param [in]table3 Register containing table 3 of crc constants. 3215 * 3216 * uint32_t crc; 3217 * v = crc ^ v 3218 * crc = table3[v&0xff]^table2[(v>>8)&0xff]^table1[(v>>16)&0xff]^table0[v>>24] 3219 * 3220 */ 3221 void MacroAssembler::update_word_crc32(Register crc, Register v, Register tmp, 3222 Register table0, Register table1, Register table2, Register table3, 3223 bool upper) { 3224 eor(v, crc, v, upper ? LSR:LSL, upper ? 32:0); 3225 uxtb(tmp, v); 3226 ldrw(crc, Address(table3, tmp, Address::lsl(2))); 3227 ubfx(tmp, v, 8, 8); 3228 ldrw(tmp, Address(table2, tmp, Address::lsl(2))); 3229 eor(crc, crc, tmp); 3230 ubfx(tmp, v, 16, 8); 3231 ldrw(tmp, Address(table1, tmp, Address::lsl(2))); 3232 eor(crc, crc, tmp); 3233 ubfx(tmp, v, 24, 8); 3234 ldrw(tmp, Address(table0, tmp, Address::lsl(2))); 3235 eor(crc, crc, tmp); 3236 } 3237 3238 void MacroAssembler::kernel_crc32_using_crc32(Register crc, Register buf, 3239 Register len, Register tmp0, Register tmp1, Register tmp2, 3240 Register tmp3) { 3241 Label CRC_by64_loop, CRC_by4_loop, CRC_by1_loop, CRC_less64, CRC_by64_pre, CRC_by32_loop, CRC_less32, L_exit; 3242 assert_different_registers(crc, buf, len, tmp0, tmp1, tmp2, tmp3); 3243 3244 mvnw(crc, crc); 3245 3246 subs(len, len, 128); 3247 br(Assembler::GE, CRC_by64_pre); 3248 BIND(CRC_less64); 3249 adds(len, len, 128-32); 3250 br(Assembler::GE, CRC_by32_loop); 3251 BIND(CRC_less32); 3252 adds(len, len, 32-4); 3253 br(Assembler::GE, CRC_by4_loop); 3254 adds(len, len, 4); 3255 br(Assembler::GT, CRC_by1_loop); 3256 b(L_exit); 3257 3258 BIND(CRC_by32_loop); 3259 ldp(tmp0, tmp1, Address(post(buf, 16))); 3260 subs(len, len, 32); 3261 crc32x(crc, crc, tmp0); 3262 ldr(tmp2, Address(post(buf, 8))); 3263 crc32x(crc, crc, tmp1); 3264 ldr(tmp3, Address(post(buf, 8))); 3265 crc32x(crc, crc, tmp2); 3266 crc32x(crc, crc, tmp3); 3267 br(Assembler::GE, CRC_by32_loop); 3268 cmn(len, 32); 3269 br(Assembler::NE, CRC_less32); 3270 b(L_exit); 3271 3272 BIND(CRC_by4_loop); 3273 ldrw(tmp0, Address(post(buf, 4))); 3274 subs(len, len, 4); 3275 crc32w(crc, crc, tmp0); 3276 br(Assembler::GE, CRC_by4_loop); 3277 adds(len, len, 4); 3278 br(Assembler::LE, L_exit); 3279 BIND(CRC_by1_loop); 3280 ldrb(tmp0, Address(post(buf, 1))); 3281 subs(len, len, 1); 3282 crc32b(crc, crc, tmp0); 3283 br(Assembler::GT, CRC_by1_loop); 3284 b(L_exit); 3285 3286 BIND(CRC_by64_pre); 3287 sub(buf, buf, 8); 3288 ldp(tmp0, tmp1, Address(buf, 8)); 3289 crc32x(crc, crc, tmp0); 3290 ldr(tmp2, Address(buf, 24)); 3291 crc32x(crc, crc, tmp1); 3292 ldr(tmp3, Address(buf, 32)); 3293 crc32x(crc, crc, tmp2); 3294 ldr(tmp0, Address(buf, 40)); 3295 crc32x(crc, crc, tmp3); 3296 ldr(tmp1, Address(buf, 48)); 3297 crc32x(crc, crc, tmp0); 3298 ldr(tmp2, Address(buf, 56)); 3299 crc32x(crc, crc, tmp1); 3300 ldr(tmp3, Address(pre(buf, 64))); 3301 3302 b(CRC_by64_loop); 3303 3304 align(CodeEntryAlignment); 3305 BIND(CRC_by64_loop); 3306 subs(len, len, 64); 3307 crc32x(crc, crc, tmp2); 3308 ldr(tmp0, Address(buf, 8)); 3309 crc32x(crc, crc, tmp3); 3310 ldr(tmp1, Address(buf, 16)); 3311 crc32x(crc, crc, tmp0); 3312 ldr(tmp2, Address(buf, 24)); 3313 crc32x(crc, crc, tmp1); 3314 ldr(tmp3, Address(buf, 32)); 3315 crc32x(crc, crc, tmp2); 3316 ldr(tmp0, Address(buf, 40)); 3317 crc32x(crc, crc, tmp3); 3318 ldr(tmp1, Address(buf, 48)); 3319 crc32x(crc, crc, tmp0); 3320 ldr(tmp2, Address(buf, 56)); 3321 crc32x(crc, crc, tmp1); 3322 ldr(tmp3, Address(pre(buf, 64))); 3323 br(Assembler::GE, CRC_by64_loop); 3324 3325 // post-loop 3326 crc32x(crc, crc, tmp2); 3327 crc32x(crc, crc, tmp3); 3328 3329 sub(len, len, 64); 3330 add(buf, buf, 8); 3331 cmn(len, 128); 3332 br(Assembler::NE, CRC_less64); 3333 BIND(L_exit); 3334 mvnw(crc, crc); 3335 } 3336 3337 /** 3338 * @param crc register containing existing CRC (32-bit) 3339 * @param buf register pointing to input byte buffer (byte*) 3340 * @param len register containing number of bytes 3341 * @param table register that will contain address of CRC table 3342 * @param tmp scratch register 3343 */ 3344 void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len, 3345 Register table0, Register table1, Register table2, Register table3, 3346 Register tmp, Register tmp2, Register tmp3) { 3347 Label L_by16, L_by16_loop, L_by4, L_by4_loop, L_by1, L_by1_loop, L_exit; 3348 unsigned long offset; 3349 3350 if (UseCRC32) { 3351 kernel_crc32_using_crc32(crc, buf, len, table0, table1, table2, table3); 3352 return; 3353 } 3354 3355 mvnw(crc, crc); 3356 3357 adrp(table0, ExternalAddress(StubRoutines::crc_table_addr()), offset); 3358 if (offset) add(table0, table0, offset); 3359 add(table1, table0, 1*256*sizeof(juint)); 3360 add(table2, table0, 2*256*sizeof(juint)); 3361 add(table3, table0, 3*256*sizeof(juint)); 3362 3363 if (UseNeon) { 3364 cmp(len, (u1)64); 3365 br(Assembler::LT, L_by16); 3366 eor(v16, T16B, v16, v16); 3367 3368 Label L_fold; 3369 3370 add(tmp, table0, 4*256*sizeof(juint)); // Point at the Neon constants 3371 3372 ld1(v0, v1, T2D, post(buf, 32)); 3373 ld1r(v4, T2D, post(tmp, 8)); 3374 ld1r(v5, T2D, post(tmp, 8)); 3375 ld1r(v6, T2D, post(tmp, 8)); 3376 ld1r(v7, T2D, post(tmp, 8)); 3377 mov(v16, T4S, 0, crc); 3378 3379 eor(v0, T16B, v0, v16); 3380 sub(len, len, 64); 3381 3382 BIND(L_fold); 3383 pmull(v22, T8H, v0, v5, T8B); 3384 pmull(v20, T8H, v0, v7, T8B); 3385 pmull(v23, T8H, v0, v4, T8B); 3386 pmull(v21, T8H, v0, v6, T8B); 3387 3388 pmull2(v18, T8H, v0, v5, T16B); 3389 pmull2(v16, T8H, v0, v7, T16B); 3390 pmull2(v19, T8H, v0, v4, T16B); 3391 pmull2(v17, T8H, v0, v6, T16B); 3392 3393 uzp1(v24, T8H, v20, v22); 3394 uzp2(v25, T8H, v20, v22); 3395 eor(v20, T16B, v24, v25); 3396 3397 uzp1(v26, T8H, v16, v18); 3398 uzp2(v27, T8H, v16, v18); 3399 eor(v16, T16B, v26, v27); 3400 3401 ushll2(v22, T4S, v20, T8H, 8); 3402 ushll(v20, T4S, v20, T4H, 8); 3403 3404 ushll2(v18, T4S, v16, T8H, 8); 3405 ushll(v16, T4S, v16, T4H, 8); 3406 3407 eor(v22, T16B, v23, v22); 3408 eor(v18, T16B, v19, v18); 3409 eor(v20, T16B, v21, v20); 3410 eor(v16, T16B, v17, v16); 3411 3412 uzp1(v17, T2D, v16, v20); 3413 uzp2(v21, T2D, v16, v20); 3414 eor(v17, T16B, v17, v21); 3415 3416 ushll2(v20, T2D, v17, T4S, 16); 3417 ushll(v16, T2D, v17, T2S, 16); 3418 3419 eor(v20, T16B, v20, v22); 3420 eor(v16, T16B, v16, v18); 3421 3422 uzp1(v17, T2D, v20, v16); 3423 uzp2(v21, T2D, v20, v16); 3424 eor(v28, T16B, v17, v21); 3425 3426 pmull(v22, T8H, v1, v5, T8B); 3427 pmull(v20, T8H, v1, v7, T8B); 3428 pmull(v23, T8H, v1, v4, T8B); 3429 pmull(v21, T8H, v1, v6, T8B); 3430 3431 pmull2(v18, T8H, v1, v5, T16B); 3432 pmull2(v16, T8H, v1, v7, T16B); 3433 pmull2(v19, T8H, v1, v4, T16B); 3434 pmull2(v17, T8H, v1, v6, T16B); 3435 3436 ld1(v0, v1, T2D, post(buf, 32)); 3437 3438 uzp1(v24, T8H, v20, v22); 3439 uzp2(v25, T8H, v20, v22); 3440 eor(v20, T16B, v24, v25); 3441 3442 uzp1(v26, T8H, v16, v18); 3443 uzp2(v27, T8H, v16, v18); 3444 eor(v16, T16B, v26, v27); 3445 3446 ushll2(v22, T4S, v20, T8H, 8); 3447 ushll(v20, T4S, v20, T4H, 8); 3448 3449 ushll2(v18, T4S, v16, T8H, 8); 3450 ushll(v16, T4S, v16, T4H, 8); 3451 3452 eor(v22, T16B, v23, v22); 3453 eor(v18, T16B, v19, v18); 3454 eor(v20, T16B, v21, v20); 3455 eor(v16, T16B, v17, v16); 3456 3457 uzp1(v17, T2D, v16, v20); 3458 uzp2(v21, T2D, v16, v20); 3459 eor(v16, T16B, v17, v21); 3460 3461 ushll2(v20, T2D, v16, T4S, 16); 3462 ushll(v16, T2D, v16, T2S, 16); 3463 3464 eor(v20, T16B, v22, v20); 3465 eor(v16, T16B, v16, v18); 3466 3467 uzp1(v17, T2D, v20, v16); 3468 uzp2(v21, T2D, v20, v16); 3469 eor(v20, T16B, v17, v21); 3470 3471 shl(v16, T2D, v28, 1); 3472 shl(v17, T2D, v20, 1); 3473 3474 eor(v0, T16B, v0, v16); 3475 eor(v1, T16B, v1, v17); 3476 3477 subs(len, len, 32); 3478 br(Assembler::GE, L_fold); 3479 3480 mov(crc, 0); 3481 mov(tmp, v0, T1D, 0); 3482 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false); 3483 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true); 3484 mov(tmp, v0, T1D, 1); 3485 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false); 3486 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true); 3487 mov(tmp, v1, T1D, 0); 3488 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false); 3489 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true); 3490 mov(tmp, v1, T1D, 1); 3491 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false); 3492 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true); 3493 3494 add(len, len, 32); 3495 } 3496 3497 BIND(L_by16); 3498 subs(len, len, 16); 3499 br(Assembler::GE, L_by16_loop); 3500 adds(len, len, 16-4); 3501 br(Assembler::GE, L_by4_loop); 3502 adds(len, len, 4); 3503 br(Assembler::GT, L_by1_loop); 3504 b(L_exit); 3505 3506 BIND(L_by4_loop); 3507 ldrw(tmp, Address(post(buf, 4))); 3508 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3); 3509 subs(len, len, 4); 3510 br(Assembler::GE, L_by4_loop); 3511 adds(len, len, 4); 3512 br(Assembler::LE, L_exit); 3513 BIND(L_by1_loop); 3514 subs(len, len, 1); 3515 ldrb(tmp, Address(post(buf, 1))); 3516 update_byte_crc32(crc, tmp, table0); 3517 br(Assembler::GT, L_by1_loop); 3518 b(L_exit); 3519 3520 align(CodeEntryAlignment); 3521 BIND(L_by16_loop); 3522 subs(len, len, 16); 3523 ldp(tmp, tmp3, Address(post(buf, 16))); 3524 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false); 3525 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true); 3526 update_word_crc32(crc, tmp3, tmp2, table0, table1, table2, table3, false); 3527 update_word_crc32(crc, tmp3, tmp2, table0, table1, table2, table3, true); 3528 br(Assembler::GE, L_by16_loop); 3529 adds(len, len, 16-4); 3530 br(Assembler::GE, L_by4_loop); 3531 adds(len, len, 4); 3532 br(Assembler::GT, L_by1_loop); 3533 BIND(L_exit); 3534 mvnw(crc, crc); 3535 } 3536 3537 void MacroAssembler::kernel_crc32c_using_crc32c(Register crc, Register buf, 3538 Register len, Register tmp0, Register tmp1, Register tmp2, 3539 Register tmp3) { 3540 Label CRC_by64_loop, CRC_by4_loop, CRC_by1_loop, CRC_less64, CRC_by64_pre, CRC_by32_loop, CRC_less32, L_exit; 3541 assert_different_registers(crc, buf, len, tmp0, tmp1, tmp2, tmp3); 3542 3543 subs(len, len, 128); 3544 br(Assembler::GE, CRC_by64_pre); 3545 BIND(CRC_less64); 3546 adds(len, len, 128-32); 3547 br(Assembler::GE, CRC_by32_loop); 3548 BIND(CRC_less32); 3549 adds(len, len, 32-4); 3550 br(Assembler::GE, CRC_by4_loop); 3551 adds(len, len, 4); 3552 br(Assembler::GT, CRC_by1_loop); 3553 b(L_exit); 3554 3555 BIND(CRC_by32_loop); 3556 ldp(tmp0, tmp1, Address(post(buf, 16))); 3557 subs(len, len, 32); 3558 crc32cx(crc, crc, tmp0); 3559 ldr(tmp2, Address(post(buf, 8))); 3560 crc32cx(crc, crc, tmp1); 3561 ldr(tmp3, Address(post(buf, 8))); 3562 crc32cx(crc, crc, tmp2); 3563 crc32cx(crc, crc, tmp3); 3564 br(Assembler::GE, CRC_by32_loop); 3565 cmn(len, 32); 3566 br(Assembler::NE, CRC_less32); 3567 b(L_exit); 3568 3569 BIND(CRC_by4_loop); 3570 ldrw(tmp0, Address(post(buf, 4))); 3571 subs(len, len, 4); 3572 crc32cw(crc, crc, tmp0); 3573 br(Assembler::GE, CRC_by4_loop); 3574 adds(len, len, 4); 3575 br(Assembler::LE, L_exit); 3576 BIND(CRC_by1_loop); 3577 ldrb(tmp0, Address(post(buf, 1))); 3578 subs(len, len, 1); 3579 crc32cb(crc, crc, tmp0); 3580 br(Assembler::GT, CRC_by1_loop); 3581 b(L_exit); 3582 3583 BIND(CRC_by64_pre); 3584 sub(buf, buf, 8); 3585 ldp(tmp0, tmp1, Address(buf, 8)); 3586 crc32cx(crc, crc, tmp0); 3587 ldr(tmp2, Address(buf, 24)); 3588 crc32cx(crc, crc, tmp1); 3589 ldr(tmp3, Address(buf, 32)); 3590 crc32cx(crc, crc, tmp2); 3591 ldr(tmp0, Address(buf, 40)); 3592 crc32cx(crc, crc, tmp3); 3593 ldr(tmp1, Address(buf, 48)); 3594 crc32cx(crc, crc, tmp0); 3595 ldr(tmp2, Address(buf, 56)); 3596 crc32cx(crc, crc, tmp1); 3597 ldr(tmp3, Address(pre(buf, 64))); 3598 3599 b(CRC_by64_loop); 3600 3601 align(CodeEntryAlignment); 3602 BIND(CRC_by64_loop); 3603 subs(len, len, 64); 3604 crc32cx(crc, crc, tmp2); 3605 ldr(tmp0, Address(buf, 8)); 3606 crc32cx(crc, crc, tmp3); 3607 ldr(tmp1, Address(buf, 16)); 3608 crc32cx(crc, crc, tmp0); 3609 ldr(tmp2, Address(buf, 24)); 3610 crc32cx(crc, crc, tmp1); 3611 ldr(tmp3, Address(buf, 32)); 3612 crc32cx(crc, crc, tmp2); 3613 ldr(tmp0, Address(buf, 40)); 3614 crc32cx(crc, crc, tmp3); 3615 ldr(tmp1, Address(buf, 48)); 3616 crc32cx(crc, crc, tmp0); 3617 ldr(tmp2, Address(buf, 56)); 3618 crc32cx(crc, crc, tmp1); 3619 ldr(tmp3, Address(pre(buf, 64))); 3620 br(Assembler::GE, CRC_by64_loop); 3621 3622 // post-loop 3623 crc32cx(crc, crc, tmp2); 3624 crc32cx(crc, crc, tmp3); 3625 3626 sub(len, len, 64); 3627 add(buf, buf, 8); 3628 cmn(len, 128); 3629 br(Assembler::NE, CRC_less64); 3630 BIND(L_exit); 3631 } 3632 3633 /** 3634 * @param crc register containing existing CRC (32-bit) 3635 * @param buf register pointing to input byte buffer (byte*) 3636 * @param len register containing number of bytes 3637 * @param table register that will contain address of CRC table 3638 * @param tmp scratch register 3639 */ 3640 void MacroAssembler::kernel_crc32c(Register crc, Register buf, Register len, 3641 Register table0, Register table1, Register table2, Register table3, 3642 Register tmp, Register tmp2, Register tmp3) { 3643 kernel_crc32c_using_crc32c(crc, buf, len, table0, table1, table2, table3); 3644 } 3645 3646 3647 SkipIfEqual::SkipIfEqual( 3648 MacroAssembler* masm, const bool* flag_addr, bool value) { 3649 _masm = masm; 3650 unsigned long offset; 3651 _masm->adrp(rscratch1, ExternalAddress((address)flag_addr), offset); 3652 _masm->ldrb(rscratch1, Address(rscratch1, offset)); 3653 _masm->cbzw(rscratch1, _label); 3654 } 3655 3656 SkipIfEqual::~SkipIfEqual() { 3657 _masm->bind(_label); 3658 } 3659 3660 void MacroAssembler::addptr(const Address &dst, int32_t src) { 3661 Address adr; 3662 switch(dst.getMode()) { 3663 case Address::base_plus_offset: 3664 // This is the expected mode, although we allow all the other 3665 // forms below. 3666 adr = form_address(rscratch2, dst.base(), dst.offset(), LogBytesPerWord); 3667 break; 3668 default: 3669 lea(rscratch2, dst); 3670 adr = Address(rscratch2); 3671 break; 3672 } 3673 ldr(rscratch1, adr); 3674 add(rscratch1, rscratch1, src); 3675 str(rscratch1, adr); 3676 } 3677 3678 void MacroAssembler::cmpptr(Register src1, Address src2) { 3679 unsigned long offset; 3680 adrp(rscratch1, src2, offset); 3681 ldr(rscratch1, Address(rscratch1, offset)); 3682 cmp(src1, rscratch1); 3683 } 3684 3685 void MacroAssembler::cmpoop(Register obj1, Register obj2) { 3686 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 3687 bs->obj_equals(this, obj1, obj2); 3688 } 3689 3690 void MacroAssembler::load_method_holder(Register holder, Register method) { 3691 ldr(holder, Address(method, Method::const_offset())); // ConstMethod* 3692 ldr(holder, Address(holder, ConstMethod::constants_offset())); // ConstantPool* 3693 ldr(holder, Address(holder, ConstantPool::pool_holder_offset_in_bytes())); // InstanceKlass* 3694 } 3695 3696 void MacroAssembler::load_klass(Register dst, Register src) { 3697 if (UseCompressedClassPointers) { 3698 ldrw(dst, Address(src, oopDesc::klass_offset_in_bytes())); 3699 } else { 3700 ldr(dst, Address(src, oopDesc::klass_offset_in_bytes())); 3701 } 3702 } 3703 3704 void MacroAssembler::load_klass(Register dst, Register src) { 3705 load_metadata(dst, src); 3706 if (UseCompressedClassPointers) { 3707 andr(dst, dst, oopDesc::compressed_klass_mask()); 3708 decode_klass_not_null(dst); 3709 } else { 3710 ubfm(dst, dst, 0, 63 - oopDesc::storage_props_nof_bits); 3711 } 3712 } 3713 3714 // ((OopHandle)result).resolve(); 3715 void MacroAssembler::resolve_oop_handle(Register result, Register tmp) { 3716 // OopHandle::resolve is an indirection. 3717 access_load_at(T_OBJECT, IN_NATIVE, result, Address(result, 0), tmp, noreg); 3718 } 3719 3720 void MacroAssembler::load_mirror(Register dst, Register method, Register tmp) { 3721 const int mirror_offset = in_bytes(Klass::java_mirror_offset()); 3722 ldr(dst, Address(rmethod, Method::const_offset())); 3723 ldr(dst, Address(dst, ConstMethod::constants_offset())); 3724 ldr(dst, Address(dst, ConstantPool::pool_holder_offset_in_bytes())); 3725 ldr(dst, Address(dst, mirror_offset)); 3726 resolve_oop_handle(dst, tmp); 3727 } 3728 3729 void MacroAssembler::load_storage_props(Register dst, Register src) { 3730 load_metadata(dst, src); 3731 if (UseCompressedClassPointers) { 3732 asrw(dst, dst, oopDesc::narrow_storage_props_shift); 3733 } else { 3734 asr(dst, dst, oopDesc::wide_storage_props_shift); 3735 } 3736 } 3737 3738 void MacroAssembler::cmp_klass(Register oop, Register trial_klass, Register tmp) { 3739 if (UseCompressedClassPointers) { 3740 ldrw(tmp, Address(oop, oopDesc::klass_offset_in_bytes())); 3741 if (CompressedKlassPointers::base() == NULL) { 3742 cmp(trial_klass, tmp, LSL, CompressedKlassPointers::shift()); 3743 return; 3744 } else if (((uint64_t)CompressedKlassPointers::base() & 0xffffffff) == 0 3745 && CompressedKlassPointers::shift() == 0) { 3746 // Only the bottom 32 bits matter 3747 cmpw(trial_klass, tmp); 3748 return; 3749 } 3750 decode_klass_not_null(tmp); 3751 } else { 3752 ldr(tmp, Address(oop, oopDesc::klass_offset_in_bytes())); 3753 } 3754 cmp(trial_klass, tmp); 3755 } 3756 3757 void MacroAssembler::load_prototype_header(Register dst, Register src) { 3758 load_klass(dst, src); 3759 ldr(dst, Address(dst, Klass::prototype_header_offset())); 3760 } 3761 3762 void MacroAssembler::store_klass(Register dst, Register src) { 3763 // FIXME: Should this be a store release? concurrent gcs assumes 3764 // klass length is valid if klass field is not null. 3765 if (UseCompressedClassPointers) { 3766 encode_klass_not_null(src); 3767 strw(src, Address(dst, oopDesc::klass_offset_in_bytes())); 3768 } else { 3769 str(src, Address(dst, oopDesc::klass_offset_in_bytes())); 3770 } 3771 } 3772 3773 void MacroAssembler::store_klass_gap(Register dst, Register src) { 3774 if (UseCompressedClassPointers) { 3775 // Store to klass gap in destination 3776 strw(src, Address(dst, oopDesc::klass_gap_offset_in_bytes())); 3777 } 3778 } 3779 3780 // Algorithm must match CompressedOops::encode. 3781 void MacroAssembler::encode_heap_oop(Register d, Register s) { 3782 #ifdef ASSERT 3783 verify_heapbase("MacroAssembler::encode_heap_oop: heap base corrupted?"); 3784 #endif 3785 verify_oop(s, "broken oop in encode_heap_oop"); 3786 if (CompressedOops::base() == NULL) { 3787 if (CompressedOops::shift() != 0) { 3788 assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong"); 3789 lsr(d, s, LogMinObjAlignmentInBytes); 3790 } else { 3791 mov(d, s); 3792 } 3793 } else { 3794 subs(d, s, rheapbase); 3795 csel(d, d, zr, Assembler::HS); 3796 lsr(d, d, LogMinObjAlignmentInBytes); 3797 3798 /* Old algorithm: is this any worse? 3799 Label nonnull; 3800 cbnz(r, nonnull); 3801 sub(r, r, rheapbase); 3802 bind(nonnull); 3803 lsr(r, r, LogMinObjAlignmentInBytes); 3804 */ 3805 } 3806 } 3807 3808 void MacroAssembler::encode_heap_oop_not_null(Register r) { 3809 #ifdef ASSERT 3810 verify_heapbase("MacroAssembler::encode_heap_oop_not_null: heap base corrupted?"); 3811 if (CheckCompressedOops) { 3812 Label ok; 3813 cbnz(r, ok); 3814 stop("null oop passed to encode_heap_oop_not_null"); 3815 bind(ok); 3816 } 3817 #endif 3818 verify_oop(r, "broken oop in encode_heap_oop_not_null"); 3819 if (CompressedOops::base() != NULL) { 3820 sub(r, r, rheapbase); 3821 } 3822 if (CompressedOops::shift() != 0) { 3823 assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong"); 3824 lsr(r, r, LogMinObjAlignmentInBytes); 3825 } 3826 } 3827 3828 void MacroAssembler::encode_heap_oop_not_null(Register dst, Register src) { 3829 #ifdef ASSERT 3830 verify_heapbase("MacroAssembler::encode_heap_oop_not_null2: heap base corrupted?"); 3831 if (CheckCompressedOops) { 3832 Label ok; 3833 cbnz(src, ok); 3834 stop("null oop passed to encode_heap_oop_not_null2"); 3835 bind(ok); 3836 } 3837 #endif 3838 verify_oop(src, "broken oop in encode_heap_oop_not_null2"); 3839 3840 Register data = src; 3841 if (CompressedOops::base() != NULL) { 3842 sub(dst, src, rheapbase); 3843 data = dst; 3844 } 3845 if (CompressedOops::shift() != 0) { 3846 assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong"); 3847 lsr(dst, data, LogMinObjAlignmentInBytes); 3848 data = dst; 3849 } 3850 if (data == src) 3851 mov(dst, src); 3852 } 3853 3854 void MacroAssembler::decode_heap_oop(Register d, Register s) { 3855 #ifdef ASSERT 3856 verify_heapbase("MacroAssembler::decode_heap_oop: heap base corrupted?"); 3857 #endif 3858 if (CompressedOops::base() == NULL) { 3859 if (CompressedOops::shift() != 0 || d != s) { 3860 lsl(d, s, CompressedOops::shift()); 3861 } 3862 } else { 3863 Label done; 3864 if (d != s) 3865 mov(d, s); 3866 cbz(s, done); 3867 add(d, rheapbase, s, Assembler::LSL, LogMinObjAlignmentInBytes); 3868 bind(done); 3869 } 3870 verify_oop(d, "broken oop in decode_heap_oop"); 3871 } 3872 3873 void MacroAssembler::decode_heap_oop_not_null(Register r) { 3874 assert (UseCompressedOops, "should only be used for compressed headers"); 3875 assert (Universe::heap() != NULL, "java heap should be initialized"); 3876 // Cannot assert, unverified entry point counts instructions (see .ad file) 3877 // vtableStubs also counts instructions in pd_code_size_limit. 3878 // Also do not verify_oop as this is called by verify_oop. 3879 if (CompressedOops::shift() != 0) { 3880 assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong"); 3881 if (CompressedOops::base() != NULL) { 3882 add(r, rheapbase, r, Assembler::LSL, LogMinObjAlignmentInBytes); 3883 } else { 3884 add(r, zr, r, Assembler::LSL, LogMinObjAlignmentInBytes); 3885 } 3886 } else { 3887 assert (CompressedOops::base() == NULL, "sanity"); 3888 } 3889 } 3890 3891 void MacroAssembler::decode_heap_oop_not_null(Register dst, Register src) { 3892 assert (UseCompressedOops, "should only be used for compressed headers"); 3893 assert (Universe::heap() != NULL, "java heap should be initialized"); 3894 // Cannot assert, unverified entry point counts instructions (see .ad file) 3895 // vtableStubs also counts instructions in pd_code_size_limit. 3896 // Also do not verify_oop as this is called by verify_oop. 3897 if (CompressedOops::shift() != 0) { 3898 assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong"); 3899 if (CompressedOops::base() != NULL) { 3900 add(dst, rheapbase, src, Assembler::LSL, LogMinObjAlignmentInBytes); 3901 } else { 3902 add(dst, zr, src, Assembler::LSL, LogMinObjAlignmentInBytes); 3903 } 3904 } else { 3905 assert (CompressedOops::base() == NULL, "sanity"); 3906 if (dst != src) { 3907 mov(dst, src); 3908 } 3909 } 3910 } 3911 3912 void MacroAssembler::encode_klass_not_null(Register dst, Register src) { 3913 if (CompressedKlassPointers::base() == NULL) { 3914 if (CompressedKlassPointers::shift() != 0) { 3915 assert (LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong"); 3916 lsr(dst, src, LogKlassAlignmentInBytes); 3917 } else { 3918 if (dst != src) mov(dst, src); 3919 } 3920 return; 3921 } 3922 3923 if (use_XOR_for_compressed_class_base) { 3924 if (CompressedKlassPointers::shift() != 0) { 3925 eor(dst, src, (uint64_t)CompressedKlassPointers::base()); 3926 lsr(dst, dst, LogKlassAlignmentInBytes); 3927 } else { 3928 eor(dst, src, (uint64_t)CompressedKlassPointers::base()); 3929 } 3930 return; 3931 } 3932 3933 if (((uint64_t)CompressedKlassPointers::base() & 0xffffffff) == 0 3934 && CompressedKlassPointers::shift() == 0) { 3935 movw(dst, src); 3936 return; 3937 } 3938 3939 #ifdef ASSERT 3940 verify_heapbase("MacroAssembler::encode_klass_not_null2: heap base corrupted?"); 3941 #endif 3942 3943 Register rbase = dst; 3944 if (dst == src) rbase = rheapbase; 3945 mov(rbase, (uint64_t)CompressedKlassPointers::base()); 3946 sub(dst, src, rbase); 3947 if (CompressedKlassPointers::shift() != 0) { 3948 assert (LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong"); 3949 lsr(dst, dst, LogKlassAlignmentInBytes); 3950 } 3951 if (dst == src) reinit_heapbase(); 3952 } 3953 3954 void MacroAssembler::encode_klass_not_null(Register r) { 3955 encode_klass_not_null(r, r); 3956 } 3957 3958 void MacroAssembler::decode_klass_not_null(Register dst, Register src) { 3959 Register rbase = dst; 3960 assert (UseCompressedClassPointers, "should only be used for compressed headers"); 3961 3962 if (CompressedKlassPointers::base() == NULL) { 3963 if (CompressedKlassPointers::shift() != 0) { 3964 assert(LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong"); 3965 lsl(dst, src, LogKlassAlignmentInBytes); 3966 } else { 3967 if (dst != src) mov(dst, src); 3968 } 3969 return; 3970 } 3971 3972 if (use_XOR_for_compressed_class_base) { 3973 if (CompressedKlassPointers::shift() != 0) { 3974 lsl(dst, src, LogKlassAlignmentInBytes); 3975 eor(dst, dst, (uint64_t)CompressedKlassPointers::base()); 3976 } else { 3977 eor(dst, src, (uint64_t)CompressedKlassPointers::base()); 3978 } 3979 return; 3980 } 3981 3982 if (((uint64_t)CompressedKlassPointers::base() & 0xffffffff) == 0 3983 && CompressedKlassPointers::shift() == 0) { 3984 if (dst != src) 3985 movw(dst, src); 3986 movk(dst, (uint64_t)CompressedKlassPointers::base() >> 32, 32); 3987 return; 3988 } 3989 3990 // Cannot assert, unverified entry point counts instructions (see .ad file) 3991 // vtableStubs also counts instructions in pd_code_size_limit. 3992 // Also do not verify_oop as this is called by verify_oop. 3993 if (dst == src) rbase = rheapbase; 3994 mov(rbase, (uint64_t)CompressedKlassPointers::base()); 3995 if (CompressedKlassPointers::shift() != 0) { 3996 assert(LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong"); 3997 add(dst, rbase, src, Assembler::LSL, LogKlassAlignmentInBytes); 3998 } else { 3999 add(dst, rbase, src); 4000 } 4001 if (dst == src) reinit_heapbase(); 4002 } 4003 4004 void MacroAssembler::decode_klass_not_null(Register r) { 4005 decode_klass_not_null(r, r); 4006 } 4007 4008 void MacroAssembler::set_narrow_oop(Register dst, jobject obj) { 4009 #ifdef ASSERT 4010 { 4011 ThreadInVMfromUnknown tiv; 4012 assert (UseCompressedOops, "should only be used for compressed oops"); 4013 assert (Universe::heap() != NULL, "java heap should be initialized"); 4014 assert (oop_recorder() != NULL, "this assembler needs an OopRecorder"); 4015 assert(Universe::heap()->is_in(JNIHandles::resolve(obj)), "should be real oop"); 4016 } 4017 #endif 4018 int oop_index = oop_recorder()->find_index(obj); 4019 InstructionMark im(this); 4020 RelocationHolder rspec = oop_Relocation::spec(oop_index); 4021 code_section()->relocate(inst_mark(), rspec); 4022 movz(dst, 0xDEAD, 16); 4023 movk(dst, 0xBEEF); 4024 } 4025 4026 void MacroAssembler::set_narrow_klass(Register dst, Klass* k) { 4027 assert (UseCompressedClassPointers, "should only be used for compressed headers"); 4028 assert (oop_recorder() != NULL, "this assembler needs an OopRecorder"); 4029 int index = oop_recorder()->find_index(k); 4030 assert(! Universe::heap()->is_in(k), "should not be an oop"); 4031 4032 InstructionMark im(this); 4033 RelocationHolder rspec = metadata_Relocation::spec(index); 4034 code_section()->relocate(inst_mark(), rspec); 4035 narrowKlass nk = CompressedKlassPointers::encode(k); 4036 movz(dst, (nk >> 16), 16); 4037 movk(dst, nk & 0xffff); 4038 } 4039 4040 void MacroAssembler::access_load_at(BasicType type, DecoratorSet decorators, 4041 Register dst, Address src, 4042 Register tmp1, Register thread_tmp) { 4043 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 4044 decorators = AccessInternal::decorator_fixup(decorators); 4045 bool as_raw = (decorators & AS_RAW) != 0; 4046 if (as_raw) { 4047 bs->BarrierSetAssembler::load_at(this, decorators, type, dst, src, tmp1, thread_tmp); 4048 } else { 4049 bs->load_at(this, decorators, type, dst, src, tmp1, thread_tmp); 4050 } 4051 } 4052 4053 void MacroAssembler::access_store_at(BasicType type, DecoratorSet decorators, 4054 Address dst, Register src, 4055 Register tmp1, Register thread_tmp, Register tmp3) { 4056 4057 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 4058 decorators = AccessInternal::decorator_fixup(decorators); 4059 bool as_raw = (decorators & AS_RAW) != 0; 4060 if (as_raw) { 4061 bs->BarrierSetAssembler::store_at(this, decorators, type, dst, src, tmp1, thread_tmp, tmp3); 4062 } else { 4063 bs->store_at(this, decorators, type, dst, src, tmp1, thread_tmp, tmp3); 4064 } 4065 } 4066 4067 void MacroAssembler::resolve(DecoratorSet decorators, Register obj) { 4068 // Use stronger ACCESS_WRITE|ACCESS_READ by default. 4069 if ((decorators & (ACCESS_READ | ACCESS_WRITE)) == 0) { 4070 decorators |= ACCESS_READ | ACCESS_WRITE; 4071 } 4072 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 4073 return bs->resolve(this, decorators, obj); 4074 } 4075 4076 void MacroAssembler::load_heap_oop(Register dst, Address src, Register tmp1, 4077 Register thread_tmp, DecoratorSet decorators) { 4078 access_load_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, thread_tmp); 4079 } 4080 4081 void MacroAssembler::load_heap_oop_not_null(Register dst, Address src, Register tmp1, 4082 Register thread_tmp, DecoratorSet decorators) { 4083 access_load_at(T_OBJECT, IN_HEAP | IS_NOT_NULL | decorators, dst, src, tmp1, thread_tmp); 4084 } 4085 4086 void MacroAssembler::store_heap_oop(Address dst, Register src, Register tmp1, 4087 Register thread_tmp, Register tmp3, DecoratorSet decorators) { 4088 access_store_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, thread_tmp, tmp3); 4089 } 4090 4091 // Used for storing NULLs. 4092 void MacroAssembler::store_heap_oop_null(Address dst) { 4093 access_store_at(T_OBJECT, IN_HEAP, dst, noreg, noreg, noreg, noreg); 4094 } 4095 4096 Address MacroAssembler::allocate_metadata_address(Metadata* obj) { 4097 assert(oop_recorder() != NULL, "this assembler needs a Recorder"); 4098 int index = oop_recorder()->allocate_metadata_index(obj); 4099 RelocationHolder rspec = metadata_Relocation::spec(index); 4100 return Address((address)obj, rspec); 4101 } 4102 4103 // Move an oop into a register. immediate is true if we want 4104 // immediate instrcutions, i.e. we are not going to patch this 4105 // instruction while the code is being executed by another thread. In 4106 // that case we can use move immediates rather than the constant pool. 4107 void MacroAssembler::movoop(Register dst, jobject obj, bool immediate) { 4108 int oop_index; 4109 if (obj == NULL) { 4110 oop_index = oop_recorder()->allocate_oop_index(obj); 4111 } else { 4112 #ifdef ASSERT 4113 { 4114 ThreadInVMfromUnknown tiv; 4115 assert(Universe::heap()->is_in(JNIHandles::resolve(obj)), "should be real oop"); 4116 } 4117 #endif 4118 oop_index = oop_recorder()->find_index(obj); 4119 } 4120 RelocationHolder rspec = oop_Relocation::spec(oop_index); 4121 if (! immediate) { 4122 address dummy = address(uintptr_t(pc()) & -wordSize); // A nearby aligned address 4123 ldr_constant(dst, Address(dummy, rspec)); 4124 } else 4125 mov(dst, Address((address)obj, rspec)); 4126 } 4127 4128 // Move a metadata address into a register. 4129 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) { 4130 int oop_index; 4131 if (obj == NULL) { 4132 oop_index = oop_recorder()->allocate_metadata_index(obj); 4133 } else { 4134 oop_index = oop_recorder()->find_index(obj); 4135 } 4136 RelocationHolder rspec = metadata_Relocation::spec(oop_index); 4137 mov(dst, Address((address)obj, rspec)); 4138 } 4139 4140 Address MacroAssembler::constant_oop_address(jobject obj) { 4141 #ifdef ASSERT 4142 { 4143 ThreadInVMfromUnknown tiv; 4144 assert(oop_recorder() != NULL, "this assembler needs an OopRecorder"); 4145 assert(Universe::heap()->is_in(JNIHandles::resolve(obj)), "not an oop"); 4146 } 4147 #endif 4148 int oop_index = oop_recorder()->find_index(obj); 4149 return Address((address)obj, oop_Relocation::spec(oop_index)); 4150 } 4151 4152 // Defines obj, preserves var_size_in_bytes, okay for t2 == var_size_in_bytes. 4153 void MacroAssembler::tlab_allocate(Register obj, 4154 Register var_size_in_bytes, 4155 int con_size_in_bytes, 4156 Register t1, 4157 Register t2, 4158 Label& slow_case) { 4159 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 4160 bs->tlab_allocate(this, obj, var_size_in_bytes, con_size_in_bytes, t1, t2, slow_case); 4161 } 4162 4163 // Defines obj, preserves var_size_in_bytes 4164 void MacroAssembler::eden_allocate(Register obj, 4165 Register var_size_in_bytes, 4166 int con_size_in_bytes, 4167 Register t1, 4168 Label& slow_case) { 4169 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 4170 bs->eden_allocate(this, obj, var_size_in_bytes, con_size_in_bytes, t1, slow_case); 4171 } 4172 4173 // Zero words; len is in bytes 4174 // Destroys all registers except addr 4175 // len must be a nonzero multiple of wordSize 4176 void MacroAssembler::zero_memory(Register addr, Register len, Register t1) { 4177 assert_different_registers(addr, len, t1, rscratch1, rscratch2); 4178 4179 #ifdef ASSERT 4180 { Label L; 4181 tst(len, BytesPerWord - 1); 4182 br(Assembler::EQ, L); 4183 stop("len is not a multiple of BytesPerWord"); 4184 bind(L); 4185 } 4186 #endif 4187 4188 #ifndef PRODUCT 4189 block_comment("zero memory"); 4190 #endif 4191 4192 Label loop; 4193 Label entry; 4194 4195 // Algorithm: 4196 // 4197 // scratch1 = cnt & 7; 4198 // cnt -= scratch1; 4199 // p += scratch1; 4200 // switch (scratch1) { 4201 // do { 4202 // cnt -= 8; 4203 // p[-8] = 0; 4204 // case 7: 4205 // p[-7] = 0; 4206 // case 6: 4207 // p[-6] = 0; 4208 // // ... 4209 // case 1: 4210 // p[-1] = 0; 4211 // case 0: 4212 // p += 8; 4213 // } while (cnt); 4214 // } 4215 4216 const int unroll = 8; // Number of str(zr) instructions we'll unroll 4217 4218 lsr(len, len, LogBytesPerWord); 4219 andr(rscratch1, len, unroll - 1); // tmp1 = cnt % unroll 4220 sub(len, len, rscratch1); // cnt -= unroll 4221 // t1 always points to the end of the region we're about to zero 4222 add(t1, addr, rscratch1, Assembler::LSL, LogBytesPerWord); 4223 adr(rscratch2, entry); 4224 sub(rscratch2, rscratch2, rscratch1, Assembler::LSL, 2); 4225 br(rscratch2); 4226 bind(loop); 4227 sub(len, len, unroll); 4228 for (int i = -unroll; i < 0; i++) 4229 Assembler::str(zr, Address(t1, i * wordSize)); 4230 bind(entry); 4231 add(t1, t1, unroll * wordSize); 4232 cbnz(len, loop); 4233 } 4234 4235 void MacroAssembler::verify_tlab() { 4236 #ifdef ASSERT 4237 if (UseTLAB && VerifyOops) { 4238 Label next, ok; 4239 4240 stp(rscratch2, rscratch1, Address(pre(sp, -16))); 4241 4242 ldr(rscratch2, Address(rthread, in_bytes(JavaThread::tlab_top_offset()))); 4243 ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_start_offset()))); 4244 cmp(rscratch2, rscratch1); 4245 br(Assembler::HS, next); 4246 STOP("assert(top >= start)"); 4247 should_not_reach_here(); 4248 4249 bind(next); 4250 ldr(rscratch2, Address(rthread, in_bytes(JavaThread::tlab_end_offset()))); 4251 ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_top_offset()))); 4252 cmp(rscratch2, rscratch1); 4253 br(Assembler::HS, ok); 4254 STOP("assert(top <= end)"); 4255 should_not_reach_here(); 4256 4257 bind(ok); 4258 ldp(rscratch2, rscratch1, Address(post(sp, 16))); 4259 } 4260 #endif 4261 } 4262 4263 // Writes to stack successive pages until offset reached to check for 4264 // stack overflow + shadow pages. This clobbers tmp. 4265 void MacroAssembler::bang_stack_size(Register size, Register tmp) { 4266 assert_different_registers(tmp, size, rscratch1); 4267 mov(tmp, sp); 4268 // Bang stack for total size given plus shadow page size. 4269 // Bang one page at a time because large size can bang beyond yellow and 4270 // red zones. 4271 Label loop; 4272 mov(rscratch1, os::vm_page_size()); 4273 bind(loop); 4274 lea(tmp, Address(tmp, -os::vm_page_size())); 4275 subsw(size, size, rscratch1); 4276 str(size, Address(tmp)); 4277 br(Assembler::GT, loop); 4278 4279 // Bang down shadow pages too. 4280 // At this point, (tmp-0) is the last address touched, so don't 4281 // touch it again. (It was touched as (tmp-pagesize) but then tmp 4282 // was post-decremented.) Skip this address by starting at i=1, and 4283 // touch a few more pages below. N.B. It is important to touch all 4284 // the way down to and including i=StackShadowPages. 4285 for (int i = 0; i < (int)(JavaThread::stack_shadow_zone_size() / os::vm_page_size()) - 1; i++) { 4286 // this could be any sized move but this is can be a debugging crumb 4287 // so the bigger the better. 4288 lea(tmp, Address(tmp, -os::vm_page_size())); 4289 str(size, Address(tmp)); 4290 } 4291 } 4292 4293 4294 // Move the address of the polling page into dest. 4295 void MacroAssembler::get_polling_page(Register dest, address page, relocInfo::relocType rtype) { 4296 if (SafepointMechanism::uses_thread_local_poll()) { 4297 ldr(dest, Address(rthread, Thread::polling_page_offset())); 4298 } else { 4299 unsigned long off; 4300 adrp(dest, Address(page, rtype), off); 4301 assert(off == 0, "polling page must be page aligned"); 4302 } 4303 } 4304 4305 // Move the address of the polling page into r, then read the polling 4306 // page. 4307 address MacroAssembler::read_polling_page(Register r, address page, relocInfo::relocType rtype) { 4308 get_polling_page(r, page, rtype); 4309 return read_polling_page(r, rtype); 4310 } 4311 4312 // Read the polling page. The address of the polling page must 4313 // already be in r. 4314 address MacroAssembler::read_polling_page(Register r, relocInfo::relocType rtype) { 4315 InstructionMark im(this); 4316 code_section()->relocate(inst_mark(), rtype); 4317 ldrw(zr, Address(r, 0)); 4318 return inst_mark(); 4319 } 4320 4321 void MacroAssembler::adrp(Register reg1, const Address &dest, unsigned long &byte_offset) { 4322 relocInfo::relocType rtype = dest.rspec().reloc()->type(); 4323 unsigned long low_page = (unsigned long)CodeCache::low_bound() >> 12; 4324 unsigned long high_page = (unsigned long)(CodeCache::high_bound()-1) >> 12; 4325 unsigned long dest_page = (unsigned long)dest.target() >> 12; 4326 long offset_low = dest_page - low_page; 4327 long offset_high = dest_page - high_page; 4328 4329 assert(is_valid_AArch64_address(dest.target()), "bad address"); 4330 assert(dest.getMode() == Address::literal, "ADRP must be applied to a literal address"); 4331 4332 InstructionMark im(this); 4333 code_section()->relocate(inst_mark(), dest.rspec()); 4334 // 8143067: Ensure that the adrp can reach the dest from anywhere within 4335 // the code cache so that if it is relocated we know it will still reach 4336 if (offset_high >= -(1<<20) && offset_low < (1<<20)) { 4337 _adrp(reg1, dest.target()); 4338 } else { 4339 unsigned long target = (unsigned long)dest.target(); 4340 unsigned long adrp_target 4341 = (target & 0xffffffffUL) | ((unsigned long)pc() & 0xffff00000000UL); 4342 4343 _adrp(reg1, (address)adrp_target); 4344 movk(reg1, target >> 32, 32); 4345 } 4346 byte_offset = (unsigned long)dest.target() & 0xfff; 4347 } 4348 4349 void MacroAssembler::load_byte_map_base(Register reg) { 4350 CardTable::CardValue* byte_map_base = 4351 ((CardTableBarrierSet*)(BarrierSet::barrier_set()))->card_table()->byte_map_base(); 4352 4353 if (is_valid_AArch64_address((address)byte_map_base)) { 4354 // Strictly speaking the byte_map_base isn't an address at all, 4355 // and it might even be negative. 4356 unsigned long offset; 4357 adrp(reg, ExternalAddress((address)byte_map_base), offset); 4358 // We expect offset to be zero with most collectors. 4359 if (offset != 0) { 4360 add(reg, reg, offset); 4361 } 4362 } else { 4363 mov(reg, (uint64_t)byte_map_base); 4364 } 4365 } 4366 4367 void MacroAssembler::build_frame(int framesize) { 4368 assert(framesize > 0, "framesize must be > 0"); 4369 if (framesize < ((1 << 9) + 2 * wordSize)) { 4370 sub(sp, sp, framesize); 4371 stp(rfp, lr, Address(sp, framesize - 2 * wordSize)); 4372 if (PreserveFramePointer) add(rfp, sp, framesize - 2 * wordSize); 4373 } else { 4374 stp(rfp, lr, Address(pre(sp, -2 * wordSize))); 4375 if (PreserveFramePointer) mov(rfp, sp); 4376 if (framesize < ((1 << 12) + 2 * wordSize)) 4377 sub(sp, sp, framesize - 2 * wordSize); 4378 else { 4379 mov(rscratch1, framesize - 2 * wordSize); 4380 sub(sp, sp, rscratch1); 4381 } 4382 } 4383 } 4384 4385 void MacroAssembler::remove_frame(int framesize) { 4386 assert(framesize > 0, "framesize must be > 0"); 4387 if (framesize < ((1 << 9) + 2 * wordSize)) { 4388 ldp(rfp, lr, Address(sp, framesize - 2 * wordSize)); 4389 add(sp, sp, framesize); 4390 } else { 4391 if (framesize < ((1 << 12) + 2 * wordSize)) 4392 add(sp, sp, framesize - 2 * wordSize); 4393 else { 4394 mov(rscratch1, framesize - 2 * wordSize); 4395 add(sp, sp, rscratch1); 4396 } 4397 ldp(rfp, lr, Address(post(sp, 2 * wordSize))); 4398 } 4399 } 4400 4401 #ifdef COMPILER2 4402 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr); 4403 4404 // Search for str1 in str2 and return index or -1 4405 void MacroAssembler::string_indexof(Register str2, Register str1, 4406 Register cnt2, Register cnt1, 4407 Register tmp1, Register tmp2, 4408 Register tmp3, Register tmp4, 4409 Register tmp5, Register tmp6, 4410 int icnt1, Register result, int ae) { 4411 // NOTE: tmp5, tmp6 can be zr depending on specific method version 4412 Label LINEARSEARCH, LINEARSTUB, LINEAR_MEDIUM, DONE, NOMATCH, MATCH; 4413 4414 Register ch1 = rscratch1; 4415 Register ch2 = rscratch2; 4416 Register cnt1tmp = tmp1; 4417 Register cnt2tmp = tmp2; 4418 Register cnt1_neg = cnt1; 4419 Register cnt2_neg = cnt2; 4420 Register result_tmp = tmp4; 4421 4422 bool isL = ae == StrIntrinsicNode::LL; 4423 4424 bool str1_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL; 4425 bool str2_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU; 4426 int str1_chr_shift = str1_isL ? 0:1; 4427 int str2_chr_shift = str2_isL ? 0:1; 4428 int str1_chr_size = str1_isL ? 1:2; 4429 int str2_chr_size = str2_isL ? 1:2; 4430 chr_insn str1_load_1chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb : 4431 (chr_insn)&MacroAssembler::ldrh; 4432 chr_insn str2_load_1chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb : 4433 (chr_insn)&MacroAssembler::ldrh; 4434 chr_insn load_2chr = isL ? (chr_insn)&MacroAssembler::ldrh : (chr_insn)&MacroAssembler::ldrw; 4435 chr_insn load_4chr = isL ? (chr_insn)&MacroAssembler::ldrw : (chr_insn)&MacroAssembler::ldr; 4436 4437 // Note, inline_string_indexOf() generates checks: 4438 // if (substr.count > string.count) return -1; 4439 // if (substr.count == 0) return 0; 4440 4441 // We have two strings, a source string in str2, cnt2 and a pattern string 4442 // in str1, cnt1. Find the 1st occurence of pattern in source or return -1. 4443 4444 // For larger pattern and source we use a simplified Boyer Moore algorithm. 4445 // With a small pattern and source we use linear scan. 4446 4447 if (icnt1 == -1) { 4448 sub(result_tmp, cnt2, cnt1); 4449 cmp(cnt1, (u1)8); // Use Linear Scan if cnt1 < 8 || cnt1 >= 256 4450 br(LT, LINEARSEARCH); 4451 dup(v0, T16B, cnt1); // done in separate FPU pipeline. Almost no penalty 4452 subs(zr, cnt1, 256); 4453 lsr(tmp1, cnt2, 2); 4454 ccmp(cnt1, tmp1, 0b0000, LT); // Source must be 4 * pattern for BM 4455 br(GE, LINEARSTUB); 4456 } 4457 4458 // The Boyer Moore alogorithm is based on the description here:- 4459 // 4460 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm 4461 // 4462 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule 4463 // and the 'Good Suffix' rule. 4464 // 4465 // These rules are essentially heuristics for how far we can shift the 4466 // pattern along the search string. 4467 // 4468 // The implementation here uses the 'Bad Character' rule only because of the 4469 // complexity of initialisation for the 'Good Suffix' rule. 4470 // 4471 // This is also known as the Boyer-Moore-Horspool algorithm:- 4472 // 4473 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm 4474 // 4475 // This particular implementation has few java-specific optimizations. 4476 // 4477 // #define ASIZE 256 4478 // 4479 // int bm(unsigned char *x, int m, unsigned char *y, int n) { 4480 // int i, j; 4481 // unsigned c; 4482 // unsigned char bc[ASIZE]; 4483 // 4484 // /* Preprocessing */ 4485 // for (i = 0; i < ASIZE; ++i) 4486 // bc[i] = m; 4487 // for (i = 0; i < m - 1; ) { 4488 // c = x[i]; 4489 // ++i; 4490 // // c < 256 for Latin1 string, so, no need for branch 4491 // #ifdef PATTERN_STRING_IS_LATIN1 4492 // bc[c] = m - i; 4493 // #else 4494 // if (c < ASIZE) bc[c] = m - i; 4495 // #endif 4496 // } 4497 // 4498 // /* Searching */ 4499 // j = 0; 4500 // while (j <= n - m) { 4501 // c = y[i+j]; 4502 // if (x[m-1] == c) 4503 // for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i); 4504 // if (i < 0) return j; 4505 // // c < 256 for Latin1 string, so, no need for branch 4506 // #ifdef SOURCE_STRING_IS_LATIN1 4507 // // LL case: (c< 256) always true. Remove branch 4508 // j += bc[y[j+m-1]]; 4509 // #endif 4510 // #ifndef PATTERN_STRING_IS_UTF 4511 // // UU case: need if (c<ASIZE) check. Skip 1 character if not. 4512 // if (c < ASIZE) 4513 // j += bc[y[j+m-1]]; 4514 // else 4515 // j += 1 4516 // #endif 4517 // #ifdef PATTERN_IS_LATIN1_AND_SOURCE_IS_UTF 4518 // // UL case: need if (c<ASIZE) check. Skip <pattern length> if not. 4519 // if (c < ASIZE) 4520 // j += bc[y[j+m-1]]; 4521 // else 4522 // j += m 4523 // #endif 4524 // } 4525 // } 4526 4527 if (icnt1 == -1) { 4528 Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH, 4529 BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP; 4530 Register cnt1end = tmp2; 4531 Register str2end = cnt2; 4532 Register skipch = tmp2; 4533 4534 // str1 length is >=8, so, we can read at least 1 register for cases when 4535 // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for 4536 // UL case. We'll re-read last character in inner pre-loop code to have 4537 // single outer pre-loop load 4538 const int firstStep = isL ? 7 : 3; 4539 4540 const int ASIZE = 256; 4541 const int STORED_BYTES = 32; // amount of bytes stored per instruction 4542 sub(sp, sp, ASIZE); 4543 mov(tmp5, ASIZE/STORED_BYTES); // loop iterations 4544 mov(ch1, sp); 4545 BIND(BM_INIT_LOOP); 4546 stpq(v0, v0, Address(post(ch1, STORED_BYTES))); 4547 subs(tmp5, tmp5, 1); 4548 br(GT, BM_INIT_LOOP); 4549 4550 sub(cnt1tmp, cnt1, 1); 4551 mov(tmp5, str2); 4552 add(str2end, str2, result_tmp, LSL, str2_chr_shift); 4553 sub(ch2, cnt1, 1); 4554 mov(tmp3, str1); 4555 BIND(BCLOOP); 4556 (this->*str1_load_1chr)(ch1, Address(post(tmp3, str1_chr_size))); 4557 if (!str1_isL) { 4558 subs(zr, ch1, ASIZE); 4559 br(HS, BCSKIP); 4560 } 4561 strb(ch2, Address(sp, ch1)); 4562 BIND(BCSKIP); 4563 subs(ch2, ch2, 1); 4564 br(GT, BCLOOP); 4565 4566 add(tmp6, str1, cnt1, LSL, str1_chr_shift); // address after str1 4567 if (str1_isL == str2_isL) { 4568 // load last 8 bytes (8LL/4UU symbols) 4569 ldr(tmp6, Address(tmp6, -wordSize)); 4570 } else { 4571 ldrw(tmp6, Address(tmp6, -wordSize/2)); // load last 4 bytes(4 symbols) 4572 // convert Latin1 to UTF. We'll have to wait until load completed, but 4573 // it's still faster than per-character loads+checks 4574 lsr(tmp3, tmp6, BitsPerByte * (wordSize/2 - str1_chr_size)); // str1[N-1] 4575 ubfx(ch1, tmp6, 8, 8); // str1[N-2] 4576 ubfx(ch2, tmp6, 16, 8); // str1[N-3] 4577 andr(tmp6, tmp6, 0xFF); // str1[N-4] 4578 orr(ch2, ch1, ch2, LSL, 16); 4579 orr(tmp6, tmp6, tmp3, LSL, 48); 4580 orr(tmp6, tmp6, ch2, LSL, 16); 4581 } 4582 BIND(BMLOOPSTR2); 4583 (this->*str2_load_1chr)(skipch, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift))); 4584 sub(cnt1tmp, cnt1tmp, firstStep); // cnt1tmp is positive here, because cnt1 >= 8 4585 if (str1_isL == str2_isL) { 4586 // re-init tmp3. It's for free because it's executed in parallel with 4587 // load above. Alternative is to initialize it before loop, but it'll 4588 // affect performance on in-order systems with 2 or more ld/st pipelines 4589 lsr(tmp3, tmp6, BitsPerByte * (wordSize - str1_chr_size)); 4590 } 4591 if (!isL) { // UU/UL case 4592 lsl(ch2, cnt1tmp, 1); // offset in bytes 4593 } 4594 cmp(tmp3, skipch); 4595 br(NE, BMSKIP); 4596 ldr(ch2, Address(str2, isL ? cnt1tmp : ch2)); 4597 mov(ch1, tmp6); 4598 if (isL) { 4599 b(BMLOOPSTR1_AFTER_LOAD); 4600 } else { 4601 sub(cnt1tmp, cnt1tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8 4602 b(BMLOOPSTR1_CMP); 4603 } 4604 BIND(BMLOOPSTR1); 4605 (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift))); 4606 (this->*str2_load_1chr)(ch2, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift))); 4607 BIND(BMLOOPSTR1_AFTER_LOAD); 4608 subs(cnt1tmp, cnt1tmp, 1); 4609 br(LT, BMLOOPSTR1_LASTCMP); 4610 BIND(BMLOOPSTR1_CMP); 4611 cmp(ch1, ch2); 4612 br(EQ, BMLOOPSTR1); 4613 BIND(BMSKIP); 4614 if (!isL) { 4615 // if we've met UTF symbol while searching Latin1 pattern, then we can 4616 // skip cnt1 symbols 4617 if (str1_isL != str2_isL) { 4618 mov(result_tmp, cnt1); 4619 } else { 4620 mov(result_tmp, 1); 4621 } 4622 subs(zr, skipch, ASIZE); 4623 br(HS, BMADV); 4624 } 4625 ldrb(result_tmp, Address(sp, skipch)); // load skip distance 4626 BIND(BMADV); 4627 sub(cnt1tmp, cnt1, 1); 4628 add(str2, str2, result_tmp, LSL, str2_chr_shift); 4629 cmp(str2, str2end); 4630 br(LE, BMLOOPSTR2); 4631 add(sp, sp, ASIZE); 4632 b(NOMATCH); 4633 BIND(BMLOOPSTR1_LASTCMP); 4634 cmp(ch1, ch2); 4635 br(NE, BMSKIP); 4636 BIND(BMMATCH); 4637 sub(result, str2, tmp5); 4638 if (!str2_isL) lsr(result, result, 1); 4639 add(sp, sp, ASIZE); 4640 b(DONE); 4641 4642 BIND(LINEARSTUB); 4643 cmp(cnt1, (u1)16); // small patterns still should be handled by simple algorithm 4644 br(LT, LINEAR_MEDIUM); 4645 mov(result, zr); 4646 RuntimeAddress stub = NULL; 4647 if (isL) { 4648 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ll()); 4649 assert(stub.target() != NULL, "string_indexof_linear_ll stub has not been generated"); 4650 } else if (str1_isL) { 4651 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ul()); 4652 assert(stub.target() != NULL, "string_indexof_linear_ul stub has not been generated"); 4653 } else { 4654 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_uu()); 4655 assert(stub.target() != NULL, "string_indexof_linear_uu stub has not been generated"); 4656 } 4657 trampoline_call(stub); 4658 b(DONE); 4659 } 4660 4661 BIND(LINEARSEARCH); 4662 { 4663 Label DO1, DO2, DO3; 4664 4665 Register str2tmp = tmp2; 4666 Register first = tmp3; 4667 4668 if (icnt1 == -1) 4669 { 4670 Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT; 4671 4672 cmp(cnt1, u1(str1_isL == str2_isL ? 4 : 2)); 4673 br(LT, DOSHORT); 4674 BIND(LINEAR_MEDIUM); 4675 (this->*str1_load_1chr)(first, Address(str1)); 4676 lea(str1, Address(str1, cnt1, Address::lsl(str1_chr_shift))); 4677 sub(cnt1_neg, zr, cnt1, LSL, str1_chr_shift); 4678 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 4679 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 4680 4681 BIND(FIRST_LOOP); 4682 (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg)); 4683 cmp(first, ch2); 4684 br(EQ, STR1_LOOP); 4685 BIND(STR2_NEXT); 4686 adds(cnt2_neg, cnt2_neg, str2_chr_size); 4687 br(LE, FIRST_LOOP); 4688 b(NOMATCH); 4689 4690 BIND(STR1_LOOP); 4691 adds(cnt1tmp, cnt1_neg, str1_chr_size); 4692 add(cnt2tmp, cnt2_neg, str2_chr_size); 4693 br(GE, MATCH); 4694 4695 BIND(STR1_NEXT); 4696 (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp)); 4697 (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp)); 4698 cmp(ch1, ch2); 4699 br(NE, STR2_NEXT); 4700 adds(cnt1tmp, cnt1tmp, str1_chr_size); 4701 add(cnt2tmp, cnt2tmp, str2_chr_size); 4702 br(LT, STR1_NEXT); 4703 b(MATCH); 4704 4705 BIND(DOSHORT); 4706 if (str1_isL == str2_isL) { 4707 cmp(cnt1, (u1)2); 4708 br(LT, DO1); 4709 br(GT, DO3); 4710 } 4711 } 4712 4713 if (icnt1 == 4) { 4714 Label CH1_LOOP; 4715 4716 (this->*load_4chr)(ch1, str1); 4717 sub(result_tmp, cnt2, 4); 4718 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 4719 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 4720 4721 BIND(CH1_LOOP); 4722 (this->*load_4chr)(ch2, Address(str2, cnt2_neg)); 4723 cmp(ch1, ch2); 4724 br(EQ, MATCH); 4725 adds(cnt2_neg, cnt2_neg, str2_chr_size); 4726 br(LE, CH1_LOOP); 4727 b(NOMATCH); 4728 } 4729 4730 if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 2) { 4731 Label CH1_LOOP; 4732 4733 BIND(DO2); 4734 (this->*load_2chr)(ch1, str1); 4735 if (icnt1 == 2) { 4736 sub(result_tmp, cnt2, 2); 4737 } 4738 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 4739 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 4740 BIND(CH1_LOOP); 4741 (this->*load_2chr)(ch2, Address(str2, cnt2_neg)); 4742 cmp(ch1, ch2); 4743 br(EQ, MATCH); 4744 adds(cnt2_neg, cnt2_neg, str2_chr_size); 4745 br(LE, CH1_LOOP); 4746 b(NOMATCH); 4747 } 4748 4749 if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 3) { 4750 Label FIRST_LOOP, STR2_NEXT, STR1_LOOP; 4751 4752 BIND(DO3); 4753 (this->*load_2chr)(first, str1); 4754 (this->*str1_load_1chr)(ch1, Address(str1, 2*str1_chr_size)); 4755 if (icnt1 == 3) { 4756 sub(result_tmp, cnt2, 3); 4757 } 4758 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 4759 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 4760 BIND(FIRST_LOOP); 4761 (this->*load_2chr)(ch2, Address(str2, cnt2_neg)); 4762 cmpw(first, ch2); 4763 br(EQ, STR1_LOOP); 4764 BIND(STR2_NEXT); 4765 adds(cnt2_neg, cnt2_neg, str2_chr_size); 4766 br(LE, FIRST_LOOP); 4767 b(NOMATCH); 4768 4769 BIND(STR1_LOOP); 4770 add(cnt2tmp, cnt2_neg, 2*str2_chr_size); 4771 (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp)); 4772 cmp(ch1, ch2); 4773 br(NE, STR2_NEXT); 4774 b(MATCH); 4775 } 4776 4777 if (icnt1 == -1 || icnt1 == 1) { 4778 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP; 4779 4780 BIND(DO1); 4781 (this->*str1_load_1chr)(ch1, str1); 4782 cmp(cnt2, (u1)8); 4783 br(LT, DO1_SHORT); 4784 4785 sub(result_tmp, cnt2, 8/str2_chr_size); 4786 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 4787 mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001); 4788 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 4789 4790 if (str2_isL) { 4791 orr(ch1, ch1, ch1, LSL, 8); 4792 } 4793 orr(ch1, ch1, ch1, LSL, 16); 4794 orr(ch1, ch1, ch1, LSL, 32); 4795 BIND(CH1_LOOP); 4796 ldr(ch2, Address(str2, cnt2_neg)); 4797 eor(ch2, ch1, ch2); 4798 sub(tmp1, ch2, tmp3); 4799 orr(tmp2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 4800 bics(tmp1, tmp1, tmp2); 4801 br(NE, HAS_ZERO); 4802 adds(cnt2_neg, cnt2_neg, 8); 4803 br(LT, CH1_LOOP); 4804 4805 cmp(cnt2_neg, (u1)8); 4806 mov(cnt2_neg, 0); 4807 br(LT, CH1_LOOP); 4808 b(NOMATCH); 4809 4810 BIND(HAS_ZERO); 4811 rev(tmp1, tmp1); 4812 clz(tmp1, tmp1); 4813 add(cnt2_neg, cnt2_neg, tmp1, LSR, 3); 4814 b(MATCH); 4815 4816 BIND(DO1_SHORT); 4817 mov(result_tmp, cnt2); 4818 lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift))); 4819 sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift); 4820 BIND(DO1_LOOP); 4821 (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg)); 4822 cmpw(ch1, ch2); 4823 br(EQ, MATCH); 4824 adds(cnt2_neg, cnt2_neg, str2_chr_size); 4825 br(LT, DO1_LOOP); 4826 } 4827 } 4828 BIND(NOMATCH); 4829 mov(result, -1); 4830 b(DONE); 4831 BIND(MATCH); 4832 add(result, result_tmp, cnt2_neg, ASR, str2_chr_shift); 4833 BIND(DONE); 4834 } 4835 4836 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr); 4837 typedef void (MacroAssembler::* uxt_insn)(Register Rd, Register Rn); 4838 4839 void MacroAssembler::string_indexof_char(Register str1, Register cnt1, 4840 Register ch, Register result, 4841 Register tmp1, Register tmp2, Register tmp3) 4842 { 4843 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE; 4844 Register cnt1_neg = cnt1; 4845 Register ch1 = rscratch1; 4846 Register result_tmp = rscratch2; 4847 4848 cmp(cnt1, (u1)4); 4849 br(LT, DO1_SHORT); 4850 4851 orr(ch, ch, ch, LSL, 16); 4852 orr(ch, ch, ch, LSL, 32); 4853 4854 sub(cnt1, cnt1, 4); 4855 mov(result_tmp, cnt1); 4856 lea(str1, Address(str1, cnt1, Address::uxtw(1))); 4857 sub(cnt1_neg, zr, cnt1, LSL, 1); 4858 4859 mov(tmp3, 0x0001000100010001); 4860 4861 BIND(CH1_LOOP); 4862 ldr(ch1, Address(str1, cnt1_neg)); 4863 eor(ch1, ch, ch1); 4864 sub(tmp1, ch1, tmp3); 4865 orr(tmp2, ch1, 0x7fff7fff7fff7fff); 4866 bics(tmp1, tmp1, tmp2); 4867 br(NE, HAS_ZERO); 4868 adds(cnt1_neg, cnt1_neg, 8); 4869 br(LT, CH1_LOOP); 4870 4871 cmp(cnt1_neg, (u1)8); 4872 mov(cnt1_neg, 0); 4873 br(LT, CH1_LOOP); 4874 b(NOMATCH); 4875 4876 BIND(HAS_ZERO); 4877 rev(tmp1, tmp1); 4878 clz(tmp1, tmp1); 4879 add(cnt1_neg, cnt1_neg, tmp1, LSR, 3); 4880 b(MATCH); 4881 4882 BIND(DO1_SHORT); 4883 mov(result_tmp, cnt1); 4884 lea(str1, Address(str1, cnt1, Address::uxtw(1))); 4885 sub(cnt1_neg, zr, cnt1, LSL, 1); 4886 BIND(DO1_LOOP); 4887 ldrh(ch1, Address(str1, cnt1_neg)); 4888 cmpw(ch, ch1); 4889 br(EQ, MATCH); 4890 adds(cnt1_neg, cnt1_neg, 2); 4891 br(LT, DO1_LOOP); 4892 BIND(NOMATCH); 4893 mov(result, -1); 4894 b(DONE); 4895 BIND(MATCH); 4896 add(result, result_tmp, cnt1_neg, ASR, 1); 4897 BIND(DONE); 4898 } 4899 4900 // Compare strings. 4901 void MacroAssembler::string_compare(Register str1, Register str2, 4902 Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2, 4903 FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3, int ae) { 4904 Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB, 4905 DIFFERENCE, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, 4906 SHORT_LOOP_START, TAIL_CHECK; 4907 4908 const u1 STUB_THRESHOLD = 64 + 8; 4909 bool isLL = ae == StrIntrinsicNode::LL; 4910 bool isLU = ae == StrIntrinsicNode::LU; 4911 bool isUL = ae == StrIntrinsicNode::UL; 4912 4913 bool str1_isL = isLL || isLU; 4914 bool str2_isL = isLL || isUL; 4915 4916 int str1_chr_shift = str1_isL ? 0 : 1; 4917 int str2_chr_shift = str2_isL ? 0 : 1; 4918 int str1_chr_size = str1_isL ? 1 : 2; 4919 int str2_chr_size = str2_isL ? 1 : 2; 4920 int minCharsInWord = isLL ? wordSize : wordSize/2; 4921 4922 FloatRegister vtmpZ = vtmp1, vtmp = vtmp2; 4923 chr_insn str1_load_chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb : 4924 (chr_insn)&MacroAssembler::ldrh; 4925 chr_insn str2_load_chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb : 4926 (chr_insn)&MacroAssembler::ldrh; 4927 uxt_insn ext_chr = isLL ? (uxt_insn)&MacroAssembler::uxtbw : 4928 (uxt_insn)&MacroAssembler::uxthw; 4929 4930 BLOCK_COMMENT("string_compare {"); 4931 4932 // Bizzarely, the counts are passed in bytes, regardless of whether they 4933 // are L or U strings, however the result is always in characters. 4934 if (!str1_isL) asrw(cnt1, cnt1, 1); 4935 if (!str2_isL) asrw(cnt2, cnt2, 1); 4936 4937 // Compute the minimum of the string lengths and save the difference. 4938 subsw(result, cnt1, cnt2); 4939 cselw(cnt2, cnt1, cnt2, Assembler::LE); // min 4940 4941 // A very short string 4942 cmpw(cnt2, minCharsInWord); 4943 br(Assembler::LE, SHORT_STRING); 4944 4945 // Compare longwords 4946 // load first parts of strings and finish initialization while loading 4947 { 4948 if (str1_isL == str2_isL) { // LL or UU 4949 ldr(tmp1, Address(str1)); 4950 cmp(str1, str2); 4951 br(Assembler::EQ, DONE); 4952 ldr(tmp2, Address(str2)); 4953 cmp(cnt2, STUB_THRESHOLD); 4954 br(GE, STUB); 4955 subsw(cnt2, cnt2, minCharsInWord); 4956 br(EQ, TAIL_CHECK); 4957 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift))); 4958 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift))); 4959 sub(cnt2, zr, cnt2, LSL, str2_chr_shift); 4960 } else if (isLU) { 4961 ldrs(vtmp, Address(str1)); 4962 cmp(str1, str2); 4963 br(Assembler::EQ, DONE); 4964 ldr(tmp2, Address(str2)); 4965 cmp(cnt2, STUB_THRESHOLD); 4966 br(GE, STUB); 4967 subw(cnt2, cnt2, 4); 4968 eor(vtmpZ, T16B, vtmpZ, vtmpZ); 4969 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift))); 4970 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift))); 4971 zip1(vtmp, T8B, vtmp, vtmpZ); 4972 sub(cnt1, zr, cnt2, LSL, str1_chr_shift); 4973 sub(cnt2, zr, cnt2, LSL, str2_chr_shift); 4974 add(cnt1, cnt1, 4); 4975 fmovd(tmp1, vtmp); 4976 } else { // UL case 4977 ldr(tmp1, Address(str1)); 4978 cmp(str1, str2); 4979 br(Assembler::EQ, DONE); 4980 ldrs(vtmp, Address(str2)); 4981 cmp(cnt2, STUB_THRESHOLD); 4982 br(GE, STUB); 4983 subw(cnt2, cnt2, 4); 4984 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift))); 4985 eor(vtmpZ, T16B, vtmpZ, vtmpZ); 4986 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift))); 4987 sub(cnt1, zr, cnt2, LSL, str1_chr_shift); 4988 zip1(vtmp, T8B, vtmp, vtmpZ); 4989 sub(cnt2, zr, cnt2, LSL, str2_chr_shift); 4990 add(cnt1, cnt1, 8); 4991 fmovd(tmp2, vtmp); 4992 } 4993 adds(cnt2, cnt2, isUL ? 4 : 8); 4994 br(GE, TAIL); 4995 eor(rscratch2, tmp1, tmp2); 4996 cbnz(rscratch2, DIFFERENCE); 4997 // main loop 4998 bind(NEXT_WORD); 4999 if (str1_isL == str2_isL) { 5000 ldr(tmp1, Address(str1, cnt2)); 5001 ldr(tmp2, Address(str2, cnt2)); 5002 adds(cnt2, cnt2, 8); 5003 } else if (isLU) { 5004 ldrs(vtmp, Address(str1, cnt1)); 5005 ldr(tmp2, Address(str2, cnt2)); 5006 add(cnt1, cnt1, 4); 5007 zip1(vtmp, T8B, vtmp, vtmpZ); 5008 fmovd(tmp1, vtmp); 5009 adds(cnt2, cnt2, 8); 5010 } else { // UL 5011 ldrs(vtmp, Address(str2, cnt2)); 5012 ldr(tmp1, Address(str1, cnt1)); 5013 zip1(vtmp, T8B, vtmp, vtmpZ); 5014 add(cnt1, cnt1, 8); 5015 fmovd(tmp2, vtmp); 5016 adds(cnt2, cnt2, 4); 5017 } 5018 br(GE, TAIL); 5019 5020 eor(rscratch2, tmp1, tmp2); 5021 cbz(rscratch2, NEXT_WORD); 5022 b(DIFFERENCE); 5023 bind(TAIL); 5024 eor(rscratch2, tmp1, tmp2); 5025 cbnz(rscratch2, DIFFERENCE); 5026 // Last longword. In the case where length == 4 we compare the 5027 // same longword twice, but that's still faster than another 5028 // conditional branch. 5029 if (str1_isL == str2_isL) { 5030 ldr(tmp1, Address(str1)); 5031 ldr(tmp2, Address(str2)); 5032 } else if (isLU) { 5033 ldrs(vtmp, Address(str1)); 5034 ldr(tmp2, Address(str2)); 5035 zip1(vtmp, T8B, vtmp, vtmpZ); 5036 fmovd(tmp1, vtmp); 5037 } else { // UL 5038 ldrs(vtmp, Address(str2)); 5039 ldr(tmp1, Address(str1)); 5040 zip1(vtmp, T8B, vtmp, vtmpZ); 5041 fmovd(tmp2, vtmp); 5042 } 5043 bind(TAIL_CHECK); 5044 eor(rscratch2, tmp1, tmp2); 5045 cbz(rscratch2, DONE); 5046 5047 // Find the first different characters in the longwords and 5048 // compute their difference. 5049 bind(DIFFERENCE); 5050 rev(rscratch2, rscratch2); 5051 clz(rscratch2, rscratch2); 5052 andr(rscratch2, rscratch2, isLL ? -8 : -16); 5053 lsrv(tmp1, tmp1, rscratch2); 5054 (this->*ext_chr)(tmp1, tmp1); 5055 lsrv(tmp2, tmp2, rscratch2); 5056 (this->*ext_chr)(tmp2, tmp2); 5057 subw(result, tmp1, tmp2); 5058 b(DONE); 5059 } 5060 5061 bind(STUB); 5062 RuntimeAddress stub = NULL; 5063 switch(ae) { 5064 case StrIntrinsicNode::LL: 5065 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LL()); 5066 break; 5067 case StrIntrinsicNode::UU: 5068 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UU()); 5069 break; 5070 case StrIntrinsicNode::LU: 5071 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LU()); 5072 break; 5073 case StrIntrinsicNode::UL: 5074 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UL()); 5075 break; 5076 default: 5077 ShouldNotReachHere(); 5078 } 5079 assert(stub.target() != NULL, "compare_long_string stub has not been generated"); 5080 trampoline_call(stub); 5081 b(DONE); 5082 5083 bind(SHORT_STRING); 5084 // Is the minimum length zero? 5085 cbz(cnt2, DONE); 5086 // arrange code to do most branches while loading and loading next characters 5087 // while comparing previous 5088 (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size))); 5089 subs(cnt2, cnt2, 1); 5090 br(EQ, SHORT_LAST_INIT); 5091 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size))); 5092 b(SHORT_LOOP_START); 5093 bind(SHORT_LOOP); 5094 subs(cnt2, cnt2, 1); 5095 br(EQ, SHORT_LAST); 5096 bind(SHORT_LOOP_START); 5097 (this->*str1_load_chr)(tmp2, Address(post(str1, str1_chr_size))); 5098 (this->*str2_load_chr)(rscratch1, Address(post(str2, str2_chr_size))); 5099 cmp(tmp1, cnt1); 5100 br(NE, SHORT_LOOP_TAIL); 5101 subs(cnt2, cnt2, 1); 5102 br(EQ, SHORT_LAST2); 5103 (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size))); 5104 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size))); 5105 cmp(tmp2, rscratch1); 5106 br(EQ, SHORT_LOOP); 5107 sub(result, tmp2, rscratch1); 5108 b(DONE); 5109 bind(SHORT_LOOP_TAIL); 5110 sub(result, tmp1, cnt1); 5111 b(DONE); 5112 bind(SHORT_LAST2); 5113 cmp(tmp2, rscratch1); 5114 br(EQ, DONE); 5115 sub(result, tmp2, rscratch1); 5116 5117 b(DONE); 5118 bind(SHORT_LAST_INIT); 5119 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size))); 5120 bind(SHORT_LAST); 5121 cmp(tmp1, cnt1); 5122 br(EQ, DONE); 5123 sub(result, tmp1, cnt1); 5124 5125 bind(DONE); 5126 5127 BLOCK_COMMENT("} string_compare"); 5128 } 5129 #endif // COMPILER2 5130 5131 // This method checks if provided byte array contains byte with highest bit set. 5132 void MacroAssembler::has_negatives(Register ary1, Register len, Register result) { 5133 // Simple and most common case of aligned small array which is not at the 5134 // end of memory page is placed here. All other cases are in stub. 5135 Label LOOP, END, STUB, STUB_LONG, SET_RESULT, DONE; 5136 const uint64_t UPPER_BIT_MASK=0x8080808080808080; 5137 assert_different_registers(ary1, len, result); 5138 5139 cmpw(len, 0); 5140 br(LE, SET_RESULT); 5141 cmpw(len, 4 * wordSize); 5142 br(GE, STUB_LONG); // size > 32 then go to stub 5143 5144 int shift = 64 - exact_log2(os::vm_page_size()); 5145 lsl(rscratch1, ary1, shift); 5146 mov(rscratch2, (size_t)(4 * wordSize) << shift); 5147 adds(rscratch2, rscratch1, rscratch2); // At end of page? 5148 br(CS, STUB); // at the end of page then go to stub 5149 subs(len, len, wordSize); 5150 br(LT, END); 5151 5152 BIND(LOOP); 5153 ldr(rscratch1, Address(post(ary1, wordSize))); 5154 tst(rscratch1, UPPER_BIT_MASK); 5155 br(NE, SET_RESULT); 5156 subs(len, len, wordSize); 5157 br(GE, LOOP); 5158 cmpw(len, -wordSize); 5159 br(EQ, SET_RESULT); 5160 5161 BIND(END); 5162 ldr(result, Address(ary1)); 5163 sub(len, zr, len, LSL, 3); // LSL 3 is to get bits from bytes 5164 lslv(result, result, len); 5165 tst(result, UPPER_BIT_MASK); 5166 b(SET_RESULT); 5167 5168 BIND(STUB); 5169 RuntimeAddress has_neg = RuntimeAddress(StubRoutines::aarch64::has_negatives()); 5170 assert(has_neg.target() != NULL, "has_negatives stub has not been generated"); 5171 trampoline_call(has_neg); 5172 b(DONE); 5173 5174 BIND(STUB_LONG); 5175 RuntimeAddress has_neg_long = RuntimeAddress( 5176 StubRoutines::aarch64::has_negatives_long()); 5177 assert(has_neg_long.target() != NULL, "has_negatives stub has not been generated"); 5178 trampoline_call(has_neg_long); 5179 b(DONE); 5180 5181 BIND(SET_RESULT); 5182 cset(result, NE); // set true or false 5183 5184 BIND(DONE); 5185 } 5186 5187 void MacroAssembler::arrays_equals(Register a1, Register a2, Register tmp3, 5188 Register tmp4, Register tmp5, Register result, 5189 Register cnt1, int elem_size) { 5190 Label DONE, SAME; 5191 Register tmp1 = rscratch1; 5192 Register tmp2 = rscratch2; 5193 Register cnt2 = tmp2; // cnt2 only used in array length compare 5194 int elem_per_word = wordSize/elem_size; 5195 int log_elem_size = exact_log2(elem_size); 5196 int length_offset = arrayOopDesc::length_offset_in_bytes(); 5197 int base_offset 5198 = arrayOopDesc::base_offset_in_bytes(elem_size == 2 ? T_CHAR : T_BYTE); 5199 int stubBytesThreshold = 3 * 64 + (UseSIMDForArrayEquals ? 0 : 16); 5200 5201 assert(elem_size == 1 || elem_size == 2, "must be char or byte"); 5202 assert_different_registers(a1, a2, result, cnt1, rscratch1, rscratch2); 5203 5204 #ifndef PRODUCT 5205 { 5206 const char kind = (elem_size == 2) ? 'U' : 'L'; 5207 char comment[64]; 5208 snprintf(comment, sizeof comment, "array_equals%c{", kind); 5209 BLOCK_COMMENT(comment); 5210 } 5211 #endif 5212 5213 // if (a1 == a2) 5214 // return true; 5215 cmpoop(a1, a2); // May have read barriers for a1 and a2. 5216 br(EQ, SAME); 5217 5218 if (UseSimpleArrayEquals) { 5219 Label NEXT_WORD, SHORT, TAIL03, TAIL01, A_MIGHT_BE_NULL, A_IS_NOT_NULL; 5220 // if (a1 == null || a2 == null) 5221 // return false; 5222 // a1 & a2 == 0 means (some-pointer is null) or 5223 // (very-rare-or-even-probably-impossible-pointer-values) 5224 // so, we can save one branch in most cases 5225 tst(a1, a2); 5226 mov(result, false); 5227 br(EQ, A_MIGHT_BE_NULL); 5228 // if (a1.length != a2.length) 5229 // return false; 5230 bind(A_IS_NOT_NULL); 5231 ldrw(cnt1, Address(a1, length_offset)); 5232 ldrw(cnt2, Address(a2, length_offset)); 5233 eorw(tmp5, cnt1, cnt2); 5234 cbnzw(tmp5, DONE); 5235 lea(a1, Address(a1, base_offset)); 5236 lea(a2, Address(a2, base_offset)); 5237 // Check for short strings, i.e. smaller than wordSize. 5238 subs(cnt1, cnt1, elem_per_word); 5239 br(Assembler::LT, SHORT); 5240 // Main 8 byte comparison loop. 5241 bind(NEXT_WORD); { 5242 ldr(tmp1, Address(post(a1, wordSize))); 5243 ldr(tmp2, Address(post(a2, wordSize))); 5244 subs(cnt1, cnt1, elem_per_word); 5245 eor(tmp5, tmp1, tmp2); 5246 cbnz(tmp5, DONE); 5247 } br(GT, NEXT_WORD); 5248 // Last longword. In the case where length == 4 we compare the 5249 // same longword twice, but that's still faster than another 5250 // conditional branch. 5251 // cnt1 could be 0, -1, -2, -3, -4 for chars; -4 only happens when 5252 // length == 4. 5253 if (log_elem_size > 0) 5254 lsl(cnt1, cnt1, log_elem_size); 5255 ldr(tmp3, Address(a1, cnt1)); 5256 ldr(tmp4, Address(a2, cnt1)); 5257 eor(tmp5, tmp3, tmp4); 5258 cbnz(tmp5, DONE); 5259 b(SAME); 5260 bind(A_MIGHT_BE_NULL); 5261 // in case both a1 and a2 are not-null, proceed with loads 5262 cbz(a1, DONE); 5263 cbz(a2, DONE); 5264 b(A_IS_NOT_NULL); 5265 bind(SHORT); 5266 5267 tbz(cnt1, 2 - log_elem_size, TAIL03); // 0-7 bytes left. 5268 { 5269 ldrw(tmp1, Address(post(a1, 4))); 5270 ldrw(tmp2, Address(post(a2, 4))); 5271 eorw(tmp5, tmp1, tmp2); 5272 cbnzw(tmp5, DONE); 5273 } 5274 bind(TAIL03); 5275 tbz(cnt1, 1 - log_elem_size, TAIL01); // 0-3 bytes left. 5276 { 5277 ldrh(tmp3, Address(post(a1, 2))); 5278 ldrh(tmp4, Address(post(a2, 2))); 5279 eorw(tmp5, tmp3, tmp4); 5280 cbnzw(tmp5, DONE); 5281 } 5282 bind(TAIL01); 5283 if (elem_size == 1) { // Only needed when comparing byte arrays. 5284 tbz(cnt1, 0, SAME); // 0-1 bytes left. 5285 { 5286 ldrb(tmp1, a1); 5287 ldrb(tmp2, a2); 5288 eorw(tmp5, tmp1, tmp2); 5289 cbnzw(tmp5, DONE); 5290 } 5291 } 5292 } else { 5293 Label NEXT_DWORD, SHORT, TAIL, TAIL2, STUB, EARLY_OUT, 5294 CSET_EQ, LAST_CHECK; 5295 mov(result, false); 5296 cbz(a1, DONE); 5297 ldrw(cnt1, Address(a1, length_offset)); 5298 cbz(a2, DONE); 5299 ldrw(cnt2, Address(a2, length_offset)); 5300 // on most CPUs a2 is still "locked"(surprisingly) in ldrw and it's 5301 // faster to perform another branch before comparing a1 and a2 5302 cmp(cnt1, (u1)elem_per_word); 5303 br(LE, SHORT); // short or same 5304 ldr(tmp3, Address(pre(a1, base_offset))); 5305 subs(zr, cnt1, stubBytesThreshold); 5306 br(GE, STUB); 5307 ldr(tmp4, Address(pre(a2, base_offset))); 5308 sub(tmp5, zr, cnt1, LSL, 3 + log_elem_size); 5309 cmp(cnt2, cnt1); 5310 br(NE, DONE); 5311 5312 // Main 16 byte comparison loop with 2 exits 5313 bind(NEXT_DWORD); { 5314 ldr(tmp1, Address(pre(a1, wordSize))); 5315 ldr(tmp2, Address(pre(a2, wordSize))); 5316 subs(cnt1, cnt1, 2 * elem_per_word); 5317 br(LE, TAIL); 5318 eor(tmp4, tmp3, tmp4); 5319 cbnz(tmp4, DONE); 5320 ldr(tmp3, Address(pre(a1, wordSize))); 5321 ldr(tmp4, Address(pre(a2, wordSize))); 5322 cmp(cnt1, (u1)elem_per_word); 5323 br(LE, TAIL2); 5324 cmp(tmp1, tmp2); 5325 } br(EQ, NEXT_DWORD); 5326 b(DONE); 5327 5328 bind(TAIL); 5329 eor(tmp4, tmp3, tmp4); 5330 eor(tmp2, tmp1, tmp2); 5331 lslv(tmp2, tmp2, tmp5); 5332 orr(tmp5, tmp4, tmp2); 5333 cmp(tmp5, zr); 5334 b(CSET_EQ); 5335 5336 bind(TAIL2); 5337 eor(tmp2, tmp1, tmp2); 5338 cbnz(tmp2, DONE); 5339 b(LAST_CHECK); 5340 5341 bind(STUB); 5342 ldr(tmp4, Address(pre(a2, base_offset))); 5343 cmp(cnt2, cnt1); 5344 br(NE, DONE); 5345 if (elem_size == 2) { // convert to byte counter 5346 lsl(cnt1, cnt1, 1); 5347 } 5348 eor(tmp5, tmp3, tmp4); 5349 cbnz(tmp5, DONE); 5350 RuntimeAddress stub = RuntimeAddress(StubRoutines::aarch64::large_array_equals()); 5351 assert(stub.target() != NULL, "array_equals_long stub has not been generated"); 5352 trampoline_call(stub); 5353 b(DONE); 5354 5355 bind(EARLY_OUT); 5356 // (a1 != null && a2 == null) || (a1 != null && a2 != null && a1 == a2) 5357 // so, if a2 == null => return false(0), else return true, so we can return a2 5358 mov(result, a2); 5359 b(DONE); 5360 bind(SHORT); 5361 cmp(cnt2, cnt1); 5362 br(NE, DONE); 5363 cbz(cnt1, SAME); 5364 sub(tmp5, zr, cnt1, LSL, 3 + log_elem_size); 5365 ldr(tmp3, Address(a1, base_offset)); 5366 ldr(tmp4, Address(a2, base_offset)); 5367 bind(LAST_CHECK); 5368 eor(tmp4, tmp3, tmp4); 5369 lslv(tmp5, tmp4, tmp5); 5370 cmp(tmp5, zr); 5371 bind(CSET_EQ); 5372 cset(result, EQ); 5373 b(DONE); 5374 } 5375 5376 bind(SAME); 5377 mov(result, true); 5378 // That's it. 5379 bind(DONE); 5380 5381 BLOCK_COMMENT("} array_equals"); 5382 } 5383 5384 // Compare Strings 5385 5386 // For Strings we're passed the address of the first characters in a1 5387 // and a2 and the length in cnt1. 5388 // elem_size is the element size in bytes: either 1 or 2. 5389 // There are two implementations. For arrays >= 8 bytes, all 5390 // comparisons (including the final one, which may overlap) are 5391 // performed 8 bytes at a time. For strings < 8 bytes, we compare a 5392 // halfword, then a short, and then a byte. 5393 5394 void MacroAssembler::string_equals(Register a1, Register a2, 5395 Register result, Register cnt1, int elem_size) 5396 { 5397 Label SAME, DONE, SHORT, NEXT_WORD; 5398 Register tmp1 = rscratch1; 5399 Register tmp2 = rscratch2; 5400 Register cnt2 = tmp2; // cnt2 only used in array length compare 5401 5402 assert(elem_size == 1 || elem_size == 2, "must be 2 or 1 byte"); 5403 assert_different_registers(a1, a2, result, cnt1, rscratch1, rscratch2); 5404 5405 #ifndef PRODUCT 5406 { 5407 const char kind = (elem_size == 2) ? 'U' : 'L'; 5408 char comment[64]; 5409 snprintf(comment, sizeof comment, "{string_equals%c", kind); 5410 BLOCK_COMMENT(comment); 5411 } 5412 #endif 5413 5414 mov(result, false); 5415 5416 // Check for short strings, i.e. smaller than wordSize. 5417 subs(cnt1, cnt1, wordSize); 5418 br(Assembler::LT, SHORT); 5419 // Main 8 byte comparison loop. 5420 bind(NEXT_WORD); { 5421 ldr(tmp1, Address(post(a1, wordSize))); 5422 ldr(tmp2, Address(post(a2, wordSize))); 5423 subs(cnt1, cnt1, wordSize); 5424 eor(tmp1, tmp1, tmp2); 5425 cbnz(tmp1, DONE); 5426 } br(GT, NEXT_WORD); 5427 // Last longword. In the case where length == 4 we compare the 5428 // same longword twice, but that's still faster than another 5429 // conditional branch. 5430 // cnt1 could be 0, -1, -2, -3, -4 for chars; -4 only happens when 5431 // length == 4. 5432 ldr(tmp1, Address(a1, cnt1)); 5433 ldr(tmp2, Address(a2, cnt1)); 5434 eor(tmp2, tmp1, tmp2); 5435 cbnz(tmp2, DONE); 5436 b(SAME); 5437 5438 bind(SHORT); 5439 Label TAIL03, TAIL01; 5440 5441 tbz(cnt1, 2, TAIL03); // 0-7 bytes left. 5442 { 5443 ldrw(tmp1, Address(post(a1, 4))); 5444 ldrw(tmp2, Address(post(a2, 4))); 5445 eorw(tmp1, tmp1, tmp2); 5446 cbnzw(tmp1, DONE); 5447 } 5448 bind(TAIL03); 5449 tbz(cnt1, 1, TAIL01); // 0-3 bytes left. 5450 { 5451 ldrh(tmp1, Address(post(a1, 2))); 5452 ldrh(tmp2, Address(post(a2, 2))); 5453 eorw(tmp1, tmp1, tmp2); 5454 cbnzw(tmp1, DONE); 5455 } 5456 bind(TAIL01); 5457 if (elem_size == 1) { // Only needed when comparing 1-byte elements 5458 tbz(cnt1, 0, SAME); // 0-1 bytes left. 5459 { 5460 ldrb(tmp1, a1); 5461 ldrb(tmp2, a2); 5462 eorw(tmp1, tmp1, tmp2); 5463 cbnzw(tmp1, DONE); 5464 } 5465 } 5466 // Arrays are equal. 5467 bind(SAME); 5468 mov(result, true); 5469 5470 // That's it. 5471 bind(DONE); 5472 BLOCK_COMMENT("} string_equals"); 5473 } 5474 5475 5476 // The size of the blocks erased by the zero_blocks stub. We must 5477 // handle anything smaller than this ourselves in zero_words(). 5478 const int MacroAssembler::zero_words_block_size = 8; 5479 5480 // zero_words() is used by C2 ClearArray patterns. It is as small as 5481 // possible, handling small word counts locally and delegating 5482 // anything larger to the zero_blocks stub. It is expanded many times 5483 // in compiled code, so it is important to keep it short. 5484 5485 // ptr: Address of a buffer to be zeroed. 5486 // cnt: Count in HeapWords. 5487 // 5488 // ptr, cnt, rscratch1, and rscratch2 are clobbered. 5489 void MacroAssembler::zero_words(Register ptr, Register cnt) 5490 { 5491 assert(is_power_of_2(zero_words_block_size), "adjust this"); 5492 assert(ptr == r10 && cnt == r11, "mismatch in register usage"); 5493 5494 BLOCK_COMMENT("zero_words {"); 5495 cmp(cnt, (u1)zero_words_block_size); 5496 Label around; 5497 br(LO, around); 5498 { 5499 RuntimeAddress zero_blocks = RuntimeAddress(StubRoutines::aarch64::zero_blocks()); 5500 assert(zero_blocks.target() != NULL, "zero_blocks stub has not been generated"); 5501 if (StubRoutines::aarch64::complete()) { 5502 trampoline_call(zero_blocks); 5503 } else { 5504 bl(zero_blocks); 5505 } 5506 } 5507 bind(around); 5508 for (int i = zero_words_block_size >> 1; i > 1; i >>= 1) { 5509 Label l; 5510 tbz(cnt, exact_log2(i), l); 5511 for (int j = 0; j < i; j += 2) { 5512 stp(zr, zr, post(ptr, 16)); 5513 } 5514 bind(l); 5515 } 5516 { 5517 Label l; 5518 tbz(cnt, 0, l); 5519 str(zr, Address(ptr)); 5520 bind(l); 5521 } 5522 BLOCK_COMMENT("} zero_words"); 5523 } 5524 5525 // base: Address of a buffer to be zeroed, 8 bytes aligned. 5526 // cnt: Immediate count in HeapWords. 5527 #define SmallArraySize (18 * BytesPerLong) 5528 void MacroAssembler::zero_words(Register base, u_int64_t cnt) 5529 { 5530 BLOCK_COMMENT("zero_words {"); 5531 int i = cnt & 1; // store any odd word to start 5532 if (i) str(zr, Address(base)); 5533 5534 if (cnt <= SmallArraySize / BytesPerLong) { 5535 for (; i < (int)cnt; i += 2) 5536 stp(zr, zr, Address(base, i * wordSize)); 5537 } else { 5538 const int unroll = 4; // Number of stp(zr, zr) instructions we'll unroll 5539 int remainder = cnt % (2 * unroll); 5540 for (; i < remainder; i += 2) 5541 stp(zr, zr, Address(base, i * wordSize)); 5542 5543 Label loop; 5544 Register cnt_reg = rscratch1; 5545 Register loop_base = rscratch2; 5546 cnt = cnt - remainder; 5547 mov(cnt_reg, cnt); 5548 // adjust base and prebias by -2 * wordSize so we can pre-increment 5549 add(loop_base, base, (remainder - 2) * wordSize); 5550 bind(loop); 5551 sub(cnt_reg, cnt_reg, 2 * unroll); 5552 for (i = 1; i < unroll; i++) 5553 stp(zr, zr, Address(loop_base, 2 * i * wordSize)); 5554 stp(zr, zr, Address(pre(loop_base, 2 * unroll * wordSize))); 5555 cbnz(cnt_reg, loop); 5556 } 5557 BLOCK_COMMENT("} zero_words"); 5558 } 5559 5560 // Zero blocks of memory by using DC ZVA. 5561 // 5562 // Aligns the base address first sufficently for DC ZVA, then uses 5563 // DC ZVA repeatedly for every full block. cnt is the size to be 5564 // zeroed in HeapWords. Returns the count of words left to be zeroed 5565 // in cnt. 5566 // 5567 // NOTE: This is intended to be used in the zero_blocks() stub. If 5568 // you want to use it elsewhere, note that cnt must be >= 2*zva_length. 5569 void MacroAssembler::zero_dcache_blocks(Register base, Register cnt) { 5570 Register tmp = rscratch1; 5571 Register tmp2 = rscratch2; 5572 int zva_length = VM_Version::zva_length(); 5573 Label initial_table_end, loop_zva; 5574 Label fini; 5575 5576 // Base must be 16 byte aligned. If not just return and let caller handle it 5577 tst(base, 0x0f); 5578 br(Assembler::NE, fini); 5579 // Align base with ZVA length. 5580 neg(tmp, base); 5581 andr(tmp, tmp, zva_length - 1); 5582 5583 // tmp: the number of bytes to be filled to align the base with ZVA length. 5584 add(base, base, tmp); 5585 sub(cnt, cnt, tmp, Assembler::ASR, 3); 5586 adr(tmp2, initial_table_end); 5587 sub(tmp2, tmp2, tmp, Assembler::LSR, 2); 5588 br(tmp2); 5589 5590 for (int i = -zva_length + 16; i < 0; i += 16) 5591 stp(zr, zr, Address(base, i)); 5592 bind(initial_table_end); 5593 5594 sub(cnt, cnt, zva_length >> 3); 5595 bind(loop_zva); 5596 dc(Assembler::ZVA, base); 5597 subs(cnt, cnt, zva_length >> 3); 5598 add(base, base, zva_length); 5599 br(Assembler::GE, loop_zva); 5600 add(cnt, cnt, zva_length >> 3); // count not zeroed by DC ZVA 5601 bind(fini); 5602 } 5603 5604 // base: Address of a buffer to be filled, 8 bytes aligned. 5605 // cnt: Count in 8-byte unit. 5606 // value: Value to be filled with. 5607 // base will point to the end of the buffer after filling. 5608 void MacroAssembler::fill_words(Register base, Register cnt, Register value) 5609 { 5610 // Algorithm: 5611 // 5612 // scratch1 = cnt & 7; 5613 // cnt -= scratch1; 5614 // p += scratch1; 5615 // switch (scratch1) { 5616 // do { 5617 // cnt -= 8; 5618 // p[-8] = v; 5619 // case 7: 5620 // p[-7] = v; 5621 // case 6: 5622 // p[-6] = v; 5623 // // ... 5624 // case 1: 5625 // p[-1] = v; 5626 // case 0: 5627 // p += 8; 5628 // } while (cnt); 5629 // } 5630 5631 assert_different_registers(base, cnt, value, rscratch1, rscratch2); 5632 5633 Label fini, skip, entry, loop; 5634 const int unroll = 8; // Number of stp instructions we'll unroll 5635 5636 cbz(cnt, fini); 5637 tbz(base, 3, skip); 5638 str(value, Address(post(base, 8))); 5639 sub(cnt, cnt, 1); 5640 bind(skip); 5641 5642 andr(rscratch1, cnt, (unroll-1) * 2); 5643 sub(cnt, cnt, rscratch1); 5644 add(base, base, rscratch1, Assembler::LSL, 3); 5645 adr(rscratch2, entry); 5646 sub(rscratch2, rscratch2, rscratch1, Assembler::LSL, 1); 5647 br(rscratch2); 5648 5649 bind(loop); 5650 add(base, base, unroll * 16); 5651 for (int i = -unroll; i < 0; i++) 5652 stp(value, value, Address(base, i * 16)); 5653 bind(entry); 5654 subs(cnt, cnt, unroll * 2); 5655 br(Assembler::GE, loop); 5656 5657 tbz(cnt, 0, fini); 5658 str(value, Address(post(base, 8))); 5659 bind(fini); 5660 } 5661 5662 // Intrinsic for sun/nio/cs/ISO_8859_1$Encoder.implEncodeISOArray and 5663 // java/lang/StringUTF16.compress. 5664 void MacroAssembler::encode_iso_array(Register src, Register dst, 5665 Register len, Register result, 5666 FloatRegister Vtmp1, FloatRegister Vtmp2, 5667 FloatRegister Vtmp3, FloatRegister Vtmp4) 5668 { 5669 Label DONE, SET_RESULT, NEXT_32, NEXT_32_PRFM, LOOP_8, NEXT_8, LOOP_1, NEXT_1, 5670 NEXT_32_START, NEXT_32_PRFM_START; 5671 Register tmp1 = rscratch1, tmp2 = rscratch2; 5672 5673 mov(result, len); // Save initial len 5674 5675 cmp(len, (u1)8); // handle shortest strings first 5676 br(LT, LOOP_1); 5677 cmp(len, (u1)32); 5678 br(LT, NEXT_8); 5679 // The following code uses the SIMD 'uzp1' and 'uzp2' instructions 5680 // to convert chars to bytes 5681 if (SoftwarePrefetchHintDistance >= 0) { 5682 ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src); 5683 subs(tmp2, len, SoftwarePrefetchHintDistance/2 + 16); 5684 br(LE, NEXT_32_START); 5685 b(NEXT_32_PRFM_START); 5686 BIND(NEXT_32_PRFM); 5687 ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src); 5688 BIND(NEXT_32_PRFM_START); 5689 prfm(Address(src, SoftwarePrefetchHintDistance)); 5690 orr(v4, T16B, Vtmp1, Vtmp2); 5691 orr(v5, T16B, Vtmp3, Vtmp4); 5692 uzp1(Vtmp1, T16B, Vtmp1, Vtmp2); 5693 uzp1(Vtmp3, T16B, Vtmp3, Vtmp4); 5694 uzp2(v5, T16B, v4, v5); // high bytes 5695 umov(tmp2, v5, D, 1); 5696 fmovd(tmp1, v5); 5697 orr(tmp1, tmp1, tmp2); 5698 cbnz(tmp1, LOOP_8); 5699 stpq(Vtmp1, Vtmp3, dst); 5700 sub(len, len, 32); 5701 add(dst, dst, 32); 5702 add(src, src, 64); 5703 subs(tmp2, len, SoftwarePrefetchHintDistance/2 + 16); 5704 br(GE, NEXT_32_PRFM); 5705 cmp(len, (u1)32); 5706 br(LT, LOOP_8); 5707 BIND(NEXT_32); 5708 ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src); 5709 BIND(NEXT_32_START); 5710 } else { 5711 BIND(NEXT_32); 5712 ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src); 5713 } 5714 prfm(Address(src, SoftwarePrefetchHintDistance)); 5715 uzp1(v4, T16B, Vtmp1, Vtmp2); 5716 uzp1(v5, T16B, Vtmp3, Vtmp4); 5717 orr(Vtmp1, T16B, Vtmp1, Vtmp2); 5718 orr(Vtmp3, T16B, Vtmp3, Vtmp4); 5719 uzp2(Vtmp1, T16B, Vtmp1, Vtmp3); // high bytes 5720 umov(tmp2, Vtmp1, D, 1); 5721 fmovd(tmp1, Vtmp1); 5722 orr(tmp1, tmp1, tmp2); 5723 cbnz(tmp1, LOOP_8); 5724 stpq(v4, v5, dst); 5725 sub(len, len, 32); 5726 add(dst, dst, 32); 5727 add(src, src, 64); 5728 cmp(len, (u1)32); 5729 br(GE, NEXT_32); 5730 cbz(len, DONE); 5731 5732 BIND(LOOP_8); 5733 cmp(len, (u1)8); 5734 br(LT, LOOP_1); 5735 BIND(NEXT_8); 5736 ld1(Vtmp1, T8H, src); 5737 uzp1(Vtmp2, T16B, Vtmp1, Vtmp1); // low bytes 5738 uzp2(Vtmp3, T16B, Vtmp1, Vtmp1); // high bytes 5739 fmovd(tmp1, Vtmp3); 5740 cbnz(tmp1, NEXT_1); 5741 strd(Vtmp2, dst); 5742 5743 sub(len, len, 8); 5744 add(dst, dst, 8); 5745 add(src, src, 16); 5746 cmp(len, (u1)8); 5747 br(GE, NEXT_8); 5748 5749 BIND(LOOP_1); 5750 5751 cbz(len, DONE); 5752 BIND(NEXT_1); 5753 ldrh(tmp1, Address(post(src, 2))); 5754 tst(tmp1, 0xff00); 5755 br(NE, SET_RESULT); 5756 strb(tmp1, Address(post(dst, 1))); 5757 subs(len, len, 1); 5758 br(GT, NEXT_1); 5759 5760 BIND(SET_RESULT); 5761 sub(result, result, len); // Return index where we stopped 5762 // Return len == 0 if we processed all 5763 // characters 5764 BIND(DONE); 5765 } 5766 5767 5768 // Inflate byte[] array to char[]. 5769 void MacroAssembler::byte_array_inflate(Register src, Register dst, Register len, 5770 FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3, 5771 Register tmp4) { 5772 Label big, done, after_init, to_stub; 5773 5774 assert_different_registers(src, dst, len, tmp4, rscratch1); 5775 5776 fmovd(vtmp1, zr); 5777 lsrw(tmp4, len, 3); 5778 bind(after_init); 5779 cbnzw(tmp4, big); 5780 // Short string: less than 8 bytes. 5781 { 5782 Label loop, tiny; 5783 5784 cmpw(len, 4); 5785 br(LT, tiny); 5786 // Use SIMD to do 4 bytes. 5787 ldrs(vtmp2, post(src, 4)); 5788 zip1(vtmp3, T8B, vtmp2, vtmp1); 5789 subw(len, len, 4); 5790 strd(vtmp3, post(dst, 8)); 5791 5792 cbzw(len, done); 5793 5794 // Do the remaining bytes by steam. 5795 bind(loop); 5796 ldrb(tmp4, post(src, 1)); 5797 strh(tmp4, post(dst, 2)); 5798 subw(len, len, 1); 5799 5800 bind(tiny); 5801 cbnz(len, loop); 5802 5803 b(done); 5804 } 5805 5806 if (SoftwarePrefetchHintDistance >= 0) { 5807 bind(to_stub); 5808 RuntimeAddress stub = RuntimeAddress(StubRoutines::aarch64::large_byte_array_inflate()); 5809 assert(stub.target() != NULL, "large_byte_array_inflate stub has not been generated"); 5810 trampoline_call(stub); 5811 b(after_init); 5812 } 5813 5814 // Unpack the bytes 8 at a time. 5815 bind(big); 5816 { 5817 Label loop, around, loop_last, loop_start; 5818 5819 if (SoftwarePrefetchHintDistance >= 0) { 5820 const int large_loop_threshold = (64 + 16)/8; 5821 ldrd(vtmp2, post(src, 8)); 5822 andw(len, len, 7); 5823 cmp(tmp4, (u1)large_loop_threshold); 5824 br(GE, to_stub); 5825 b(loop_start); 5826 5827 bind(loop); 5828 ldrd(vtmp2, post(src, 8)); 5829 bind(loop_start); 5830 subs(tmp4, tmp4, 1); 5831 br(EQ, loop_last); 5832 zip1(vtmp2, T16B, vtmp2, vtmp1); 5833 ldrd(vtmp3, post(src, 8)); 5834 st1(vtmp2, T8H, post(dst, 16)); 5835 subs(tmp4, tmp4, 1); 5836 zip1(vtmp3, T16B, vtmp3, vtmp1); 5837 st1(vtmp3, T8H, post(dst, 16)); 5838 br(NE, loop); 5839 b(around); 5840 bind(loop_last); 5841 zip1(vtmp2, T16B, vtmp2, vtmp1); 5842 st1(vtmp2, T8H, post(dst, 16)); 5843 bind(around); 5844 cbz(len, done); 5845 } else { 5846 andw(len, len, 7); 5847 bind(loop); 5848 ldrd(vtmp2, post(src, 8)); 5849 sub(tmp4, tmp4, 1); 5850 zip1(vtmp3, T16B, vtmp2, vtmp1); 5851 st1(vtmp3, T8H, post(dst, 16)); 5852 cbnz(tmp4, loop); 5853 } 5854 } 5855 5856 // Do the tail of up to 8 bytes. 5857 add(src, src, len); 5858 ldrd(vtmp3, Address(src, -8)); 5859 add(dst, dst, len, ext::uxtw, 1); 5860 zip1(vtmp3, T16B, vtmp3, vtmp1); 5861 strq(vtmp3, Address(dst, -16)); 5862 5863 bind(done); 5864 } 5865 5866 // Compress char[] array to byte[]. 5867 void MacroAssembler::char_array_compress(Register src, Register dst, Register len, 5868 FloatRegister tmp1Reg, FloatRegister tmp2Reg, 5869 FloatRegister tmp3Reg, FloatRegister tmp4Reg, 5870 Register result) { 5871 encode_iso_array(src, dst, len, result, 5872 tmp1Reg, tmp2Reg, tmp3Reg, tmp4Reg); 5873 cmp(len, zr); 5874 csel(result, result, zr, EQ); 5875 } 5876 5877 // get_thread() can be called anywhere inside generated code so we 5878 // need to save whatever non-callee save context might get clobbered 5879 // by the call to JavaThread::aarch64_get_thread_helper() or, indeed, 5880 // the call setup code. 5881 // 5882 // aarch64_get_thread_helper() clobbers only r0, r1, and flags. 5883 // 5884 void MacroAssembler::get_thread(Register dst) { 5885 RegSet saved_regs = RegSet::range(r0, r1) + lr - dst; 5886 push(saved_regs, sp); 5887 5888 mov(lr, CAST_FROM_FN_PTR(address, JavaThread::aarch64_get_thread_helper)); 5889 blr(lr); 5890 if (dst != c_rarg0) { 5891 mov(dst, c_rarg0); 5892 } 5893 5894 pop(saved_regs, sp); 5895 } 5896 5897 // C2 compiled method's prolog code 5898 // Moved here from aarch64.ad to support Valhalla code belows 5899 void MacroAssembler::verified_entry(Compile* C, int sp_inc) { 5900 5901 // n.b. frame size includes space for return pc and rfp 5902 const long framesize = C->frame_size_in_bytes(); 5903 assert(framesize % (2 * wordSize) == 0, "must preserve 2 * wordSize alignment"); 5904 5905 // insert a nop at the start of the prolog so we can patch in a 5906 // branch if we need to invalidate the method later 5907 nop(); 5908 5909 int bangsize = C->bang_size_in_bytes(); 5910 if (C->need_stack_bang(bangsize) && UseStackBanging) 5911 generate_stack_overflow_check(bangsize); 5912 5913 build_frame(framesize); 5914 5915 if (NotifySimulator) { 5916 notify(Assembler::method_entry); 5917 } 5918 5919 if (VerifyStackAtCalls) { 5920 Unimplemented(); 5921 } 5922 } 5923 5924 int MacroAssembler::store_value_type_fields_to_buf(ciValueKlass* vk, bool from_interpreter) { 5925 // A value type might be returned. If fields are in registers we 5926 // need to allocate a value type instance and initialize it with 5927 // the value of the fields. 5928 Label skip; 5929 // We only need a new buffered value if a new one is not returned 5930 cmp(r0, (u1) 1); 5931 br(Assembler::EQ, skip); 5932 int call_offset = -1; 5933 5934 Label slow_case; 5935 5936 // Try to allocate a new buffered value (from the heap) 5937 if (UseTLAB) { 5938 5939 if (vk != NULL) { 5940 // Called from C1, where the return type is statically known. 5941 mov(r1, (intptr_t)vk->get_ValueKlass()); 5942 jint lh = vk->layout_helper(); 5943 assert(lh != Klass::_lh_neutral_value, "inline class in return type must have been resolved"); 5944 mov(r14, lh); 5945 } else { 5946 // Call from interpreter. R0 contains ((the ValueKlass* of the return type) | 0x01) 5947 andr(r1, r0, -2); 5948 // get obj size 5949 ldrw(r14, Address(rscratch1 /*klass*/, Klass::layout_helper_offset())); 5950 } 5951 5952 ldr(r13, Address(rthread, in_bytes(JavaThread::tlab_top_offset()))); 5953 5954 // check whether we have space in TLAB, 5955 // rscratch1 contains pointer to just allocated obj 5956 lea(r14, Address(r13, r14)); 5957 ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_end_offset()))); 5958 5959 cmp(r14, rscratch1); 5960 br(Assembler::GT, slow_case); 5961 5962 // OK we have room in TLAB, 5963 // Set new TLAB top 5964 str(r14, Address(rthread, in_bytes(JavaThread::tlab_top_offset()))); 5965 5966 // Set new class always locked 5967 mov(rscratch1, (uint64_t) markWord::always_locked_prototype().value()); 5968 str(rscratch1, Address(r13, oopDesc::mark_offset_in_bytes())); 5969 5970 store_klass_gap(r13, zr); // zero klass gap for compressed oops 5971 if (vk == NULL) { 5972 // store_klass corrupts rbx, so save it in rax for later use (interpreter case only). 5973 mov(r0, r1); 5974 } 5975 5976 store_klass(r13, r1); // klass 5977 5978 if (vk != NULL) { 5979 // FIXME -- do the packing in-line to avoid the runtime call 5980 mov(r0, r13); 5981 far_call(RuntimeAddress(vk->pack_handler())); // no need for call info as this will not safepoint. 5982 } else { 5983 5984 // We have our new buffered value, initialize its fields with a 5985 // value class specific handler 5986 ldr(r1, Address(r0, InstanceKlass::adr_valueklass_fixed_block_offset())); 5987 ldr(r1, Address(r1, ValueKlass::pack_handler_offset())); 5988 5989 // Mov new class to r0 and call pack_handler 5990 mov(r0, r13); 5991 blr(r1); 5992 } 5993 b(skip); 5994 } 5995 5996 bind(slow_case); 5997 // We failed to allocate a new value, fall back to a runtime 5998 // call. Some oop field may be live in some registers but we can't 5999 // tell. That runtime call will take care of preserving them 6000 // across a GC if there's one. 6001 6002 6003 if (from_interpreter) { 6004 super_call_VM_leaf(StubRoutines::store_value_type_fields_to_buf()); 6005 } else { 6006 ldr(rscratch1, RuntimeAddress(StubRoutines::store_value_type_fields_to_buf())); 6007 blr(rscratch1); 6008 call_offset = offset(); 6009 } 6010 6011 bind(skip); 6012 return call_offset; 6013 } 6014 6015 // Move a value between registers/stack slots and update the reg_state 6016 bool MacroAssembler::move_helper(VMReg from, VMReg to, BasicType bt, RegState reg_state[], int ret_off, int extra_stack_offset) { 6017 if (reg_state[to->value()] == reg_written) { 6018 return true; // Already written 6019 } 6020 6021 if (from != to && bt != T_VOID) { 6022 if (reg_state[to->value()] == reg_readonly) { 6023 return false; // Not yet writable 6024 } 6025 if (from->is_reg()) { 6026 if (to->is_reg()) { 6027 mov(to->as_Register(), from->as_Register()); 6028 } else { 6029 int st_off = to->reg2stack() * VMRegImpl::stack_slot_size + extra_stack_offset; 6030 Address to_addr = Address(sp, st_off); 6031 if (from->is_FloatRegister()) { 6032 if (bt == T_DOUBLE) { 6033 strd(from->as_FloatRegister(), to_addr); 6034 } else { 6035 assert(bt == T_FLOAT, "must be float"); 6036 strs(from->as_FloatRegister(), to_addr); 6037 } 6038 } else { 6039 str(from->as_Register(), to_addr); 6040 } 6041 } 6042 } else { 6043 Address from_addr = Address(sp, from->reg2stack() * VMRegImpl::stack_slot_size + extra_stack_offset); 6044 if (to->is_reg()) { 6045 if (to->is_FloatRegister()) { 6046 if (bt == T_DOUBLE) { 6047 ldrd(to->as_FloatRegister(), from_addr); 6048 } else { 6049 assert(bt == T_FLOAT, "must be float"); 6050 ldrs(to->as_FloatRegister(), from_addr); 6051 } 6052 } else { 6053 ldr(to->as_Register(), from_addr); 6054 } 6055 } else { 6056 int st_off = to->reg2stack() * VMRegImpl::stack_slot_size + extra_stack_offset; 6057 ldr(rscratch1, from_addr); 6058 str(rscratch1, Address(sp, st_off)); 6059 } 6060 } 6061 } 6062 6063 // Update register states 6064 reg_state[from->value()] = reg_writable; 6065 reg_state[to->value()] = reg_written; 6066 return true; 6067 } 6068 6069 // Read all fields from a value type oop and store the values in registers/stack slots 6070 bool MacroAssembler::unpack_value_helper(const GrowableArray<SigEntry>* sig, int& sig_index, VMReg from, VMRegPair* regs_to, 6071 int& to_index, RegState reg_state[], int ret_off, int extra_stack_offset) { 6072 Register fromReg = from->is_reg() ? from->as_Register() : noreg; 6073 assert(sig->at(sig_index)._bt == T_VOID, "should be at end delimiter"); 6074 6075 6076 int vt = 1; 6077 bool done = true; 6078 bool mark_done = true; 6079 do { 6080 sig_index--; 6081 BasicType bt = sig->at(sig_index)._bt; 6082 if (bt == T_VALUETYPE) { 6083 vt--; 6084 } else if (bt == T_VOID && 6085 sig->at(sig_index-1)._bt != T_LONG && 6086 sig->at(sig_index-1)._bt != T_DOUBLE) { 6087 vt++; 6088 } else if (SigEntry::is_reserved_entry(sig, sig_index)) { 6089 to_index--; // Ignore this 6090 } else { 6091 assert(to_index >= 0, "invalid to_index"); 6092 VMRegPair pair_to = regs_to[to_index--]; 6093 VMReg to = pair_to.first(); 6094 6095 if (bt == T_VOID) continue; 6096 6097 int idx = (int) to->value(); 6098 if (reg_state[idx] == reg_readonly) { 6099 if (idx != from->value()) { 6100 mark_done = false; 6101 } 6102 done = false; 6103 continue; 6104 } else if (reg_state[idx] == reg_written) { 6105 continue; 6106 } else { 6107 assert(reg_state[idx] == reg_writable, "must be writable"); 6108 reg_state[idx] = reg_written; 6109 } 6110 6111 if (fromReg == noreg) { 6112 int st_off = from->reg2stack() * VMRegImpl::stack_slot_size + extra_stack_offset; 6113 ldr(rscratch2, Address(sp, st_off)); 6114 fromReg = rscratch2; 6115 } 6116 6117 int off = sig->at(sig_index)._offset; 6118 assert(off > 0, "offset in object should be positive"); 6119 bool is_oop = (bt == T_OBJECT || bt == T_ARRAY); 6120 6121 Address fromAddr = Address(fromReg, off); 6122 bool is_signed = (bt != T_CHAR) && (bt != T_BOOLEAN); 6123 6124 if (!to->is_FloatRegister()) { 6125 6126 Register dst = to->is_stack() ? rscratch1 : to->as_Register(); 6127 6128 if (is_oop) { 6129 load_heap_oop(dst, fromAddr); 6130 } else { 6131 load_sized_value(dst, fromAddr, type2aelembytes(bt), is_signed); 6132 } 6133 if (to->is_stack()) { 6134 int st_off = to->reg2stack() * VMRegImpl::stack_slot_size + extra_stack_offset; 6135 str(dst, Address(sp, st_off)); 6136 } 6137 } else { 6138 if (bt == T_DOUBLE) { 6139 ldrd(to->as_FloatRegister(), fromAddr); 6140 } else { 6141 assert(bt == T_FLOAT, "must be float"); 6142 ldrs(to->as_FloatRegister(), fromAddr); 6143 } 6144 } 6145 6146 } 6147 6148 } while (vt != 0); 6149 6150 if (mark_done && reg_state[from->value()] != reg_written) { 6151 // This is okay because no one else will write to that slot 6152 reg_state[from->value()] = reg_writable; 6153 } 6154 return done; 6155 } 6156 6157 // Pack fields back into a value type oop 6158 bool MacroAssembler::pack_value_helper(const GrowableArray<SigEntry>* sig, int& sig_index, int vtarg_index, 6159 VMReg to, VMRegPair* regs_from, int regs_from_count, int& from_index, RegState reg_state[], 6160 int ret_off, int extra_stack_offset) { 6161 assert(sig->at(sig_index)._bt == T_VALUETYPE, "should be at end delimiter"); 6162 assert(to->is_valid(), "must be"); 6163 6164 if (reg_state[to->value()] == reg_written) { 6165 skip_unpacked_fields(sig, sig_index, regs_from, regs_from_count, from_index); 6166 return true; // Already written 6167 } 6168 6169 Register val_array = r0; 6170 Register val_obj_tmp = r11; 6171 Register from_reg_tmp = r10; 6172 Register tmp1 = r14; 6173 Register tmp2 = r13; 6174 Register tmp3 = r1; 6175 Register val_obj = to->is_stack() ? val_obj_tmp : to->as_Register(); 6176 6177 if (reg_state[to->value()] == reg_readonly) { 6178 if (!is_reg_in_unpacked_fields(sig, sig_index, to, regs_from, regs_from_count, from_index)) { 6179 skip_unpacked_fields(sig, sig_index, regs_from, regs_from_count, from_index); 6180 return false; // Not yet writable 6181 } 6182 val_obj = val_obj_tmp; 6183 } 6184 6185 int index = arrayOopDesc::base_offset_in_bytes(T_OBJECT) + vtarg_index * type2aelembytes(T_VALUETYPE); 6186 load_heap_oop(val_obj, Address(val_array, index)); 6187 6188 ScalarizedValueArgsStream stream(sig, sig_index, regs_from, regs_from_count, from_index); 6189 VMRegPair from_pair; 6190 BasicType bt; 6191 6192 while (stream.next(from_pair, bt)) { 6193 int off = sig->at(stream.sig_cc_index())._offset; 6194 assert(off > 0, "offset in object should be positive"); 6195 bool is_oop = (bt == T_OBJECT || bt == T_ARRAY); 6196 size_t size_in_bytes = is_java_primitive(bt) ? type2aelembytes(bt) : wordSize; 6197 6198 VMReg from_r1 = from_pair.first(); 6199 VMReg from_r2 = from_pair.second(); 6200 6201 // Pack the scalarized field into the value object. 6202 Address dst(val_obj, off); 6203 6204 if (!from_r1->is_FloatRegister()) { 6205 Register from_reg; 6206 if (from_r1->is_stack()) { 6207 from_reg = from_reg_tmp; 6208 int ld_off = from_r1->reg2stack() * VMRegImpl::stack_slot_size + extra_stack_offset; 6209 load_sized_value(from_reg, Address(sp, ld_off), size_in_bytes, /* is_signed */ false); 6210 } else { 6211 from_reg = from_r1->as_Register(); 6212 } 6213 6214 if (is_oop) { 6215 DecoratorSet decorators = IN_HEAP | ACCESS_WRITE; 6216 store_heap_oop(dst, from_reg, tmp1, tmp2, tmp3, decorators); 6217 } else { 6218 store_sized_value(dst, from_reg, size_in_bytes); 6219 } 6220 } else { 6221 if (from_r2->is_valid()) { 6222 strd(from_r1->as_FloatRegister(), dst); 6223 } else { 6224 strs(from_r1->as_FloatRegister(), dst); 6225 } 6226 } 6227 6228 reg_state[from_r1->value()] = reg_writable; 6229 } 6230 sig_index = stream.sig_cc_index(); 6231 from_index = stream.regs_cc_index(); 6232 6233 assert(reg_state[to->value()] == reg_writable, "must have already been read"); 6234 bool success = move_helper(val_obj->as_VMReg(), to, T_OBJECT, reg_state, ret_off, extra_stack_offset); 6235 assert(success, "to register must be writeable"); 6236 6237 return true; 6238 } 6239 6240 // Unpack all value type arguments passed as oops 6241 void MacroAssembler::unpack_value_args(Compile* C, bool receiver_only) { 6242 int sp_inc = unpack_value_args_common(C, receiver_only); 6243 // Emit code for verified entry and save increment for stack repair on return 6244 verified_entry(C, sp_inc); 6245 } 6246 6247 int MacroAssembler::shuffle_value_args(bool is_packing, bool receiver_only, int extra_stack_offset, 6248 BasicType* sig_bt, const GrowableArray<SigEntry>* sig_cc, 6249 int args_passed, int args_on_stack, VMRegPair* regs, // from 6250 int args_passed_to, int args_on_stack_to, VMRegPair* regs_to) { // to 6251 // Check if we need to extend the stack for packing/unpacking 6252 int sp_inc = (args_on_stack_to - args_on_stack) * VMRegImpl::stack_slot_size; 6253 if (sp_inc > 0) { 6254 sp_inc = align_up(sp_inc, StackAlignmentInBytes); 6255 if (!is_packing) { 6256 // Save the return address, adjust the stack (make sure it is properly 6257 // 16-byte aligned) and copy the return address to the new top of the stack. 6258 // (Note: C1 does this in C1_MacroAssembler::scalarized_entry). 6259 // FIXME: We need not to preserve return address on aarch64 6260 pop(rscratch1); 6261 sub(sp, sp, sp_inc); 6262 push(rscratch1); 6263 } 6264 } else { 6265 // The scalarized calling convention needs less stack space than the unscalarized one. 6266 // No need to extend the stack, the caller will take care of these adjustments. 6267 sp_inc = 0; 6268 } 6269 6270 int ret_off; // make sure we don't overwrite the return address 6271 if (is_packing) { 6272 // For C1 code, the VVEP doesn't have reserved slots, so we store the returned address at 6273 // rsp[0] during shuffling. 6274 ret_off = 0; 6275 } else { 6276 // C2 code ensures that sp_inc is a reserved slot. 6277 ret_off = sp_inc; 6278 } 6279 6280 return shuffle_value_args_common(is_packing, receiver_only, extra_stack_offset, 6281 sig_bt, sig_cc, 6282 args_passed, args_on_stack, regs, 6283 args_passed_to, args_on_stack_to, regs_to, 6284 sp_inc, ret_off); 6285 } 6286 6287 VMReg MacroAssembler::spill_reg_for(VMReg reg) { 6288 return (reg->is_FloatRegister()) ? v0->as_VMReg() : r14->as_VMReg(); 6289 } 6290 6291 void MacroAssembler::cache_wb(Address line) { 6292 assert(line.getMode() == Address::base_plus_offset, "mode should be base_plus_offset"); 6293 assert(line.index() == noreg, "index should be noreg"); 6294 assert(line.offset() == 0, "offset should be 0"); 6295 // would like to assert this 6296 // assert(line._ext.shift == 0, "shift should be zero"); 6297 if (VM_Version::supports_dcpop()) { 6298 // writeback using clear virtual address to point of persistence 6299 dc(Assembler::CVAP, line.base()); 6300 } else { 6301 // no need to generate anything as Unsafe.writebackMemory should 6302 // never invoke this stub 6303 } 6304 } 6305 6306 void MacroAssembler::cache_wbsync(bool is_pre) { 6307 // we only need a barrier post sync 6308 if (!is_pre) { 6309 membar(Assembler::AnyAny); 6310 } 6311 }