1 /* 2 * Copyright (c) 1997, 2019, Oracle and/or its affiliates. All rights reserved. 3 * Copyright (c) 2014, 2019, Red Hat Inc. All rights reserved. 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. 9 * 10 * This code is distributed in the hope that it will be useful, but WITHOUT 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 * version 2 for more details (a copy is included in the LICENSE file that 14 * accompanied this code). 15 * 16 * You should have received a copy of the GNU General Public License version 17 * 2 along with this work; if not, write to the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 19 * 20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 21 * or visit www.oracle.com if you need additional information or have any 22 * questions. 23 * 24 */ 25 26 #include <sys/types.h> 27 28 #include "precompiled.hpp" 29 #include "jvm.h" 30 #include "asm/assembler.hpp" 31 #include "asm/assembler.inline.hpp" 32 #include "gc/shared/barrierSet.hpp" 33 #include "gc/shared/cardTable.hpp" 34 #include "gc/shared/barrierSetAssembler.hpp" 35 #include "gc/shared/cardTableBarrierSet.hpp" 36 #include "interpreter/interpreter.hpp" 37 #include "compiler/disassembler.hpp" 38 #include "memory/resourceArea.hpp" 39 #include "memory/universe.hpp" 40 #include "nativeInst_aarch64.hpp" 41 #include "oops/accessDecorators.hpp" 42 #include "oops/compressedOops.inline.hpp" 43 #include "oops/klass.inline.hpp" 44 #include "runtime/biasedLocking.hpp" 45 #include "runtime/icache.hpp" 46 #include "runtime/interfaceSupport.inline.hpp" 47 #include "runtime/jniHandles.inline.hpp" 48 #include "runtime/sharedRuntime.hpp" 49 #include "runtime/thread.hpp" 50 #ifdef COMPILER1 51 #include "c1/c1_LIRAssembler.hpp" 52 #endif 53 #ifdef COMPILER2 54 #include "oops/oop.hpp" 55 #include "opto/compile.hpp" 56 #include "opto/intrinsicnode.hpp" 57 #include "opto/node.hpp" 58 #endif 59 60 #ifdef PRODUCT 61 #define BLOCK_COMMENT(str) /* nothing */ 62 #define STOP(error) stop(error) 63 #else 64 #define BLOCK_COMMENT(str) block_comment(str) 65 #define STOP(error) block_comment(error); stop(error) 66 #endif 67 68 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 69 70 // Patch any kind of instruction; there may be several instructions. 71 // Return the total length (in bytes) of the instructions. 72 int MacroAssembler::pd_patch_instruction_size(address branch, address target) { 73 int instructions = 1; 74 assert((uint64_t)target < (1ul << 48), "48-bit overflow in address constant"); 75 long offset = (target - branch) >> 2; 76 unsigned insn = *(unsigned*)branch; 77 if ((Instruction_aarch64::extract(insn, 29, 24) & 0b111011) == 0b011000) { 78 // Load register (literal) 79 Instruction_aarch64::spatch(branch, 23, 5, offset); 80 } else if (Instruction_aarch64::extract(insn, 30, 26) == 0b00101) { 81 // Unconditional branch (immediate) 82 Instruction_aarch64::spatch(branch, 25, 0, offset); 83 } else if (Instruction_aarch64::extract(insn, 31, 25) == 0b0101010) { 84 // Conditional branch (immediate) 85 Instruction_aarch64::spatch(branch, 23, 5, offset); 86 } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011010) { 87 // Compare & branch (immediate) 88 Instruction_aarch64::spatch(branch, 23, 5, offset); 89 } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011011) { 90 // Test & branch (immediate) 91 Instruction_aarch64::spatch(branch, 18, 5, offset); 92 } else if (Instruction_aarch64::extract(insn, 28, 24) == 0b10000) { 93 // PC-rel. addressing 94 offset = target-branch; 95 int shift = Instruction_aarch64::extract(insn, 31, 31); 96 if (shift) { 97 u_int64_t dest = (u_int64_t)target; 98 uint64_t pc_page = (uint64_t)branch >> 12; 99 uint64_t adr_page = (uint64_t)target >> 12; 100 unsigned offset_lo = dest & 0xfff; 101 offset = adr_page - pc_page; 102 103 // We handle 4 types of PC relative addressing 104 // 1 - adrp Rx, target_page 105 // ldr/str Ry, [Rx, #offset_in_page] 106 // 2 - adrp Rx, target_page 107 // add Ry, Rx, #offset_in_page 108 // 3 - adrp Rx, target_page (page aligned reloc, offset == 0) 109 // movk Rx, #imm16<<32 110 // 4 - adrp Rx, target_page (page aligned reloc, offset == 0) 111 // In the first 3 cases we must check that Rx is the same in the adrp and the 112 // subsequent ldr/str, add or movk instruction. Otherwise we could accidentally end 113 // up treating a type 4 relocation as a type 1, 2 or 3 just because it happened 114 // to be followed by a random unrelated ldr/str, add or movk instruction. 115 // 116 unsigned insn2 = ((unsigned*)branch)[1]; 117 if (Instruction_aarch64::extract(insn2, 29, 24) == 0b111001 && 118 Instruction_aarch64::extract(insn, 4, 0) == 119 Instruction_aarch64::extract(insn2, 9, 5)) { 120 // Load/store register (unsigned immediate) 121 unsigned size = Instruction_aarch64::extract(insn2, 31, 30); 122 Instruction_aarch64::patch(branch + sizeof (unsigned), 123 21, 10, offset_lo >> size); 124 guarantee(((dest >> size) << size) == dest, "misaligned target"); 125 instructions = 2; 126 } else if (Instruction_aarch64::extract(insn2, 31, 22) == 0b1001000100 && 127 Instruction_aarch64::extract(insn, 4, 0) == 128 Instruction_aarch64::extract(insn2, 4, 0)) { 129 // add (immediate) 130 Instruction_aarch64::patch(branch + sizeof (unsigned), 131 21, 10, offset_lo); 132 instructions = 2; 133 } else if (Instruction_aarch64::extract(insn2, 31, 21) == 0b11110010110 && 134 Instruction_aarch64::extract(insn, 4, 0) == 135 Instruction_aarch64::extract(insn2, 4, 0)) { 136 // movk #imm16<<32 137 Instruction_aarch64::patch(branch + 4, 20, 5, (uint64_t)target >> 32); 138 long dest = ((long)target & 0xffffffffL) | ((long)branch & 0xffff00000000L); 139 long pc_page = (long)branch >> 12; 140 long adr_page = (long)dest >> 12; 141 offset = adr_page - pc_page; 142 instructions = 2; 143 } 144 } 145 int offset_lo = offset & 3; 146 offset >>= 2; 147 Instruction_aarch64::spatch(branch, 23, 5, offset); 148 Instruction_aarch64::patch(branch, 30, 29, offset_lo); 149 } else if (Instruction_aarch64::extract(insn, 31, 21) == 0b11010010100) { 150 u_int64_t dest = (u_int64_t)target; 151 // Move wide constant 152 assert(nativeInstruction_at(branch+4)->is_movk(), "wrong insns in patch"); 153 assert(nativeInstruction_at(branch+8)->is_movk(), "wrong insns in patch"); 154 Instruction_aarch64::patch(branch, 20, 5, dest & 0xffff); 155 Instruction_aarch64::patch(branch+4, 20, 5, (dest >>= 16) & 0xffff); 156 Instruction_aarch64::patch(branch+8, 20, 5, (dest >>= 16) & 0xffff); 157 assert(target_addr_for_insn(branch) == target, "should be"); 158 instructions = 3; 159 } else if (Instruction_aarch64::extract(insn, 31, 22) == 0b1011100101 && 160 Instruction_aarch64::extract(insn, 4, 0) == 0b11111) { 161 // nothing to do 162 assert(target == 0, "did not expect to relocate target for polling page load"); 163 } else { 164 ShouldNotReachHere(); 165 } 166 return instructions * NativeInstruction::instruction_size; 167 } 168 169 int MacroAssembler::patch_oop(address insn_addr, address o) { 170 int instructions; 171 unsigned insn = *(unsigned*)insn_addr; 172 assert(nativeInstruction_at(insn_addr+4)->is_movk(), "wrong insns in patch"); 173 174 // OOPs are either narrow (32 bits) or wide (48 bits). We encode 175 // narrow OOPs by setting the upper 16 bits in the first 176 // instruction. 177 if (Instruction_aarch64::extract(insn, 31, 21) == 0b11010010101) { 178 // Move narrow OOP 179 narrowOop n = CompressedOops::encode((oop)o); 180 Instruction_aarch64::patch(insn_addr, 20, 5, n >> 16); 181 Instruction_aarch64::patch(insn_addr+4, 20, 5, n & 0xffff); 182 instructions = 2; 183 } else { 184 // Move wide OOP 185 assert(nativeInstruction_at(insn_addr+8)->is_movk(), "wrong insns in patch"); 186 uintptr_t dest = (uintptr_t)o; 187 Instruction_aarch64::patch(insn_addr, 20, 5, dest & 0xffff); 188 Instruction_aarch64::patch(insn_addr+4, 20, 5, (dest >>= 16) & 0xffff); 189 Instruction_aarch64::patch(insn_addr+8, 20, 5, (dest >>= 16) & 0xffff); 190 instructions = 3; 191 } 192 return instructions * NativeInstruction::instruction_size; 193 } 194 195 int MacroAssembler::patch_narrow_klass(address insn_addr, narrowKlass n) { 196 // Metatdata pointers are either narrow (32 bits) or wide (48 bits). 197 // We encode narrow ones by setting the upper 16 bits in the first 198 // instruction. 199 NativeInstruction *insn = nativeInstruction_at(insn_addr); 200 assert(Instruction_aarch64::extract(insn->encoding(), 31, 21) == 0b11010010101 && 201 nativeInstruction_at(insn_addr+4)->is_movk(), "wrong insns in patch"); 202 203 Instruction_aarch64::patch(insn_addr, 20, 5, n >> 16); 204 Instruction_aarch64::patch(insn_addr+4, 20, 5, n & 0xffff); 205 return 2 * NativeInstruction::instruction_size; 206 } 207 208 address MacroAssembler::target_addr_for_insn(address insn_addr, unsigned insn) { 209 long offset = 0; 210 if ((Instruction_aarch64::extract(insn, 29, 24) & 0b011011) == 0b00011000) { 211 // Load register (literal) 212 offset = Instruction_aarch64::sextract(insn, 23, 5); 213 return address(((uint64_t)insn_addr + (offset << 2))); 214 } else if (Instruction_aarch64::extract(insn, 30, 26) == 0b00101) { 215 // Unconditional branch (immediate) 216 offset = Instruction_aarch64::sextract(insn, 25, 0); 217 } else if (Instruction_aarch64::extract(insn, 31, 25) == 0b0101010) { 218 // Conditional branch (immediate) 219 offset = Instruction_aarch64::sextract(insn, 23, 5); 220 } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011010) { 221 // Compare & branch (immediate) 222 offset = Instruction_aarch64::sextract(insn, 23, 5); 223 } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011011) { 224 // Test & branch (immediate) 225 offset = Instruction_aarch64::sextract(insn, 18, 5); 226 } else if (Instruction_aarch64::extract(insn, 28, 24) == 0b10000) { 227 // PC-rel. addressing 228 offset = Instruction_aarch64::extract(insn, 30, 29); 229 offset |= Instruction_aarch64::sextract(insn, 23, 5) << 2; 230 int shift = Instruction_aarch64::extract(insn, 31, 31) ? 12 : 0; 231 if (shift) { 232 offset <<= shift; 233 uint64_t target_page = ((uint64_t)insn_addr) + offset; 234 target_page &= ((uint64_t)-1) << shift; 235 // Return the target address for the following sequences 236 // 1 - adrp Rx, target_page 237 // ldr/str Ry, [Rx, #offset_in_page] 238 // 2 - adrp Rx, target_page 239 // add Ry, Rx, #offset_in_page 240 // 3 - adrp Rx, target_page (page aligned reloc, offset == 0) 241 // movk Rx, #imm12<<32 242 // 4 - adrp Rx, target_page (page aligned reloc, offset == 0) 243 // 244 // In the first two cases we check that the register is the same and 245 // return the target_page + the offset within the page. 246 // Otherwise we assume it is a page aligned relocation and return 247 // the target page only. 248 // 249 unsigned insn2 = ((unsigned*)insn_addr)[1]; 250 if (Instruction_aarch64::extract(insn2, 29, 24) == 0b111001 && 251 Instruction_aarch64::extract(insn, 4, 0) == 252 Instruction_aarch64::extract(insn2, 9, 5)) { 253 // Load/store register (unsigned immediate) 254 unsigned int byte_offset = Instruction_aarch64::extract(insn2, 21, 10); 255 unsigned int size = Instruction_aarch64::extract(insn2, 31, 30); 256 return address(target_page + (byte_offset << size)); 257 } else if (Instruction_aarch64::extract(insn2, 31, 22) == 0b1001000100 && 258 Instruction_aarch64::extract(insn, 4, 0) == 259 Instruction_aarch64::extract(insn2, 4, 0)) { 260 // add (immediate) 261 unsigned int byte_offset = Instruction_aarch64::extract(insn2, 21, 10); 262 return address(target_page + byte_offset); 263 } else { 264 if (Instruction_aarch64::extract(insn2, 31, 21) == 0b11110010110 && 265 Instruction_aarch64::extract(insn, 4, 0) == 266 Instruction_aarch64::extract(insn2, 4, 0)) { 267 target_page = (target_page & 0xffffffff) | 268 ((uint64_t)Instruction_aarch64::extract(insn2, 20, 5) << 32); 269 } 270 return (address)target_page; 271 } 272 } else { 273 ShouldNotReachHere(); 274 } 275 } else if (Instruction_aarch64::extract(insn, 31, 23) == 0b110100101) { 276 u_int32_t *insns = (u_int32_t *)insn_addr; 277 // Move wide constant: movz, movk, movk. See movptr(). 278 assert(nativeInstruction_at(insns+1)->is_movk(), "wrong insns in patch"); 279 assert(nativeInstruction_at(insns+2)->is_movk(), "wrong insns in patch"); 280 return address(u_int64_t(Instruction_aarch64::extract(insns[0], 20, 5)) 281 + (u_int64_t(Instruction_aarch64::extract(insns[1], 20, 5)) << 16) 282 + (u_int64_t(Instruction_aarch64::extract(insns[2], 20, 5)) << 32)); 283 } else if (Instruction_aarch64::extract(insn, 31, 22) == 0b1011100101 && 284 Instruction_aarch64::extract(insn, 4, 0) == 0b11111) { 285 return 0; 286 } else { 287 ShouldNotReachHere(); 288 } 289 return address(((uint64_t)insn_addr + (offset << 2))); 290 } 291 292 void MacroAssembler::safepoint_poll(Label& slow_path) { 293 if (SafepointMechanism::uses_thread_local_poll()) { 294 ldr(rscratch1, Address(rthread, Thread::polling_page_offset())); 295 tbnz(rscratch1, exact_log2(SafepointMechanism::poll_bit()), slow_path); 296 } else { 297 unsigned long offset; 298 adrp(rscratch1, ExternalAddress(SafepointSynchronize::address_of_state()), offset); 299 ldrw(rscratch1, Address(rscratch1, offset)); 300 assert(SafepointSynchronize::_not_synchronized == 0, "rewrite this code"); 301 cbnz(rscratch1, slow_path); 302 } 303 } 304 305 // Just like safepoint_poll, but use an acquiring load for thread- 306 // local polling. 307 // 308 // We need an acquire here to ensure that any subsequent load of the 309 // global SafepointSynchronize::_state flag is ordered after this load 310 // of the local Thread::_polling page. We don't want this poll to 311 // return false (i.e. not safepointing) and a later poll of the global 312 // SafepointSynchronize::_state spuriously to return true. 313 // 314 // This is to avoid a race when we're in a native->Java transition 315 // racing the code which wakes up from a safepoint. 316 // 317 void MacroAssembler::safepoint_poll_acquire(Label& slow_path) { 318 if (SafepointMechanism::uses_thread_local_poll()) { 319 lea(rscratch1, Address(rthread, Thread::polling_page_offset())); 320 ldar(rscratch1, rscratch1); 321 tbnz(rscratch1, exact_log2(SafepointMechanism::poll_bit()), slow_path); 322 } else { 323 safepoint_poll(slow_path); 324 } 325 } 326 327 void MacroAssembler::reset_last_Java_frame(bool clear_fp) { 328 // we must set sp to zero to clear frame 329 str(zr, Address(rthread, JavaThread::last_Java_sp_offset())); 330 331 // must clear fp, so that compiled frames are not confused; it is 332 // possible that we need it only for debugging 333 if (clear_fp) { 334 str(zr, Address(rthread, JavaThread::last_Java_fp_offset())); 335 } 336 337 // Always clear the pc because it could have been set by make_walkable() 338 str(zr, Address(rthread, JavaThread::last_Java_pc_offset())); 339 } 340 341 // Calls to C land 342 // 343 // When entering C land, the rfp, & resp of the last Java frame have to be recorded 344 // in the (thread-local) JavaThread object. When leaving C land, the last Java fp 345 // has to be reset to 0. This is required to allow proper stack traversal. 346 void MacroAssembler::set_last_Java_frame(Register last_java_sp, 347 Register last_java_fp, 348 Register last_java_pc, 349 Register scratch) { 350 351 if (last_java_pc->is_valid()) { 352 str(last_java_pc, Address(rthread, 353 JavaThread::frame_anchor_offset() 354 + JavaFrameAnchor::last_Java_pc_offset())); 355 } 356 357 // determine last_java_sp register 358 if (last_java_sp == sp) { 359 mov(scratch, sp); 360 last_java_sp = scratch; 361 } else if (!last_java_sp->is_valid()) { 362 last_java_sp = esp; 363 } 364 365 str(last_java_sp, Address(rthread, JavaThread::last_Java_sp_offset())); 366 367 // last_java_fp is optional 368 if (last_java_fp->is_valid()) { 369 str(last_java_fp, Address(rthread, JavaThread::last_Java_fp_offset())); 370 } 371 } 372 373 void MacroAssembler::set_last_Java_frame(Register last_java_sp, 374 Register last_java_fp, 375 address last_java_pc, 376 Register scratch) { 377 assert(last_java_pc != NULL, "must provide a valid PC"); 378 379 adr(scratch, last_java_pc); 380 str(scratch, Address(rthread, 381 JavaThread::frame_anchor_offset() 382 + JavaFrameAnchor::last_Java_pc_offset())); 383 384 set_last_Java_frame(last_java_sp, last_java_fp, noreg, scratch); 385 } 386 387 void MacroAssembler::set_last_Java_frame(Register last_java_sp, 388 Register last_java_fp, 389 Label &L, 390 Register scratch) { 391 if (L.is_bound()) { 392 set_last_Java_frame(last_java_sp, last_java_fp, target(L), scratch); 393 } else { 394 InstructionMark im(this); 395 L.add_patch_at(code(), locator()); 396 set_last_Java_frame(last_java_sp, last_java_fp, pc() /* Patched later */, scratch); 397 } 398 } 399 400 void MacroAssembler::far_call(Address entry, CodeBuffer *cbuf, Register tmp) { 401 assert(ReservedCodeCacheSize < 4*G, "branch out of range"); 402 assert(CodeCache::find_blob(entry.target()) != NULL, 403 "destination of far call not found in code cache"); 404 if (far_branches()) { 405 unsigned long offset; 406 // We can use ADRP here because we know that the total size of 407 // the code cache cannot exceed 2Gb. 408 adrp(tmp, entry, offset); 409 add(tmp, tmp, offset); 410 if (cbuf) cbuf->set_insts_mark(); 411 blr(tmp); 412 } else { 413 if (cbuf) cbuf->set_insts_mark(); 414 bl(entry); 415 } 416 } 417 418 void MacroAssembler::far_jump(Address entry, CodeBuffer *cbuf, Register tmp) { 419 assert(ReservedCodeCacheSize < 4*G, "branch out of range"); 420 assert(CodeCache::find_blob(entry.target()) != NULL, 421 "destination of far call not found in code cache"); 422 if (far_branches()) { 423 unsigned long offset; 424 // We can use ADRP here because we know that the total size of 425 // the code cache cannot exceed 2Gb. 426 adrp(tmp, entry, offset); 427 add(tmp, tmp, offset); 428 if (cbuf) cbuf->set_insts_mark(); 429 br(tmp); 430 } else { 431 if (cbuf) cbuf->set_insts_mark(); 432 b(entry); 433 } 434 } 435 436 void MacroAssembler::reserved_stack_check() { 437 // testing if reserved zone needs to be enabled 438 Label no_reserved_zone_enabling; 439 440 ldr(rscratch1, Address(rthread, JavaThread::reserved_stack_activation_offset())); 441 cmp(sp, rscratch1); 442 br(Assembler::LO, no_reserved_zone_enabling); 443 444 enter(); // LR and FP are live. 445 lea(rscratch1, CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone)); 446 mov(c_rarg0, rthread); 447 blr(rscratch1); 448 leave(); 449 450 // We have already removed our own frame. 451 // throw_delayed_StackOverflowError will think that it's been 452 // called by our caller. 453 lea(rscratch1, RuntimeAddress(StubRoutines::throw_delayed_StackOverflowError_entry())); 454 br(rscratch1); 455 should_not_reach_here(); 456 457 bind(no_reserved_zone_enabling); 458 } 459 460 int MacroAssembler::biased_locking_enter(Register lock_reg, 461 Register obj_reg, 462 Register swap_reg, 463 Register tmp_reg, 464 bool swap_reg_contains_mark, 465 Label& done, 466 Label* slow_case, 467 BiasedLockingCounters* counters) { 468 assert(UseBiasedLocking, "why call this otherwise?"); 469 assert_different_registers(lock_reg, obj_reg, swap_reg); 470 471 if (PrintBiasedLockingStatistics && counters == NULL) 472 counters = BiasedLocking::counters(); 473 474 assert_different_registers(lock_reg, obj_reg, swap_reg, tmp_reg, rscratch1, rscratch2, noreg); 475 assert(markWord::age_shift == markWord::lock_bits + markWord::biased_lock_bits, "biased locking makes assumptions about bit layout"); 476 Address mark_addr (obj_reg, oopDesc::mark_offset_in_bytes()); 477 Address klass_addr (obj_reg, oopDesc::klass_offset_in_bytes()); 478 Address saved_mark_addr(lock_reg, 0); 479 480 // Biased locking 481 // See whether the lock is currently biased toward our thread and 482 // whether the epoch is still valid 483 // Note that the runtime guarantees sufficient alignment of JavaThread 484 // pointers to allow age to be placed into low bits 485 // First check to see whether biasing is even enabled for this object 486 Label cas_label; 487 int null_check_offset = -1; 488 if (!swap_reg_contains_mark) { 489 null_check_offset = offset(); 490 ldr(swap_reg, mark_addr); 491 } 492 andr(tmp_reg, swap_reg, markWord::biased_lock_mask_in_place); 493 cmp(tmp_reg, (u1)markWord::biased_lock_pattern); 494 br(Assembler::NE, cas_label); 495 // The bias pattern is present in the object's header. Need to check 496 // whether the bias owner and the epoch are both still current. 497 load_prototype_header(tmp_reg, obj_reg); 498 orr(tmp_reg, tmp_reg, rthread); 499 eor(tmp_reg, swap_reg, tmp_reg); 500 andr(tmp_reg, tmp_reg, ~((int) markWord::age_mask_in_place)); 501 if (counters != NULL) { 502 Label around; 503 cbnz(tmp_reg, around); 504 atomic_incw(Address((address)counters->biased_lock_entry_count_addr()), tmp_reg, rscratch1, rscratch2); 505 b(done); 506 bind(around); 507 } else { 508 cbz(tmp_reg, done); 509 } 510 511 Label try_revoke_bias; 512 Label try_rebias; 513 514 // At this point we know that the header has the bias pattern and 515 // that we are not the bias owner in the current epoch. We need to 516 // figure out more details about the state of the header in order to 517 // know what operations can be legally performed on the object's 518 // header. 519 520 // If the low three bits in the xor result aren't clear, that means 521 // the prototype header is no longer biased and we have to revoke 522 // the bias on this object. 523 andr(rscratch1, tmp_reg, markWord::biased_lock_mask_in_place); 524 cbnz(rscratch1, try_revoke_bias); 525 526 // Biasing is still enabled for this data type. See whether the 527 // epoch of the current bias is still valid, meaning that the epoch 528 // bits of the mark word are equal to the epoch bits of the 529 // prototype header. (Note that the prototype header's epoch bits 530 // only change at a safepoint.) If not, attempt to rebias the object 531 // toward the current thread. Note that we must be absolutely sure 532 // that the current epoch is invalid in order to do this because 533 // otherwise the manipulations it performs on the mark word are 534 // illegal. 535 andr(rscratch1, tmp_reg, markWord::epoch_mask_in_place); 536 cbnz(rscratch1, try_rebias); 537 538 // The epoch of the current bias is still valid but we know nothing 539 // about the owner; it might be set or it might be clear. Try to 540 // acquire the bias of the object using an atomic operation. If this 541 // fails we will go in to the runtime to revoke the object's bias. 542 // Note that we first construct the presumed unbiased header so we 543 // don't accidentally blow away another thread's valid bias. 544 { 545 Label here; 546 mov(rscratch1, markWord::biased_lock_mask_in_place | markWord::age_mask_in_place | markWord::epoch_mask_in_place); 547 andr(swap_reg, swap_reg, rscratch1); 548 orr(tmp_reg, swap_reg, rthread); 549 cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, rscratch1, here, slow_case); 550 // If the biasing toward our thread failed, this means that 551 // another thread succeeded in biasing it toward itself and we 552 // need to revoke that bias. The revocation will occur in the 553 // interpreter runtime in the slow case. 554 bind(here); 555 if (counters != NULL) { 556 atomic_incw(Address((address)counters->anonymously_biased_lock_entry_count_addr()), 557 tmp_reg, rscratch1, rscratch2); 558 } 559 } 560 b(done); 561 562 bind(try_rebias); 563 // At this point we know the epoch has expired, meaning that the 564 // current "bias owner", if any, is actually invalid. Under these 565 // circumstances _only_, we are allowed to use the current header's 566 // value as the comparison value when doing the cas to acquire the 567 // bias in the current epoch. In other words, we allow transfer of 568 // the bias from one thread to another directly in this situation. 569 // 570 // FIXME: due to a lack of registers we currently blow away the age 571 // bits in this situation. Should attempt to preserve them. 572 { 573 Label here; 574 load_prototype_header(tmp_reg, obj_reg); 575 orr(tmp_reg, rthread, tmp_reg); 576 cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, rscratch1, here, slow_case); 577 // If the biasing toward our thread failed, then another thread 578 // succeeded in biasing it toward itself and we need to revoke that 579 // bias. The revocation will occur in the runtime in the slow case. 580 bind(here); 581 if (counters != NULL) { 582 atomic_incw(Address((address)counters->rebiased_lock_entry_count_addr()), 583 tmp_reg, rscratch1, rscratch2); 584 } 585 } 586 b(done); 587 588 bind(try_revoke_bias); 589 // The prototype mark in the klass doesn't have the bias bit set any 590 // more, indicating that objects of this data type are not supposed 591 // to be biased any more. We are going to try to reset the mark of 592 // this object to the prototype value and fall through to the 593 // CAS-based locking scheme. Note that if our CAS fails, it means 594 // that another thread raced us for the privilege of revoking the 595 // bias of this particular object, so it's okay to continue in the 596 // normal locking code. 597 // 598 // FIXME: due to a lack of registers we currently blow away the age 599 // bits in this situation. Should attempt to preserve them. 600 { 601 Label here, nope; 602 load_prototype_header(tmp_reg, obj_reg); 603 cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, rscratch1, here, &nope); 604 bind(here); 605 606 // Fall through to the normal CAS-based lock, because no matter what 607 // the result of the above CAS, some thread must have succeeded in 608 // removing the bias bit from the object's header. 609 if (counters != NULL) { 610 atomic_incw(Address((address)counters->revoked_lock_entry_count_addr()), tmp_reg, 611 rscratch1, rscratch2); 612 } 613 bind(nope); 614 } 615 616 bind(cas_label); 617 618 return null_check_offset; 619 } 620 621 void MacroAssembler::biased_locking_exit(Register obj_reg, Register temp_reg, Label& done) { 622 assert(UseBiasedLocking, "why call this otherwise?"); 623 624 // Check for biased locking unlock case, which is a no-op 625 // Note: we do not have to check the thread ID for two reasons. 626 // First, the interpreter checks for IllegalMonitorStateException at 627 // a higher level. Second, if the bias was revoked while we held the 628 // lock, the object could not be rebiased toward another thread, so 629 // the bias bit would be clear. 630 ldr(temp_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes())); 631 andr(temp_reg, temp_reg, markWord::biased_lock_mask_in_place); 632 cmp(temp_reg, (u1)markWord::biased_lock_pattern); 633 br(Assembler::EQ, done); 634 } 635 636 static void pass_arg0(MacroAssembler* masm, Register arg) { 637 if (c_rarg0 != arg ) { 638 masm->mov(c_rarg0, arg); 639 } 640 } 641 642 static void pass_arg1(MacroAssembler* masm, Register arg) { 643 if (c_rarg1 != arg ) { 644 masm->mov(c_rarg1, arg); 645 } 646 } 647 648 static void pass_arg2(MacroAssembler* masm, Register arg) { 649 if (c_rarg2 != arg ) { 650 masm->mov(c_rarg2, arg); 651 } 652 } 653 654 static void pass_arg3(MacroAssembler* masm, Register arg) { 655 if (c_rarg3 != arg ) { 656 masm->mov(c_rarg3, arg); 657 } 658 } 659 660 void MacroAssembler::call_VM_base(Register oop_result, 661 Register java_thread, 662 Register last_java_sp, 663 address entry_point, 664 int number_of_arguments, 665 bool check_exceptions) { 666 // determine java_thread register 667 if (!java_thread->is_valid()) { 668 java_thread = rthread; 669 } 670 671 // determine last_java_sp register 672 if (!last_java_sp->is_valid()) { 673 last_java_sp = esp; 674 } 675 676 // debugging support 677 assert(number_of_arguments >= 0 , "cannot have negative number of arguments"); 678 assert(java_thread == rthread, "unexpected register"); 679 #ifdef ASSERT 680 // TraceBytecodes does not use r12 but saves it over the call, so don't verify 681 // if ((UseCompressedOops || UseCompressedClassPointers) && !TraceBytecodes) verify_heapbase("call_VM_base: heap base corrupted?"); 682 #endif // ASSERT 683 684 assert(java_thread != oop_result , "cannot use the same register for java_thread & oop_result"); 685 assert(java_thread != last_java_sp, "cannot use the same register for java_thread & last_java_sp"); 686 687 // push java thread (becomes first argument of C function) 688 689 mov(c_rarg0, java_thread); 690 691 // set last Java frame before call 692 assert(last_java_sp != rfp, "can't use rfp"); 693 694 Label l; 695 set_last_Java_frame(last_java_sp, rfp, l, rscratch1); 696 697 // do the call, remove parameters 698 MacroAssembler::call_VM_leaf_base(entry_point, number_of_arguments, &l); 699 700 // reset last Java frame 701 // Only interpreter should have to clear fp 702 reset_last_Java_frame(true); 703 704 // C++ interp handles this in the interpreter 705 check_and_handle_popframe(java_thread); 706 check_and_handle_earlyret(java_thread); 707 708 if (check_exceptions) { 709 // check for pending exceptions (java_thread is set upon return) 710 ldr(rscratch1, Address(java_thread, in_bytes(Thread::pending_exception_offset()))); 711 Label ok; 712 cbz(rscratch1, ok); 713 lea(rscratch1, RuntimeAddress(StubRoutines::forward_exception_entry())); 714 br(rscratch1); 715 bind(ok); 716 } 717 718 // get oop result if there is one and reset the value in the thread 719 if (oop_result->is_valid()) { 720 get_vm_result(oop_result, java_thread); 721 } 722 } 723 724 void MacroAssembler::call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions) { 725 call_VM_base(oop_result, noreg, noreg, entry_point, number_of_arguments, check_exceptions); 726 } 727 728 // Maybe emit a call via a trampoline. If the code cache is small 729 // trampolines won't be emitted. 730 731 address MacroAssembler::trampoline_call(Address entry, CodeBuffer *cbuf) { 732 assert(JavaThread::current()->is_Compiler_thread(), "just checking"); 733 assert(entry.rspec().type() == relocInfo::runtime_call_type 734 || entry.rspec().type() == relocInfo::opt_virtual_call_type 735 || entry.rspec().type() == relocInfo::static_call_type 736 || entry.rspec().type() == relocInfo::virtual_call_type, "wrong reloc type"); 737 738 // We need a trampoline if branches are far. 739 if (far_branches()) { 740 bool in_scratch_emit_size = false; 741 #ifdef COMPILER2 742 // We don't want to emit a trampoline if C2 is generating dummy 743 // code during its branch shortening phase. 744 CompileTask* task = ciEnv::current()->task(); 745 in_scratch_emit_size = 746 (task != NULL && is_c2_compile(task->comp_level()) && 747 Compile::current()->in_scratch_emit_size()); 748 #endif 749 if (!in_scratch_emit_size) { 750 address stub = emit_trampoline_stub(offset(), entry.target()); 751 if (stub == NULL) { 752 return NULL; // CodeCache is full 753 } 754 } 755 } 756 757 if (cbuf) cbuf->set_insts_mark(); 758 relocate(entry.rspec()); 759 if (!far_branches()) { 760 bl(entry.target()); 761 } else { 762 bl(pc()); 763 } 764 // just need to return a non-null address 765 return pc(); 766 } 767 768 769 // Emit a trampoline stub for a call to a target which is too far away. 770 // 771 // code sequences: 772 // 773 // call-site: 774 // branch-and-link to <destination> or <trampoline stub> 775 // 776 // Related trampoline stub for this call site in the stub section: 777 // load the call target from the constant pool 778 // branch (LR still points to the call site above) 779 780 address MacroAssembler::emit_trampoline_stub(int insts_call_instruction_offset, 781 address dest) { 782 // Max stub size: alignment nop, TrampolineStub. 783 address stub = start_a_stub(NativeInstruction::instruction_size 784 + NativeCallTrampolineStub::instruction_size); 785 if (stub == NULL) { 786 return NULL; // CodeBuffer::expand failed 787 } 788 789 // Create a trampoline stub relocation which relates this trampoline stub 790 // with the call instruction at insts_call_instruction_offset in the 791 // instructions code-section. 792 align(wordSize); 793 relocate(trampoline_stub_Relocation::spec(code()->insts()->start() 794 + insts_call_instruction_offset)); 795 const int stub_start_offset = offset(); 796 797 // Now, create the trampoline stub's code: 798 // - load the call 799 // - call 800 Label target; 801 ldr(rscratch1, target); 802 br(rscratch1); 803 bind(target); 804 assert(offset() - stub_start_offset == NativeCallTrampolineStub::data_offset, 805 "should be"); 806 emit_int64((int64_t)dest); 807 808 const address stub_start_addr = addr_at(stub_start_offset); 809 810 assert(is_NativeCallTrampolineStub_at(stub_start_addr), "doesn't look like a trampoline"); 811 812 end_a_stub(); 813 return stub_start_addr; 814 } 815 816 void MacroAssembler::emit_static_call_stub() { 817 // CompiledDirectStaticCall::set_to_interpreted knows the 818 // exact layout of this stub. 819 820 isb(); 821 mov_metadata(rmethod, (Metadata*)NULL); 822 823 // Jump to the entry point of the i2c stub. 824 movptr(rscratch1, 0); 825 br(rscratch1); 826 } 827 828 void MacroAssembler::c2bool(Register x) { 829 // implements x == 0 ? 0 : 1 830 // note: must only look at least-significant byte of x 831 // since C-style booleans are stored in one byte 832 // only! (was bug) 833 tst(x, 0xff); 834 cset(x, Assembler::NE); 835 } 836 837 address MacroAssembler::ic_call(address entry, jint method_index) { 838 RelocationHolder rh = virtual_call_Relocation::spec(pc(), method_index); 839 // address const_ptr = long_constant((jlong)Universe::non_oop_word()); 840 // unsigned long offset; 841 // ldr_constant(rscratch2, const_ptr); 842 movptr(rscratch2, (uintptr_t)Universe::non_oop_word()); 843 return trampoline_call(Address(entry, rh)); 844 } 845 846 // Implementation of call_VM versions 847 848 void MacroAssembler::call_VM(Register oop_result, 849 address entry_point, 850 bool check_exceptions) { 851 call_VM_helper(oop_result, entry_point, 0, check_exceptions); 852 } 853 854 void MacroAssembler::call_VM(Register oop_result, 855 address entry_point, 856 Register arg_1, 857 bool check_exceptions) { 858 pass_arg1(this, arg_1); 859 call_VM_helper(oop_result, entry_point, 1, check_exceptions); 860 } 861 862 void MacroAssembler::call_VM(Register oop_result, 863 address entry_point, 864 Register arg_1, 865 Register arg_2, 866 bool check_exceptions) { 867 assert(arg_1 != c_rarg2, "smashed arg"); 868 pass_arg2(this, arg_2); 869 pass_arg1(this, arg_1); 870 call_VM_helper(oop_result, entry_point, 2, check_exceptions); 871 } 872 873 void MacroAssembler::call_VM(Register oop_result, 874 address entry_point, 875 Register arg_1, 876 Register arg_2, 877 Register arg_3, 878 bool check_exceptions) { 879 assert(arg_1 != c_rarg3, "smashed arg"); 880 assert(arg_2 != c_rarg3, "smashed arg"); 881 pass_arg3(this, arg_3); 882 883 assert(arg_1 != c_rarg2, "smashed arg"); 884 pass_arg2(this, arg_2); 885 886 pass_arg1(this, arg_1); 887 call_VM_helper(oop_result, entry_point, 3, check_exceptions); 888 } 889 890 void MacroAssembler::call_VM(Register oop_result, 891 Register last_java_sp, 892 address entry_point, 893 int number_of_arguments, 894 bool check_exceptions) { 895 call_VM_base(oop_result, rthread, last_java_sp, entry_point, number_of_arguments, check_exceptions); 896 } 897 898 void MacroAssembler::call_VM(Register oop_result, 899 Register last_java_sp, 900 address entry_point, 901 Register arg_1, 902 bool check_exceptions) { 903 pass_arg1(this, arg_1); 904 call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions); 905 } 906 907 void MacroAssembler::call_VM(Register oop_result, 908 Register last_java_sp, 909 address entry_point, 910 Register arg_1, 911 Register arg_2, 912 bool check_exceptions) { 913 914 assert(arg_1 != c_rarg2, "smashed arg"); 915 pass_arg2(this, arg_2); 916 pass_arg1(this, arg_1); 917 call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions); 918 } 919 920 void MacroAssembler::call_VM(Register oop_result, 921 Register last_java_sp, 922 address entry_point, 923 Register arg_1, 924 Register arg_2, 925 Register arg_3, 926 bool check_exceptions) { 927 assert(arg_1 != c_rarg3, "smashed arg"); 928 assert(arg_2 != c_rarg3, "smashed arg"); 929 pass_arg3(this, arg_3); 930 assert(arg_1 != c_rarg2, "smashed arg"); 931 pass_arg2(this, arg_2); 932 pass_arg1(this, arg_1); 933 call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions); 934 } 935 936 937 void MacroAssembler::get_vm_result(Register oop_result, Register java_thread) { 938 ldr(oop_result, Address(java_thread, JavaThread::vm_result_offset())); 939 str(zr, Address(java_thread, JavaThread::vm_result_offset())); 940 verify_oop(oop_result, "broken oop in call_VM_base"); 941 } 942 943 void MacroAssembler::get_vm_result_2(Register metadata_result, Register java_thread) { 944 ldr(metadata_result, Address(java_thread, JavaThread::vm_result_2_offset())); 945 str(zr, Address(java_thread, JavaThread::vm_result_2_offset())); 946 } 947 948 void MacroAssembler::align(int modulus) { 949 while (offset() % modulus != 0) nop(); 950 } 951 952 // these are no-ops overridden by InterpreterMacroAssembler 953 954 void MacroAssembler::check_and_handle_earlyret(Register java_thread) { } 955 956 void MacroAssembler::check_and_handle_popframe(Register java_thread) { } 957 958 959 RegisterOrConstant MacroAssembler::delayed_value_impl(intptr_t* delayed_value_addr, 960 Register tmp, 961 int offset) { 962 intptr_t value = *delayed_value_addr; 963 if (value != 0) 964 return RegisterOrConstant(value + offset); 965 966 // load indirectly to solve generation ordering problem 967 ldr(tmp, ExternalAddress((address) delayed_value_addr)); 968 969 if (offset != 0) 970 add(tmp, tmp, offset); 971 972 return RegisterOrConstant(tmp); 973 } 974 975 // Look up the method for a megamorphic invokeinterface call. 976 // The target method is determined by <intf_klass, itable_index>. 977 // The receiver klass is in recv_klass. 978 // On success, the result will be in method_result, and execution falls through. 979 // On failure, execution transfers to the given label. 980 void MacroAssembler::lookup_interface_method(Register recv_klass, 981 Register intf_klass, 982 RegisterOrConstant itable_index, 983 Register method_result, 984 Register scan_temp, 985 Label& L_no_such_interface, 986 bool return_method) { 987 assert_different_registers(recv_klass, intf_klass, scan_temp); 988 assert_different_registers(method_result, intf_klass, scan_temp); 989 assert(recv_klass != method_result || !return_method, 990 "recv_klass can be destroyed when method isn't needed"); 991 assert(itable_index.is_constant() || itable_index.as_register() == method_result, 992 "caller must use same register for non-constant itable index as for method"); 993 994 // Compute start of first itableOffsetEntry (which is at the end of the vtable) 995 int vtable_base = in_bytes(Klass::vtable_start_offset()); 996 int itentry_off = itableMethodEntry::method_offset_in_bytes(); 997 int scan_step = itableOffsetEntry::size() * wordSize; 998 int vte_size = vtableEntry::size_in_bytes(); 999 assert(vte_size == wordSize, "else adjust times_vte_scale"); 1000 1001 ldrw(scan_temp, Address(recv_klass, Klass::vtable_length_offset())); 1002 1003 // %%% Could store the aligned, prescaled offset in the klassoop. 1004 // lea(scan_temp, Address(recv_klass, scan_temp, times_vte_scale, vtable_base)); 1005 lea(scan_temp, Address(recv_klass, scan_temp, Address::lsl(3))); 1006 add(scan_temp, scan_temp, vtable_base); 1007 1008 if (return_method) { 1009 // Adjust recv_klass by scaled itable_index, so we can free itable_index. 1010 assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below"); 1011 // lea(recv_klass, Address(recv_klass, itable_index, Address::times_ptr, itentry_off)); 1012 lea(recv_klass, Address(recv_klass, itable_index, Address::lsl(3))); 1013 if (itentry_off) 1014 add(recv_klass, recv_klass, itentry_off); 1015 } 1016 1017 // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) { 1018 // if (scan->interface() == intf) { 1019 // result = (klass + scan->offset() + itable_index); 1020 // } 1021 // } 1022 Label search, found_method; 1023 1024 for (int peel = 1; peel >= 0; peel--) { 1025 ldr(method_result, Address(scan_temp, itableOffsetEntry::interface_offset_in_bytes())); 1026 cmp(intf_klass, method_result); 1027 1028 if (peel) { 1029 br(Assembler::EQ, found_method); 1030 } else { 1031 br(Assembler::NE, search); 1032 // (invert the test to fall through to found_method...) 1033 } 1034 1035 if (!peel) break; 1036 1037 bind(search); 1038 1039 // Check that the previous entry is non-null. A null entry means that 1040 // the receiver class doesn't implement the interface, and wasn't the 1041 // same as when the caller was compiled. 1042 cbz(method_result, L_no_such_interface); 1043 add(scan_temp, scan_temp, scan_step); 1044 } 1045 1046 bind(found_method); 1047 1048 // Got a hit. 1049 if (return_method) { 1050 ldrw(scan_temp, Address(scan_temp, itableOffsetEntry::offset_offset_in_bytes())); 1051 ldr(method_result, Address(recv_klass, scan_temp, Address::uxtw(0))); 1052 } 1053 } 1054 1055 // virtual method calling 1056 void MacroAssembler::lookup_virtual_method(Register recv_klass, 1057 RegisterOrConstant vtable_index, 1058 Register method_result) { 1059 const int base = in_bytes(Klass::vtable_start_offset()); 1060 assert(vtableEntry::size() * wordSize == 8, 1061 "adjust the scaling in the code below"); 1062 int vtable_offset_in_bytes = base + vtableEntry::method_offset_in_bytes(); 1063 1064 if (vtable_index.is_register()) { 1065 lea(method_result, Address(recv_klass, 1066 vtable_index.as_register(), 1067 Address::lsl(LogBytesPerWord))); 1068 ldr(method_result, Address(method_result, vtable_offset_in_bytes)); 1069 } else { 1070 vtable_offset_in_bytes += vtable_index.as_constant() * wordSize; 1071 ldr(method_result, 1072 form_address(rscratch1, recv_klass, vtable_offset_in_bytes, 0)); 1073 } 1074 } 1075 1076 void MacroAssembler::check_klass_subtype(Register sub_klass, 1077 Register super_klass, 1078 Register temp_reg, 1079 Label& L_success) { 1080 Label L_failure; 1081 check_klass_subtype_fast_path(sub_klass, super_klass, temp_reg, &L_success, &L_failure, NULL); 1082 check_klass_subtype_slow_path(sub_klass, super_klass, temp_reg, noreg, &L_success, NULL); 1083 bind(L_failure); 1084 } 1085 1086 1087 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass, 1088 Register super_klass, 1089 Register temp_reg, 1090 Label* L_success, 1091 Label* L_failure, 1092 Label* L_slow_path, 1093 RegisterOrConstant super_check_offset) { 1094 assert_different_registers(sub_klass, super_klass, temp_reg); 1095 bool must_load_sco = (super_check_offset.constant_or_zero() == -1); 1096 if (super_check_offset.is_register()) { 1097 assert_different_registers(sub_klass, super_klass, 1098 super_check_offset.as_register()); 1099 } else if (must_load_sco) { 1100 assert(temp_reg != noreg, "supply either a temp or a register offset"); 1101 } 1102 1103 Label L_fallthrough; 1104 int label_nulls = 0; 1105 if (L_success == NULL) { L_success = &L_fallthrough; label_nulls++; } 1106 if (L_failure == NULL) { L_failure = &L_fallthrough; label_nulls++; } 1107 if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; } 1108 assert(label_nulls <= 1, "at most one NULL in the batch"); 1109 1110 int sc_offset = in_bytes(Klass::secondary_super_cache_offset()); 1111 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 1112 Address super_check_offset_addr(super_klass, sco_offset); 1113 1114 // Hacked jmp, which may only be used just before L_fallthrough. 1115 #define final_jmp(label) \ 1116 if (&(label) == &L_fallthrough) { /*do nothing*/ } \ 1117 else b(label) /*omit semi*/ 1118 1119 // If the pointers are equal, we are done (e.g., String[] elements). 1120 // This self-check enables sharing of secondary supertype arrays among 1121 // non-primary types such as array-of-interface. Otherwise, each such 1122 // type would need its own customized SSA. 1123 // We move this check to the front of the fast path because many 1124 // type checks are in fact trivially successful in this manner, 1125 // so we get a nicely predicted branch right at the start of the check. 1126 cmp(sub_klass, super_klass); 1127 br(Assembler::EQ, *L_success); 1128 1129 // Check the supertype display: 1130 if (must_load_sco) { 1131 ldrw(temp_reg, super_check_offset_addr); 1132 super_check_offset = RegisterOrConstant(temp_reg); 1133 } 1134 Address super_check_addr(sub_klass, super_check_offset); 1135 ldr(rscratch1, super_check_addr); 1136 cmp(super_klass, rscratch1); // load displayed supertype 1137 1138 // This check has worked decisively for primary supers. 1139 // Secondary supers are sought in the super_cache ('super_cache_addr'). 1140 // (Secondary supers are interfaces and very deeply nested subtypes.) 1141 // This works in the same check above because of a tricky aliasing 1142 // between the super_cache and the primary super display elements. 1143 // (The 'super_check_addr' can address either, as the case requires.) 1144 // Note that the cache is updated below if it does not help us find 1145 // what we need immediately. 1146 // So if it was a primary super, we can just fail immediately. 1147 // Otherwise, it's the slow path for us (no success at this point). 1148 1149 if (super_check_offset.is_register()) { 1150 br(Assembler::EQ, *L_success); 1151 subs(zr, super_check_offset.as_register(), sc_offset); 1152 if (L_failure == &L_fallthrough) { 1153 br(Assembler::EQ, *L_slow_path); 1154 } else { 1155 br(Assembler::NE, *L_failure); 1156 final_jmp(*L_slow_path); 1157 } 1158 } else if (super_check_offset.as_constant() == sc_offset) { 1159 // Need a slow path; fast failure is impossible. 1160 if (L_slow_path == &L_fallthrough) { 1161 br(Assembler::EQ, *L_success); 1162 } else { 1163 br(Assembler::NE, *L_slow_path); 1164 final_jmp(*L_success); 1165 } 1166 } else { 1167 // No slow path; it's a fast decision. 1168 if (L_failure == &L_fallthrough) { 1169 br(Assembler::EQ, *L_success); 1170 } else { 1171 br(Assembler::NE, *L_failure); 1172 final_jmp(*L_success); 1173 } 1174 } 1175 1176 bind(L_fallthrough); 1177 1178 #undef final_jmp 1179 } 1180 1181 // These two are taken from x86, but they look generally useful 1182 1183 // scans count pointer sized words at [addr] for occurence of value, 1184 // generic 1185 void MacroAssembler::repne_scan(Register addr, Register value, Register count, 1186 Register scratch) { 1187 Label Lloop, Lexit; 1188 cbz(count, Lexit); 1189 bind(Lloop); 1190 ldr(scratch, post(addr, wordSize)); 1191 cmp(value, scratch); 1192 br(EQ, Lexit); 1193 sub(count, count, 1); 1194 cbnz(count, Lloop); 1195 bind(Lexit); 1196 } 1197 1198 // scans count 4 byte words at [addr] for occurence of value, 1199 // generic 1200 void MacroAssembler::repne_scanw(Register addr, Register value, Register count, 1201 Register scratch) { 1202 Label Lloop, Lexit; 1203 cbz(count, Lexit); 1204 bind(Lloop); 1205 ldrw(scratch, post(addr, wordSize)); 1206 cmpw(value, scratch); 1207 br(EQ, Lexit); 1208 sub(count, count, 1); 1209 cbnz(count, Lloop); 1210 bind(Lexit); 1211 } 1212 1213 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass, 1214 Register super_klass, 1215 Register temp_reg, 1216 Register temp2_reg, 1217 Label* L_success, 1218 Label* L_failure, 1219 bool set_cond_codes) { 1220 assert_different_registers(sub_klass, super_klass, temp_reg); 1221 if (temp2_reg != noreg) 1222 assert_different_registers(sub_klass, super_klass, temp_reg, temp2_reg, rscratch1); 1223 #define IS_A_TEMP(reg) ((reg) == temp_reg || (reg) == temp2_reg) 1224 1225 Label L_fallthrough; 1226 int label_nulls = 0; 1227 if (L_success == NULL) { L_success = &L_fallthrough; label_nulls++; } 1228 if (L_failure == NULL) { L_failure = &L_fallthrough; label_nulls++; } 1229 assert(label_nulls <= 1, "at most one NULL in the batch"); 1230 1231 // a couple of useful fields in sub_klass: 1232 int ss_offset = in_bytes(Klass::secondary_supers_offset()); 1233 int sc_offset = in_bytes(Klass::secondary_super_cache_offset()); 1234 Address secondary_supers_addr(sub_klass, ss_offset); 1235 Address super_cache_addr( sub_klass, sc_offset); 1236 1237 BLOCK_COMMENT("check_klass_subtype_slow_path"); 1238 1239 // Do a linear scan of the secondary super-klass chain. 1240 // This code is rarely used, so simplicity is a virtue here. 1241 // The repne_scan instruction uses fixed registers, which we must spill. 1242 // Don't worry too much about pre-existing connections with the input regs. 1243 1244 assert(sub_klass != r0, "killed reg"); // killed by mov(r0, super) 1245 assert(sub_klass != r2, "killed reg"); // killed by lea(r2, &pst_counter) 1246 1247 RegSet pushed_registers; 1248 if (!IS_A_TEMP(r2)) pushed_registers += r2; 1249 if (!IS_A_TEMP(r5)) pushed_registers += r5; 1250 1251 if (super_klass != r0 || UseCompressedOops) { 1252 if (!IS_A_TEMP(r0)) pushed_registers += r0; 1253 } 1254 1255 push(pushed_registers, sp); 1256 1257 // Get super_klass value into r0 (even if it was in r5 or r2). 1258 if (super_klass != r0) { 1259 mov(r0, super_klass); 1260 } 1261 1262 #ifndef PRODUCT 1263 mov(rscratch2, (address)&SharedRuntime::_partial_subtype_ctr); 1264 Address pst_counter_addr(rscratch2); 1265 ldr(rscratch1, pst_counter_addr); 1266 add(rscratch1, rscratch1, 1); 1267 str(rscratch1, pst_counter_addr); 1268 #endif //PRODUCT 1269 1270 // We will consult the secondary-super array. 1271 ldr(r5, secondary_supers_addr); 1272 // Load the array length. 1273 ldrw(r2, Address(r5, Array<Klass*>::length_offset_in_bytes())); 1274 // Skip to start of data. 1275 add(r5, r5, Array<Klass*>::base_offset_in_bytes()); 1276 1277 cmp(sp, zr); // Clear Z flag; SP is never zero 1278 // Scan R2 words at [R5] for an occurrence of R0. 1279 // Set NZ/Z based on last compare. 1280 repne_scan(r5, r0, r2, rscratch1); 1281 1282 // Unspill the temp. registers: 1283 pop(pushed_registers, sp); 1284 1285 br(Assembler::NE, *L_failure); 1286 1287 // Success. Cache the super we found and proceed in triumph. 1288 str(super_klass, super_cache_addr); 1289 1290 if (L_success != &L_fallthrough) { 1291 b(*L_success); 1292 } 1293 1294 #undef IS_A_TEMP 1295 1296 bind(L_fallthrough); 1297 } 1298 1299 void MacroAssembler::clinit_barrier(Register klass, Register scratch, Label* L_fast_path, Label* L_slow_path) { 1300 assert(L_fast_path != NULL || L_slow_path != NULL, "at least one is required"); 1301 assert_different_registers(klass, rthread, scratch); 1302 1303 Label L_fallthrough, L_tmp; 1304 if (L_fast_path == NULL) { 1305 L_fast_path = &L_fallthrough; 1306 } else if (L_slow_path == NULL) { 1307 L_slow_path = &L_fallthrough; 1308 } 1309 // Fast path check: class is fully initialized 1310 ldrb(scratch, Address(klass, InstanceKlass::init_state_offset())); 1311 subs(zr, scratch, InstanceKlass::fully_initialized); 1312 br(Assembler::EQ, *L_fast_path); 1313 1314 // Fast path check: current thread is initializer thread 1315 ldr(scratch, Address(klass, InstanceKlass::init_thread_offset())); 1316 cmp(rthread, scratch); 1317 1318 if (L_slow_path == &L_fallthrough) { 1319 br(Assembler::EQ, *L_fast_path); 1320 bind(*L_slow_path); 1321 } else if (L_fast_path == &L_fallthrough) { 1322 br(Assembler::NE, *L_slow_path); 1323 bind(*L_fast_path); 1324 } else { 1325 Unimplemented(); 1326 } 1327 } 1328 1329 void MacroAssembler::verify_oop(Register reg, const char* s) { 1330 if (!VerifyOops) return; 1331 1332 // Pass register number to verify_oop_subroutine 1333 const char* b = NULL; 1334 { 1335 ResourceMark rm; 1336 stringStream ss; 1337 ss.print("verify_oop: %s: %s", reg->name(), s); 1338 b = code_string(ss.as_string()); 1339 } 1340 BLOCK_COMMENT("verify_oop {"); 1341 1342 stp(r0, rscratch1, Address(pre(sp, -2 * wordSize))); 1343 stp(rscratch2, lr, Address(pre(sp, -2 * wordSize))); 1344 1345 mov(r0, reg); 1346 mov(rscratch1, (address)b); 1347 1348 // call indirectly to solve generation ordering problem 1349 lea(rscratch2, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address())); 1350 ldr(rscratch2, Address(rscratch2)); 1351 blr(rscratch2); 1352 1353 ldp(rscratch2, lr, Address(post(sp, 2 * wordSize))); 1354 ldp(r0, rscratch1, Address(post(sp, 2 * wordSize))); 1355 1356 BLOCK_COMMENT("} verify_oop"); 1357 } 1358 1359 void MacroAssembler::verify_oop_addr(Address addr, const char* s) { 1360 if (!VerifyOops) return; 1361 1362 const char* b = NULL; 1363 { 1364 ResourceMark rm; 1365 stringStream ss; 1366 ss.print("verify_oop_addr: %s", s); 1367 b = code_string(ss.as_string()); 1368 } 1369 BLOCK_COMMENT("verify_oop_addr {"); 1370 1371 stp(r0, rscratch1, Address(pre(sp, -2 * wordSize))); 1372 stp(rscratch2, lr, Address(pre(sp, -2 * wordSize))); 1373 1374 // addr may contain sp so we will have to adjust it based on the 1375 // pushes that we just did. 1376 if (addr.uses(sp)) { 1377 lea(r0, addr); 1378 ldr(r0, Address(r0, 4 * wordSize)); 1379 } else { 1380 ldr(r0, addr); 1381 } 1382 mov(rscratch1, (address)b); 1383 1384 // call indirectly to solve generation ordering problem 1385 lea(rscratch2, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address())); 1386 ldr(rscratch2, Address(rscratch2)); 1387 blr(rscratch2); 1388 1389 ldp(rscratch2, lr, Address(post(sp, 2 * wordSize))); 1390 ldp(r0, rscratch1, Address(post(sp, 2 * wordSize))); 1391 1392 BLOCK_COMMENT("} verify_oop_addr"); 1393 } 1394 1395 Address MacroAssembler::argument_address(RegisterOrConstant arg_slot, 1396 int extra_slot_offset) { 1397 // cf. TemplateTable::prepare_invoke(), if (load_receiver). 1398 int stackElementSize = Interpreter::stackElementSize; 1399 int offset = Interpreter::expr_offset_in_bytes(extra_slot_offset+0); 1400 #ifdef ASSERT 1401 int offset1 = Interpreter::expr_offset_in_bytes(extra_slot_offset+1); 1402 assert(offset1 - offset == stackElementSize, "correct arithmetic"); 1403 #endif 1404 if (arg_slot.is_constant()) { 1405 return Address(esp, arg_slot.as_constant() * stackElementSize 1406 + offset); 1407 } else { 1408 add(rscratch1, esp, arg_slot.as_register(), 1409 ext::uxtx, exact_log2(stackElementSize)); 1410 return Address(rscratch1, offset); 1411 } 1412 } 1413 1414 void MacroAssembler::call_VM_leaf_base(address entry_point, 1415 int number_of_arguments, 1416 Label *retaddr) { 1417 Label E, L; 1418 1419 stp(rscratch1, rmethod, Address(pre(sp, -2 * wordSize))); 1420 1421 mov(rscratch1, entry_point); 1422 blr(rscratch1); 1423 if (retaddr) 1424 bind(*retaddr); 1425 1426 ldp(rscratch1, rmethod, Address(post(sp, 2 * wordSize))); 1427 maybe_isb(); 1428 } 1429 1430 void MacroAssembler::call_VM_leaf(address entry_point, int number_of_arguments) { 1431 call_VM_leaf_base(entry_point, number_of_arguments); 1432 } 1433 1434 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0) { 1435 pass_arg0(this, arg_0); 1436 call_VM_leaf_base(entry_point, 1); 1437 } 1438 1439 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1) { 1440 pass_arg0(this, arg_0); 1441 pass_arg1(this, arg_1); 1442 call_VM_leaf_base(entry_point, 2); 1443 } 1444 1445 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, 1446 Register arg_1, Register arg_2) { 1447 pass_arg0(this, arg_0); 1448 pass_arg1(this, arg_1); 1449 pass_arg2(this, arg_2); 1450 call_VM_leaf_base(entry_point, 3); 1451 } 1452 1453 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0) { 1454 pass_arg0(this, arg_0); 1455 MacroAssembler::call_VM_leaf_base(entry_point, 1); 1456 } 1457 1458 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1) { 1459 1460 assert(arg_0 != c_rarg1, "smashed arg"); 1461 pass_arg1(this, arg_1); 1462 pass_arg0(this, arg_0); 1463 MacroAssembler::call_VM_leaf_base(entry_point, 2); 1464 } 1465 1466 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) { 1467 assert(arg_0 != c_rarg2, "smashed arg"); 1468 assert(arg_1 != c_rarg2, "smashed arg"); 1469 pass_arg2(this, arg_2); 1470 assert(arg_0 != c_rarg1, "smashed arg"); 1471 pass_arg1(this, arg_1); 1472 pass_arg0(this, arg_0); 1473 MacroAssembler::call_VM_leaf_base(entry_point, 3); 1474 } 1475 1476 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) { 1477 assert(arg_0 != c_rarg3, "smashed arg"); 1478 assert(arg_1 != c_rarg3, "smashed arg"); 1479 assert(arg_2 != c_rarg3, "smashed arg"); 1480 pass_arg3(this, arg_3); 1481 assert(arg_0 != c_rarg2, "smashed arg"); 1482 assert(arg_1 != c_rarg2, "smashed arg"); 1483 pass_arg2(this, arg_2); 1484 assert(arg_0 != c_rarg1, "smashed arg"); 1485 pass_arg1(this, arg_1); 1486 pass_arg0(this, arg_0); 1487 MacroAssembler::call_VM_leaf_base(entry_point, 4); 1488 } 1489 1490 void MacroAssembler::null_check(Register reg, int offset) { 1491 if (needs_explicit_null_check(offset)) { 1492 // provoke OS NULL exception if reg = NULL by 1493 // accessing M[reg] w/o changing any registers 1494 // NOTE: this is plenty to provoke a segv 1495 ldr(zr, Address(reg)); 1496 } else { 1497 // nothing to do, (later) access of M[reg + offset] 1498 // will provoke OS NULL exception if reg = NULL 1499 } 1500 } 1501 1502 // MacroAssembler protected routines needed to implement 1503 // public methods 1504 1505 void MacroAssembler::mov(Register r, Address dest) { 1506 code_section()->relocate(pc(), dest.rspec()); 1507 u_int64_t imm64 = (u_int64_t)dest.target(); 1508 movptr(r, imm64); 1509 } 1510 1511 // Move a constant pointer into r. In AArch64 mode the virtual 1512 // address space is 48 bits in size, so we only need three 1513 // instructions to create a patchable instruction sequence that can 1514 // reach anywhere. 1515 void MacroAssembler::movptr(Register r, uintptr_t imm64) { 1516 #ifndef PRODUCT 1517 { 1518 char buffer[64]; 1519 snprintf(buffer, sizeof(buffer), "0x%" PRIX64, imm64); 1520 block_comment(buffer); 1521 } 1522 #endif 1523 assert(imm64 < (1ul << 48), "48-bit overflow in address constant"); 1524 movz(r, imm64 & 0xffff); 1525 imm64 >>= 16; 1526 movk(r, imm64 & 0xffff, 16); 1527 imm64 >>= 16; 1528 movk(r, imm64 & 0xffff, 32); 1529 } 1530 1531 // Macro to mov replicated immediate to vector register. 1532 // Vd will get the following values for different arrangements in T 1533 // imm32 == hex 000000gh T8B: Vd = ghghghghghghghgh 1534 // imm32 == hex 000000gh T16B: Vd = ghghghghghghghghghghghghghghghgh 1535 // imm32 == hex 0000efgh T4H: Vd = efghefghefghefgh 1536 // imm32 == hex 0000efgh T8H: Vd = efghefghefghefghefghefghefghefgh 1537 // imm32 == hex abcdefgh T2S: Vd = abcdefghabcdefgh 1538 // imm32 == hex abcdefgh T4S: Vd = abcdefghabcdefghabcdefghabcdefgh 1539 // T1D/T2D: invalid 1540 void MacroAssembler::mov(FloatRegister Vd, SIMD_Arrangement T, u_int32_t imm32) { 1541 assert(T != T1D && T != T2D, "invalid arrangement"); 1542 if (T == T8B || T == T16B) { 1543 assert((imm32 & ~0xff) == 0, "extraneous bits in unsigned imm32 (T8B/T16B)"); 1544 movi(Vd, T, imm32 & 0xff, 0); 1545 return; 1546 } 1547 u_int32_t nimm32 = ~imm32; 1548 if (T == T4H || T == T8H) { 1549 assert((imm32 & ~0xffff) == 0, "extraneous bits in unsigned imm32 (T4H/T8H)"); 1550 imm32 &= 0xffff; 1551 nimm32 &= 0xffff; 1552 } 1553 u_int32_t x = imm32; 1554 int movi_cnt = 0; 1555 int movn_cnt = 0; 1556 while (x) { if (x & 0xff) movi_cnt++; x >>= 8; } 1557 x = nimm32; 1558 while (x) { if (x & 0xff) movn_cnt++; x >>= 8; } 1559 if (movn_cnt < movi_cnt) imm32 = nimm32; 1560 unsigned lsl = 0; 1561 while (imm32 && (imm32 & 0xff) == 0) { lsl += 8; imm32 >>= 8; } 1562 if (movn_cnt < movi_cnt) 1563 mvni(Vd, T, imm32 & 0xff, lsl); 1564 else 1565 movi(Vd, T, imm32 & 0xff, lsl); 1566 imm32 >>= 8; lsl += 8; 1567 while (imm32) { 1568 while ((imm32 & 0xff) == 0) { lsl += 8; imm32 >>= 8; } 1569 if (movn_cnt < movi_cnt) 1570 bici(Vd, T, imm32 & 0xff, lsl); 1571 else 1572 orri(Vd, T, imm32 & 0xff, lsl); 1573 lsl += 8; imm32 >>= 8; 1574 } 1575 } 1576 1577 void MacroAssembler::mov_immediate64(Register dst, u_int64_t imm64) 1578 { 1579 #ifndef PRODUCT 1580 { 1581 char buffer[64]; 1582 snprintf(buffer, sizeof(buffer), "0x%" PRIX64, imm64); 1583 block_comment(buffer); 1584 } 1585 #endif 1586 if (operand_valid_for_logical_immediate(false, imm64)) { 1587 orr(dst, zr, imm64); 1588 } else { 1589 // we can use a combination of MOVZ or MOVN with 1590 // MOVK to build up the constant 1591 u_int64_t imm_h[4]; 1592 int zero_count = 0; 1593 int neg_count = 0; 1594 int i; 1595 for (i = 0; i < 4; i++) { 1596 imm_h[i] = ((imm64 >> (i * 16)) & 0xffffL); 1597 if (imm_h[i] == 0) { 1598 zero_count++; 1599 } else if (imm_h[i] == 0xffffL) { 1600 neg_count++; 1601 } 1602 } 1603 if (zero_count == 4) { 1604 // one MOVZ will do 1605 movz(dst, 0); 1606 } else if (neg_count == 4) { 1607 // one MOVN will do 1608 movn(dst, 0); 1609 } else if (zero_count == 3) { 1610 for (i = 0; i < 4; i++) { 1611 if (imm_h[i] != 0L) { 1612 movz(dst, (u_int32_t)imm_h[i], (i << 4)); 1613 break; 1614 } 1615 } 1616 } else if (neg_count == 3) { 1617 // one MOVN will do 1618 for (int i = 0; i < 4; i++) { 1619 if (imm_h[i] != 0xffffL) { 1620 movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4)); 1621 break; 1622 } 1623 } 1624 } else if (zero_count == 2) { 1625 // one MOVZ and one MOVK will do 1626 for (i = 0; i < 3; i++) { 1627 if (imm_h[i] != 0L) { 1628 movz(dst, (u_int32_t)imm_h[i], (i << 4)); 1629 i++; 1630 break; 1631 } 1632 } 1633 for (;i < 4; i++) { 1634 if (imm_h[i] != 0L) { 1635 movk(dst, (u_int32_t)imm_h[i], (i << 4)); 1636 } 1637 } 1638 } else if (neg_count == 2) { 1639 // one MOVN and one MOVK will do 1640 for (i = 0; i < 4; i++) { 1641 if (imm_h[i] != 0xffffL) { 1642 movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4)); 1643 i++; 1644 break; 1645 } 1646 } 1647 for (;i < 4; i++) { 1648 if (imm_h[i] != 0xffffL) { 1649 movk(dst, (u_int32_t)imm_h[i], (i << 4)); 1650 } 1651 } 1652 } else if (zero_count == 1) { 1653 // one MOVZ and two MOVKs will do 1654 for (i = 0; i < 4; i++) { 1655 if (imm_h[i] != 0L) { 1656 movz(dst, (u_int32_t)imm_h[i], (i << 4)); 1657 i++; 1658 break; 1659 } 1660 } 1661 for (;i < 4; i++) { 1662 if (imm_h[i] != 0x0L) { 1663 movk(dst, (u_int32_t)imm_h[i], (i << 4)); 1664 } 1665 } 1666 } else if (neg_count == 1) { 1667 // one MOVN and two MOVKs will do 1668 for (i = 0; i < 4; i++) { 1669 if (imm_h[i] != 0xffffL) { 1670 movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4)); 1671 i++; 1672 break; 1673 } 1674 } 1675 for (;i < 4; i++) { 1676 if (imm_h[i] != 0xffffL) { 1677 movk(dst, (u_int32_t)imm_h[i], (i << 4)); 1678 } 1679 } 1680 } else { 1681 // use a MOVZ and 3 MOVKs (makes it easier to debug) 1682 movz(dst, (u_int32_t)imm_h[0], 0); 1683 for (i = 1; i < 4; i++) { 1684 movk(dst, (u_int32_t)imm_h[i], (i << 4)); 1685 } 1686 } 1687 } 1688 } 1689 1690 void MacroAssembler::mov_immediate32(Register dst, u_int32_t imm32) 1691 { 1692 #ifndef PRODUCT 1693 { 1694 char buffer[64]; 1695 snprintf(buffer, sizeof(buffer), "0x%" PRIX32, imm32); 1696 block_comment(buffer); 1697 } 1698 #endif 1699 if (operand_valid_for_logical_immediate(true, imm32)) { 1700 orrw(dst, zr, imm32); 1701 } else { 1702 // we can use MOVZ, MOVN or two calls to MOVK to build up the 1703 // constant 1704 u_int32_t imm_h[2]; 1705 imm_h[0] = imm32 & 0xffff; 1706 imm_h[1] = ((imm32 >> 16) & 0xffff); 1707 if (imm_h[0] == 0) { 1708 movzw(dst, imm_h[1], 16); 1709 } else if (imm_h[0] == 0xffff) { 1710 movnw(dst, imm_h[1] ^ 0xffff, 16); 1711 } else if (imm_h[1] == 0) { 1712 movzw(dst, imm_h[0], 0); 1713 } else if (imm_h[1] == 0xffff) { 1714 movnw(dst, imm_h[0] ^ 0xffff, 0); 1715 } else { 1716 // use a MOVZ and MOVK (makes it easier to debug) 1717 movzw(dst, imm_h[0], 0); 1718 movkw(dst, imm_h[1], 16); 1719 } 1720 } 1721 } 1722 1723 // Form an address from base + offset in Rd. Rd may or may 1724 // not actually be used: you must use the Address that is returned. 1725 // It is up to you to ensure that the shift provided matches the size 1726 // of your data. 1727 Address MacroAssembler::form_address(Register Rd, Register base, long byte_offset, int shift) { 1728 if (Address::offset_ok_for_immed(byte_offset, shift)) 1729 // It fits; no need for any heroics 1730 return Address(base, byte_offset); 1731 1732 // Don't do anything clever with negative or misaligned offsets 1733 unsigned mask = (1 << shift) - 1; 1734 if (byte_offset < 0 || byte_offset & mask) { 1735 mov(Rd, byte_offset); 1736 add(Rd, base, Rd); 1737 return Address(Rd); 1738 } 1739 1740 // See if we can do this with two 12-bit offsets 1741 { 1742 unsigned long word_offset = byte_offset >> shift; 1743 unsigned long masked_offset = word_offset & 0xfff000; 1744 if (Address::offset_ok_for_immed(word_offset - masked_offset, 0) 1745 && Assembler::operand_valid_for_add_sub_immediate(masked_offset << shift)) { 1746 add(Rd, base, masked_offset << shift); 1747 word_offset -= masked_offset; 1748 return Address(Rd, word_offset << shift); 1749 } 1750 } 1751 1752 // Do it the hard way 1753 mov(Rd, byte_offset); 1754 add(Rd, base, Rd); 1755 return Address(Rd); 1756 } 1757 1758 void MacroAssembler::atomic_incw(Register counter_addr, Register tmp, Register tmp2) { 1759 if (UseLSE) { 1760 mov(tmp, 1); 1761 ldadd(Assembler::word, tmp, zr, counter_addr); 1762 return; 1763 } 1764 Label retry_load; 1765 if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH)) 1766 prfm(Address(counter_addr), PSTL1STRM); 1767 bind(retry_load); 1768 // flush and load exclusive from the memory location 1769 ldxrw(tmp, counter_addr); 1770 addw(tmp, tmp, 1); 1771 // if we store+flush with no intervening write tmp wil be zero 1772 stxrw(tmp2, tmp, counter_addr); 1773 cbnzw(tmp2, retry_load); 1774 } 1775 1776 1777 int MacroAssembler::corrected_idivl(Register result, Register ra, Register rb, 1778 bool want_remainder, Register scratch) 1779 { 1780 // Full implementation of Java idiv and irem. The function 1781 // returns the (pc) offset of the div instruction - may be needed 1782 // for implicit exceptions. 1783 // 1784 // constraint : ra/rb =/= scratch 1785 // normal case 1786 // 1787 // input : ra: dividend 1788 // rb: divisor 1789 // 1790 // result: either 1791 // quotient (= ra idiv rb) 1792 // remainder (= ra irem rb) 1793 1794 assert(ra != scratch && rb != scratch, "reg cannot be scratch"); 1795 1796 int idivl_offset = offset(); 1797 if (! want_remainder) { 1798 sdivw(result, ra, rb); 1799 } else { 1800 sdivw(scratch, ra, rb); 1801 Assembler::msubw(result, scratch, rb, ra); 1802 } 1803 1804 return idivl_offset; 1805 } 1806 1807 int MacroAssembler::corrected_idivq(Register result, Register ra, Register rb, 1808 bool want_remainder, Register scratch) 1809 { 1810 // Full implementation of Java ldiv and lrem. The function 1811 // returns the (pc) offset of the div instruction - may be needed 1812 // for implicit exceptions. 1813 // 1814 // constraint : ra/rb =/= scratch 1815 // normal case 1816 // 1817 // input : ra: dividend 1818 // rb: divisor 1819 // 1820 // result: either 1821 // quotient (= ra idiv rb) 1822 // remainder (= ra irem rb) 1823 1824 assert(ra != scratch && rb != scratch, "reg cannot be scratch"); 1825 1826 int idivq_offset = offset(); 1827 if (! want_remainder) { 1828 sdiv(result, ra, rb); 1829 } else { 1830 sdiv(scratch, ra, rb); 1831 Assembler::msub(result, scratch, rb, ra); 1832 } 1833 1834 return idivq_offset; 1835 } 1836 1837 void MacroAssembler::membar(Membar_mask_bits order_constraint) { 1838 address prev = pc() - NativeMembar::instruction_size; 1839 address last = code()->last_insn(); 1840 if (last != NULL && nativeInstruction_at(last)->is_Membar() && prev == last) { 1841 NativeMembar *bar = NativeMembar_at(prev); 1842 // We are merging two memory barrier instructions. On AArch64 we 1843 // can do this simply by ORing them together. 1844 bar->set_kind(bar->get_kind() | order_constraint); 1845 BLOCK_COMMENT("merged membar"); 1846 } else { 1847 code()->set_last_insn(pc()); 1848 dmb(Assembler::barrier(order_constraint)); 1849 } 1850 } 1851 1852 bool MacroAssembler::try_merge_ldst(Register rt, const Address &adr, size_t size_in_bytes, bool is_store) { 1853 if (ldst_can_merge(rt, adr, size_in_bytes, is_store)) { 1854 merge_ldst(rt, adr, size_in_bytes, is_store); 1855 code()->clear_last_insn(); 1856 return true; 1857 } else { 1858 assert(size_in_bytes == 8 || size_in_bytes == 4, "only 8 bytes or 4 bytes load/store is supported."); 1859 const unsigned mask = size_in_bytes - 1; 1860 if (adr.getMode() == Address::base_plus_offset && 1861 (adr.offset() & mask) == 0) { // only supports base_plus_offset. 1862 code()->set_last_insn(pc()); 1863 } 1864 return false; 1865 } 1866 } 1867 1868 void MacroAssembler::ldr(Register Rx, const Address &adr) { 1869 // We always try to merge two adjacent loads into one ldp. 1870 if (!try_merge_ldst(Rx, adr, 8, false)) { 1871 Assembler::ldr(Rx, adr); 1872 } 1873 } 1874 1875 void MacroAssembler::ldrw(Register Rw, const Address &adr) { 1876 // We always try to merge two adjacent loads into one ldp. 1877 if (!try_merge_ldst(Rw, adr, 4, false)) { 1878 Assembler::ldrw(Rw, adr); 1879 } 1880 } 1881 1882 void MacroAssembler::str(Register Rx, const Address &adr) { 1883 // We always try to merge two adjacent stores into one stp. 1884 if (!try_merge_ldst(Rx, adr, 8, true)) { 1885 Assembler::str(Rx, adr); 1886 } 1887 } 1888 1889 void MacroAssembler::strw(Register Rw, const Address &adr) { 1890 // We always try to merge two adjacent stores into one stp. 1891 if (!try_merge_ldst(Rw, adr, 4, true)) { 1892 Assembler::strw(Rw, adr); 1893 } 1894 } 1895 1896 // MacroAssembler routines found actually to be needed 1897 1898 void MacroAssembler::push(Register src) 1899 { 1900 str(src, Address(pre(esp, -1 * wordSize))); 1901 } 1902 1903 void MacroAssembler::pop(Register dst) 1904 { 1905 ldr(dst, Address(post(esp, 1 * wordSize))); 1906 } 1907 1908 // Note: load_unsigned_short used to be called load_unsigned_word. 1909 int MacroAssembler::load_unsigned_short(Register dst, Address src) { 1910 int off = offset(); 1911 ldrh(dst, src); 1912 return off; 1913 } 1914 1915 int MacroAssembler::load_unsigned_byte(Register dst, Address src) { 1916 int off = offset(); 1917 ldrb(dst, src); 1918 return off; 1919 } 1920 1921 int MacroAssembler::load_signed_short(Register dst, Address src) { 1922 int off = offset(); 1923 ldrsh(dst, src); 1924 return off; 1925 } 1926 1927 int MacroAssembler::load_signed_byte(Register dst, Address src) { 1928 int off = offset(); 1929 ldrsb(dst, src); 1930 return off; 1931 } 1932 1933 int MacroAssembler::load_signed_short32(Register dst, Address src) { 1934 int off = offset(); 1935 ldrshw(dst, src); 1936 return off; 1937 } 1938 1939 int MacroAssembler::load_signed_byte32(Register dst, Address src) { 1940 int off = offset(); 1941 ldrsbw(dst, src); 1942 return off; 1943 } 1944 1945 void MacroAssembler::load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed, Register dst2) { 1946 switch (size_in_bytes) { 1947 case 8: ldr(dst, src); break; 1948 case 4: ldrw(dst, src); break; 1949 case 2: is_signed ? load_signed_short(dst, src) : load_unsigned_short(dst, src); break; 1950 case 1: is_signed ? load_signed_byte( dst, src) : load_unsigned_byte( dst, src); break; 1951 default: ShouldNotReachHere(); 1952 } 1953 } 1954 1955 void MacroAssembler::store_sized_value(Address dst, Register src, size_t size_in_bytes, Register src2) { 1956 switch (size_in_bytes) { 1957 case 8: str(src, dst); break; 1958 case 4: strw(src, dst); break; 1959 case 2: strh(src, dst); break; 1960 case 1: strb(src, dst); break; 1961 default: ShouldNotReachHere(); 1962 } 1963 } 1964 1965 void MacroAssembler::decrementw(Register reg, int value) 1966 { 1967 if (value < 0) { incrementw(reg, -value); return; } 1968 if (value == 0) { return; } 1969 if (value < (1 << 12)) { subw(reg, reg, value); return; } 1970 /* else */ { 1971 guarantee(reg != rscratch2, "invalid dst for register decrement"); 1972 movw(rscratch2, (unsigned)value); 1973 subw(reg, reg, rscratch2); 1974 } 1975 } 1976 1977 void MacroAssembler::decrement(Register reg, int value) 1978 { 1979 if (value < 0) { increment(reg, -value); return; } 1980 if (value == 0) { return; } 1981 if (value < (1 << 12)) { sub(reg, reg, value); return; } 1982 /* else */ { 1983 assert(reg != rscratch2, "invalid dst for register decrement"); 1984 mov(rscratch2, (unsigned long)value); 1985 sub(reg, reg, rscratch2); 1986 } 1987 } 1988 1989 void MacroAssembler::decrementw(Address dst, int value) 1990 { 1991 assert(!dst.uses(rscratch1), "invalid dst for address decrement"); 1992 if (dst.getMode() == Address::literal) { 1993 assert(abs(value) < (1 << 12), "invalid value and address mode combination"); 1994 lea(rscratch2, dst); 1995 dst = Address(rscratch2); 1996 } 1997 ldrw(rscratch1, dst); 1998 decrementw(rscratch1, value); 1999 strw(rscratch1, dst); 2000 } 2001 2002 void MacroAssembler::decrement(Address dst, int value) 2003 { 2004 assert(!dst.uses(rscratch1), "invalid address for decrement"); 2005 if (dst.getMode() == Address::literal) { 2006 assert(abs(value) < (1 << 12), "invalid value and address mode combination"); 2007 lea(rscratch2, dst); 2008 dst = Address(rscratch2); 2009 } 2010 ldr(rscratch1, dst); 2011 decrement(rscratch1, value); 2012 str(rscratch1, dst); 2013 } 2014 2015 void MacroAssembler::incrementw(Register reg, int value) 2016 { 2017 if (value < 0) { decrementw(reg, -value); return; } 2018 if (value == 0) { return; } 2019 if (value < (1 << 12)) { addw(reg, reg, value); return; } 2020 /* else */ { 2021 assert(reg != rscratch2, "invalid dst for register increment"); 2022 movw(rscratch2, (unsigned)value); 2023 addw(reg, reg, rscratch2); 2024 } 2025 } 2026 2027 void MacroAssembler::increment(Register reg, int value) 2028 { 2029 if (value < 0) { decrement(reg, -value); return; } 2030 if (value == 0) { return; } 2031 if (value < (1 << 12)) { add(reg, reg, value); return; } 2032 /* else */ { 2033 assert(reg != rscratch2, "invalid dst for register increment"); 2034 movw(rscratch2, (unsigned)value); 2035 add(reg, reg, rscratch2); 2036 } 2037 } 2038 2039 void MacroAssembler::incrementw(Address dst, int value) 2040 { 2041 assert(!dst.uses(rscratch1), "invalid dst for address increment"); 2042 if (dst.getMode() == Address::literal) { 2043 assert(abs(value) < (1 << 12), "invalid value and address mode combination"); 2044 lea(rscratch2, dst); 2045 dst = Address(rscratch2); 2046 } 2047 ldrw(rscratch1, dst); 2048 incrementw(rscratch1, value); 2049 strw(rscratch1, dst); 2050 } 2051 2052 void MacroAssembler::increment(Address dst, int value) 2053 { 2054 assert(!dst.uses(rscratch1), "invalid dst for address increment"); 2055 if (dst.getMode() == Address::literal) { 2056 assert(abs(value) < (1 << 12), "invalid value and address mode combination"); 2057 lea(rscratch2, dst); 2058 dst = Address(rscratch2); 2059 } 2060 ldr(rscratch1, dst); 2061 increment(rscratch1, value); 2062 str(rscratch1, dst); 2063 } 2064 2065 2066 void MacroAssembler::pusha() { 2067 push(0x7fffffff, sp); 2068 } 2069 2070 void MacroAssembler::popa() { 2071 pop(0x7fffffff, sp); 2072 } 2073 2074 // Push lots of registers in the bit set supplied. Don't push sp. 2075 // Return the number of words pushed 2076 int MacroAssembler::push(unsigned int bitset, Register stack) { 2077 int words_pushed = 0; 2078 2079 // Scan bitset to accumulate register pairs 2080 unsigned char regs[32]; 2081 int count = 0; 2082 for (int reg = 0; reg <= 30; reg++) { 2083 if (1 & bitset) 2084 regs[count++] = reg; 2085 bitset >>= 1; 2086 } 2087 regs[count++] = zr->encoding_nocheck(); 2088 count &= ~1; // Only push an even nuber of regs 2089 2090 if (count) { 2091 stp(as_Register(regs[0]), as_Register(regs[1]), 2092 Address(pre(stack, -count * wordSize))); 2093 words_pushed += 2; 2094 } 2095 for (int i = 2; i < count; i += 2) { 2096 stp(as_Register(regs[i]), as_Register(regs[i+1]), 2097 Address(stack, i * wordSize)); 2098 words_pushed += 2; 2099 } 2100 2101 assert(words_pushed == count, "oops, pushed != count"); 2102 2103 return count; 2104 } 2105 2106 int MacroAssembler::pop(unsigned int bitset, Register stack) { 2107 int words_pushed = 0; 2108 2109 // Scan bitset to accumulate register pairs 2110 unsigned char regs[32]; 2111 int count = 0; 2112 for (int reg = 0; reg <= 30; reg++) { 2113 if (1 & bitset) 2114 regs[count++] = reg; 2115 bitset >>= 1; 2116 } 2117 regs[count++] = zr->encoding_nocheck(); 2118 count &= ~1; 2119 2120 for (int i = 2; i < count; i += 2) { 2121 ldp(as_Register(regs[i]), as_Register(regs[i+1]), 2122 Address(stack, i * wordSize)); 2123 words_pushed += 2; 2124 } 2125 if (count) { 2126 ldp(as_Register(regs[0]), as_Register(regs[1]), 2127 Address(post(stack, count * wordSize))); 2128 words_pushed += 2; 2129 } 2130 2131 assert(words_pushed == count, "oops, pushed != count"); 2132 2133 return count; 2134 } 2135 2136 // Push lots of registers in the bit set supplied. Don't push sp. 2137 // Return the number of words pushed 2138 int MacroAssembler::push_fp(unsigned int bitset, Register stack) { 2139 int words_pushed = 0; 2140 2141 // Scan bitset to accumulate register pairs 2142 unsigned char regs[32]; 2143 int count = 0; 2144 for (int reg = 0; reg <= 31; reg++) { 2145 if (1 & bitset) 2146 regs[count++] = reg; 2147 bitset >>= 1; 2148 } 2149 regs[count++] = zr->encoding_nocheck(); 2150 count &= ~1; // Only push an even number of regs 2151 2152 // Always pushing full 128 bit registers. 2153 if (count) { 2154 stpq(as_FloatRegister(regs[0]), as_FloatRegister(regs[1]), Address(pre(stack, -count * wordSize * 2))); 2155 words_pushed += 2; 2156 } 2157 for (int i = 2; i < count; i += 2) { 2158 stpq(as_FloatRegister(regs[i]), as_FloatRegister(regs[i+1]), Address(stack, i * wordSize * 2)); 2159 words_pushed += 2; 2160 } 2161 2162 assert(words_pushed == count, "oops, pushed != count"); 2163 return count; 2164 } 2165 2166 int MacroAssembler::pop_fp(unsigned int bitset, Register stack) { 2167 int words_pushed = 0; 2168 2169 // Scan bitset to accumulate register pairs 2170 unsigned char regs[32]; 2171 int count = 0; 2172 for (int reg = 0; reg <= 31; reg++) { 2173 if (1 & bitset) 2174 regs[count++] = reg; 2175 bitset >>= 1; 2176 } 2177 regs[count++] = zr->encoding_nocheck(); 2178 count &= ~1; 2179 2180 for (int i = 2; i < count; i += 2) { 2181 ldpq(as_FloatRegister(regs[i]), as_FloatRegister(regs[i+1]), Address(stack, i * wordSize * 2)); 2182 words_pushed += 2; 2183 } 2184 if (count) { 2185 ldpq(as_FloatRegister(regs[0]), as_FloatRegister(regs[1]), Address(post(stack, count * wordSize * 2))); 2186 words_pushed += 2; 2187 } 2188 2189 assert(words_pushed == count, "oops, pushed != count"); 2190 2191 return count; 2192 } 2193 2194 #ifdef ASSERT 2195 void MacroAssembler::verify_heapbase(const char* msg) { 2196 #if 0 2197 assert (UseCompressedOops || UseCompressedClassPointers, "should be compressed"); 2198 assert (Universe::heap() != NULL, "java heap should be initialized"); 2199 if (CheckCompressedOops) { 2200 Label ok; 2201 push(1 << rscratch1->encoding(), sp); // cmpptr trashes rscratch1 2202 cmpptr(rheapbase, ExternalAddress((address)CompressedOops::ptrs_base_addr())); 2203 br(Assembler::EQ, ok); 2204 stop(msg); 2205 bind(ok); 2206 pop(1 << rscratch1->encoding(), sp); 2207 } 2208 #endif 2209 } 2210 #endif 2211 2212 void MacroAssembler::resolve_jobject(Register value, Register thread, Register tmp) { 2213 Label done, not_weak; 2214 cbz(value, done); // Use NULL as-is. 2215 2216 STATIC_ASSERT(JNIHandles::weak_tag_mask == 1u); 2217 tbz(r0, 0, not_weak); // Test for jweak tag. 2218 2219 // Resolve jweak. 2220 access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF, value, 2221 Address(value, -JNIHandles::weak_tag_value), tmp, thread); 2222 verify_oop(value); 2223 b(done); 2224 2225 bind(not_weak); 2226 // Resolve (untagged) jobject. 2227 access_load_at(T_OBJECT, IN_NATIVE, value, Address(value, 0), tmp, thread); 2228 verify_oop(value); 2229 bind(done); 2230 } 2231 2232 void MacroAssembler::stop(const char* msg) { 2233 address ip = pc(); 2234 pusha(); 2235 mov(c_rarg0, (address)msg); 2236 mov(c_rarg1, (address)ip); 2237 mov(c_rarg2, sp); 2238 mov(c_rarg3, CAST_FROM_FN_PTR(address, MacroAssembler::debug64)); 2239 blr(c_rarg3); 2240 hlt(0); 2241 } 2242 2243 void MacroAssembler::warn(const char* msg) { 2244 pusha(); 2245 mov(c_rarg0, (address)msg); 2246 mov(lr, CAST_FROM_FN_PTR(address, warning)); 2247 blr(lr); 2248 popa(); 2249 } 2250 2251 void MacroAssembler::unimplemented(const char* what) { 2252 const char* buf = NULL; 2253 { 2254 ResourceMark rm; 2255 stringStream ss; 2256 ss.print("unimplemented: %s", what); 2257 buf = code_string(ss.as_string()); 2258 } 2259 stop(buf); 2260 } 2261 2262 // If a constant does not fit in an immediate field, generate some 2263 // number of MOV instructions and then perform the operation. 2264 void MacroAssembler::wrap_add_sub_imm_insn(Register Rd, Register Rn, unsigned imm, 2265 add_sub_imm_insn insn1, 2266 add_sub_reg_insn insn2) { 2267 assert(Rd != zr, "Rd = zr and not setting flags?"); 2268 if (operand_valid_for_add_sub_immediate((int)imm)) { 2269 (this->*insn1)(Rd, Rn, imm); 2270 } else { 2271 if (uabs(imm) < (1 << 24)) { 2272 (this->*insn1)(Rd, Rn, imm & -(1 << 12)); 2273 (this->*insn1)(Rd, Rd, imm & ((1 << 12)-1)); 2274 } else { 2275 assert_different_registers(Rd, Rn); 2276 mov(Rd, (uint64_t)imm); 2277 (this->*insn2)(Rd, Rn, Rd, LSL, 0); 2278 } 2279 } 2280 } 2281 2282 // Seperate vsn which sets the flags. Optimisations are more restricted 2283 // because we must set the flags correctly. 2284 void MacroAssembler::wrap_adds_subs_imm_insn(Register Rd, Register Rn, unsigned imm, 2285 add_sub_imm_insn insn1, 2286 add_sub_reg_insn insn2) { 2287 if (operand_valid_for_add_sub_immediate((int)imm)) { 2288 (this->*insn1)(Rd, Rn, imm); 2289 } else { 2290 assert_different_registers(Rd, Rn); 2291 assert(Rd != zr, "overflow in immediate operand"); 2292 mov(Rd, (uint64_t)imm); 2293 (this->*insn2)(Rd, Rn, Rd, LSL, 0); 2294 } 2295 } 2296 2297 2298 void MacroAssembler::add(Register Rd, Register Rn, RegisterOrConstant increment) { 2299 if (increment.is_register()) { 2300 add(Rd, Rn, increment.as_register()); 2301 } else { 2302 add(Rd, Rn, increment.as_constant()); 2303 } 2304 } 2305 2306 void MacroAssembler::addw(Register Rd, Register Rn, RegisterOrConstant increment) { 2307 if (increment.is_register()) { 2308 addw(Rd, Rn, increment.as_register()); 2309 } else { 2310 addw(Rd, Rn, increment.as_constant()); 2311 } 2312 } 2313 2314 void MacroAssembler::sub(Register Rd, Register Rn, RegisterOrConstant decrement) { 2315 if (decrement.is_register()) { 2316 sub(Rd, Rn, decrement.as_register()); 2317 } else { 2318 sub(Rd, Rn, decrement.as_constant()); 2319 } 2320 } 2321 2322 void MacroAssembler::subw(Register Rd, Register Rn, RegisterOrConstant decrement) { 2323 if (decrement.is_register()) { 2324 subw(Rd, Rn, decrement.as_register()); 2325 } else { 2326 subw(Rd, Rn, decrement.as_constant()); 2327 } 2328 } 2329 2330 void MacroAssembler::reinit_heapbase() 2331 { 2332 if (UseCompressedOops) { 2333 if (Universe::is_fully_initialized()) { 2334 mov(rheapbase, CompressedOops::ptrs_base()); 2335 } else { 2336 lea(rheapbase, ExternalAddress((address)CompressedOops::ptrs_base_addr())); 2337 ldr(rheapbase, Address(rheapbase)); 2338 } 2339 } 2340 } 2341 2342 // this simulates the behaviour of the x86 cmpxchg instruction using a 2343 // load linked/store conditional pair. we use the acquire/release 2344 // versions of these instructions so that we flush pending writes as 2345 // per Java semantics. 2346 2347 // n.b the x86 version assumes the old value to be compared against is 2348 // in rax and updates rax with the value located in memory if the 2349 // cmpxchg fails. we supply a register for the old value explicitly 2350 2351 // the aarch64 load linked/store conditional instructions do not 2352 // accept an offset. so, unlike x86, we must provide a plain register 2353 // to identify the memory word to be compared/exchanged rather than a 2354 // register+offset Address. 2355 2356 void MacroAssembler::cmpxchgptr(Register oldv, Register newv, Register addr, Register tmp, 2357 Label &succeed, Label *fail) { 2358 // oldv holds comparison value 2359 // newv holds value to write in exchange 2360 // addr identifies memory word to compare against/update 2361 if (UseLSE) { 2362 mov(tmp, oldv); 2363 casal(Assembler::xword, oldv, newv, addr); 2364 cmp(tmp, oldv); 2365 br(Assembler::EQ, succeed); 2366 membar(AnyAny); 2367 } else { 2368 Label retry_load, nope; 2369 if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH)) 2370 prfm(Address(addr), PSTL1STRM); 2371 bind(retry_load); 2372 // flush and load exclusive from the memory location 2373 // and fail if it is not what we expect 2374 ldaxr(tmp, addr); 2375 cmp(tmp, oldv); 2376 br(Assembler::NE, nope); 2377 // if we store+flush with no intervening write tmp wil be zero 2378 stlxr(tmp, newv, addr); 2379 cbzw(tmp, succeed); 2380 // retry so we only ever return after a load fails to compare 2381 // ensures we don't return a stale value after a failed write. 2382 b(retry_load); 2383 // if the memory word differs we return it in oldv and signal a fail 2384 bind(nope); 2385 membar(AnyAny); 2386 mov(oldv, tmp); 2387 } 2388 if (fail) 2389 b(*fail); 2390 } 2391 2392 void MacroAssembler::cmpxchg_obj_header(Register oldv, Register newv, Register obj, Register tmp, 2393 Label &succeed, Label *fail) { 2394 assert(oopDesc::mark_offset_in_bytes() == 0, "assumption"); 2395 cmpxchgptr(oldv, newv, obj, tmp, succeed, fail); 2396 } 2397 2398 void MacroAssembler::cmpxchgw(Register oldv, Register newv, Register addr, Register tmp, 2399 Label &succeed, Label *fail) { 2400 // oldv holds comparison value 2401 // newv holds value to write in exchange 2402 // addr identifies memory word to compare against/update 2403 // tmp returns 0/1 for success/failure 2404 if (UseLSE) { 2405 mov(tmp, oldv); 2406 casal(Assembler::word, oldv, newv, addr); 2407 cmp(tmp, oldv); 2408 br(Assembler::EQ, succeed); 2409 membar(AnyAny); 2410 } else { 2411 Label retry_load, nope; 2412 if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH)) 2413 prfm(Address(addr), PSTL1STRM); 2414 bind(retry_load); 2415 // flush and load exclusive from the memory location 2416 // and fail if it is not what we expect 2417 ldaxrw(tmp, addr); 2418 cmp(tmp, oldv); 2419 br(Assembler::NE, nope); 2420 // if we store+flush with no intervening write tmp wil be zero 2421 stlxrw(tmp, newv, addr); 2422 cbzw(tmp, succeed); 2423 // retry so we only ever return after a load fails to compare 2424 // ensures we don't return a stale value after a failed write. 2425 b(retry_load); 2426 // if the memory word differs we return it in oldv and signal a fail 2427 bind(nope); 2428 membar(AnyAny); 2429 mov(oldv, tmp); 2430 } 2431 if (fail) 2432 b(*fail); 2433 } 2434 2435 // A generic CAS; success or failure is in the EQ flag. A weak CAS 2436 // doesn't retry and may fail spuriously. If the oldval is wanted, 2437 // Pass a register for the result, otherwise pass noreg. 2438 2439 // Clobbers rscratch1 2440 void MacroAssembler::cmpxchg(Register addr, Register expected, 2441 Register new_val, 2442 enum operand_size size, 2443 bool acquire, bool release, 2444 bool weak, 2445 Register result) { 2446 if (result == noreg) result = rscratch1; 2447 BLOCK_COMMENT("cmpxchg {"); 2448 if (UseLSE) { 2449 mov(result, expected); 2450 lse_cas(result, new_val, addr, size, acquire, release, /*not_pair*/ true); 2451 compare_eq(result, expected, size); 2452 } else { 2453 Label retry_load, done; 2454 if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH)) 2455 prfm(Address(addr), PSTL1STRM); 2456 bind(retry_load); 2457 load_exclusive(result, addr, size, acquire); 2458 compare_eq(result, expected, size); 2459 br(Assembler::NE, done); 2460 store_exclusive(rscratch1, new_val, addr, size, release); 2461 if (weak) { 2462 cmpw(rscratch1, 0u); // If the store fails, return NE to our caller. 2463 } else { 2464 cbnzw(rscratch1, retry_load); 2465 } 2466 bind(done); 2467 } 2468 BLOCK_COMMENT("} cmpxchg"); 2469 } 2470 2471 // A generic comparison. Only compares for equality, clobbers rscratch1. 2472 void MacroAssembler::compare_eq(Register rm, Register rn, enum operand_size size) { 2473 if (size == xword) { 2474 cmp(rm, rn); 2475 } else if (size == word) { 2476 cmpw(rm, rn); 2477 } else if (size == halfword) { 2478 eorw(rscratch1, rm, rn); 2479 ands(zr, rscratch1, 0xffff); 2480 } else if (size == byte) { 2481 eorw(rscratch1, rm, rn); 2482 ands(zr, rscratch1, 0xff); 2483 } else { 2484 ShouldNotReachHere(); 2485 } 2486 } 2487 2488 2489 static bool different(Register a, RegisterOrConstant b, Register c) { 2490 if (b.is_constant()) 2491 return a != c; 2492 else 2493 return a != b.as_register() && a != c && b.as_register() != c; 2494 } 2495 2496 #define ATOMIC_OP(NAME, LDXR, OP, IOP, AOP, STXR, sz) \ 2497 void MacroAssembler::atomic_##NAME(Register prev, RegisterOrConstant incr, Register addr) { \ 2498 if (UseLSE) { \ 2499 prev = prev->is_valid() ? prev : zr; \ 2500 if (incr.is_register()) { \ 2501 AOP(sz, incr.as_register(), prev, addr); \ 2502 } else { \ 2503 mov(rscratch2, incr.as_constant()); \ 2504 AOP(sz, rscratch2, prev, addr); \ 2505 } \ 2506 return; \ 2507 } \ 2508 Register result = rscratch2; \ 2509 if (prev->is_valid()) \ 2510 result = different(prev, incr, addr) ? prev : rscratch2; \ 2511 \ 2512 Label retry_load; \ 2513 if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH)) \ 2514 prfm(Address(addr), PSTL1STRM); \ 2515 bind(retry_load); \ 2516 LDXR(result, addr); \ 2517 OP(rscratch1, result, incr); \ 2518 STXR(rscratch2, rscratch1, addr); \ 2519 cbnzw(rscratch2, retry_load); \ 2520 if (prev->is_valid() && prev != result) { \ 2521 IOP(prev, rscratch1, incr); \ 2522 } \ 2523 } 2524 2525 ATOMIC_OP(add, ldxr, add, sub, ldadd, stxr, Assembler::xword) 2526 ATOMIC_OP(addw, ldxrw, addw, subw, ldadd, stxrw, Assembler::word) 2527 ATOMIC_OP(addal, ldaxr, add, sub, ldaddal, stlxr, Assembler::xword) 2528 ATOMIC_OP(addalw, ldaxrw, addw, subw, ldaddal, stlxrw, Assembler::word) 2529 2530 #undef ATOMIC_OP 2531 2532 #define ATOMIC_XCHG(OP, AOP, LDXR, STXR, sz) \ 2533 void MacroAssembler::atomic_##OP(Register prev, Register newv, Register addr) { \ 2534 if (UseLSE) { \ 2535 prev = prev->is_valid() ? prev : zr; \ 2536 AOP(sz, newv, prev, addr); \ 2537 return; \ 2538 } \ 2539 Register result = rscratch2; \ 2540 if (prev->is_valid()) \ 2541 result = different(prev, newv, addr) ? prev : rscratch2; \ 2542 \ 2543 Label retry_load; \ 2544 if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH)) \ 2545 prfm(Address(addr), PSTL1STRM); \ 2546 bind(retry_load); \ 2547 LDXR(result, addr); \ 2548 STXR(rscratch1, newv, addr); \ 2549 cbnzw(rscratch1, retry_load); \ 2550 if (prev->is_valid() && prev != result) \ 2551 mov(prev, result); \ 2552 } 2553 2554 ATOMIC_XCHG(xchg, swp, ldxr, stxr, Assembler::xword) 2555 ATOMIC_XCHG(xchgw, swp, ldxrw, stxrw, Assembler::word) 2556 ATOMIC_XCHG(xchgal, swpal, ldaxr, stlxr, Assembler::xword) 2557 ATOMIC_XCHG(xchgalw, swpal, ldaxrw, stlxrw, Assembler::word) 2558 2559 #undef ATOMIC_XCHG 2560 2561 #ifndef PRODUCT 2562 extern "C" void findpc(intptr_t x); 2563 #endif 2564 2565 void MacroAssembler::debug64(char* msg, int64_t pc, int64_t regs[]) 2566 { 2567 // In order to get locks to work, we need to fake a in_VM state 2568 if (ShowMessageBoxOnError ) { 2569 JavaThread* thread = JavaThread::current(); 2570 JavaThreadState saved_state = thread->thread_state(); 2571 thread->set_thread_state(_thread_in_vm); 2572 #ifndef PRODUCT 2573 if (CountBytecodes || TraceBytecodes || StopInterpreterAt) { 2574 ttyLocker ttyl; 2575 BytecodeCounter::print(); 2576 } 2577 #endif 2578 if (os::message_box(msg, "Execution stopped, print registers?")) { 2579 ttyLocker ttyl; 2580 tty->print_cr(" pc = 0x%016lx", pc); 2581 #ifndef PRODUCT 2582 tty->cr(); 2583 findpc(pc); 2584 tty->cr(); 2585 #endif 2586 tty->print_cr(" r0 = 0x%016lx", regs[0]); 2587 tty->print_cr(" r1 = 0x%016lx", regs[1]); 2588 tty->print_cr(" r2 = 0x%016lx", regs[2]); 2589 tty->print_cr(" r3 = 0x%016lx", regs[3]); 2590 tty->print_cr(" r4 = 0x%016lx", regs[4]); 2591 tty->print_cr(" r5 = 0x%016lx", regs[5]); 2592 tty->print_cr(" r6 = 0x%016lx", regs[6]); 2593 tty->print_cr(" r7 = 0x%016lx", regs[7]); 2594 tty->print_cr(" r8 = 0x%016lx", regs[8]); 2595 tty->print_cr(" r9 = 0x%016lx", regs[9]); 2596 tty->print_cr("r10 = 0x%016lx", regs[10]); 2597 tty->print_cr("r11 = 0x%016lx", regs[11]); 2598 tty->print_cr("r12 = 0x%016lx", regs[12]); 2599 tty->print_cr("r13 = 0x%016lx", regs[13]); 2600 tty->print_cr("r14 = 0x%016lx", regs[14]); 2601 tty->print_cr("r15 = 0x%016lx", regs[15]); 2602 tty->print_cr("r16 = 0x%016lx", regs[16]); 2603 tty->print_cr("r17 = 0x%016lx", regs[17]); 2604 tty->print_cr("r18 = 0x%016lx", regs[18]); 2605 tty->print_cr("r19 = 0x%016lx", regs[19]); 2606 tty->print_cr("r20 = 0x%016lx", regs[20]); 2607 tty->print_cr("r21 = 0x%016lx", regs[21]); 2608 tty->print_cr("r22 = 0x%016lx", regs[22]); 2609 tty->print_cr("r23 = 0x%016lx", regs[23]); 2610 tty->print_cr("r24 = 0x%016lx", regs[24]); 2611 tty->print_cr("r25 = 0x%016lx", regs[25]); 2612 tty->print_cr("r26 = 0x%016lx", regs[26]); 2613 tty->print_cr("r27 = 0x%016lx", regs[27]); 2614 tty->print_cr("r28 = 0x%016lx", regs[28]); 2615 tty->print_cr("r30 = 0x%016lx", regs[30]); 2616 tty->print_cr("r31 = 0x%016lx", regs[31]); 2617 BREAKPOINT; 2618 } 2619 } 2620 fatal("DEBUG MESSAGE: %s", msg); 2621 } 2622 2623 void MacroAssembler::push_call_clobbered_registers() { 2624 int step = 4 * wordSize; 2625 push(RegSet::range(r0, r18) - RegSet::of(rscratch1, rscratch2), sp); 2626 sub(sp, sp, step); 2627 mov(rscratch1, -step); 2628 // Push v0-v7, v16-v31. 2629 for (int i = 31; i>= 4; i -= 4) { 2630 if (i <= v7->encoding() || i >= v16->encoding()) 2631 st1(as_FloatRegister(i-3), as_FloatRegister(i-2), as_FloatRegister(i-1), 2632 as_FloatRegister(i), T1D, Address(post(sp, rscratch1))); 2633 } 2634 st1(as_FloatRegister(0), as_FloatRegister(1), as_FloatRegister(2), 2635 as_FloatRegister(3), T1D, Address(sp)); 2636 } 2637 2638 void MacroAssembler::pop_call_clobbered_registers() { 2639 for (int i = 0; i < 32; i += 4) { 2640 if (i <= v7->encoding() || i >= v16->encoding()) 2641 ld1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2), 2642 as_FloatRegister(i+3), T1D, Address(post(sp, 4 * wordSize))); 2643 } 2644 2645 pop(RegSet::range(r0, r18) - RegSet::of(rscratch1, rscratch2), sp); 2646 } 2647 2648 void MacroAssembler::push_CPU_state(bool save_vectors) { 2649 int step = (save_vectors ? 8 : 4) * wordSize; 2650 push(0x3fffffff, sp); // integer registers except lr & sp 2651 mov(rscratch1, -step); 2652 sub(sp, sp, step); 2653 for (int i = 28; i >= 4; i -= 4) { 2654 st1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2), 2655 as_FloatRegister(i+3), save_vectors ? T2D : T1D, Address(post(sp, rscratch1))); 2656 } 2657 st1(v0, v1, v2, v3, save_vectors ? T2D : T1D, sp); 2658 } 2659 2660 void MacroAssembler::pop_CPU_state(bool restore_vectors) { 2661 int step = (restore_vectors ? 8 : 4) * wordSize; 2662 for (int i = 0; i <= 28; i += 4) 2663 ld1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2), 2664 as_FloatRegister(i+3), restore_vectors ? T2D : T1D, Address(post(sp, step))); 2665 pop(0x3fffffff, sp); // integer registers except lr & sp 2666 } 2667 2668 /** 2669 * Helpers for multiply_to_len(). 2670 */ 2671 void MacroAssembler::add2_with_carry(Register final_dest_hi, Register dest_hi, Register dest_lo, 2672 Register src1, Register src2) { 2673 adds(dest_lo, dest_lo, src1); 2674 adc(dest_hi, dest_hi, zr); 2675 adds(dest_lo, dest_lo, src2); 2676 adc(final_dest_hi, dest_hi, zr); 2677 } 2678 2679 // Generate an address from (r + r1 extend offset). "size" is the 2680 // size of the operand. The result may be in rscratch2. 2681 Address MacroAssembler::offsetted_address(Register r, Register r1, 2682 Address::extend ext, int offset, int size) { 2683 if (offset || (ext.shift() % size != 0)) { 2684 lea(rscratch2, Address(r, r1, ext)); 2685 return Address(rscratch2, offset); 2686 } else { 2687 return Address(r, r1, ext); 2688 } 2689 } 2690 2691 Address MacroAssembler::spill_address(int size, int offset, Register tmp) 2692 { 2693 assert(offset >= 0, "spill to negative address?"); 2694 // Offset reachable ? 2695 // Not aligned - 9 bits signed offset 2696 // Aligned - 12 bits unsigned offset shifted 2697 Register base = sp; 2698 if ((offset & (size-1)) && offset >= (1<<8)) { 2699 add(tmp, base, offset & ((1<<12)-1)); 2700 base = tmp; 2701 offset &= -1u<<12; 2702 } 2703 2704 if (offset >= (1<<12) * size) { 2705 add(tmp, base, offset & (((1<<12)-1)<<12)); 2706 base = tmp; 2707 offset &= ~(((1<<12)-1)<<12); 2708 } 2709 2710 return Address(base, offset); 2711 } 2712 2713 // Checks whether offset is aligned. 2714 // Returns true if it is, else false. 2715 bool MacroAssembler::merge_alignment_check(Register base, 2716 size_t size, 2717 long cur_offset, 2718 long prev_offset) const { 2719 if (AvoidUnalignedAccesses) { 2720 if (base == sp) { 2721 // Checks whether low offset if aligned to pair of registers. 2722 long pair_mask = size * 2 - 1; 2723 long offset = prev_offset > cur_offset ? cur_offset : prev_offset; 2724 return (offset & pair_mask) == 0; 2725 } else { // If base is not sp, we can't guarantee the access is aligned. 2726 return false; 2727 } 2728 } else { 2729 long mask = size - 1; 2730 // Load/store pair instruction only supports element size aligned offset. 2731 return (cur_offset & mask) == 0 && (prev_offset & mask) == 0; 2732 } 2733 } 2734 2735 // Checks whether current and previous loads/stores can be merged. 2736 // Returns true if it can be merged, else false. 2737 bool MacroAssembler::ldst_can_merge(Register rt, 2738 const Address &adr, 2739 size_t cur_size_in_bytes, 2740 bool is_store) const { 2741 address prev = pc() - NativeInstruction::instruction_size; 2742 address last = code()->last_insn(); 2743 2744 if (last == NULL || !nativeInstruction_at(last)->is_Imm_LdSt()) { 2745 return false; 2746 } 2747 2748 if (adr.getMode() != Address::base_plus_offset || prev != last) { 2749 return false; 2750 } 2751 2752 NativeLdSt* prev_ldst = NativeLdSt_at(prev); 2753 size_t prev_size_in_bytes = prev_ldst->size_in_bytes(); 2754 2755 assert(prev_size_in_bytes == 4 || prev_size_in_bytes == 8, "only supports 64/32bit merging."); 2756 assert(cur_size_in_bytes == 4 || cur_size_in_bytes == 8, "only supports 64/32bit merging."); 2757 2758 if (cur_size_in_bytes != prev_size_in_bytes || is_store != prev_ldst->is_store()) { 2759 return false; 2760 } 2761 2762 long max_offset = 63 * prev_size_in_bytes; 2763 long min_offset = -64 * prev_size_in_bytes; 2764 2765 assert(prev_ldst->is_not_pre_post_index(), "pre-index or post-index is not supported to be merged."); 2766 2767 // Only same base can be merged. 2768 if (adr.base() != prev_ldst->base()) { 2769 return false; 2770 } 2771 2772 long cur_offset = adr.offset(); 2773 long prev_offset = prev_ldst->offset(); 2774 size_t diff = abs(cur_offset - prev_offset); 2775 if (diff != prev_size_in_bytes) { 2776 return false; 2777 } 2778 2779 // Following cases can not be merged: 2780 // ldr x2, [x2, #8] 2781 // ldr x3, [x2, #16] 2782 // or: 2783 // ldr x2, [x3, #8] 2784 // ldr x2, [x3, #16] 2785 // If t1 and t2 is the same in "ldp t1, t2, [xn, #imm]", we'll get SIGILL. 2786 if (!is_store && (adr.base() == prev_ldst->target() || rt == prev_ldst->target())) { 2787 return false; 2788 } 2789 2790 long low_offset = prev_offset > cur_offset ? cur_offset : prev_offset; 2791 // Offset range must be in ldp/stp instruction's range. 2792 if (low_offset > max_offset || low_offset < min_offset) { 2793 return false; 2794 } 2795 2796 if (merge_alignment_check(adr.base(), prev_size_in_bytes, cur_offset, prev_offset)) { 2797 return true; 2798 } 2799 2800 return false; 2801 } 2802 2803 // Merge current load/store with previous load/store into ldp/stp. 2804 void MacroAssembler::merge_ldst(Register rt, 2805 const Address &adr, 2806 size_t cur_size_in_bytes, 2807 bool is_store) { 2808 2809 assert(ldst_can_merge(rt, adr, cur_size_in_bytes, is_store) == true, "cur and prev must be able to be merged."); 2810 2811 Register rt_low, rt_high; 2812 address prev = pc() - NativeInstruction::instruction_size; 2813 NativeLdSt* prev_ldst = NativeLdSt_at(prev); 2814 2815 long offset; 2816 2817 if (adr.offset() < prev_ldst->offset()) { 2818 offset = adr.offset(); 2819 rt_low = rt; 2820 rt_high = prev_ldst->target(); 2821 } else { 2822 offset = prev_ldst->offset(); 2823 rt_low = prev_ldst->target(); 2824 rt_high = rt; 2825 } 2826 2827 Address adr_p = Address(prev_ldst->base(), offset); 2828 // Overwrite previous generated binary. 2829 code_section()->set_end(prev); 2830 2831 const int sz = prev_ldst->size_in_bytes(); 2832 assert(sz == 8 || sz == 4, "only supports 64/32bit merging."); 2833 if (!is_store) { 2834 BLOCK_COMMENT("merged ldr pair"); 2835 if (sz == 8) { 2836 ldp(rt_low, rt_high, adr_p); 2837 } else { 2838 ldpw(rt_low, rt_high, adr_p); 2839 } 2840 } else { 2841 BLOCK_COMMENT("merged str pair"); 2842 if (sz == 8) { 2843 stp(rt_low, rt_high, adr_p); 2844 } else { 2845 stpw(rt_low, rt_high, adr_p); 2846 } 2847 } 2848 } 2849 2850 /** 2851 * Multiply 64 bit by 64 bit first loop. 2852 */ 2853 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart, 2854 Register y, Register y_idx, Register z, 2855 Register carry, Register product, 2856 Register idx, Register kdx) { 2857 // 2858 // jlong carry, x[], y[], z[]; 2859 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) { 2860 // huge_128 product = y[idx] * x[xstart] + carry; 2861 // z[kdx] = (jlong)product; 2862 // carry = (jlong)(product >>> 64); 2863 // } 2864 // z[xstart] = carry; 2865 // 2866 2867 Label L_first_loop, L_first_loop_exit; 2868 Label L_one_x, L_one_y, L_multiply; 2869 2870 subsw(xstart, xstart, 1); 2871 br(Assembler::MI, L_one_x); 2872 2873 lea(rscratch1, Address(x, xstart, Address::lsl(LogBytesPerInt))); 2874 ldr(x_xstart, Address(rscratch1)); 2875 ror(x_xstart, x_xstart, 32); // convert big-endian to little-endian 2876 2877 bind(L_first_loop); 2878 subsw(idx, idx, 1); 2879 br(Assembler::MI, L_first_loop_exit); 2880 subsw(idx, idx, 1); 2881 br(Assembler::MI, L_one_y); 2882 lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt))); 2883 ldr(y_idx, Address(rscratch1)); 2884 ror(y_idx, y_idx, 32); // convert big-endian to little-endian 2885 bind(L_multiply); 2886 2887 // AArch64 has a multiply-accumulate instruction that we can't use 2888 // here because it has no way to process carries, so we have to use 2889 // separate add and adc instructions. Bah. 2890 umulh(rscratch1, x_xstart, y_idx); // x_xstart * y_idx -> rscratch1:product 2891 mul(product, x_xstart, y_idx); 2892 adds(product, product, carry); 2893 adc(carry, rscratch1, zr); // x_xstart * y_idx + carry -> carry:product 2894 2895 subw(kdx, kdx, 2); 2896 ror(product, product, 32); // back to big-endian 2897 str(product, offsetted_address(z, kdx, Address::uxtw(LogBytesPerInt), 0, BytesPerLong)); 2898 2899 b(L_first_loop); 2900 2901 bind(L_one_y); 2902 ldrw(y_idx, Address(y, 0)); 2903 b(L_multiply); 2904 2905 bind(L_one_x); 2906 ldrw(x_xstart, Address(x, 0)); 2907 b(L_first_loop); 2908 2909 bind(L_first_loop_exit); 2910 } 2911 2912 /** 2913 * Multiply 128 bit by 128. Unrolled inner loop. 2914 * 2915 */ 2916 void MacroAssembler::multiply_128_x_128_loop(Register y, Register z, 2917 Register carry, Register carry2, 2918 Register idx, Register jdx, 2919 Register yz_idx1, Register yz_idx2, 2920 Register tmp, Register tmp3, Register tmp4, 2921 Register tmp6, Register product_hi) { 2922 2923 // jlong carry, x[], y[], z[]; 2924 // int kdx = ystart+1; 2925 // for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop 2926 // huge_128 tmp3 = (y[idx+1] * product_hi) + z[kdx+idx+1] + carry; 2927 // jlong carry2 = (jlong)(tmp3 >>> 64); 2928 // huge_128 tmp4 = (y[idx] * product_hi) + z[kdx+idx] + carry2; 2929 // carry = (jlong)(tmp4 >>> 64); 2930 // z[kdx+idx+1] = (jlong)tmp3; 2931 // z[kdx+idx] = (jlong)tmp4; 2932 // } 2933 // idx += 2; 2934 // if (idx > 0) { 2935 // yz_idx1 = (y[idx] * product_hi) + z[kdx+idx] + carry; 2936 // z[kdx+idx] = (jlong)yz_idx1; 2937 // carry = (jlong)(yz_idx1 >>> 64); 2938 // } 2939 // 2940 2941 Label L_third_loop, L_third_loop_exit, L_post_third_loop_done; 2942 2943 lsrw(jdx, idx, 2); 2944 2945 bind(L_third_loop); 2946 2947 subsw(jdx, jdx, 1); 2948 br(Assembler::MI, L_third_loop_exit); 2949 subw(idx, idx, 4); 2950 2951 lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt))); 2952 2953 ldp(yz_idx2, yz_idx1, Address(rscratch1, 0)); 2954 2955 lea(tmp6, Address(z, idx, Address::uxtw(LogBytesPerInt))); 2956 2957 ror(yz_idx1, yz_idx1, 32); // convert big-endian to little-endian 2958 ror(yz_idx2, yz_idx2, 32); 2959 2960 ldp(rscratch2, rscratch1, Address(tmp6, 0)); 2961 2962 mul(tmp3, product_hi, yz_idx1); // yz_idx1 * product_hi -> tmp4:tmp3 2963 umulh(tmp4, product_hi, yz_idx1); 2964 2965 ror(rscratch1, rscratch1, 32); // convert big-endian to little-endian 2966 ror(rscratch2, rscratch2, 32); 2967 2968 mul(tmp, product_hi, yz_idx2); // yz_idx2 * product_hi -> carry2:tmp 2969 umulh(carry2, product_hi, yz_idx2); 2970 2971 // propagate sum of both multiplications into carry:tmp4:tmp3 2972 adds(tmp3, tmp3, carry); 2973 adc(tmp4, tmp4, zr); 2974 adds(tmp3, tmp3, rscratch1); 2975 adcs(tmp4, tmp4, tmp); 2976 adc(carry, carry2, zr); 2977 adds(tmp4, tmp4, rscratch2); 2978 adc(carry, carry, zr); 2979 2980 ror(tmp3, tmp3, 32); // convert little-endian to big-endian 2981 ror(tmp4, tmp4, 32); 2982 stp(tmp4, tmp3, Address(tmp6, 0)); 2983 2984 b(L_third_loop); 2985 bind (L_third_loop_exit); 2986 2987 andw (idx, idx, 0x3); 2988 cbz(idx, L_post_third_loop_done); 2989 2990 Label L_check_1; 2991 subsw(idx, idx, 2); 2992 br(Assembler::MI, L_check_1); 2993 2994 lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt))); 2995 ldr(yz_idx1, Address(rscratch1, 0)); 2996 ror(yz_idx1, yz_idx1, 32); 2997 mul(tmp3, product_hi, yz_idx1); // yz_idx1 * product_hi -> tmp4:tmp3 2998 umulh(tmp4, product_hi, yz_idx1); 2999 lea(rscratch1, Address(z, idx, Address::uxtw(LogBytesPerInt))); 3000 ldr(yz_idx2, Address(rscratch1, 0)); 3001 ror(yz_idx2, yz_idx2, 32); 3002 3003 add2_with_carry(carry, tmp4, tmp3, carry, yz_idx2); 3004 3005 ror(tmp3, tmp3, 32); 3006 str(tmp3, Address(rscratch1, 0)); 3007 3008 bind (L_check_1); 3009 3010 andw (idx, idx, 0x1); 3011 subsw(idx, idx, 1); 3012 br(Assembler::MI, L_post_third_loop_done); 3013 ldrw(tmp4, Address(y, idx, Address::uxtw(LogBytesPerInt))); 3014 mul(tmp3, tmp4, product_hi); // tmp4 * product_hi -> carry2:tmp3 3015 umulh(carry2, tmp4, product_hi); 3016 ldrw(tmp4, Address(z, idx, Address::uxtw(LogBytesPerInt))); 3017 3018 add2_with_carry(carry2, tmp3, tmp4, carry); 3019 3020 strw(tmp3, Address(z, idx, Address::uxtw(LogBytesPerInt))); 3021 extr(carry, carry2, tmp3, 32); 3022 3023 bind(L_post_third_loop_done); 3024 } 3025 3026 /** 3027 * Code for BigInteger::multiplyToLen() instrinsic. 3028 * 3029 * r0: x 3030 * r1: xlen 3031 * r2: y 3032 * r3: ylen 3033 * r4: z 3034 * r5: zlen 3035 * r10: tmp1 3036 * r11: tmp2 3037 * r12: tmp3 3038 * r13: tmp4 3039 * r14: tmp5 3040 * r15: tmp6 3041 * r16: tmp7 3042 * 3043 */ 3044 void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Register ylen, 3045 Register z, Register zlen, 3046 Register tmp1, Register tmp2, Register tmp3, Register tmp4, 3047 Register tmp5, Register tmp6, Register product_hi) { 3048 3049 assert_different_registers(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6); 3050 3051 const Register idx = tmp1; 3052 const Register kdx = tmp2; 3053 const Register xstart = tmp3; 3054 3055 const Register y_idx = tmp4; 3056 const Register carry = tmp5; 3057 const Register product = xlen; 3058 const Register x_xstart = zlen; // reuse register 3059 3060 // First Loop. 3061 // 3062 // final static long LONG_MASK = 0xffffffffL; 3063 // int xstart = xlen - 1; 3064 // int ystart = ylen - 1; 3065 // long carry = 0; 3066 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) { 3067 // long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry; 3068 // z[kdx] = (int)product; 3069 // carry = product >>> 32; 3070 // } 3071 // z[xstart] = (int)carry; 3072 // 3073 3074 movw(idx, ylen); // idx = ylen; 3075 movw(kdx, zlen); // kdx = xlen+ylen; 3076 mov(carry, zr); // carry = 0; 3077 3078 Label L_done; 3079 3080 movw(xstart, xlen); 3081 subsw(xstart, xstart, 1); 3082 br(Assembler::MI, L_done); 3083 3084 multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, carry, product, idx, kdx); 3085 3086 Label L_second_loop; 3087 cbzw(kdx, L_second_loop); 3088 3089 Label L_carry; 3090 subw(kdx, kdx, 1); 3091 cbzw(kdx, L_carry); 3092 3093 strw(carry, Address(z, kdx, Address::uxtw(LogBytesPerInt))); 3094 lsr(carry, carry, 32); 3095 subw(kdx, kdx, 1); 3096 3097 bind(L_carry); 3098 strw(carry, Address(z, kdx, Address::uxtw(LogBytesPerInt))); 3099 3100 // Second and third (nested) loops. 3101 // 3102 // for (int i = xstart-1; i >= 0; i--) { // Second loop 3103 // carry = 0; 3104 // for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop 3105 // long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) + 3106 // (z[k] & LONG_MASK) + carry; 3107 // z[k] = (int)product; 3108 // carry = product >>> 32; 3109 // } 3110 // z[i] = (int)carry; 3111 // } 3112 // 3113 // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = product_hi 3114 3115 const Register jdx = tmp1; 3116 3117 bind(L_second_loop); 3118 mov(carry, zr); // carry = 0; 3119 movw(jdx, ylen); // j = ystart+1 3120 3121 subsw(xstart, xstart, 1); // i = xstart-1; 3122 br(Assembler::MI, L_done); 3123 3124 str(z, Address(pre(sp, -4 * wordSize))); 3125 3126 Label L_last_x; 3127 lea(z, offsetted_address(z, xstart, Address::uxtw(LogBytesPerInt), 4, BytesPerInt)); // z = z + k - j 3128 subsw(xstart, xstart, 1); // i = xstart-1; 3129 br(Assembler::MI, L_last_x); 3130 3131 lea(rscratch1, Address(x, xstart, Address::uxtw(LogBytesPerInt))); 3132 ldr(product_hi, Address(rscratch1)); 3133 ror(product_hi, product_hi, 32); // convert big-endian to little-endian 3134 3135 Label L_third_loop_prologue; 3136 bind(L_third_loop_prologue); 3137 3138 str(ylen, Address(sp, wordSize)); 3139 stp(x, xstart, Address(sp, 2 * wordSize)); 3140 multiply_128_x_128_loop(y, z, carry, x, jdx, ylen, product, 3141 tmp2, x_xstart, tmp3, tmp4, tmp6, product_hi); 3142 ldp(z, ylen, Address(post(sp, 2 * wordSize))); 3143 ldp(x, xlen, Address(post(sp, 2 * wordSize))); // copy old xstart -> xlen 3144 3145 addw(tmp3, xlen, 1); 3146 strw(carry, Address(z, tmp3, Address::uxtw(LogBytesPerInt))); 3147 subsw(tmp3, tmp3, 1); 3148 br(Assembler::MI, L_done); 3149 3150 lsr(carry, carry, 32); 3151 strw(carry, Address(z, tmp3, Address::uxtw(LogBytesPerInt))); 3152 b(L_second_loop); 3153 3154 // Next infrequent code is moved outside loops. 3155 bind(L_last_x); 3156 ldrw(product_hi, Address(x, 0)); 3157 b(L_third_loop_prologue); 3158 3159 bind(L_done); 3160 } 3161 3162 // Code for BigInteger::mulAdd instrinsic 3163 // out = r0 3164 // in = r1 3165 // offset = r2 (already out.length-offset) 3166 // len = r3 3167 // k = r4 3168 // 3169 // pseudo code from java implementation: 3170 // carry = 0; 3171 // offset = out.length-offset - 1; 3172 // for (int j=len-1; j >= 0; j--) { 3173 // product = (in[j] & LONG_MASK) * kLong + (out[offset] & LONG_MASK) + carry; 3174 // out[offset--] = (int)product; 3175 // carry = product >>> 32; 3176 // } 3177 // return (int)carry; 3178 void MacroAssembler::mul_add(Register out, Register in, Register offset, 3179 Register len, Register k) { 3180 Label LOOP, END; 3181 // pre-loop 3182 cmp(len, zr); // cmp, not cbz/cbnz: to use condition twice => less branches 3183 csel(out, zr, out, Assembler::EQ); 3184 br(Assembler::EQ, END); 3185 add(in, in, len, LSL, 2); // in[j+1] address 3186 add(offset, out, offset, LSL, 2); // out[offset + 1] address 3187 mov(out, zr); // used to keep carry now 3188 BIND(LOOP); 3189 ldrw(rscratch1, Address(pre(in, -4))); 3190 madd(rscratch1, rscratch1, k, out); 3191 ldrw(rscratch2, Address(pre(offset, -4))); 3192 add(rscratch1, rscratch1, rscratch2); 3193 strw(rscratch1, Address(offset)); 3194 lsr(out, rscratch1, 32); 3195 subs(len, len, 1); 3196 br(Assembler::NE, LOOP); 3197 BIND(END); 3198 } 3199 3200 /** 3201 * Emits code to update CRC-32 with a byte value according to constants in table 3202 * 3203 * @param [in,out]crc Register containing the crc. 3204 * @param [in]val Register containing the byte to fold into the CRC. 3205 * @param [in]table Register containing the table of crc constants. 3206 * 3207 * uint32_t crc; 3208 * val = crc_table[(val ^ crc) & 0xFF]; 3209 * crc = val ^ (crc >> 8); 3210 * 3211 */ 3212 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) { 3213 eor(val, val, crc); 3214 andr(val, val, 0xff); 3215 ldrw(val, Address(table, val, Address::lsl(2))); 3216 eor(crc, val, crc, Assembler::LSR, 8); 3217 } 3218 3219 /** 3220 * Emits code to update CRC-32 with a 32-bit value according to tables 0 to 3 3221 * 3222 * @param [in,out]crc Register containing the crc. 3223 * @param [in]v Register containing the 32-bit to fold into the CRC. 3224 * @param [in]table0 Register containing table 0 of crc constants. 3225 * @param [in]table1 Register containing table 1 of crc constants. 3226 * @param [in]table2 Register containing table 2 of crc constants. 3227 * @param [in]table3 Register containing table 3 of crc constants. 3228 * 3229 * uint32_t crc; 3230 * v = crc ^ v 3231 * crc = table3[v&0xff]^table2[(v>>8)&0xff]^table1[(v>>16)&0xff]^table0[v>>24] 3232 * 3233 */ 3234 void MacroAssembler::update_word_crc32(Register crc, Register v, Register tmp, 3235 Register table0, Register table1, Register table2, Register table3, 3236 bool upper) { 3237 eor(v, crc, v, upper ? LSR:LSL, upper ? 32:0); 3238 uxtb(tmp, v); 3239 ldrw(crc, Address(table3, tmp, Address::lsl(2))); 3240 ubfx(tmp, v, 8, 8); 3241 ldrw(tmp, Address(table2, tmp, Address::lsl(2))); 3242 eor(crc, crc, tmp); 3243 ubfx(tmp, v, 16, 8); 3244 ldrw(tmp, Address(table1, tmp, Address::lsl(2))); 3245 eor(crc, crc, tmp); 3246 ubfx(tmp, v, 24, 8); 3247 ldrw(tmp, Address(table0, tmp, Address::lsl(2))); 3248 eor(crc, crc, tmp); 3249 } 3250 3251 void MacroAssembler::kernel_crc32_using_crc32(Register crc, Register buf, 3252 Register len, Register tmp0, Register tmp1, Register tmp2, 3253 Register tmp3) { 3254 Label CRC_by64_loop, CRC_by4_loop, CRC_by1_loop, CRC_less64, CRC_by64_pre, CRC_by32_loop, CRC_less32, L_exit; 3255 assert_different_registers(crc, buf, len, tmp0, tmp1, tmp2, tmp3); 3256 3257 mvnw(crc, crc); 3258 3259 subs(len, len, 128); 3260 br(Assembler::GE, CRC_by64_pre); 3261 BIND(CRC_less64); 3262 adds(len, len, 128-32); 3263 br(Assembler::GE, CRC_by32_loop); 3264 BIND(CRC_less32); 3265 adds(len, len, 32-4); 3266 br(Assembler::GE, CRC_by4_loop); 3267 adds(len, len, 4); 3268 br(Assembler::GT, CRC_by1_loop); 3269 b(L_exit); 3270 3271 BIND(CRC_by32_loop); 3272 ldp(tmp0, tmp1, Address(post(buf, 16))); 3273 subs(len, len, 32); 3274 crc32x(crc, crc, tmp0); 3275 ldr(tmp2, Address(post(buf, 8))); 3276 crc32x(crc, crc, tmp1); 3277 ldr(tmp3, Address(post(buf, 8))); 3278 crc32x(crc, crc, tmp2); 3279 crc32x(crc, crc, tmp3); 3280 br(Assembler::GE, CRC_by32_loop); 3281 cmn(len, 32); 3282 br(Assembler::NE, CRC_less32); 3283 b(L_exit); 3284 3285 BIND(CRC_by4_loop); 3286 ldrw(tmp0, Address(post(buf, 4))); 3287 subs(len, len, 4); 3288 crc32w(crc, crc, tmp0); 3289 br(Assembler::GE, CRC_by4_loop); 3290 adds(len, len, 4); 3291 br(Assembler::LE, L_exit); 3292 BIND(CRC_by1_loop); 3293 ldrb(tmp0, Address(post(buf, 1))); 3294 subs(len, len, 1); 3295 crc32b(crc, crc, tmp0); 3296 br(Assembler::GT, CRC_by1_loop); 3297 b(L_exit); 3298 3299 BIND(CRC_by64_pre); 3300 sub(buf, buf, 8); 3301 ldp(tmp0, tmp1, Address(buf, 8)); 3302 crc32x(crc, crc, tmp0); 3303 ldr(tmp2, Address(buf, 24)); 3304 crc32x(crc, crc, tmp1); 3305 ldr(tmp3, Address(buf, 32)); 3306 crc32x(crc, crc, tmp2); 3307 ldr(tmp0, Address(buf, 40)); 3308 crc32x(crc, crc, tmp3); 3309 ldr(tmp1, Address(buf, 48)); 3310 crc32x(crc, crc, tmp0); 3311 ldr(tmp2, Address(buf, 56)); 3312 crc32x(crc, crc, tmp1); 3313 ldr(tmp3, Address(pre(buf, 64))); 3314 3315 b(CRC_by64_loop); 3316 3317 align(CodeEntryAlignment); 3318 BIND(CRC_by64_loop); 3319 subs(len, len, 64); 3320 crc32x(crc, crc, tmp2); 3321 ldr(tmp0, Address(buf, 8)); 3322 crc32x(crc, crc, tmp3); 3323 ldr(tmp1, Address(buf, 16)); 3324 crc32x(crc, crc, tmp0); 3325 ldr(tmp2, Address(buf, 24)); 3326 crc32x(crc, crc, tmp1); 3327 ldr(tmp3, Address(buf, 32)); 3328 crc32x(crc, crc, tmp2); 3329 ldr(tmp0, Address(buf, 40)); 3330 crc32x(crc, crc, tmp3); 3331 ldr(tmp1, Address(buf, 48)); 3332 crc32x(crc, crc, tmp0); 3333 ldr(tmp2, Address(buf, 56)); 3334 crc32x(crc, crc, tmp1); 3335 ldr(tmp3, Address(pre(buf, 64))); 3336 br(Assembler::GE, CRC_by64_loop); 3337 3338 // post-loop 3339 crc32x(crc, crc, tmp2); 3340 crc32x(crc, crc, tmp3); 3341 3342 sub(len, len, 64); 3343 add(buf, buf, 8); 3344 cmn(len, 128); 3345 br(Assembler::NE, CRC_less64); 3346 BIND(L_exit); 3347 mvnw(crc, crc); 3348 } 3349 3350 /** 3351 * @param crc register containing existing CRC (32-bit) 3352 * @param buf register pointing to input byte buffer (byte*) 3353 * @param len register containing number of bytes 3354 * @param table register that will contain address of CRC table 3355 * @param tmp scratch register 3356 */ 3357 void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len, 3358 Register table0, Register table1, Register table2, Register table3, 3359 Register tmp, Register tmp2, Register tmp3) { 3360 Label L_by16, L_by16_loop, L_by4, L_by4_loop, L_by1, L_by1_loop, L_exit; 3361 unsigned long offset; 3362 3363 if (UseCRC32) { 3364 kernel_crc32_using_crc32(crc, buf, len, table0, table1, table2, table3); 3365 return; 3366 } 3367 3368 mvnw(crc, crc); 3369 3370 adrp(table0, ExternalAddress(StubRoutines::crc_table_addr()), offset); 3371 if (offset) add(table0, table0, offset); 3372 add(table1, table0, 1*256*sizeof(juint)); 3373 add(table2, table0, 2*256*sizeof(juint)); 3374 add(table3, table0, 3*256*sizeof(juint)); 3375 3376 if (UseNeon) { 3377 cmp(len, (u1)64); 3378 br(Assembler::LT, L_by16); 3379 eor(v16, T16B, v16, v16); 3380 3381 Label L_fold; 3382 3383 add(tmp, table0, 4*256*sizeof(juint)); // Point at the Neon constants 3384 3385 ld1(v0, v1, T2D, post(buf, 32)); 3386 ld1r(v4, T2D, post(tmp, 8)); 3387 ld1r(v5, T2D, post(tmp, 8)); 3388 ld1r(v6, T2D, post(tmp, 8)); 3389 ld1r(v7, T2D, post(tmp, 8)); 3390 mov(v16, T4S, 0, crc); 3391 3392 eor(v0, T16B, v0, v16); 3393 sub(len, len, 64); 3394 3395 BIND(L_fold); 3396 pmull(v22, T8H, v0, v5, T8B); 3397 pmull(v20, T8H, v0, v7, T8B); 3398 pmull(v23, T8H, v0, v4, T8B); 3399 pmull(v21, T8H, v0, v6, T8B); 3400 3401 pmull2(v18, T8H, v0, v5, T16B); 3402 pmull2(v16, T8H, v0, v7, T16B); 3403 pmull2(v19, T8H, v0, v4, T16B); 3404 pmull2(v17, T8H, v0, v6, T16B); 3405 3406 uzp1(v24, T8H, v20, v22); 3407 uzp2(v25, T8H, v20, v22); 3408 eor(v20, T16B, v24, v25); 3409 3410 uzp1(v26, T8H, v16, v18); 3411 uzp2(v27, T8H, v16, v18); 3412 eor(v16, T16B, v26, v27); 3413 3414 ushll2(v22, T4S, v20, T8H, 8); 3415 ushll(v20, T4S, v20, T4H, 8); 3416 3417 ushll2(v18, T4S, v16, T8H, 8); 3418 ushll(v16, T4S, v16, T4H, 8); 3419 3420 eor(v22, T16B, v23, v22); 3421 eor(v18, T16B, v19, v18); 3422 eor(v20, T16B, v21, v20); 3423 eor(v16, T16B, v17, v16); 3424 3425 uzp1(v17, T2D, v16, v20); 3426 uzp2(v21, T2D, v16, v20); 3427 eor(v17, T16B, v17, v21); 3428 3429 ushll2(v20, T2D, v17, T4S, 16); 3430 ushll(v16, T2D, v17, T2S, 16); 3431 3432 eor(v20, T16B, v20, v22); 3433 eor(v16, T16B, v16, v18); 3434 3435 uzp1(v17, T2D, v20, v16); 3436 uzp2(v21, T2D, v20, v16); 3437 eor(v28, T16B, v17, v21); 3438 3439 pmull(v22, T8H, v1, v5, T8B); 3440 pmull(v20, T8H, v1, v7, T8B); 3441 pmull(v23, T8H, v1, v4, T8B); 3442 pmull(v21, T8H, v1, v6, T8B); 3443 3444 pmull2(v18, T8H, v1, v5, T16B); 3445 pmull2(v16, T8H, v1, v7, T16B); 3446 pmull2(v19, T8H, v1, v4, T16B); 3447 pmull2(v17, T8H, v1, v6, T16B); 3448 3449 ld1(v0, v1, T2D, post(buf, 32)); 3450 3451 uzp1(v24, T8H, v20, v22); 3452 uzp2(v25, T8H, v20, v22); 3453 eor(v20, T16B, v24, v25); 3454 3455 uzp1(v26, T8H, v16, v18); 3456 uzp2(v27, T8H, v16, v18); 3457 eor(v16, T16B, v26, v27); 3458 3459 ushll2(v22, T4S, v20, T8H, 8); 3460 ushll(v20, T4S, v20, T4H, 8); 3461 3462 ushll2(v18, T4S, v16, T8H, 8); 3463 ushll(v16, T4S, v16, T4H, 8); 3464 3465 eor(v22, T16B, v23, v22); 3466 eor(v18, T16B, v19, v18); 3467 eor(v20, T16B, v21, v20); 3468 eor(v16, T16B, v17, v16); 3469 3470 uzp1(v17, T2D, v16, v20); 3471 uzp2(v21, T2D, v16, v20); 3472 eor(v16, T16B, v17, v21); 3473 3474 ushll2(v20, T2D, v16, T4S, 16); 3475 ushll(v16, T2D, v16, T2S, 16); 3476 3477 eor(v20, T16B, v22, v20); 3478 eor(v16, T16B, v16, v18); 3479 3480 uzp1(v17, T2D, v20, v16); 3481 uzp2(v21, T2D, v20, v16); 3482 eor(v20, T16B, v17, v21); 3483 3484 shl(v16, T2D, v28, 1); 3485 shl(v17, T2D, v20, 1); 3486 3487 eor(v0, T16B, v0, v16); 3488 eor(v1, T16B, v1, v17); 3489 3490 subs(len, len, 32); 3491 br(Assembler::GE, L_fold); 3492 3493 mov(crc, 0); 3494 mov(tmp, v0, T1D, 0); 3495 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false); 3496 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true); 3497 mov(tmp, v0, T1D, 1); 3498 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false); 3499 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true); 3500 mov(tmp, v1, T1D, 0); 3501 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false); 3502 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true); 3503 mov(tmp, v1, T1D, 1); 3504 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false); 3505 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true); 3506 3507 add(len, len, 32); 3508 } 3509 3510 BIND(L_by16); 3511 subs(len, len, 16); 3512 br(Assembler::GE, L_by16_loop); 3513 adds(len, len, 16-4); 3514 br(Assembler::GE, L_by4_loop); 3515 adds(len, len, 4); 3516 br(Assembler::GT, L_by1_loop); 3517 b(L_exit); 3518 3519 BIND(L_by4_loop); 3520 ldrw(tmp, Address(post(buf, 4))); 3521 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3); 3522 subs(len, len, 4); 3523 br(Assembler::GE, L_by4_loop); 3524 adds(len, len, 4); 3525 br(Assembler::LE, L_exit); 3526 BIND(L_by1_loop); 3527 subs(len, len, 1); 3528 ldrb(tmp, Address(post(buf, 1))); 3529 update_byte_crc32(crc, tmp, table0); 3530 br(Assembler::GT, L_by1_loop); 3531 b(L_exit); 3532 3533 align(CodeEntryAlignment); 3534 BIND(L_by16_loop); 3535 subs(len, len, 16); 3536 ldp(tmp, tmp3, Address(post(buf, 16))); 3537 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false); 3538 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true); 3539 update_word_crc32(crc, tmp3, tmp2, table0, table1, table2, table3, false); 3540 update_word_crc32(crc, tmp3, tmp2, table0, table1, table2, table3, true); 3541 br(Assembler::GE, L_by16_loop); 3542 adds(len, len, 16-4); 3543 br(Assembler::GE, L_by4_loop); 3544 adds(len, len, 4); 3545 br(Assembler::GT, L_by1_loop); 3546 BIND(L_exit); 3547 mvnw(crc, crc); 3548 } 3549 3550 void MacroAssembler::kernel_crc32c_using_crc32c(Register crc, Register buf, 3551 Register len, Register tmp0, Register tmp1, Register tmp2, 3552 Register tmp3) { 3553 Label CRC_by64_loop, CRC_by4_loop, CRC_by1_loop, CRC_less64, CRC_by64_pre, CRC_by32_loop, CRC_less32, L_exit; 3554 assert_different_registers(crc, buf, len, tmp0, tmp1, tmp2, tmp3); 3555 3556 subs(len, len, 128); 3557 br(Assembler::GE, CRC_by64_pre); 3558 BIND(CRC_less64); 3559 adds(len, len, 128-32); 3560 br(Assembler::GE, CRC_by32_loop); 3561 BIND(CRC_less32); 3562 adds(len, len, 32-4); 3563 br(Assembler::GE, CRC_by4_loop); 3564 adds(len, len, 4); 3565 br(Assembler::GT, CRC_by1_loop); 3566 b(L_exit); 3567 3568 BIND(CRC_by32_loop); 3569 ldp(tmp0, tmp1, Address(post(buf, 16))); 3570 subs(len, len, 32); 3571 crc32cx(crc, crc, tmp0); 3572 ldr(tmp2, Address(post(buf, 8))); 3573 crc32cx(crc, crc, tmp1); 3574 ldr(tmp3, Address(post(buf, 8))); 3575 crc32cx(crc, crc, tmp2); 3576 crc32cx(crc, crc, tmp3); 3577 br(Assembler::GE, CRC_by32_loop); 3578 cmn(len, 32); 3579 br(Assembler::NE, CRC_less32); 3580 b(L_exit); 3581 3582 BIND(CRC_by4_loop); 3583 ldrw(tmp0, Address(post(buf, 4))); 3584 subs(len, len, 4); 3585 crc32cw(crc, crc, tmp0); 3586 br(Assembler::GE, CRC_by4_loop); 3587 adds(len, len, 4); 3588 br(Assembler::LE, L_exit); 3589 BIND(CRC_by1_loop); 3590 ldrb(tmp0, Address(post(buf, 1))); 3591 subs(len, len, 1); 3592 crc32cb(crc, crc, tmp0); 3593 br(Assembler::GT, CRC_by1_loop); 3594 b(L_exit); 3595 3596 BIND(CRC_by64_pre); 3597 sub(buf, buf, 8); 3598 ldp(tmp0, tmp1, Address(buf, 8)); 3599 crc32cx(crc, crc, tmp0); 3600 ldr(tmp2, Address(buf, 24)); 3601 crc32cx(crc, crc, tmp1); 3602 ldr(tmp3, Address(buf, 32)); 3603 crc32cx(crc, crc, tmp2); 3604 ldr(tmp0, Address(buf, 40)); 3605 crc32cx(crc, crc, tmp3); 3606 ldr(tmp1, Address(buf, 48)); 3607 crc32cx(crc, crc, tmp0); 3608 ldr(tmp2, Address(buf, 56)); 3609 crc32cx(crc, crc, tmp1); 3610 ldr(tmp3, Address(pre(buf, 64))); 3611 3612 b(CRC_by64_loop); 3613 3614 align(CodeEntryAlignment); 3615 BIND(CRC_by64_loop); 3616 subs(len, len, 64); 3617 crc32cx(crc, crc, tmp2); 3618 ldr(tmp0, Address(buf, 8)); 3619 crc32cx(crc, crc, tmp3); 3620 ldr(tmp1, Address(buf, 16)); 3621 crc32cx(crc, crc, tmp0); 3622 ldr(tmp2, Address(buf, 24)); 3623 crc32cx(crc, crc, tmp1); 3624 ldr(tmp3, Address(buf, 32)); 3625 crc32cx(crc, crc, tmp2); 3626 ldr(tmp0, Address(buf, 40)); 3627 crc32cx(crc, crc, tmp3); 3628 ldr(tmp1, Address(buf, 48)); 3629 crc32cx(crc, crc, tmp0); 3630 ldr(tmp2, Address(buf, 56)); 3631 crc32cx(crc, crc, tmp1); 3632 ldr(tmp3, Address(pre(buf, 64))); 3633 br(Assembler::GE, CRC_by64_loop); 3634 3635 // post-loop 3636 crc32cx(crc, crc, tmp2); 3637 crc32cx(crc, crc, tmp3); 3638 3639 sub(len, len, 64); 3640 add(buf, buf, 8); 3641 cmn(len, 128); 3642 br(Assembler::NE, CRC_less64); 3643 BIND(L_exit); 3644 } 3645 3646 /** 3647 * @param crc register containing existing CRC (32-bit) 3648 * @param buf register pointing to input byte buffer (byte*) 3649 * @param len register containing number of bytes 3650 * @param table register that will contain address of CRC table 3651 * @param tmp scratch register 3652 */ 3653 void MacroAssembler::kernel_crc32c(Register crc, Register buf, Register len, 3654 Register table0, Register table1, Register table2, Register table3, 3655 Register tmp, Register tmp2, Register tmp3) { 3656 kernel_crc32c_using_crc32c(crc, buf, len, table0, table1, table2, table3); 3657 } 3658 3659 3660 SkipIfEqual::SkipIfEqual( 3661 MacroAssembler* masm, const bool* flag_addr, bool value) { 3662 _masm = masm; 3663 unsigned long offset; 3664 _masm->adrp(rscratch1, ExternalAddress((address)flag_addr), offset); 3665 _masm->ldrb(rscratch1, Address(rscratch1, offset)); 3666 _masm->cbzw(rscratch1, _label); 3667 } 3668 3669 SkipIfEqual::~SkipIfEqual() { 3670 _masm->bind(_label); 3671 } 3672 3673 void MacroAssembler::addptr(const Address &dst, int32_t src) { 3674 Address adr; 3675 switch(dst.getMode()) { 3676 case Address::base_plus_offset: 3677 // This is the expected mode, although we allow all the other 3678 // forms below. 3679 adr = form_address(rscratch2, dst.base(), dst.offset(), LogBytesPerWord); 3680 break; 3681 default: 3682 lea(rscratch2, dst); 3683 adr = Address(rscratch2); 3684 break; 3685 } 3686 ldr(rscratch1, adr); 3687 add(rscratch1, rscratch1, src); 3688 str(rscratch1, adr); 3689 } 3690 3691 void MacroAssembler::cmpptr(Register src1, Address src2) { 3692 unsigned long offset; 3693 adrp(rscratch1, src2, offset); 3694 ldr(rscratch1, Address(rscratch1, offset)); 3695 cmp(src1, rscratch1); 3696 } 3697 3698 void MacroAssembler::cmpoop(Register obj1, Register obj2) { 3699 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 3700 bs->obj_equals(this, obj1, obj2); 3701 } 3702 3703 void MacroAssembler::load_method_holder(Register holder, Register method) { 3704 ldr(holder, Address(method, Method::const_offset())); // ConstMethod* 3705 ldr(holder, Address(holder, ConstMethod::constants_offset())); // ConstantPool* 3706 ldr(holder, Address(holder, ConstantPool::pool_holder_offset_in_bytes())); // InstanceKlass* 3707 } 3708 3709 void MacroAssembler::load_klass(Register dst, Register src) { 3710 if (UseCompressedClassPointers) { 3711 ldrw(dst, Address(src, oopDesc::klass_offset_in_bytes())); 3712 decode_klass_not_null(dst); 3713 } else { 3714 ldr(dst, Address(src, oopDesc::klass_offset_in_bytes())); 3715 } 3716 } 3717 3718 // ((OopHandle)result).resolve(); 3719 void MacroAssembler::resolve_oop_handle(Register result, Register tmp) { 3720 // OopHandle::resolve is an indirection. 3721 access_load_at(T_OBJECT, IN_NATIVE, result, Address(result, 0), tmp, noreg); 3722 } 3723 3724 void MacroAssembler::load_mirror(Register dst, Register method, Register tmp) { 3725 const int mirror_offset = in_bytes(Klass::java_mirror_offset()); 3726 ldr(dst, Address(rmethod, Method::const_offset())); 3727 ldr(dst, Address(dst, ConstMethod::constants_offset())); 3728 ldr(dst, Address(dst, ConstantPool::pool_holder_offset_in_bytes())); 3729 ldr(dst, Address(dst, mirror_offset)); 3730 resolve_oop_handle(dst, tmp); 3731 } 3732 3733 void MacroAssembler::cmp_klass(Register oop, Register trial_klass, Register tmp) { 3734 if (UseCompressedClassPointers) { 3735 ldrw(tmp, Address(oop, oopDesc::klass_offset_in_bytes())); 3736 if (CompressedKlassPointers::base() == NULL) { 3737 cmp(trial_klass, tmp, LSL, CompressedKlassPointers::shift()); 3738 return; 3739 } else if (((uint64_t)CompressedKlassPointers::base() & 0xffffffff) == 0 3740 && CompressedKlassPointers::shift() == 0) { 3741 // Only the bottom 32 bits matter 3742 cmpw(trial_klass, tmp); 3743 return; 3744 } 3745 decode_klass_not_null(tmp); 3746 } else { 3747 ldr(tmp, Address(oop, oopDesc::klass_offset_in_bytes())); 3748 } 3749 cmp(trial_klass, tmp); 3750 } 3751 3752 void MacroAssembler::load_prototype_header(Register dst, Register src) { 3753 load_klass(dst, src); 3754 ldr(dst, Address(dst, Klass::prototype_header_offset())); 3755 } 3756 3757 void MacroAssembler::store_klass(Register dst, Register src) { 3758 // FIXME: Should this be a store release? concurrent gcs assumes 3759 // klass length is valid if klass field is not null. 3760 if (UseCompressedClassPointers) { 3761 encode_klass_not_null(src); 3762 strw(src, Address(dst, oopDesc::klass_offset_in_bytes())); 3763 } else { 3764 str(src, Address(dst, oopDesc::klass_offset_in_bytes())); 3765 } 3766 } 3767 3768 void MacroAssembler::store_klass_gap(Register dst, Register src) { 3769 if (UseCompressedClassPointers) { 3770 // Store to klass gap in destination 3771 strw(src, Address(dst, oopDesc::klass_gap_offset_in_bytes())); 3772 } 3773 } 3774 3775 // Algorithm must match CompressedOops::encode. 3776 void MacroAssembler::encode_heap_oop(Register d, Register s) { 3777 #ifdef ASSERT 3778 verify_heapbase("MacroAssembler::encode_heap_oop: heap base corrupted?"); 3779 #endif 3780 verify_oop(s, "broken oop in encode_heap_oop"); 3781 if (CompressedOops::base() == NULL) { 3782 if (CompressedOops::shift() != 0) { 3783 assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong"); 3784 lsr(d, s, LogMinObjAlignmentInBytes); 3785 } else { 3786 mov(d, s); 3787 } 3788 } else { 3789 subs(d, s, rheapbase); 3790 csel(d, d, zr, Assembler::HS); 3791 lsr(d, d, LogMinObjAlignmentInBytes); 3792 3793 /* Old algorithm: is this any worse? 3794 Label nonnull; 3795 cbnz(r, nonnull); 3796 sub(r, r, rheapbase); 3797 bind(nonnull); 3798 lsr(r, r, LogMinObjAlignmentInBytes); 3799 */ 3800 } 3801 } 3802 3803 void MacroAssembler::encode_heap_oop_not_null(Register r) { 3804 #ifdef ASSERT 3805 verify_heapbase("MacroAssembler::encode_heap_oop_not_null: heap base corrupted?"); 3806 if (CheckCompressedOops) { 3807 Label ok; 3808 cbnz(r, ok); 3809 stop("null oop passed to encode_heap_oop_not_null"); 3810 bind(ok); 3811 } 3812 #endif 3813 verify_oop(r, "broken oop in encode_heap_oop_not_null"); 3814 if (CompressedOops::base() != NULL) { 3815 sub(r, r, rheapbase); 3816 } 3817 if (CompressedOops::shift() != 0) { 3818 assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong"); 3819 lsr(r, r, LogMinObjAlignmentInBytes); 3820 } 3821 } 3822 3823 void MacroAssembler::encode_heap_oop_not_null(Register dst, Register src) { 3824 #ifdef ASSERT 3825 verify_heapbase("MacroAssembler::encode_heap_oop_not_null2: heap base corrupted?"); 3826 if (CheckCompressedOops) { 3827 Label ok; 3828 cbnz(src, ok); 3829 stop("null oop passed to encode_heap_oop_not_null2"); 3830 bind(ok); 3831 } 3832 #endif 3833 verify_oop(src, "broken oop in encode_heap_oop_not_null2"); 3834 3835 Register data = src; 3836 if (CompressedOops::base() != NULL) { 3837 sub(dst, src, rheapbase); 3838 data = dst; 3839 } 3840 if (CompressedOops::shift() != 0) { 3841 assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong"); 3842 lsr(dst, data, LogMinObjAlignmentInBytes); 3843 data = dst; 3844 } 3845 if (data == src) 3846 mov(dst, src); 3847 } 3848 3849 void MacroAssembler::decode_heap_oop(Register d, Register s) { 3850 #ifdef ASSERT 3851 verify_heapbase("MacroAssembler::decode_heap_oop: heap base corrupted?"); 3852 #endif 3853 if (CompressedOops::base() == NULL) { 3854 if (CompressedOops::shift() != 0 || d != s) { 3855 lsl(d, s, CompressedOops::shift()); 3856 } 3857 } else { 3858 Label done; 3859 if (d != s) 3860 mov(d, s); 3861 cbz(s, done); 3862 add(d, rheapbase, s, Assembler::LSL, LogMinObjAlignmentInBytes); 3863 bind(done); 3864 } 3865 verify_oop(d, "broken oop in decode_heap_oop"); 3866 } 3867 3868 void MacroAssembler::decode_heap_oop_not_null(Register r) { 3869 assert (UseCompressedOops, "should only be used for compressed headers"); 3870 assert (Universe::heap() != NULL, "java heap should be initialized"); 3871 // Cannot assert, unverified entry point counts instructions (see .ad file) 3872 // vtableStubs also counts instructions in pd_code_size_limit. 3873 // Also do not verify_oop as this is called by verify_oop. 3874 if (CompressedOops::shift() != 0) { 3875 assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong"); 3876 if (CompressedOops::base() != NULL) { 3877 add(r, rheapbase, r, Assembler::LSL, LogMinObjAlignmentInBytes); 3878 } else { 3879 add(r, zr, r, Assembler::LSL, LogMinObjAlignmentInBytes); 3880 } 3881 } else { 3882 assert (CompressedOops::base() == NULL, "sanity"); 3883 } 3884 } 3885 3886 void MacroAssembler::decode_heap_oop_not_null(Register dst, Register src) { 3887 assert (UseCompressedOops, "should only be used for compressed headers"); 3888 assert (Universe::heap() != NULL, "java heap should be initialized"); 3889 // Cannot assert, unverified entry point counts instructions (see .ad file) 3890 // vtableStubs also counts instructions in pd_code_size_limit. 3891 // Also do not verify_oop as this is called by verify_oop. 3892 if (CompressedOops::shift() != 0) { 3893 assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong"); 3894 if (CompressedOops::base() != NULL) { 3895 add(dst, rheapbase, src, Assembler::LSL, LogMinObjAlignmentInBytes); 3896 } else { 3897 add(dst, zr, src, Assembler::LSL, LogMinObjAlignmentInBytes); 3898 } 3899 } else { 3900 assert (CompressedOops::base() == NULL, "sanity"); 3901 if (dst != src) { 3902 mov(dst, src); 3903 } 3904 } 3905 } 3906 3907 void MacroAssembler::encode_klass_not_null(Register dst, Register src) { 3908 if (CompressedKlassPointers::base() == NULL) { 3909 if (CompressedKlassPointers::shift() != 0) { 3910 assert (LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong"); 3911 lsr(dst, src, LogKlassAlignmentInBytes); 3912 } else { 3913 if (dst != src) mov(dst, src); 3914 } 3915 return; 3916 } 3917 3918 if (use_XOR_for_compressed_class_base) { 3919 if (CompressedKlassPointers::shift() != 0) { 3920 eor(dst, src, (uint64_t)CompressedKlassPointers::base()); 3921 lsr(dst, dst, LogKlassAlignmentInBytes); 3922 } else { 3923 eor(dst, src, (uint64_t)CompressedKlassPointers::base()); 3924 } 3925 return; 3926 } 3927 3928 if (((uint64_t)CompressedKlassPointers::base() & 0xffffffff) == 0 3929 && CompressedKlassPointers::shift() == 0) { 3930 movw(dst, src); 3931 return; 3932 } 3933 3934 #ifdef ASSERT 3935 verify_heapbase("MacroAssembler::encode_klass_not_null2: heap base corrupted?"); 3936 #endif 3937 3938 Register rbase = dst; 3939 if (dst == src) rbase = rheapbase; 3940 mov(rbase, (uint64_t)CompressedKlassPointers::base()); 3941 sub(dst, src, rbase); 3942 if (CompressedKlassPointers::shift() != 0) { 3943 assert (LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong"); 3944 lsr(dst, dst, LogKlassAlignmentInBytes); 3945 } 3946 if (dst == src) reinit_heapbase(); 3947 } 3948 3949 void MacroAssembler::encode_klass_not_null(Register r) { 3950 encode_klass_not_null(r, r); 3951 } 3952 3953 void MacroAssembler::decode_klass_not_null(Register dst, Register src) { 3954 Register rbase = dst; 3955 assert (UseCompressedClassPointers, "should only be used for compressed headers"); 3956 3957 if (CompressedKlassPointers::base() == NULL) { 3958 if (CompressedKlassPointers::shift() != 0) { 3959 assert(LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong"); 3960 lsl(dst, src, LogKlassAlignmentInBytes); 3961 } else { 3962 if (dst != src) mov(dst, src); 3963 } 3964 return; 3965 } 3966 3967 if (use_XOR_for_compressed_class_base) { 3968 if (CompressedKlassPointers::shift() != 0) { 3969 lsl(dst, src, LogKlassAlignmentInBytes); 3970 eor(dst, dst, (uint64_t)CompressedKlassPointers::base()); 3971 } else { 3972 eor(dst, src, (uint64_t)CompressedKlassPointers::base()); 3973 } 3974 return; 3975 } 3976 3977 if (((uint64_t)CompressedKlassPointers::base() & 0xffffffff) == 0 3978 && CompressedKlassPointers::shift() == 0) { 3979 if (dst != src) 3980 movw(dst, src); 3981 movk(dst, (uint64_t)CompressedKlassPointers::base() >> 32, 32); 3982 return; 3983 } 3984 3985 // Cannot assert, unverified entry point counts instructions (see .ad file) 3986 // vtableStubs also counts instructions in pd_code_size_limit. 3987 // Also do not verify_oop as this is called by verify_oop. 3988 if (dst == src) rbase = rheapbase; 3989 mov(rbase, (uint64_t)CompressedKlassPointers::base()); 3990 if (CompressedKlassPointers::shift() != 0) { 3991 assert(LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong"); 3992 add(dst, rbase, src, Assembler::LSL, LogKlassAlignmentInBytes); 3993 } else { 3994 add(dst, rbase, src); 3995 } 3996 if (dst == src) reinit_heapbase(); 3997 } 3998 3999 void MacroAssembler::decode_klass_not_null(Register r) { 4000 decode_klass_not_null(r, r); 4001 } 4002 4003 void MacroAssembler::set_narrow_oop(Register dst, jobject obj) { 4004 #ifdef ASSERT 4005 { 4006 ThreadInVMfromUnknown tiv; 4007 assert (UseCompressedOops, "should only be used for compressed oops"); 4008 assert (Universe::heap() != NULL, "java heap should be initialized"); 4009 assert (oop_recorder() != NULL, "this assembler needs an OopRecorder"); 4010 assert(Universe::heap()->is_in(JNIHandles::resolve(obj)), "should be real oop"); 4011 } 4012 #endif 4013 int oop_index = oop_recorder()->find_index(obj); 4014 InstructionMark im(this); 4015 RelocationHolder rspec = oop_Relocation::spec(oop_index); 4016 code_section()->relocate(inst_mark(), rspec); 4017 movz(dst, 0xDEAD, 16); 4018 movk(dst, 0xBEEF); 4019 } 4020 4021 void MacroAssembler::set_narrow_klass(Register dst, Klass* k) { 4022 assert (UseCompressedClassPointers, "should only be used for compressed headers"); 4023 assert (oop_recorder() != NULL, "this assembler needs an OopRecorder"); 4024 int index = oop_recorder()->find_index(k); 4025 assert(! Universe::heap()->is_in(k), "should not be an oop"); 4026 4027 InstructionMark im(this); 4028 RelocationHolder rspec = metadata_Relocation::spec(index); 4029 code_section()->relocate(inst_mark(), rspec); 4030 narrowKlass nk = CompressedKlassPointers::encode(k); 4031 movz(dst, (nk >> 16), 16); 4032 movk(dst, nk & 0xffff); 4033 } 4034 4035 void MacroAssembler::access_load_at(BasicType type, DecoratorSet decorators, 4036 Register dst, Address src, 4037 Register tmp1, Register thread_tmp) { 4038 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 4039 decorators = AccessInternal::decorator_fixup(decorators); 4040 bool as_raw = (decorators & AS_RAW) != 0; 4041 if (as_raw) { 4042 bs->BarrierSetAssembler::load_at(this, decorators, type, dst, src, tmp1, thread_tmp); 4043 } else { 4044 bs->load_at(this, decorators, type, dst, src, tmp1, thread_tmp); 4045 } 4046 } 4047 4048 void MacroAssembler::access_store_at(BasicType type, DecoratorSet decorators, 4049 Address dst, Register src, 4050 Register tmp1, Register thread_tmp) { 4051 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 4052 decorators = AccessInternal::decorator_fixup(decorators); 4053 bool as_raw = (decorators & AS_RAW) != 0; 4054 if (as_raw) { 4055 bs->BarrierSetAssembler::store_at(this, decorators, type, dst, src, tmp1, thread_tmp); 4056 } else { 4057 bs->store_at(this, decorators, type, dst, src, tmp1, thread_tmp); 4058 } 4059 } 4060 4061 void MacroAssembler::resolve(DecoratorSet decorators, Register obj) { 4062 // Use stronger ACCESS_WRITE|ACCESS_READ by default. 4063 if ((decorators & (ACCESS_READ | ACCESS_WRITE)) == 0) { 4064 decorators |= ACCESS_READ | ACCESS_WRITE; 4065 } 4066 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 4067 return bs->resolve(this, decorators, obj); 4068 } 4069 4070 void MacroAssembler::load_heap_oop(Register dst, Address src, Register tmp1, 4071 Register thread_tmp, DecoratorSet decorators) { 4072 access_load_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, thread_tmp); 4073 } 4074 4075 void MacroAssembler::load_heap_oop_not_null(Register dst, Address src, Register tmp1, 4076 Register thread_tmp, DecoratorSet decorators) { 4077 access_load_at(T_OBJECT, IN_HEAP | IS_NOT_NULL | decorators, dst, src, tmp1, thread_tmp); 4078 } 4079 4080 void MacroAssembler::store_heap_oop(Address dst, Register src, Register tmp1, 4081 Register thread_tmp, DecoratorSet decorators) { 4082 access_store_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, thread_tmp); 4083 } 4084 4085 // Used for storing NULLs. 4086 void MacroAssembler::store_heap_oop_null(Address dst) { 4087 access_store_at(T_OBJECT, IN_HEAP, dst, noreg, noreg, noreg); 4088 } 4089 4090 Address MacroAssembler::allocate_metadata_address(Metadata* obj) { 4091 assert(oop_recorder() != NULL, "this assembler needs a Recorder"); 4092 int index = oop_recorder()->allocate_metadata_index(obj); 4093 RelocationHolder rspec = metadata_Relocation::spec(index); 4094 return Address((address)obj, rspec); 4095 } 4096 4097 // Move an oop into a register. immediate is true if we want 4098 // immediate instrcutions, i.e. we are not going to patch this 4099 // instruction while the code is being executed by another thread. In 4100 // that case we can use move immediates rather than the constant pool. 4101 void MacroAssembler::movoop(Register dst, jobject obj, bool immediate) { 4102 int oop_index; 4103 if (obj == NULL) { 4104 oop_index = oop_recorder()->allocate_oop_index(obj); 4105 } else { 4106 #ifdef ASSERT 4107 { 4108 ThreadInVMfromUnknown tiv; 4109 assert(Universe::heap()->is_in(JNIHandles::resolve(obj)), "should be real oop"); 4110 } 4111 #endif 4112 oop_index = oop_recorder()->find_index(obj); 4113 } 4114 RelocationHolder rspec = oop_Relocation::spec(oop_index); 4115 if (! immediate) { 4116 address dummy = address(uintptr_t(pc()) & -wordSize); // A nearby aligned address 4117 ldr_constant(dst, Address(dummy, rspec)); 4118 } else 4119 mov(dst, Address((address)obj, rspec)); 4120 } 4121 4122 // Move a metadata address into a register. 4123 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) { 4124 int oop_index; 4125 if (obj == NULL) { 4126 oop_index = oop_recorder()->allocate_metadata_index(obj); 4127 } else { 4128 oop_index = oop_recorder()->find_index(obj); 4129 } 4130 RelocationHolder rspec = metadata_Relocation::spec(oop_index); 4131 mov(dst, Address((address)obj, rspec)); 4132 } 4133 4134 Address MacroAssembler::constant_oop_address(jobject obj) { 4135 #ifdef ASSERT 4136 { 4137 ThreadInVMfromUnknown tiv; 4138 assert(oop_recorder() != NULL, "this assembler needs an OopRecorder"); 4139 assert(Universe::heap()->is_in(JNIHandles::resolve(obj)), "not an oop"); 4140 } 4141 #endif 4142 int oop_index = oop_recorder()->find_index(obj); 4143 return Address((address)obj, oop_Relocation::spec(oop_index)); 4144 } 4145 4146 // Defines obj, preserves var_size_in_bytes, okay for t2 == var_size_in_bytes. 4147 void MacroAssembler::tlab_allocate(Register obj, 4148 Register var_size_in_bytes, 4149 int con_size_in_bytes, 4150 Register t1, 4151 Register t2, 4152 Label& slow_case) { 4153 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 4154 bs->tlab_allocate(this, obj, var_size_in_bytes, con_size_in_bytes, t1, t2, slow_case); 4155 } 4156 4157 // Defines obj, preserves var_size_in_bytes 4158 void MacroAssembler::eden_allocate(Register obj, 4159 Register var_size_in_bytes, 4160 int con_size_in_bytes, 4161 Register t1, 4162 Label& slow_case) { 4163 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 4164 bs->eden_allocate(this, obj, var_size_in_bytes, con_size_in_bytes, t1, slow_case); 4165 } 4166 4167 // Zero words; len is in bytes 4168 // Destroys all registers except addr 4169 // len must be a nonzero multiple of wordSize 4170 void MacroAssembler::zero_memory(Register addr, Register len, Register t1) { 4171 assert_different_registers(addr, len, t1, rscratch1, rscratch2); 4172 4173 #ifdef ASSERT 4174 { Label L; 4175 tst(len, BytesPerWord - 1); 4176 br(Assembler::EQ, L); 4177 stop("len is not a multiple of BytesPerWord"); 4178 bind(L); 4179 } 4180 #endif 4181 4182 #ifndef PRODUCT 4183 block_comment("zero memory"); 4184 #endif 4185 4186 Label loop; 4187 Label entry; 4188 4189 // Algorithm: 4190 // 4191 // scratch1 = cnt & 7; 4192 // cnt -= scratch1; 4193 // p += scratch1; 4194 // switch (scratch1) { 4195 // do { 4196 // cnt -= 8; 4197 // p[-8] = 0; 4198 // case 7: 4199 // p[-7] = 0; 4200 // case 6: 4201 // p[-6] = 0; 4202 // // ... 4203 // case 1: 4204 // p[-1] = 0; 4205 // case 0: 4206 // p += 8; 4207 // } while (cnt); 4208 // } 4209 4210 const int unroll = 8; // Number of str(zr) instructions we'll unroll 4211 4212 lsr(len, len, LogBytesPerWord); 4213 andr(rscratch1, len, unroll - 1); // tmp1 = cnt % unroll 4214 sub(len, len, rscratch1); // cnt -= unroll 4215 // t1 always points to the end of the region we're about to zero 4216 add(t1, addr, rscratch1, Assembler::LSL, LogBytesPerWord); 4217 adr(rscratch2, entry); 4218 sub(rscratch2, rscratch2, rscratch1, Assembler::LSL, 2); 4219 br(rscratch2); 4220 bind(loop); 4221 sub(len, len, unroll); 4222 for (int i = -unroll; i < 0; i++) 4223 Assembler::str(zr, Address(t1, i * wordSize)); 4224 bind(entry); 4225 add(t1, t1, unroll * wordSize); 4226 cbnz(len, loop); 4227 } 4228 4229 void MacroAssembler::verify_tlab() { 4230 #ifdef ASSERT 4231 if (UseTLAB && VerifyOops) { 4232 Label next, ok; 4233 4234 stp(rscratch2, rscratch1, Address(pre(sp, -16))); 4235 4236 ldr(rscratch2, Address(rthread, in_bytes(JavaThread::tlab_top_offset()))); 4237 ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_start_offset()))); 4238 cmp(rscratch2, rscratch1); 4239 br(Assembler::HS, next); 4240 STOP("assert(top >= start)"); 4241 should_not_reach_here(); 4242 4243 bind(next); 4244 ldr(rscratch2, Address(rthread, in_bytes(JavaThread::tlab_end_offset()))); 4245 ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_top_offset()))); 4246 cmp(rscratch2, rscratch1); 4247 br(Assembler::HS, ok); 4248 STOP("assert(top <= end)"); 4249 should_not_reach_here(); 4250 4251 bind(ok); 4252 ldp(rscratch2, rscratch1, Address(post(sp, 16))); 4253 } 4254 #endif 4255 } 4256 4257 // Writes to stack successive pages until offset reached to check for 4258 // stack overflow + shadow pages. This clobbers tmp. 4259 void MacroAssembler::bang_stack_size(Register size, Register tmp) { 4260 assert_different_registers(tmp, size, rscratch1); 4261 mov(tmp, sp); 4262 // Bang stack for total size given plus shadow page size. 4263 // Bang one page at a time because large size can bang beyond yellow and 4264 // red zones. 4265 Label loop; 4266 mov(rscratch1, os::vm_page_size()); 4267 bind(loop); 4268 lea(tmp, Address(tmp, -os::vm_page_size())); 4269 subsw(size, size, rscratch1); 4270 str(size, Address(tmp)); 4271 br(Assembler::GT, loop); 4272 4273 // Bang down shadow pages too. 4274 // At this point, (tmp-0) is the last address touched, so don't 4275 // touch it again. (It was touched as (tmp-pagesize) but then tmp 4276 // was post-decremented.) Skip this address by starting at i=1, and 4277 // touch a few more pages below. N.B. It is important to touch all 4278 // the way down to and including i=StackShadowPages. 4279 for (int i = 0; i < (int)(JavaThread::stack_shadow_zone_size() / os::vm_page_size()) - 1; i++) { 4280 // this could be any sized move but this is can be a debugging crumb 4281 // so the bigger the better. 4282 lea(tmp, Address(tmp, -os::vm_page_size())); 4283 str(size, Address(tmp)); 4284 } 4285 } 4286 4287 4288 // Move the address of the polling page into dest. 4289 void MacroAssembler::get_polling_page(Register dest, address page, relocInfo::relocType rtype) { 4290 if (SafepointMechanism::uses_thread_local_poll()) { 4291 ldr(dest, Address(rthread, Thread::polling_page_offset())); 4292 } else { 4293 unsigned long off; 4294 adrp(dest, Address(page, rtype), off); 4295 assert(off == 0, "polling page must be page aligned"); 4296 } 4297 } 4298 4299 // Move the address of the polling page into r, then read the polling 4300 // page. 4301 address MacroAssembler::read_polling_page(Register r, address page, relocInfo::relocType rtype) { 4302 get_polling_page(r, page, rtype); 4303 return read_polling_page(r, rtype); 4304 } 4305 4306 // Read the polling page. The address of the polling page must 4307 // already be in r. 4308 address MacroAssembler::read_polling_page(Register r, relocInfo::relocType rtype) { 4309 InstructionMark im(this); 4310 code_section()->relocate(inst_mark(), rtype); 4311 ldrw(zr, Address(r, 0)); 4312 return inst_mark(); 4313 } 4314 4315 void MacroAssembler::adrp(Register reg1, const Address &dest, unsigned long &byte_offset) { 4316 relocInfo::relocType rtype = dest.rspec().reloc()->type(); 4317 unsigned long low_page = (unsigned long)CodeCache::low_bound() >> 12; 4318 unsigned long high_page = (unsigned long)(CodeCache::high_bound()-1) >> 12; 4319 unsigned long dest_page = (unsigned long)dest.target() >> 12; 4320 long offset_low = dest_page - low_page; 4321 long offset_high = dest_page - high_page; 4322 4323 assert(is_valid_AArch64_address(dest.target()), "bad address"); 4324 assert(dest.getMode() == Address::literal, "ADRP must be applied to a literal address"); 4325 4326 InstructionMark im(this); 4327 code_section()->relocate(inst_mark(), dest.rspec()); 4328 // 8143067: Ensure that the adrp can reach the dest from anywhere within 4329 // the code cache so that if it is relocated we know it will still reach 4330 if (offset_high >= -(1<<20) && offset_low < (1<<20)) { 4331 _adrp(reg1, dest.target()); 4332 } else { 4333 unsigned long target = (unsigned long)dest.target(); 4334 unsigned long adrp_target 4335 = (target & 0xffffffffUL) | ((unsigned long)pc() & 0xffff00000000UL); 4336 4337 _adrp(reg1, (address)adrp_target); 4338 movk(reg1, target >> 32, 32); 4339 } 4340 byte_offset = (unsigned long)dest.target() & 0xfff; 4341 } 4342 4343 void MacroAssembler::load_byte_map_base(Register reg) { 4344 CardTable::CardValue* byte_map_base = 4345 ((CardTableBarrierSet*)(BarrierSet::barrier_set()))->card_table()->byte_map_base(); 4346 4347 if (is_valid_AArch64_address((address)byte_map_base)) { 4348 // Strictly speaking the byte_map_base isn't an address at all, 4349 // and it might even be negative. 4350 unsigned long offset; 4351 adrp(reg, ExternalAddress((address)byte_map_base), offset); 4352 // We expect offset to be zero with most collectors. 4353 if (offset != 0) { 4354 add(reg, reg, offset); 4355 } 4356 } else { 4357 mov(reg, (uint64_t)byte_map_base); 4358 } 4359 } 4360 4361 void MacroAssembler::build_frame(int framesize) { 4362 assert(framesize > 0, "framesize must be > 0"); 4363 if (framesize < ((1 << 9) + 2 * wordSize)) { 4364 sub(sp, sp, framesize); 4365 stp(rfp, lr, Address(sp, framesize - 2 * wordSize)); 4366 if (PreserveFramePointer) add(rfp, sp, framesize - 2 * wordSize); 4367 } else { 4368 stp(rfp, lr, Address(pre(sp, -2 * wordSize))); 4369 if (PreserveFramePointer) mov(rfp, sp); 4370 if (framesize < ((1 << 12) + 2 * wordSize)) 4371 sub(sp, sp, framesize - 2 * wordSize); 4372 else { 4373 mov(rscratch1, framesize - 2 * wordSize); 4374 sub(sp, sp, rscratch1); 4375 } 4376 } 4377 } 4378 4379 void MacroAssembler::remove_frame(int framesize) { 4380 assert(framesize > 0, "framesize must be > 0"); 4381 if (framesize < ((1 << 9) + 2 * wordSize)) { 4382 ldp(rfp, lr, Address(sp, framesize - 2 * wordSize)); 4383 add(sp, sp, framesize); 4384 } else { 4385 if (framesize < ((1 << 12) + 2 * wordSize)) 4386 add(sp, sp, framesize - 2 * wordSize); 4387 else { 4388 mov(rscratch1, framesize - 2 * wordSize); 4389 add(sp, sp, rscratch1); 4390 } 4391 ldp(rfp, lr, Address(post(sp, 2 * wordSize))); 4392 } 4393 } 4394 4395 #ifdef COMPILER2 4396 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr); 4397 4398 // Search for str1 in str2 and return index or -1 4399 void MacroAssembler::string_indexof(Register str2, Register str1, 4400 Register cnt2, Register cnt1, 4401 Register tmp1, Register tmp2, 4402 Register tmp3, Register tmp4, 4403 Register tmp5, Register tmp6, 4404 int icnt1, Register result, int ae) { 4405 // NOTE: tmp5, tmp6 can be zr depending on specific method version 4406 Label LINEARSEARCH, LINEARSTUB, LINEAR_MEDIUM, DONE, NOMATCH, MATCH; 4407 4408 Register ch1 = rscratch1; 4409 Register ch2 = rscratch2; 4410 Register cnt1tmp = tmp1; 4411 Register cnt2tmp = tmp2; 4412 Register cnt1_neg = cnt1; 4413 Register cnt2_neg = cnt2; 4414 Register result_tmp = tmp4; 4415 4416 bool isL = ae == StrIntrinsicNode::LL; 4417 4418 bool str1_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL; 4419 bool str2_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU; 4420 int str1_chr_shift = str1_isL ? 0:1; 4421 int str2_chr_shift = str2_isL ? 0:1; 4422 int str1_chr_size = str1_isL ? 1:2; 4423 int str2_chr_size = str2_isL ? 1:2; 4424 chr_insn str1_load_1chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb : 4425 (chr_insn)&MacroAssembler::ldrh; 4426 chr_insn str2_load_1chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb : 4427 (chr_insn)&MacroAssembler::ldrh; 4428 chr_insn load_2chr = isL ? (chr_insn)&MacroAssembler::ldrh : (chr_insn)&MacroAssembler::ldrw; 4429 chr_insn load_4chr = isL ? (chr_insn)&MacroAssembler::ldrw : (chr_insn)&MacroAssembler::ldr; 4430 4431 // Note, inline_string_indexOf() generates checks: 4432 // if (substr.count > string.count) return -1; 4433 // if (substr.count == 0) return 0; 4434 4435 // We have two strings, a source string in str2, cnt2 and a pattern string 4436 // in str1, cnt1. Find the 1st occurence of pattern in source or return -1. 4437 4438 // For larger pattern and source we use a simplified Boyer Moore algorithm. 4439 // With a small pattern and source we use linear scan. 4440 4441 if (icnt1 == -1) { 4442 sub(result_tmp, cnt2, cnt1); 4443 cmp(cnt1, (u1)8); // Use Linear Scan if cnt1 < 8 || cnt1 >= 256 4444 br(LT, LINEARSEARCH); 4445 dup(v0, T16B, cnt1); // done in separate FPU pipeline. Almost no penalty 4446 subs(zr, cnt1, 256); 4447 lsr(tmp1, cnt2, 2); 4448 ccmp(cnt1, tmp1, 0b0000, LT); // Source must be 4 * pattern for BM 4449 br(GE, LINEARSTUB); 4450 } 4451 4452 // The Boyer Moore alogorithm is based on the description here:- 4453 // 4454 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm 4455 // 4456 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule 4457 // and the 'Good Suffix' rule. 4458 // 4459 // These rules are essentially heuristics for how far we can shift the 4460 // pattern along the search string. 4461 // 4462 // The implementation here uses the 'Bad Character' rule only because of the 4463 // complexity of initialisation for the 'Good Suffix' rule. 4464 // 4465 // This is also known as the Boyer-Moore-Horspool algorithm:- 4466 // 4467 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm 4468 // 4469 // This particular implementation has few java-specific optimizations. 4470 // 4471 // #define ASIZE 256 4472 // 4473 // int bm(unsigned char *x, int m, unsigned char *y, int n) { 4474 // int i, j; 4475 // unsigned c; 4476 // unsigned char bc[ASIZE]; 4477 // 4478 // /* Preprocessing */ 4479 // for (i = 0; i < ASIZE; ++i) 4480 // bc[i] = m; 4481 // for (i = 0; i < m - 1; ) { 4482 // c = x[i]; 4483 // ++i; 4484 // // c < 256 for Latin1 string, so, no need for branch 4485 // #ifdef PATTERN_STRING_IS_LATIN1 4486 // bc[c] = m - i; 4487 // #else 4488 // if (c < ASIZE) bc[c] = m - i; 4489 // #endif 4490 // } 4491 // 4492 // /* Searching */ 4493 // j = 0; 4494 // while (j <= n - m) { 4495 // c = y[i+j]; 4496 // if (x[m-1] == c) 4497 // for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i); 4498 // if (i < 0) return j; 4499 // // c < 256 for Latin1 string, so, no need for branch 4500 // #ifdef SOURCE_STRING_IS_LATIN1 4501 // // LL case: (c< 256) always true. Remove branch 4502 // j += bc[y[j+m-1]]; 4503 // #endif 4504 // #ifndef PATTERN_STRING_IS_UTF 4505 // // UU case: need if (c<ASIZE) check. Skip 1 character if not. 4506 // if (c < ASIZE) 4507 // j += bc[y[j+m-1]]; 4508 // else 4509 // j += 1 4510 // #endif 4511 // #ifdef PATTERN_IS_LATIN1_AND_SOURCE_IS_UTF 4512 // // UL case: need if (c<ASIZE) check. Skip <pattern length> if not. 4513 // if (c < ASIZE) 4514 // j += bc[y[j+m-1]]; 4515 // else 4516 // j += m 4517 // #endif 4518 // } 4519 // } 4520 4521 if (icnt1 == -1) { 4522 Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH, 4523 BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP; 4524 Register cnt1end = tmp2; 4525 Register str2end = cnt2; 4526 Register skipch = tmp2; 4527 4528 // str1 length is >=8, so, we can read at least 1 register for cases when 4529 // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for 4530 // UL case. We'll re-read last character in inner pre-loop code to have 4531 // single outer pre-loop load 4532 const int firstStep = isL ? 7 : 3; 4533 4534 const int ASIZE = 256; 4535 const int STORED_BYTES = 32; // amount of bytes stored per instruction 4536 sub(sp, sp, ASIZE); 4537 mov(tmp5, ASIZE/STORED_BYTES); // loop iterations 4538 mov(ch1, sp); 4539 BIND(BM_INIT_LOOP); 4540 stpq(v0, v0, Address(post(ch1, STORED_BYTES))); 4541 subs(tmp5, tmp5, 1); 4542 br(GT, BM_INIT_LOOP); 4543 4544 sub(cnt1tmp, cnt1, 1); 4545 mov(tmp5, str2); 4546 add(str2end, str2, result_tmp, LSL, str2_chr_shift); 4547 sub(ch2, cnt1, 1); 4548 mov(tmp3, str1); 4549 BIND(BCLOOP); 4550 (this->*str1_load_1chr)(ch1, Address(post(tmp3, str1_chr_size))); 4551 if (!str1_isL) { 4552 subs(zr, ch1, ASIZE); 4553 br(HS, BCSKIP); 4554 } 4555 strb(ch2, Address(sp, ch1)); 4556 BIND(BCSKIP); 4557 subs(ch2, ch2, 1); 4558 br(GT, BCLOOP); 4559 4560 add(tmp6, str1, cnt1, LSL, str1_chr_shift); // address after str1 4561 if (str1_isL == str2_isL) { 4562 // load last 8 bytes (8LL/4UU symbols) 4563 ldr(tmp6, Address(tmp6, -wordSize)); 4564 } else { 4565 ldrw(tmp6, Address(tmp6, -wordSize/2)); // load last 4 bytes(4 symbols) 4566 // convert Latin1 to UTF. We'll have to wait until load completed, but 4567 // it's still faster than per-character loads+checks 4568 lsr(tmp3, tmp6, BitsPerByte * (wordSize/2 - str1_chr_size)); // str1[N-1] 4569 ubfx(ch1, tmp6, 8, 8); // str1[N-2] 4570 ubfx(ch2, tmp6, 16, 8); // str1[N-3] 4571 andr(tmp6, tmp6, 0xFF); // str1[N-4] 4572 orr(ch2, ch1, ch2, LSL, 16); 4573 orr(tmp6, tmp6, tmp3, LSL, 48); 4574 orr(tmp6, tmp6, ch2, LSL, 16); 4575 } 4576 BIND(BMLOOPSTR2); 4577 (this->*str2_load_1chr)(skipch, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift))); 4578 sub(cnt1tmp, cnt1tmp, firstStep); // cnt1tmp is positive here, because cnt1 >= 8 4579 if (str1_isL == str2_isL) { 4580 // re-init tmp3. It's for free because it's executed in parallel with 4581 // load above. Alternative is to initialize it before loop, but it'll 4582 // affect performance on in-order systems with 2 or more ld/st pipelines 4583 lsr(tmp3, tmp6, BitsPerByte * (wordSize - str1_chr_size)); 4584 } 4585 if (!isL) { // UU/UL case 4586 lsl(ch2, cnt1tmp, 1); // offset in bytes 4587 } 4588 cmp(tmp3, skipch); 4589 br(NE, BMSKIP); 4590 ldr(ch2, Address(str2, isL ? cnt1tmp : ch2)); 4591 mov(ch1, tmp6); 4592 if (isL) { 4593 b(BMLOOPSTR1_AFTER_LOAD); 4594 } else { 4595 sub(cnt1tmp, cnt1tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8 4596 b(BMLOOPSTR1_CMP); 4597 } 4598 BIND(BMLOOPSTR1); 4599 (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift))); 4600 (this->*str2_load_1chr)(ch2, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift))); 4601 BIND(BMLOOPSTR1_AFTER_LOAD); 4602 subs(cnt1tmp, cnt1tmp, 1); 4603 br(LT, BMLOOPSTR1_LASTCMP); 4604 BIND(BMLOOPSTR1_CMP); 4605 cmp(ch1, ch2); 4606 br(EQ, BMLOOPSTR1); 4607 BIND(BMSKIP); 4608 if (!isL) { 4609 // if we've met UTF symbol while searching Latin1 pattern, then we can 4610 // skip cnt1 symbols 4611 if (str1_isL != str2_isL) { 4612 mov(result_tmp, cnt1); 4613 } else { 4614 mov(result_tmp, 1); 4615 } 4616 subs(zr, skipch, ASIZE); 4617 br(HS, BMADV); 4618 } 4619 ldrb(result_tmp, Address(sp, skipch)); // load skip distance 4620 BIND(BMADV); 4621 sub(cnt1tmp, cnt1, 1); 4622 add(str2, str2, result_tmp, LSL, str2_chr_shift); 4623 cmp(str2, str2end); 4624 br(LE, BMLOOPSTR2); 4625 add(sp, sp, ASIZE); 4626 b(NOMATCH); 4627 BIND(BMLOOPSTR1_LASTCMP); 4628 cmp(ch1, ch2); 4629 br(NE, BMSKIP); 4630 BIND(BMMATCH); 4631 sub(result, str2, tmp5); 4632 if (!str2_isL) lsr(result, result, 1); 4633 add(sp, sp, ASIZE); 4634 b(DONE); 4635 4636 BIND(LINEARSTUB); 4637 cmp(cnt1, (u1)16); // small patterns still should be handled by simple algorithm 4638 br(LT, LINEAR_MEDIUM); 4639 mov(result, zr); 4640 RuntimeAddress stub = NULL; 4641 if (isL) { 4642 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ll()); 4643 assert(stub.target() != NULL, "string_indexof_linear_ll stub has not been generated"); 4644 } else if (str1_isL) { 4645 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ul()); 4646 assert(stub.target() != NULL, "string_indexof_linear_ul stub has not been generated"); 4647 } else { 4648 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_uu()); 4649 assert(stub.target() != NULL, "string_indexof_linear_uu stub has not been generated"); 4650 } 4651 trampoline_call(stub); 4652 b(DONE); 4653 } 4654 4655 BIND(LINEARSEARCH); 4656 { 4657 Label DO1, DO2, DO3; 4658 4659 Register str2tmp = tmp2; 4660 Register first = tmp3; 4661 4662 if (icnt1 == -1) 4663 { 4664 Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT; 4665 4666 cmp(cnt1, u1(str1_isL == str2_isL ? 4 : 2)); 4667 br(LT, DOSHORT); 4668 BIND(LINEAR_MEDIUM); 4669 (this->*str1_load_1chr)(first, Address(str1)); 4670 lea(str1, Address(str1, cnt1, Address::lsl(str1_chr_shift))); 4671 sub(cnt1_neg, zr, cnt1, LSL, str1_chr_shift); 4672 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 4673 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 4674 4675 BIND(FIRST_LOOP); 4676 (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg)); 4677 cmp(first, ch2); 4678 br(EQ, STR1_LOOP); 4679 BIND(STR2_NEXT); 4680 adds(cnt2_neg, cnt2_neg, str2_chr_size); 4681 br(LE, FIRST_LOOP); 4682 b(NOMATCH); 4683 4684 BIND(STR1_LOOP); 4685 adds(cnt1tmp, cnt1_neg, str1_chr_size); 4686 add(cnt2tmp, cnt2_neg, str2_chr_size); 4687 br(GE, MATCH); 4688 4689 BIND(STR1_NEXT); 4690 (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp)); 4691 (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp)); 4692 cmp(ch1, ch2); 4693 br(NE, STR2_NEXT); 4694 adds(cnt1tmp, cnt1tmp, str1_chr_size); 4695 add(cnt2tmp, cnt2tmp, str2_chr_size); 4696 br(LT, STR1_NEXT); 4697 b(MATCH); 4698 4699 BIND(DOSHORT); 4700 if (str1_isL == str2_isL) { 4701 cmp(cnt1, (u1)2); 4702 br(LT, DO1); 4703 br(GT, DO3); 4704 } 4705 } 4706 4707 if (icnt1 == 4) { 4708 Label CH1_LOOP; 4709 4710 (this->*load_4chr)(ch1, str1); 4711 sub(result_tmp, cnt2, 4); 4712 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 4713 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 4714 4715 BIND(CH1_LOOP); 4716 (this->*load_4chr)(ch2, Address(str2, cnt2_neg)); 4717 cmp(ch1, ch2); 4718 br(EQ, MATCH); 4719 adds(cnt2_neg, cnt2_neg, str2_chr_size); 4720 br(LE, CH1_LOOP); 4721 b(NOMATCH); 4722 } 4723 4724 if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 2) { 4725 Label CH1_LOOP; 4726 4727 BIND(DO2); 4728 (this->*load_2chr)(ch1, str1); 4729 if (icnt1 == 2) { 4730 sub(result_tmp, cnt2, 2); 4731 } 4732 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 4733 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 4734 BIND(CH1_LOOP); 4735 (this->*load_2chr)(ch2, Address(str2, cnt2_neg)); 4736 cmp(ch1, ch2); 4737 br(EQ, MATCH); 4738 adds(cnt2_neg, cnt2_neg, str2_chr_size); 4739 br(LE, CH1_LOOP); 4740 b(NOMATCH); 4741 } 4742 4743 if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 3) { 4744 Label FIRST_LOOP, STR2_NEXT, STR1_LOOP; 4745 4746 BIND(DO3); 4747 (this->*load_2chr)(first, str1); 4748 (this->*str1_load_1chr)(ch1, Address(str1, 2*str1_chr_size)); 4749 if (icnt1 == 3) { 4750 sub(result_tmp, cnt2, 3); 4751 } 4752 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 4753 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 4754 BIND(FIRST_LOOP); 4755 (this->*load_2chr)(ch2, Address(str2, cnt2_neg)); 4756 cmpw(first, ch2); 4757 br(EQ, STR1_LOOP); 4758 BIND(STR2_NEXT); 4759 adds(cnt2_neg, cnt2_neg, str2_chr_size); 4760 br(LE, FIRST_LOOP); 4761 b(NOMATCH); 4762 4763 BIND(STR1_LOOP); 4764 add(cnt2tmp, cnt2_neg, 2*str2_chr_size); 4765 (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp)); 4766 cmp(ch1, ch2); 4767 br(NE, STR2_NEXT); 4768 b(MATCH); 4769 } 4770 4771 if (icnt1 == -1 || icnt1 == 1) { 4772 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP; 4773 4774 BIND(DO1); 4775 (this->*str1_load_1chr)(ch1, str1); 4776 cmp(cnt2, (u1)8); 4777 br(LT, DO1_SHORT); 4778 4779 sub(result_tmp, cnt2, 8/str2_chr_size); 4780 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 4781 mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001); 4782 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 4783 4784 if (str2_isL) { 4785 orr(ch1, ch1, ch1, LSL, 8); 4786 } 4787 orr(ch1, ch1, ch1, LSL, 16); 4788 orr(ch1, ch1, ch1, LSL, 32); 4789 BIND(CH1_LOOP); 4790 ldr(ch2, Address(str2, cnt2_neg)); 4791 eor(ch2, ch1, ch2); 4792 sub(tmp1, ch2, tmp3); 4793 orr(tmp2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 4794 bics(tmp1, tmp1, tmp2); 4795 br(NE, HAS_ZERO); 4796 adds(cnt2_neg, cnt2_neg, 8); 4797 br(LT, CH1_LOOP); 4798 4799 cmp(cnt2_neg, (u1)8); 4800 mov(cnt2_neg, 0); 4801 br(LT, CH1_LOOP); 4802 b(NOMATCH); 4803 4804 BIND(HAS_ZERO); 4805 rev(tmp1, tmp1); 4806 clz(tmp1, tmp1); 4807 add(cnt2_neg, cnt2_neg, tmp1, LSR, 3); 4808 b(MATCH); 4809 4810 BIND(DO1_SHORT); 4811 mov(result_tmp, cnt2); 4812 lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift))); 4813 sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift); 4814 BIND(DO1_LOOP); 4815 (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg)); 4816 cmpw(ch1, ch2); 4817 br(EQ, MATCH); 4818 adds(cnt2_neg, cnt2_neg, str2_chr_size); 4819 br(LT, DO1_LOOP); 4820 } 4821 } 4822 BIND(NOMATCH); 4823 mov(result, -1); 4824 b(DONE); 4825 BIND(MATCH); 4826 add(result, result_tmp, cnt2_neg, ASR, str2_chr_shift); 4827 BIND(DONE); 4828 } 4829 4830 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr); 4831 typedef void (MacroAssembler::* uxt_insn)(Register Rd, Register Rn); 4832 4833 void MacroAssembler::string_indexof_char(Register str1, Register cnt1, 4834 Register ch, Register result, 4835 Register tmp1, Register tmp2, Register tmp3) 4836 { 4837 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE; 4838 Register cnt1_neg = cnt1; 4839 Register ch1 = rscratch1; 4840 Register result_tmp = rscratch2; 4841 4842 cmp(cnt1, (u1)4); 4843 br(LT, DO1_SHORT); 4844 4845 orr(ch, ch, ch, LSL, 16); 4846 orr(ch, ch, ch, LSL, 32); 4847 4848 sub(cnt1, cnt1, 4); 4849 mov(result_tmp, cnt1); 4850 lea(str1, Address(str1, cnt1, Address::uxtw(1))); 4851 sub(cnt1_neg, zr, cnt1, LSL, 1); 4852 4853 mov(tmp3, 0x0001000100010001); 4854 4855 BIND(CH1_LOOP); 4856 ldr(ch1, Address(str1, cnt1_neg)); 4857 eor(ch1, ch, ch1); 4858 sub(tmp1, ch1, tmp3); 4859 orr(tmp2, ch1, 0x7fff7fff7fff7fff); 4860 bics(tmp1, tmp1, tmp2); 4861 br(NE, HAS_ZERO); 4862 adds(cnt1_neg, cnt1_neg, 8); 4863 br(LT, CH1_LOOP); 4864 4865 cmp(cnt1_neg, (u1)8); 4866 mov(cnt1_neg, 0); 4867 br(LT, CH1_LOOP); 4868 b(NOMATCH); 4869 4870 BIND(HAS_ZERO); 4871 rev(tmp1, tmp1); 4872 clz(tmp1, tmp1); 4873 add(cnt1_neg, cnt1_neg, tmp1, LSR, 3); 4874 b(MATCH); 4875 4876 BIND(DO1_SHORT); 4877 mov(result_tmp, cnt1); 4878 lea(str1, Address(str1, cnt1, Address::uxtw(1))); 4879 sub(cnt1_neg, zr, cnt1, LSL, 1); 4880 BIND(DO1_LOOP); 4881 ldrh(ch1, Address(str1, cnt1_neg)); 4882 cmpw(ch, ch1); 4883 br(EQ, MATCH); 4884 adds(cnt1_neg, cnt1_neg, 2); 4885 br(LT, DO1_LOOP); 4886 BIND(NOMATCH); 4887 mov(result, -1); 4888 b(DONE); 4889 BIND(MATCH); 4890 add(result, result_tmp, cnt1_neg, ASR, 1); 4891 BIND(DONE); 4892 } 4893 4894 // Compare strings. 4895 void MacroAssembler::string_compare(Register str1, Register str2, 4896 Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2, 4897 FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3, int ae) { 4898 Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB, 4899 DIFFERENCE, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, 4900 SHORT_LOOP_START, TAIL_CHECK; 4901 4902 const u1 STUB_THRESHOLD = 64 + 8; 4903 bool isLL = ae == StrIntrinsicNode::LL; 4904 bool isLU = ae == StrIntrinsicNode::LU; 4905 bool isUL = ae == StrIntrinsicNode::UL; 4906 4907 bool str1_isL = isLL || isLU; 4908 bool str2_isL = isLL || isUL; 4909 4910 int str1_chr_shift = str1_isL ? 0 : 1; 4911 int str2_chr_shift = str2_isL ? 0 : 1; 4912 int str1_chr_size = str1_isL ? 1 : 2; 4913 int str2_chr_size = str2_isL ? 1 : 2; 4914 int minCharsInWord = isLL ? wordSize : wordSize/2; 4915 4916 FloatRegister vtmpZ = vtmp1, vtmp = vtmp2; 4917 chr_insn str1_load_chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb : 4918 (chr_insn)&MacroAssembler::ldrh; 4919 chr_insn str2_load_chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb : 4920 (chr_insn)&MacroAssembler::ldrh; 4921 uxt_insn ext_chr = isLL ? (uxt_insn)&MacroAssembler::uxtbw : 4922 (uxt_insn)&MacroAssembler::uxthw; 4923 4924 BLOCK_COMMENT("string_compare {"); 4925 4926 // Bizzarely, the counts are passed in bytes, regardless of whether they 4927 // are L or U strings, however the result is always in characters. 4928 if (!str1_isL) asrw(cnt1, cnt1, 1); 4929 if (!str2_isL) asrw(cnt2, cnt2, 1); 4930 4931 // Compute the minimum of the string lengths and save the difference. 4932 subsw(result, cnt1, cnt2); 4933 cselw(cnt2, cnt1, cnt2, Assembler::LE); // min 4934 4935 // A very short string 4936 cmpw(cnt2, minCharsInWord); 4937 br(Assembler::LE, SHORT_STRING); 4938 4939 // Compare longwords 4940 // load first parts of strings and finish initialization while loading 4941 { 4942 if (str1_isL == str2_isL) { // LL or UU 4943 ldr(tmp1, Address(str1)); 4944 cmp(str1, str2); 4945 br(Assembler::EQ, DONE); 4946 ldr(tmp2, Address(str2)); 4947 cmp(cnt2, STUB_THRESHOLD); 4948 br(GE, STUB); 4949 subsw(cnt2, cnt2, minCharsInWord); 4950 br(EQ, TAIL_CHECK); 4951 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift))); 4952 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift))); 4953 sub(cnt2, zr, cnt2, LSL, str2_chr_shift); 4954 } else if (isLU) { 4955 ldrs(vtmp, Address(str1)); 4956 cmp(str1, str2); 4957 br(Assembler::EQ, DONE); 4958 ldr(tmp2, Address(str2)); 4959 cmp(cnt2, STUB_THRESHOLD); 4960 br(GE, STUB); 4961 subw(cnt2, cnt2, 4); 4962 eor(vtmpZ, T16B, vtmpZ, vtmpZ); 4963 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift))); 4964 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift))); 4965 zip1(vtmp, T8B, vtmp, vtmpZ); 4966 sub(cnt1, zr, cnt2, LSL, str1_chr_shift); 4967 sub(cnt2, zr, cnt2, LSL, str2_chr_shift); 4968 add(cnt1, cnt1, 4); 4969 fmovd(tmp1, vtmp); 4970 } else { // UL case 4971 ldr(tmp1, Address(str1)); 4972 cmp(str1, str2); 4973 br(Assembler::EQ, DONE); 4974 ldrs(vtmp, Address(str2)); 4975 cmp(cnt2, STUB_THRESHOLD); 4976 br(GE, STUB); 4977 subw(cnt2, cnt2, 4); 4978 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift))); 4979 eor(vtmpZ, T16B, vtmpZ, vtmpZ); 4980 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift))); 4981 sub(cnt1, zr, cnt2, LSL, str1_chr_shift); 4982 zip1(vtmp, T8B, vtmp, vtmpZ); 4983 sub(cnt2, zr, cnt2, LSL, str2_chr_shift); 4984 add(cnt1, cnt1, 8); 4985 fmovd(tmp2, vtmp); 4986 } 4987 adds(cnt2, cnt2, isUL ? 4 : 8); 4988 br(GE, TAIL); 4989 eor(rscratch2, tmp1, tmp2); 4990 cbnz(rscratch2, DIFFERENCE); 4991 // main loop 4992 bind(NEXT_WORD); 4993 if (str1_isL == str2_isL) { 4994 ldr(tmp1, Address(str1, cnt2)); 4995 ldr(tmp2, Address(str2, cnt2)); 4996 adds(cnt2, cnt2, 8); 4997 } else if (isLU) { 4998 ldrs(vtmp, Address(str1, cnt1)); 4999 ldr(tmp2, Address(str2, cnt2)); 5000 add(cnt1, cnt1, 4); 5001 zip1(vtmp, T8B, vtmp, vtmpZ); 5002 fmovd(tmp1, vtmp); 5003 adds(cnt2, cnt2, 8); 5004 } else { // UL 5005 ldrs(vtmp, Address(str2, cnt2)); 5006 ldr(tmp1, Address(str1, cnt1)); 5007 zip1(vtmp, T8B, vtmp, vtmpZ); 5008 add(cnt1, cnt1, 8); 5009 fmovd(tmp2, vtmp); 5010 adds(cnt2, cnt2, 4); 5011 } 5012 br(GE, TAIL); 5013 5014 eor(rscratch2, tmp1, tmp2); 5015 cbz(rscratch2, NEXT_WORD); 5016 b(DIFFERENCE); 5017 bind(TAIL); 5018 eor(rscratch2, tmp1, tmp2); 5019 cbnz(rscratch2, DIFFERENCE); 5020 // Last longword. In the case where length == 4 we compare the 5021 // same longword twice, but that's still faster than another 5022 // conditional branch. 5023 if (str1_isL == str2_isL) { 5024 ldr(tmp1, Address(str1)); 5025 ldr(tmp2, Address(str2)); 5026 } else if (isLU) { 5027 ldrs(vtmp, Address(str1)); 5028 ldr(tmp2, Address(str2)); 5029 zip1(vtmp, T8B, vtmp, vtmpZ); 5030 fmovd(tmp1, vtmp); 5031 } else { // UL 5032 ldrs(vtmp, Address(str2)); 5033 ldr(tmp1, Address(str1)); 5034 zip1(vtmp, T8B, vtmp, vtmpZ); 5035 fmovd(tmp2, vtmp); 5036 } 5037 bind(TAIL_CHECK); 5038 eor(rscratch2, tmp1, tmp2); 5039 cbz(rscratch2, DONE); 5040 5041 // Find the first different characters in the longwords and 5042 // compute their difference. 5043 bind(DIFFERENCE); 5044 rev(rscratch2, rscratch2); 5045 clz(rscratch2, rscratch2); 5046 andr(rscratch2, rscratch2, isLL ? -8 : -16); 5047 lsrv(tmp1, tmp1, rscratch2); 5048 (this->*ext_chr)(tmp1, tmp1); 5049 lsrv(tmp2, tmp2, rscratch2); 5050 (this->*ext_chr)(tmp2, tmp2); 5051 subw(result, tmp1, tmp2); 5052 b(DONE); 5053 } 5054 5055 bind(STUB); 5056 RuntimeAddress stub = NULL; 5057 switch(ae) { 5058 case StrIntrinsicNode::LL: 5059 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LL()); 5060 break; 5061 case StrIntrinsicNode::UU: 5062 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UU()); 5063 break; 5064 case StrIntrinsicNode::LU: 5065 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LU()); 5066 break; 5067 case StrIntrinsicNode::UL: 5068 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UL()); 5069 break; 5070 default: 5071 ShouldNotReachHere(); 5072 } 5073 assert(stub.target() != NULL, "compare_long_string stub has not been generated"); 5074 trampoline_call(stub); 5075 b(DONE); 5076 5077 bind(SHORT_STRING); 5078 // Is the minimum length zero? 5079 cbz(cnt2, DONE); 5080 // arrange code to do most branches while loading and loading next characters 5081 // while comparing previous 5082 (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size))); 5083 subs(cnt2, cnt2, 1); 5084 br(EQ, SHORT_LAST_INIT); 5085 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size))); 5086 b(SHORT_LOOP_START); 5087 bind(SHORT_LOOP); 5088 subs(cnt2, cnt2, 1); 5089 br(EQ, SHORT_LAST); 5090 bind(SHORT_LOOP_START); 5091 (this->*str1_load_chr)(tmp2, Address(post(str1, str1_chr_size))); 5092 (this->*str2_load_chr)(rscratch1, Address(post(str2, str2_chr_size))); 5093 cmp(tmp1, cnt1); 5094 br(NE, SHORT_LOOP_TAIL); 5095 subs(cnt2, cnt2, 1); 5096 br(EQ, SHORT_LAST2); 5097 (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size))); 5098 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size))); 5099 cmp(tmp2, rscratch1); 5100 br(EQ, SHORT_LOOP); 5101 sub(result, tmp2, rscratch1); 5102 b(DONE); 5103 bind(SHORT_LOOP_TAIL); 5104 sub(result, tmp1, cnt1); 5105 b(DONE); 5106 bind(SHORT_LAST2); 5107 cmp(tmp2, rscratch1); 5108 br(EQ, DONE); 5109 sub(result, tmp2, rscratch1); 5110 5111 b(DONE); 5112 bind(SHORT_LAST_INIT); 5113 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size))); 5114 bind(SHORT_LAST); 5115 cmp(tmp1, cnt1); 5116 br(EQ, DONE); 5117 sub(result, tmp1, cnt1); 5118 5119 bind(DONE); 5120 5121 BLOCK_COMMENT("} string_compare"); 5122 } 5123 #endif // COMPILER2 5124 5125 // This method checks if provided byte array contains byte with highest bit set. 5126 void MacroAssembler::has_negatives(Register ary1, Register len, Register result) { 5127 // Simple and most common case of aligned small array which is not at the 5128 // end of memory page is placed here. All other cases are in stub. 5129 Label LOOP, END, STUB, STUB_LONG, SET_RESULT, DONE; 5130 const uint64_t UPPER_BIT_MASK=0x8080808080808080; 5131 assert_different_registers(ary1, len, result); 5132 5133 cmpw(len, 0); 5134 br(LE, SET_RESULT); 5135 cmpw(len, 4 * wordSize); 5136 br(GE, STUB_LONG); // size > 32 then go to stub 5137 5138 int shift = 64 - exact_log2(os::vm_page_size()); 5139 lsl(rscratch1, ary1, shift); 5140 mov(rscratch2, (size_t)(4 * wordSize) << shift); 5141 adds(rscratch2, rscratch1, rscratch2); // At end of page? 5142 br(CS, STUB); // at the end of page then go to stub 5143 subs(len, len, wordSize); 5144 br(LT, END); 5145 5146 BIND(LOOP); 5147 ldr(rscratch1, Address(post(ary1, wordSize))); 5148 tst(rscratch1, UPPER_BIT_MASK); 5149 br(NE, SET_RESULT); 5150 subs(len, len, wordSize); 5151 br(GE, LOOP); 5152 cmpw(len, -wordSize); 5153 br(EQ, SET_RESULT); 5154 5155 BIND(END); 5156 ldr(result, Address(ary1)); 5157 sub(len, zr, len, LSL, 3); // LSL 3 is to get bits from bytes 5158 lslv(result, result, len); 5159 tst(result, UPPER_BIT_MASK); 5160 b(SET_RESULT); 5161 5162 BIND(STUB); 5163 RuntimeAddress has_neg = RuntimeAddress(StubRoutines::aarch64::has_negatives()); 5164 assert(has_neg.target() != NULL, "has_negatives stub has not been generated"); 5165 trampoline_call(has_neg); 5166 b(DONE); 5167 5168 BIND(STUB_LONG); 5169 RuntimeAddress has_neg_long = RuntimeAddress( 5170 StubRoutines::aarch64::has_negatives_long()); 5171 assert(has_neg_long.target() != NULL, "has_negatives stub has not been generated"); 5172 trampoline_call(has_neg_long); 5173 b(DONE); 5174 5175 BIND(SET_RESULT); 5176 cset(result, NE); // set true or false 5177 5178 BIND(DONE); 5179 } 5180 5181 void MacroAssembler::arrays_equals(Register a1, Register a2, Register tmp3, 5182 Register tmp4, Register tmp5, Register result, 5183 Register cnt1, int elem_size) { 5184 Label DONE, SAME; 5185 Register tmp1 = rscratch1; 5186 Register tmp2 = rscratch2; 5187 Register cnt2 = tmp2; // cnt2 only used in array length compare 5188 int elem_per_word = wordSize/elem_size; 5189 int log_elem_size = exact_log2(elem_size); 5190 int length_offset = arrayOopDesc::length_offset_in_bytes(); 5191 int base_offset 5192 = arrayOopDesc::base_offset_in_bytes(elem_size == 2 ? T_CHAR : T_BYTE); 5193 int stubBytesThreshold = 3 * 64 + (UseSIMDForArrayEquals ? 0 : 16); 5194 5195 assert(elem_size == 1 || elem_size == 2, "must be char or byte"); 5196 assert_different_registers(a1, a2, result, cnt1, rscratch1, rscratch2); 5197 5198 #ifndef PRODUCT 5199 { 5200 const char kind = (elem_size == 2) ? 'U' : 'L'; 5201 char comment[64]; 5202 snprintf(comment, sizeof comment, "array_equals%c{", kind); 5203 BLOCK_COMMENT(comment); 5204 } 5205 #endif 5206 5207 // if (a1 == a2) 5208 // return true; 5209 cmpoop(a1, a2); // May have read barriers for a1 and a2. 5210 br(EQ, SAME); 5211 5212 if (UseSimpleArrayEquals) { 5213 Label NEXT_WORD, SHORT, TAIL03, TAIL01, A_MIGHT_BE_NULL, A_IS_NOT_NULL; 5214 // if (a1 == null || a2 == null) 5215 // return false; 5216 // a1 & a2 == 0 means (some-pointer is null) or 5217 // (very-rare-or-even-probably-impossible-pointer-values) 5218 // so, we can save one branch in most cases 5219 tst(a1, a2); 5220 mov(result, false); 5221 br(EQ, A_MIGHT_BE_NULL); 5222 // if (a1.length != a2.length) 5223 // return false; 5224 bind(A_IS_NOT_NULL); 5225 ldrw(cnt1, Address(a1, length_offset)); 5226 ldrw(cnt2, Address(a2, length_offset)); 5227 eorw(tmp5, cnt1, cnt2); 5228 cbnzw(tmp5, DONE); 5229 lea(a1, Address(a1, base_offset)); 5230 lea(a2, Address(a2, base_offset)); 5231 // Check for short strings, i.e. smaller than wordSize. 5232 subs(cnt1, cnt1, elem_per_word); 5233 br(Assembler::LT, SHORT); 5234 // Main 8 byte comparison loop. 5235 bind(NEXT_WORD); { 5236 ldr(tmp1, Address(post(a1, wordSize))); 5237 ldr(tmp2, Address(post(a2, wordSize))); 5238 subs(cnt1, cnt1, elem_per_word); 5239 eor(tmp5, tmp1, tmp2); 5240 cbnz(tmp5, DONE); 5241 } br(GT, NEXT_WORD); 5242 // Last longword. In the case where length == 4 we compare the 5243 // same longword twice, but that's still faster than another 5244 // conditional branch. 5245 // cnt1 could be 0, -1, -2, -3, -4 for chars; -4 only happens when 5246 // length == 4. 5247 if (log_elem_size > 0) 5248 lsl(cnt1, cnt1, log_elem_size); 5249 ldr(tmp3, Address(a1, cnt1)); 5250 ldr(tmp4, Address(a2, cnt1)); 5251 eor(tmp5, tmp3, tmp4); 5252 cbnz(tmp5, DONE); 5253 b(SAME); 5254 bind(A_MIGHT_BE_NULL); 5255 // in case both a1 and a2 are not-null, proceed with loads 5256 cbz(a1, DONE); 5257 cbz(a2, DONE); 5258 b(A_IS_NOT_NULL); 5259 bind(SHORT); 5260 5261 tbz(cnt1, 2 - log_elem_size, TAIL03); // 0-7 bytes left. 5262 { 5263 ldrw(tmp1, Address(post(a1, 4))); 5264 ldrw(tmp2, Address(post(a2, 4))); 5265 eorw(tmp5, tmp1, tmp2); 5266 cbnzw(tmp5, DONE); 5267 } 5268 bind(TAIL03); 5269 tbz(cnt1, 1 - log_elem_size, TAIL01); // 0-3 bytes left. 5270 { 5271 ldrh(tmp3, Address(post(a1, 2))); 5272 ldrh(tmp4, Address(post(a2, 2))); 5273 eorw(tmp5, tmp3, tmp4); 5274 cbnzw(tmp5, DONE); 5275 } 5276 bind(TAIL01); 5277 if (elem_size == 1) { // Only needed when comparing byte arrays. 5278 tbz(cnt1, 0, SAME); // 0-1 bytes left. 5279 { 5280 ldrb(tmp1, a1); 5281 ldrb(tmp2, a2); 5282 eorw(tmp5, tmp1, tmp2); 5283 cbnzw(tmp5, DONE); 5284 } 5285 } 5286 } else { 5287 Label NEXT_DWORD, SHORT, TAIL, TAIL2, STUB, EARLY_OUT, 5288 CSET_EQ, LAST_CHECK; 5289 mov(result, false); 5290 cbz(a1, DONE); 5291 ldrw(cnt1, Address(a1, length_offset)); 5292 cbz(a2, DONE); 5293 ldrw(cnt2, Address(a2, length_offset)); 5294 // on most CPUs a2 is still "locked"(surprisingly) in ldrw and it's 5295 // faster to perform another branch before comparing a1 and a2 5296 cmp(cnt1, (u1)elem_per_word); 5297 br(LE, SHORT); // short or same 5298 ldr(tmp3, Address(pre(a1, base_offset))); 5299 subs(zr, cnt1, stubBytesThreshold); 5300 br(GE, STUB); 5301 ldr(tmp4, Address(pre(a2, base_offset))); 5302 sub(tmp5, zr, cnt1, LSL, 3 + log_elem_size); 5303 cmp(cnt2, cnt1); 5304 br(NE, DONE); 5305 5306 // Main 16 byte comparison loop with 2 exits 5307 bind(NEXT_DWORD); { 5308 ldr(tmp1, Address(pre(a1, wordSize))); 5309 ldr(tmp2, Address(pre(a2, wordSize))); 5310 subs(cnt1, cnt1, 2 * elem_per_word); 5311 br(LE, TAIL); 5312 eor(tmp4, tmp3, tmp4); 5313 cbnz(tmp4, DONE); 5314 ldr(tmp3, Address(pre(a1, wordSize))); 5315 ldr(tmp4, Address(pre(a2, wordSize))); 5316 cmp(cnt1, (u1)elem_per_word); 5317 br(LE, TAIL2); 5318 cmp(tmp1, tmp2); 5319 } br(EQ, NEXT_DWORD); 5320 b(DONE); 5321 5322 bind(TAIL); 5323 eor(tmp4, tmp3, tmp4); 5324 eor(tmp2, tmp1, tmp2); 5325 lslv(tmp2, tmp2, tmp5); 5326 orr(tmp5, tmp4, tmp2); 5327 cmp(tmp5, zr); 5328 b(CSET_EQ); 5329 5330 bind(TAIL2); 5331 eor(tmp2, tmp1, tmp2); 5332 cbnz(tmp2, DONE); 5333 b(LAST_CHECK); 5334 5335 bind(STUB); 5336 ldr(tmp4, Address(pre(a2, base_offset))); 5337 cmp(cnt2, cnt1); 5338 br(NE, DONE); 5339 if (elem_size == 2) { // convert to byte counter 5340 lsl(cnt1, cnt1, 1); 5341 } 5342 eor(tmp5, tmp3, tmp4); 5343 cbnz(tmp5, DONE); 5344 RuntimeAddress stub = RuntimeAddress(StubRoutines::aarch64::large_array_equals()); 5345 assert(stub.target() != NULL, "array_equals_long stub has not been generated"); 5346 trampoline_call(stub); 5347 b(DONE); 5348 5349 bind(EARLY_OUT); 5350 // (a1 != null && a2 == null) || (a1 != null && a2 != null && a1 == a2) 5351 // so, if a2 == null => return false(0), else return true, so we can return a2 5352 mov(result, a2); 5353 b(DONE); 5354 bind(SHORT); 5355 cmp(cnt2, cnt1); 5356 br(NE, DONE); 5357 cbz(cnt1, SAME); 5358 sub(tmp5, zr, cnt1, LSL, 3 + log_elem_size); 5359 ldr(tmp3, Address(a1, base_offset)); 5360 ldr(tmp4, Address(a2, base_offset)); 5361 bind(LAST_CHECK); 5362 eor(tmp4, tmp3, tmp4); 5363 lslv(tmp5, tmp4, tmp5); 5364 cmp(tmp5, zr); 5365 bind(CSET_EQ); 5366 cset(result, EQ); 5367 b(DONE); 5368 } 5369 5370 bind(SAME); 5371 mov(result, true); 5372 // That's it. 5373 bind(DONE); 5374 5375 BLOCK_COMMENT("} array_equals"); 5376 } 5377 5378 // Compare Strings 5379 5380 // For Strings we're passed the address of the first characters in a1 5381 // and a2 and the length in cnt1. 5382 // elem_size is the element size in bytes: either 1 or 2. 5383 // There are two implementations. For arrays >= 8 bytes, all 5384 // comparisons (including the final one, which may overlap) are 5385 // performed 8 bytes at a time. For strings < 8 bytes, we compare a 5386 // halfword, then a short, and then a byte. 5387 5388 void MacroAssembler::string_equals(Register a1, Register a2, 5389 Register result, Register cnt1, int elem_size) 5390 { 5391 Label SAME, DONE, SHORT, NEXT_WORD; 5392 Register tmp1 = rscratch1; 5393 Register tmp2 = rscratch2; 5394 Register cnt2 = tmp2; // cnt2 only used in array length compare 5395 5396 assert(elem_size == 1 || elem_size == 2, "must be 2 or 1 byte"); 5397 assert_different_registers(a1, a2, result, cnt1, rscratch1, rscratch2); 5398 5399 #ifndef PRODUCT 5400 { 5401 const char kind = (elem_size == 2) ? 'U' : 'L'; 5402 char comment[64]; 5403 snprintf(comment, sizeof comment, "{string_equals%c", kind); 5404 BLOCK_COMMENT(comment); 5405 } 5406 #endif 5407 5408 mov(result, false); 5409 5410 // Check for short strings, i.e. smaller than wordSize. 5411 subs(cnt1, cnt1, wordSize); 5412 br(Assembler::LT, SHORT); 5413 // Main 8 byte comparison loop. 5414 bind(NEXT_WORD); { 5415 ldr(tmp1, Address(post(a1, wordSize))); 5416 ldr(tmp2, Address(post(a2, wordSize))); 5417 subs(cnt1, cnt1, wordSize); 5418 eor(tmp1, tmp1, tmp2); 5419 cbnz(tmp1, DONE); 5420 } br(GT, NEXT_WORD); 5421 // Last longword. In the case where length == 4 we compare the 5422 // same longword twice, but that's still faster than another 5423 // conditional branch. 5424 // cnt1 could be 0, -1, -2, -3, -4 for chars; -4 only happens when 5425 // length == 4. 5426 ldr(tmp1, Address(a1, cnt1)); 5427 ldr(tmp2, Address(a2, cnt1)); 5428 eor(tmp2, tmp1, tmp2); 5429 cbnz(tmp2, DONE); 5430 b(SAME); 5431 5432 bind(SHORT); 5433 Label TAIL03, TAIL01; 5434 5435 tbz(cnt1, 2, TAIL03); // 0-7 bytes left. 5436 { 5437 ldrw(tmp1, Address(post(a1, 4))); 5438 ldrw(tmp2, Address(post(a2, 4))); 5439 eorw(tmp1, tmp1, tmp2); 5440 cbnzw(tmp1, DONE); 5441 } 5442 bind(TAIL03); 5443 tbz(cnt1, 1, TAIL01); // 0-3 bytes left. 5444 { 5445 ldrh(tmp1, Address(post(a1, 2))); 5446 ldrh(tmp2, Address(post(a2, 2))); 5447 eorw(tmp1, tmp1, tmp2); 5448 cbnzw(tmp1, DONE); 5449 } 5450 bind(TAIL01); 5451 if (elem_size == 1) { // Only needed when comparing 1-byte elements 5452 tbz(cnt1, 0, SAME); // 0-1 bytes left. 5453 { 5454 ldrb(tmp1, a1); 5455 ldrb(tmp2, a2); 5456 eorw(tmp1, tmp1, tmp2); 5457 cbnzw(tmp1, DONE); 5458 } 5459 } 5460 // Arrays are equal. 5461 bind(SAME); 5462 mov(result, true); 5463 5464 // That's it. 5465 bind(DONE); 5466 BLOCK_COMMENT("} string_equals"); 5467 } 5468 5469 5470 // The size of the blocks erased by the zero_blocks stub. We must 5471 // handle anything smaller than this ourselves in zero_words(). 5472 const int MacroAssembler::zero_words_block_size = 8; 5473 5474 // zero_words() is used by C2 ClearArray patterns. It is as small as 5475 // possible, handling small word counts locally and delegating 5476 // anything larger to the zero_blocks stub. It is expanded many times 5477 // in compiled code, so it is important to keep it short. 5478 5479 // ptr: Address of a buffer to be zeroed. 5480 // cnt: Count in HeapWords. 5481 // 5482 // ptr, cnt, rscratch1, and rscratch2 are clobbered. 5483 void MacroAssembler::zero_words(Register ptr, Register cnt) 5484 { 5485 assert(is_power_of_2(zero_words_block_size), "adjust this"); 5486 assert(ptr == r10 && cnt == r11, "mismatch in register usage"); 5487 5488 BLOCK_COMMENT("zero_words {"); 5489 cmp(cnt, (u1)zero_words_block_size); 5490 Label around; 5491 br(LO, around); 5492 { 5493 RuntimeAddress zero_blocks = RuntimeAddress(StubRoutines::aarch64::zero_blocks()); 5494 assert(zero_blocks.target() != NULL, "zero_blocks stub has not been generated"); 5495 if (StubRoutines::aarch64::complete()) { 5496 trampoline_call(zero_blocks); 5497 } else { 5498 bl(zero_blocks); 5499 } 5500 } 5501 bind(around); 5502 for (int i = zero_words_block_size >> 1; i > 1; i >>= 1) { 5503 Label l; 5504 tbz(cnt, exact_log2(i), l); 5505 for (int j = 0; j < i; j += 2) { 5506 stp(zr, zr, post(ptr, 16)); 5507 } 5508 bind(l); 5509 } 5510 { 5511 Label l; 5512 tbz(cnt, 0, l); 5513 str(zr, Address(ptr)); 5514 bind(l); 5515 } 5516 BLOCK_COMMENT("} zero_words"); 5517 } 5518 5519 // base: Address of a buffer to be zeroed, 8 bytes aligned. 5520 // cnt: Immediate count in HeapWords. 5521 #define SmallArraySize (18 * BytesPerLong) 5522 void MacroAssembler::zero_words(Register base, u_int64_t cnt) 5523 { 5524 BLOCK_COMMENT("zero_words {"); 5525 int i = cnt & 1; // store any odd word to start 5526 if (i) str(zr, Address(base)); 5527 5528 if (cnt <= SmallArraySize / BytesPerLong) { 5529 for (; i < (int)cnt; i += 2) 5530 stp(zr, zr, Address(base, i * wordSize)); 5531 } else { 5532 const int unroll = 4; // Number of stp(zr, zr) instructions we'll unroll 5533 int remainder = cnt % (2 * unroll); 5534 for (; i < remainder; i += 2) 5535 stp(zr, zr, Address(base, i * wordSize)); 5536 5537 Label loop; 5538 Register cnt_reg = rscratch1; 5539 Register loop_base = rscratch2; 5540 cnt = cnt - remainder; 5541 mov(cnt_reg, cnt); 5542 // adjust base and prebias by -2 * wordSize so we can pre-increment 5543 add(loop_base, base, (remainder - 2) * wordSize); 5544 bind(loop); 5545 sub(cnt_reg, cnt_reg, 2 * unroll); 5546 for (i = 1; i < unroll; i++) 5547 stp(zr, zr, Address(loop_base, 2 * i * wordSize)); 5548 stp(zr, zr, Address(pre(loop_base, 2 * unroll * wordSize))); 5549 cbnz(cnt_reg, loop); 5550 } 5551 BLOCK_COMMENT("} zero_words"); 5552 } 5553 5554 // Zero blocks of memory by using DC ZVA. 5555 // 5556 // Aligns the base address first sufficently for DC ZVA, then uses 5557 // DC ZVA repeatedly for every full block. cnt is the size to be 5558 // zeroed in HeapWords. Returns the count of words left to be zeroed 5559 // in cnt. 5560 // 5561 // NOTE: This is intended to be used in the zero_blocks() stub. If 5562 // you want to use it elsewhere, note that cnt must be >= 2*zva_length. 5563 void MacroAssembler::zero_dcache_blocks(Register base, Register cnt) { 5564 Register tmp = rscratch1; 5565 Register tmp2 = rscratch2; 5566 int zva_length = VM_Version::zva_length(); 5567 Label initial_table_end, loop_zva; 5568 Label fini; 5569 5570 // Base must be 16 byte aligned. If not just return and let caller handle it 5571 tst(base, 0x0f); 5572 br(Assembler::NE, fini); 5573 // Align base with ZVA length. 5574 neg(tmp, base); 5575 andr(tmp, tmp, zva_length - 1); 5576 5577 // tmp: the number of bytes to be filled to align the base with ZVA length. 5578 add(base, base, tmp); 5579 sub(cnt, cnt, tmp, Assembler::ASR, 3); 5580 adr(tmp2, initial_table_end); 5581 sub(tmp2, tmp2, tmp, Assembler::LSR, 2); 5582 br(tmp2); 5583 5584 for (int i = -zva_length + 16; i < 0; i += 16) 5585 stp(zr, zr, Address(base, i)); 5586 bind(initial_table_end); 5587 5588 sub(cnt, cnt, zva_length >> 3); 5589 bind(loop_zva); 5590 dc(Assembler::ZVA, base); 5591 subs(cnt, cnt, zva_length >> 3); 5592 add(base, base, zva_length); 5593 br(Assembler::GE, loop_zva); 5594 add(cnt, cnt, zva_length >> 3); // count not zeroed by DC ZVA 5595 bind(fini); 5596 } 5597 5598 // base: Address of a buffer to be filled, 8 bytes aligned. 5599 // cnt: Count in 8-byte unit. 5600 // value: Value to be filled with. 5601 // base will point to the end of the buffer after filling. 5602 void MacroAssembler::fill_words(Register base, Register cnt, Register value) 5603 { 5604 // Algorithm: 5605 // 5606 // scratch1 = cnt & 7; 5607 // cnt -= scratch1; 5608 // p += scratch1; 5609 // switch (scratch1) { 5610 // do { 5611 // cnt -= 8; 5612 // p[-8] = v; 5613 // case 7: 5614 // p[-7] = v; 5615 // case 6: 5616 // p[-6] = v; 5617 // // ... 5618 // case 1: 5619 // p[-1] = v; 5620 // case 0: 5621 // p += 8; 5622 // } while (cnt); 5623 // } 5624 5625 assert_different_registers(base, cnt, value, rscratch1, rscratch2); 5626 5627 Label fini, skip, entry, loop; 5628 const int unroll = 8; // Number of stp instructions we'll unroll 5629 5630 cbz(cnt, fini); 5631 tbz(base, 3, skip); 5632 str(value, Address(post(base, 8))); 5633 sub(cnt, cnt, 1); 5634 bind(skip); 5635 5636 andr(rscratch1, cnt, (unroll-1) * 2); 5637 sub(cnt, cnt, rscratch1); 5638 add(base, base, rscratch1, Assembler::LSL, 3); 5639 adr(rscratch2, entry); 5640 sub(rscratch2, rscratch2, rscratch1, Assembler::LSL, 1); 5641 br(rscratch2); 5642 5643 bind(loop); 5644 add(base, base, unroll * 16); 5645 for (int i = -unroll; i < 0; i++) 5646 stp(value, value, Address(base, i * 16)); 5647 bind(entry); 5648 subs(cnt, cnt, unroll * 2); 5649 br(Assembler::GE, loop); 5650 5651 tbz(cnt, 0, fini); 5652 str(value, Address(post(base, 8))); 5653 bind(fini); 5654 } 5655 5656 // Intrinsic for sun/nio/cs/ISO_8859_1$Encoder.implEncodeISOArray and 5657 // java/lang/StringUTF16.compress. 5658 void MacroAssembler::encode_iso_array(Register src, Register dst, 5659 Register len, Register result, 5660 FloatRegister Vtmp1, FloatRegister Vtmp2, 5661 FloatRegister Vtmp3, FloatRegister Vtmp4) 5662 { 5663 Label DONE, SET_RESULT, NEXT_32, NEXT_32_PRFM, LOOP_8, NEXT_8, LOOP_1, NEXT_1, 5664 NEXT_32_START, NEXT_32_PRFM_START; 5665 Register tmp1 = rscratch1, tmp2 = rscratch2; 5666 5667 mov(result, len); // Save initial len 5668 5669 cmp(len, (u1)8); // handle shortest strings first 5670 br(LT, LOOP_1); 5671 cmp(len, (u1)32); 5672 br(LT, NEXT_8); 5673 // The following code uses the SIMD 'uzp1' and 'uzp2' instructions 5674 // to convert chars to bytes 5675 if (SoftwarePrefetchHintDistance >= 0) { 5676 ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src); 5677 subs(tmp2, len, SoftwarePrefetchHintDistance/2 + 16); 5678 br(LE, NEXT_32_START); 5679 b(NEXT_32_PRFM_START); 5680 BIND(NEXT_32_PRFM); 5681 ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src); 5682 BIND(NEXT_32_PRFM_START); 5683 prfm(Address(src, SoftwarePrefetchHintDistance)); 5684 orr(v4, T16B, Vtmp1, Vtmp2); 5685 orr(v5, T16B, Vtmp3, Vtmp4); 5686 uzp1(Vtmp1, T16B, Vtmp1, Vtmp2); 5687 uzp1(Vtmp3, T16B, Vtmp3, Vtmp4); 5688 uzp2(v5, T16B, v4, v5); // high bytes 5689 umov(tmp2, v5, D, 1); 5690 fmovd(tmp1, v5); 5691 orr(tmp1, tmp1, tmp2); 5692 cbnz(tmp1, LOOP_8); 5693 stpq(Vtmp1, Vtmp3, dst); 5694 sub(len, len, 32); 5695 add(dst, dst, 32); 5696 add(src, src, 64); 5697 subs(tmp2, len, SoftwarePrefetchHintDistance/2 + 16); 5698 br(GE, NEXT_32_PRFM); 5699 cmp(len, (u1)32); 5700 br(LT, LOOP_8); 5701 BIND(NEXT_32); 5702 ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src); 5703 BIND(NEXT_32_START); 5704 } else { 5705 BIND(NEXT_32); 5706 ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src); 5707 } 5708 prfm(Address(src, SoftwarePrefetchHintDistance)); 5709 uzp1(v4, T16B, Vtmp1, Vtmp2); 5710 uzp1(v5, T16B, Vtmp3, Vtmp4); 5711 orr(Vtmp1, T16B, Vtmp1, Vtmp2); 5712 orr(Vtmp3, T16B, Vtmp3, Vtmp4); 5713 uzp2(Vtmp1, T16B, Vtmp1, Vtmp3); // high bytes 5714 umov(tmp2, Vtmp1, D, 1); 5715 fmovd(tmp1, Vtmp1); 5716 orr(tmp1, tmp1, tmp2); 5717 cbnz(tmp1, LOOP_8); 5718 stpq(v4, v5, dst); 5719 sub(len, len, 32); 5720 add(dst, dst, 32); 5721 add(src, src, 64); 5722 cmp(len, (u1)32); 5723 br(GE, NEXT_32); 5724 cbz(len, DONE); 5725 5726 BIND(LOOP_8); 5727 cmp(len, (u1)8); 5728 br(LT, LOOP_1); 5729 BIND(NEXT_8); 5730 ld1(Vtmp1, T8H, src); 5731 uzp1(Vtmp2, T16B, Vtmp1, Vtmp1); // low bytes 5732 uzp2(Vtmp3, T16B, Vtmp1, Vtmp1); // high bytes 5733 fmovd(tmp1, Vtmp3); 5734 cbnz(tmp1, NEXT_1); 5735 strd(Vtmp2, dst); 5736 5737 sub(len, len, 8); 5738 add(dst, dst, 8); 5739 add(src, src, 16); 5740 cmp(len, (u1)8); 5741 br(GE, NEXT_8); 5742 5743 BIND(LOOP_1); 5744 5745 cbz(len, DONE); 5746 BIND(NEXT_1); 5747 ldrh(tmp1, Address(post(src, 2))); 5748 tst(tmp1, 0xff00); 5749 br(NE, SET_RESULT); 5750 strb(tmp1, Address(post(dst, 1))); 5751 subs(len, len, 1); 5752 br(GT, NEXT_1); 5753 5754 BIND(SET_RESULT); 5755 sub(result, result, len); // Return index where we stopped 5756 // Return len == 0 if we processed all 5757 // characters 5758 BIND(DONE); 5759 } 5760 5761 5762 // Inflate byte[] array to char[]. 5763 void MacroAssembler::byte_array_inflate(Register src, Register dst, Register len, 5764 FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3, 5765 Register tmp4) { 5766 Label big, done, after_init, to_stub; 5767 5768 assert_different_registers(src, dst, len, tmp4, rscratch1); 5769 5770 fmovd(vtmp1, zr); 5771 lsrw(tmp4, len, 3); 5772 bind(after_init); 5773 cbnzw(tmp4, big); 5774 // Short string: less than 8 bytes. 5775 { 5776 Label loop, tiny; 5777 5778 cmpw(len, 4); 5779 br(LT, tiny); 5780 // Use SIMD to do 4 bytes. 5781 ldrs(vtmp2, post(src, 4)); 5782 zip1(vtmp3, T8B, vtmp2, vtmp1); 5783 subw(len, len, 4); 5784 strd(vtmp3, post(dst, 8)); 5785 5786 cbzw(len, done); 5787 5788 // Do the remaining bytes by steam. 5789 bind(loop); 5790 ldrb(tmp4, post(src, 1)); 5791 strh(tmp4, post(dst, 2)); 5792 subw(len, len, 1); 5793 5794 bind(tiny); 5795 cbnz(len, loop); 5796 5797 b(done); 5798 } 5799 5800 if (SoftwarePrefetchHintDistance >= 0) { 5801 bind(to_stub); 5802 RuntimeAddress stub = RuntimeAddress(StubRoutines::aarch64::large_byte_array_inflate()); 5803 assert(stub.target() != NULL, "large_byte_array_inflate stub has not been generated"); 5804 trampoline_call(stub); 5805 b(after_init); 5806 } 5807 5808 // Unpack the bytes 8 at a time. 5809 bind(big); 5810 { 5811 Label loop, around, loop_last, loop_start; 5812 5813 if (SoftwarePrefetchHintDistance >= 0) { 5814 const int large_loop_threshold = (64 + 16)/8; 5815 ldrd(vtmp2, post(src, 8)); 5816 andw(len, len, 7); 5817 cmp(tmp4, (u1)large_loop_threshold); 5818 br(GE, to_stub); 5819 b(loop_start); 5820 5821 bind(loop); 5822 ldrd(vtmp2, post(src, 8)); 5823 bind(loop_start); 5824 subs(tmp4, tmp4, 1); 5825 br(EQ, loop_last); 5826 zip1(vtmp2, T16B, vtmp2, vtmp1); 5827 ldrd(vtmp3, post(src, 8)); 5828 st1(vtmp2, T8H, post(dst, 16)); 5829 subs(tmp4, tmp4, 1); 5830 zip1(vtmp3, T16B, vtmp3, vtmp1); 5831 st1(vtmp3, T8H, post(dst, 16)); 5832 br(NE, loop); 5833 b(around); 5834 bind(loop_last); 5835 zip1(vtmp2, T16B, vtmp2, vtmp1); 5836 st1(vtmp2, T8H, post(dst, 16)); 5837 bind(around); 5838 cbz(len, done); 5839 } else { 5840 andw(len, len, 7); 5841 bind(loop); 5842 ldrd(vtmp2, post(src, 8)); 5843 sub(tmp4, tmp4, 1); 5844 zip1(vtmp3, T16B, vtmp2, vtmp1); 5845 st1(vtmp3, T8H, post(dst, 16)); 5846 cbnz(tmp4, loop); 5847 } 5848 } 5849 5850 // Do the tail of up to 8 bytes. 5851 add(src, src, len); 5852 ldrd(vtmp3, Address(src, -8)); 5853 add(dst, dst, len, ext::uxtw, 1); 5854 zip1(vtmp3, T16B, vtmp3, vtmp1); 5855 strq(vtmp3, Address(dst, -16)); 5856 5857 bind(done); 5858 } 5859 5860 // Compress char[] array to byte[]. 5861 void MacroAssembler::char_array_compress(Register src, Register dst, Register len, 5862 FloatRegister tmp1Reg, FloatRegister tmp2Reg, 5863 FloatRegister tmp3Reg, FloatRegister tmp4Reg, 5864 Register result) { 5865 encode_iso_array(src, dst, len, result, 5866 tmp1Reg, tmp2Reg, tmp3Reg, tmp4Reg); 5867 cmp(len, zr); 5868 csel(result, result, zr, EQ); 5869 } 5870 5871 // get_thread() can be called anywhere inside generated code so we 5872 // need to save whatever non-callee save context might get clobbered 5873 // by the call to JavaThread::aarch64_get_thread_helper() or, indeed, 5874 // the call setup code. 5875 // 5876 // aarch64_get_thread_helper() clobbers only r0, r1, and flags. 5877 // 5878 void MacroAssembler::get_thread(Register dst) { 5879 RegSet saved_regs = RegSet::range(r0, r1) + lr - dst; 5880 push(saved_regs, sp); 5881 5882 mov(lr, CAST_FROM_FN_PTR(address, JavaThread::aarch64_get_thread_helper)); 5883 blr(lr); 5884 if (dst != c_rarg0) { 5885 mov(dst, c_rarg0); 5886 } 5887 5888 pop(saved_regs, sp); 5889 } 5890 5891 void MacroAssembler::cache_wb(Address line) { 5892 assert(line.getMode() == Address::base_plus_offset, "mode should be base_plus_offset"); 5893 assert(line.index() == noreg, "index should be noreg"); 5894 assert(line.offset() == 0, "offset should be 0"); 5895 // would like to assert this 5896 // assert(line._ext.shift == 0, "shift should be zero"); 5897 if (VM_Version::supports_dcpop()) { 5898 // writeback using clear virtual address to point of persistence 5899 dc(Assembler::CVAP, line.base()); 5900 } else { 5901 // no need to generate anything as Unsafe.writebackMemory should 5902 // never invoke this stub 5903 } 5904 } 5905 5906 void MacroAssembler::cache_wbsync(bool is_pre) { 5907 // we only need a barrier post sync 5908 if (!is_pre) { 5909 membar(Assembler::AnyAny); 5910 } 5911 }