1 /* 2 * Copyright (c) 1997, 2020, Oracle and/or its affiliates. All rights reserved. 3 * Copyright (c) 2014, 2020, Red Hat Inc. All rights reserved. 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. 9 * 10 * This code is distributed in the hope that it will be useful, but WITHOUT 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 * version 2 for more details (a copy is included in the LICENSE file that 14 * accompanied this code). 15 * 16 * You should have received a copy of the GNU General Public License version 17 * 2 along with this work; if not, write to the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 19 * 20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 21 * or visit www.oracle.com if you need additional information or have any 22 * questions. 23 * 24 */ 25 26 #include <sys/types.h> 27 28 #include "precompiled.hpp" 29 #include "jvm.h" 30 #include "asm/assembler.hpp" 31 #include "asm/assembler.inline.hpp" 32 #include "gc/shared/barrierSet.hpp" 33 #include "gc/shared/cardTable.hpp" 34 #include "gc/shared/barrierSetAssembler.hpp" 35 #include "gc/shared/cardTableBarrierSet.hpp" 36 #include "interpreter/interpreter.hpp" 37 #include "compiler/disassembler.hpp" 38 #include "memory/resourceArea.hpp" 39 #include "memory/universe.hpp" 40 #include "nativeInst_aarch64.hpp" 41 #include "oops/accessDecorators.hpp" 42 #include "oops/compressedOops.inline.hpp" 43 #include "oops/klass.inline.hpp" 44 #include "runtime/biasedLocking.hpp" 45 #include "runtime/icache.hpp" 46 #include "runtime/interfaceSupport.inline.hpp" 47 #include "runtime/jniHandles.inline.hpp" 48 #include "runtime/sharedRuntime.hpp" 49 #include "runtime/thread.hpp" 50 #include "utilities/powerOfTwo.hpp" 51 #ifdef COMPILER1 52 #include "c1/c1_LIRAssembler.hpp" 53 #endif 54 #ifdef COMPILER2 55 #include "oops/oop.hpp" 56 #include "opto/compile.hpp" 57 #include "opto/intrinsicnode.hpp" 58 #include "opto/node.hpp" 59 #endif 60 61 #ifdef PRODUCT 62 #define BLOCK_COMMENT(str) /* nothing */ 63 #define STOP(error) stop(error) 64 #else 65 #define BLOCK_COMMENT(str) block_comment(str) 66 #define STOP(error) block_comment(error); stop(error) 67 #endif 68 69 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 70 71 // Patch any kind of instruction; there may be several instructions. 72 // Return the total length (in bytes) of the instructions. 73 int MacroAssembler::pd_patch_instruction_size(address branch, address target) { 74 int instructions = 1; 75 assert((uint64_t)target < (1ul << 48), "48-bit overflow in address constant"); 76 long offset = (target - branch) >> 2; 77 unsigned insn = *(unsigned*)branch; 78 if ((Instruction_aarch64::extract(insn, 29, 24) & 0b111011) == 0b011000) { 79 // Load register (literal) 80 Instruction_aarch64::spatch(branch, 23, 5, offset); 81 } else if (Instruction_aarch64::extract(insn, 30, 26) == 0b00101) { 82 // Unconditional branch (immediate) 83 Instruction_aarch64::spatch(branch, 25, 0, offset); 84 } else if (Instruction_aarch64::extract(insn, 31, 25) == 0b0101010) { 85 // Conditional branch (immediate) 86 Instruction_aarch64::spatch(branch, 23, 5, offset); 87 } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011010) { 88 // Compare & branch (immediate) 89 Instruction_aarch64::spatch(branch, 23, 5, offset); 90 } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011011) { 91 // Test & branch (immediate) 92 Instruction_aarch64::spatch(branch, 18, 5, offset); 93 } else if (Instruction_aarch64::extract(insn, 28, 24) == 0b10000) { 94 // PC-rel. addressing 95 offset = target-branch; 96 int shift = Instruction_aarch64::extract(insn, 31, 31); 97 if (shift) { 98 u_int64_t dest = (u_int64_t)target; 99 uint64_t pc_page = (uint64_t)branch >> 12; 100 uint64_t adr_page = (uint64_t)target >> 12; 101 unsigned offset_lo = dest & 0xfff; 102 offset = adr_page - pc_page; 103 104 // We handle 4 types of PC relative addressing 105 // 1 - adrp Rx, target_page 106 // ldr/str Ry, [Rx, #offset_in_page] 107 // 2 - adrp Rx, target_page 108 // add Ry, Rx, #offset_in_page 109 // 3 - adrp Rx, target_page (page aligned reloc, offset == 0) 110 // movk Rx, #imm16<<32 111 // 4 - adrp Rx, target_page (page aligned reloc, offset == 0) 112 // In the first 3 cases we must check that Rx is the same in the adrp and the 113 // subsequent ldr/str, add or movk instruction. Otherwise we could accidentally end 114 // up treating a type 4 relocation as a type 1, 2 or 3 just because it happened 115 // to be followed by a random unrelated ldr/str, add or movk instruction. 116 // 117 unsigned insn2 = ((unsigned*)branch)[1]; 118 if (Instruction_aarch64::extract(insn2, 29, 24) == 0b111001 && 119 Instruction_aarch64::extract(insn, 4, 0) == 120 Instruction_aarch64::extract(insn2, 9, 5)) { 121 // Load/store register (unsigned immediate) 122 unsigned size = Instruction_aarch64::extract(insn2, 31, 30); 123 Instruction_aarch64::patch(branch + sizeof (unsigned), 124 21, 10, offset_lo >> size); 125 guarantee(((dest >> size) << size) == dest, "misaligned target"); 126 instructions = 2; 127 } else if (Instruction_aarch64::extract(insn2, 31, 22) == 0b1001000100 && 128 Instruction_aarch64::extract(insn, 4, 0) == 129 Instruction_aarch64::extract(insn2, 4, 0)) { 130 // add (immediate) 131 Instruction_aarch64::patch(branch + sizeof (unsigned), 132 21, 10, offset_lo); 133 instructions = 2; 134 } else if (Instruction_aarch64::extract(insn2, 31, 21) == 0b11110010110 && 135 Instruction_aarch64::extract(insn, 4, 0) == 136 Instruction_aarch64::extract(insn2, 4, 0)) { 137 // movk #imm16<<32 138 Instruction_aarch64::patch(branch + 4, 20, 5, (uint64_t)target >> 32); 139 long dest = ((long)target & 0xffffffffL) | ((long)branch & 0xffff00000000L); 140 long pc_page = (long)branch >> 12; 141 long adr_page = (long)dest >> 12; 142 offset = adr_page - pc_page; 143 instructions = 2; 144 } 145 } 146 int offset_lo = offset & 3; 147 offset >>= 2; 148 Instruction_aarch64::spatch(branch, 23, 5, offset); 149 Instruction_aarch64::patch(branch, 30, 29, offset_lo); 150 } else if (Instruction_aarch64::extract(insn, 31, 21) == 0b11010010100) { 151 u_int64_t dest = (u_int64_t)target; 152 // Move wide constant 153 assert(nativeInstruction_at(branch+4)->is_movk(), "wrong insns in patch"); 154 assert(nativeInstruction_at(branch+8)->is_movk(), "wrong insns in patch"); 155 Instruction_aarch64::patch(branch, 20, 5, dest & 0xffff); 156 Instruction_aarch64::patch(branch+4, 20, 5, (dest >>= 16) & 0xffff); 157 Instruction_aarch64::patch(branch+8, 20, 5, (dest >>= 16) & 0xffff); 158 assert(target_addr_for_insn(branch) == target, "should be"); 159 instructions = 3; 160 } else if (Instruction_aarch64::extract(insn, 31, 22) == 0b1011100101 && 161 Instruction_aarch64::extract(insn, 4, 0) == 0b11111) { 162 // nothing to do 163 assert(target == 0, "did not expect to relocate target for polling page load"); 164 } else { 165 ShouldNotReachHere(); 166 } 167 return instructions * NativeInstruction::instruction_size; 168 } 169 170 int MacroAssembler::patch_oop(address insn_addr, address o) { 171 int instructions; 172 unsigned insn = *(unsigned*)insn_addr; 173 assert(nativeInstruction_at(insn_addr+4)->is_movk(), "wrong insns in patch"); 174 175 // OOPs are either narrow (32 bits) or wide (48 bits). We encode 176 // narrow OOPs by setting the upper 16 bits in the first 177 // instruction. 178 if (Instruction_aarch64::extract(insn, 31, 21) == 0b11010010101) { 179 // Move narrow OOP 180 narrowOop n = CompressedOops::encode((oop)o); 181 Instruction_aarch64::patch(insn_addr, 20, 5, n >> 16); 182 Instruction_aarch64::patch(insn_addr+4, 20, 5, n & 0xffff); 183 instructions = 2; 184 } else { 185 // Move wide OOP 186 assert(nativeInstruction_at(insn_addr+8)->is_movk(), "wrong insns in patch"); 187 uintptr_t dest = (uintptr_t)o; 188 Instruction_aarch64::patch(insn_addr, 20, 5, dest & 0xffff); 189 Instruction_aarch64::patch(insn_addr+4, 20, 5, (dest >>= 16) & 0xffff); 190 Instruction_aarch64::patch(insn_addr+8, 20, 5, (dest >>= 16) & 0xffff); 191 instructions = 3; 192 } 193 return instructions * NativeInstruction::instruction_size; 194 } 195 196 int MacroAssembler::patch_narrow_klass(address insn_addr, narrowKlass n) { 197 // Metatdata pointers are either narrow (32 bits) or wide (48 bits). 198 // We encode narrow ones by setting the upper 16 bits in the first 199 // instruction. 200 NativeInstruction *insn = nativeInstruction_at(insn_addr); 201 assert(Instruction_aarch64::extract(insn->encoding(), 31, 21) == 0b11010010101 && 202 nativeInstruction_at(insn_addr+4)->is_movk(), "wrong insns in patch"); 203 204 Instruction_aarch64::patch(insn_addr, 20, 5, n >> 16); 205 Instruction_aarch64::patch(insn_addr+4, 20, 5, n & 0xffff); 206 return 2 * NativeInstruction::instruction_size; 207 } 208 209 address MacroAssembler::target_addr_for_insn(address insn_addr, unsigned insn) { 210 long offset = 0; 211 if ((Instruction_aarch64::extract(insn, 29, 24) & 0b011011) == 0b00011000) { 212 // Load register (literal) 213 offset = Instruction_aarch64::sextract(insn, 23, 5); 214 return address(((uint64_t)insn_addr + (offset << 2))); 215 } else if (Instruction_aarch64::extract(insn, 30, 26) == 0b00101) { 216 // Unconditional branch (immediate) 217 offset = Instruction_aarch64::sextract(insn, 25, 0); 218 } else if (Instruction_aarch64::extract(insn, 31, 25) == 0b0101010) { 219 // Conditional branch (immediate) 220 offset = Instruction_aarch64::sextract(insn, 23, 5); 221 } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011010) { 222 // Compare & branch (immediate) 223 offset = Instruction_aarch64::sextract(insn, 23, 5); 224 } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011011) { 225 // Test & branch (immediate) 226 offset = Instruction_aarch64::sextract(insn, 18, 5); 227 } else if (Instruction_aarch64::extract(insn, 28, 24) == 0b10000) { 228 // PC-rel. addressing 229 offset = Instruction_aarch64::extract(insn, 30, 29); 230 offset |= Instruction_aarch64::sextract(insn, 23, 5) << 2; 231 int shift = Instruction_aarch64::extract(insn, 31, 31) ? 12 : 0; 232 if (shift) { 233 offset <<= shift; 234 uint64_t target_page = ((uint64_t)insn_addr) + offset; 235 target_page &= ((uint64_t)-1) << shift; 236 // Return the target address for the following sequences 237 // 1 - adrp Rx, target_page 238 // ldr/str Ry, [Rx, #offset_in_page] 239 // 2 - adrp Rx, target_page 240 // add Ry, Rx, #offset_in_page 241 // 3 - adrp Rx, target_page (page aligned reloc, offset == 0) 242 // movk Rx, #imm12<<32 243 // 4 - adrp Rx, target_page (page aligned reloc, offset == 0) 244 // 245 // In the first two cases we check that the register is the same and 246 // return the target_page + the offset within the page. 247 // Otherwise we assume it is a page aligned relocation and return 248 // the target page only. 249 // 250 unsigned insn2 = ((unsigned*)insn_addr)[1]; 251 if (Instruction_aarch64::extract(insn2, 29, 24) == 0b111001 && 252 Instruction_aarch64::extract(insn, 4, 0) == 253 Instruction_aarch64::extract(insn2, 9, 5)) { 254 // Load/store register (unsigned immediate) 255 unsigned int byte_offset = Instruction_aarch64::extract(insn2, 21, 10); 256 unsigned int size = Instruction_aarch64::extract(insn2, 31, 30); 257 return address(target_page + (byte_offset << size)); 258 } else if (Instruction_aarch64::extract(insn2, 31, 22) == 0b1001000100 && 259 Instruction_aarch64::extract(insn, 4, 0) == 260 Instruction_aarch64::extract(insn2, 4, 0)) { 261 // add (immediate) 262 unsigned int byte_offset = Instruction_aarch64::extract(insn2, 21, 10); 263 return address(target_page + byte_offset); 264 } else { 265 if (Instruction_aarch64::extract(insn2, 31, 21) == 0b11110010110 && 266 Instruction_aarch64::extract(insn, 4, 0) == 267 Instruction_aarch64::extract(insn2, 4, 0)) { 268 target_page = (target_page & 0xffffffff) | 269 ((uint64_t)Instruction_aarch64::extract(insn2, 20, 5) << 32); 270 } 271 return (address)target_page; 272 } 273 } else { 274 ShouldNotReachHere(); 275 } 276 } else if (Instruction_aarch64::extract(insn, 31, 23) == 0b110100101) { 277 u_int32_t *insns = (u_int32_t *)insn_addr; 278 // Move wide constant: movz, movk, movk. See movptr(). 279 assert(nativeInstruction_at(insns+1)->is_movk(), "wrong insns in patch"); 280 assert(nativeInstruction_at(insns+2)->is_movk(), "wrong insns in patch"); 281 return address(u_int64_t(Instruction_aarch64::extract(insns[0], 20, 5)) 282 + (u_int64_t(Instruction_aarch64::extract(insns[1], 20, 5)) << 16) 283 + (u_int64_t(Instruction_aarch64::extract(insns[2], 20, 5)) << 32)); 284 } else if (Instruction_aarch64::extract(insn, 31, 22) == 0b1011100101 && 285 Instruction_aarch64::extract(insn, 4, 0) == 0b11111) { 286 return 0; 287 } else { 288 ShouldNotReachHere(); 289 } 290 return address(((uint64_t)insn_addr + (offset << 2))); 291 } 292 293 void MacroAssembler::safepoint_poll(Label& slow_path) { 294 if (SafepointMechanism::uses_thread_local_poll()) { 295 ldr(rscratch1, Address(rthread, Thread::polling_page_offset())); 296 tbnz(rscratch1, exact_log2(SafepointMechanism::poll_bit()), slow_path); 297 } else { 298 unsigned long offset; 299 adrp(rscratch1, ExternalAddress(SafepointSynchronize::address_of_state()), offset); 300 ldrw(rscratch1, Address(rscratch1, offset)); 301 assert(SafepointSynchronize::_not_synchronized == 0, "rewrite this code"); 302 cbnz(rscratch1, slow_path); 303 } 304 } 305 306 // Just like safepoint_poll, but use an acquiring load for thread- 307 // local polling. 308 // 309 // We need an acquire here to ensure that any subsequent load of the 310 // global SafepointSynchronize::_state flag is ordered after this load 311 // of the local Thread::_polling page. We don't want this poll to 312 // return false (i.e. not safepointing) and a later poll of the global 313 // SafepointSynchronize::_state spuriously to return true. 314 // 315 // This is to avoid a race when we're in a native->Java transition 316 // racing the code which wakes up from a safepoint. 317 // 318 void MacroAssembler::safepoint_poll_acquire(Label& slow_path) { 319 if (SafepointMechanism::uses_thread_local_poll()) { 320 lea(rscratch1, Address(rthread, Thread::polling_page_offset())); 321 ldar(rscratch1, rscratch1); 322 tbnz(rscratch1, exact_log2(SafepointMechanism::poll_bit()), slow_path); 323 } else { 324 safepoint_poll(slow_path); 325 } 326 } 327 328 void MacroAssembler::reset_last_Java_frame(bool clear_fp) { 329 // we must set sp to zero to clear frame 330 str(zr, Address(rthread, JavaThread::last_Java_sp_offset())); 331 332 // must clear fp, so that compiled frames are not confused; it is 333 // possible that we need it only for debugging 334 if (clear_fp) { 335 str(zr, Address(rthread, JavaThread::last_Java_fp_offset())); 336 } 337 338 // Always clear the pc because it could have been set by make_walkable() 339 str(zr, Address(rthread, JavaThread::last_Java_pc_offset())); 340 } 341 342 // Calls to C land 343 // 344 // When entering C land, the rfp, & resp of the last Java frame have to be recorded 345 // in the (thread-local) JavaThread object. When leaving C land, the last Java fp 346 // has to be reset to 0. This is required to allow proper stack traversal. 347 void MacroAssembler::set_last_Java_frame(Register last_java_sp, 348 Register last_java_fp, 349 Register last_java_pc, 350 Register scratch) { 351 352 if (last_java_pc->is_valid()) { 353 str(last_java_pc, Address(rthread, 354 JavaThread::frame_anchor_offset() 355 + JavaFrameAnchor::last_Java_pc_offset())); 356 } 357 358 // determine last_java_sp register 359 if (last_java_sp == sp) { 360 mov(scratch, sp); 361 last_java_sp = scratch; 362 } else if (!last_java_sp->is_valid()) { 363 last_java_sp = esp; 364 } 365 366 str(last_java_sp, Address(rthread, JavaThread::last_Java_sp_offset())); 367 368 // last_java_fp is optional 369 if (last_java_fp->is_valid()) { 370 str(last_java_fp, Address(rthread, JavaThread::last_Java_fp_offset())); 371 } 372 } 373 374 void MacroAssembler::set_last_Java_frame(Register last_java_sp, 375 Register last_java_fp, 376 address last_java_pc, 377 Register scratch) { 378 assert(last_java_pc != NULL, "must provide a valid PC"); 379 380 adr(scratch, last_java_pc); 381 str(scratch, Address(rthread, 382 JavaThread::frame_anchor_offset() 383 + JavaFrameAnchor::last_Java_pc_offset())); 384 385 set_last_Java_frame(last_java_sp, last_java_fp, noreg, scratch); 386 } 387 388 void MacroAssembler::set_last_Java_frame(Register last_java_sp, 389 Register last_java_fp, 390 Label &L, 391 Register scratch) { 392 if (L.is_bound()) { 393 set_last_Java_frame(last_java_sp, last_java_fp, target(L), scratch); 394 } else { 395 InstructionMark im(this); 396 L.add_patch_at(code(), locator()); 397 set_last_Java_frame(last_java_sp, last_java_fp, pc() /* Patched later */, scratch); 398 } 399 } 400 401 void MacroAssembler::far_call(Address entry, CodeBuffer *cbuf, Register tmp) { 402 assert(ReservedCodeCacheSize < 4*G, "branch out of range"); 403 assert(CodeCache::find_blob(entry.target()) != NULL, 404 "destination of far call not found in code cache"); 405 if (far_branches()) { 406 unsigned long offset; 407 // We can use ADRP here because we know that the total size of 408 // the code cache cannot exceed 2Gb. 409 adrp(tmp, entry, offset); 410 add(tmp, tmp, offset); 411 if (cbuf) cbuf->set_insts_mark(); 412 blr(tmp); 413 } else { 414 if (cbuf) cbuf->set_insts_mark(); 415 bl(entry); 416 } 417 } 418 419 void MacroAssembler::far_jump(Address entry, CodeBuffer *cbuf, Register tmp) { 420 assert(ReservedCodeCacheSize < 4*G, "branch out of range"); 421 assert(CodeCache::find_blob(entry.target()) != NULL, 422 "destination of far call not found in code cache"); 423 if (far_branches()) { 424 unsigned long offset; 425 // We can use ADRP here because we know that the total size of 426 // the code cache cannot exceed 2Gb. 427 adrp(tmp, entry, offset); 428 add(tmp, tmp, offset); 429 if (cbuf) cbuf->set_insts_mark(); 430 br(tmp); 431 } else { 432 if (cbuf) cbuf->set_insts_mark(); 433 b(entry); 434 } 435 } 436 437 void MacroAssembler::reserved_stack_check() { 438 // testing if reserved zone needs to be enabled 439 Label no_reserved_zone_enabling; 440 441 ldr(rscratch1, Address(rthread, JavaThread::reserved_stack_activation_offset())); 442 cmp(sp, rscratch1); 443 br(Assembler::LO, no_reserved_zone_enabling); 444 445 enter(); // LR and FP are live. 446 lea(rscratch1, CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone)); 447 mov(c_rarg0, rthread); 448 blr(rscratch1); 449 leave(); 450 451 // We have already removed our own frame. 452 // throw_delayed_StackOverflowError will think that it's been 453 // called by our caller. 454 lea(rscratch1, RuntimeAddress(StubRoutines::throw_delayed_StackOverflowError_entry())); 455 br(rscratch1); 456 should_not_reach_here(); 457 458 bind(no_reserved_zone_enabling); 459 } 460 461 int MacroAssembler::biased_locking_enter(Register lock_reg, 462 Register obj_reg, 463 Register swap_reg, 464 Register tmp_reg, 465 bool swap_reg_contains_mark, 466 Label& done, 467 Label* slow_case, 468 BiasedLockingCounters* counters) { 469 assert(UseBiasedLocking, "why call this otherwise?"); 470 assert_different_registers(lock_reg, obj_reg, swap_reg); 471 472 if (PrintBiasedLockingStatistics && counters == NULL) 473 counters = BiasedLocking::counters(); 474 475 assert_different_registers(lock_reg, obj_reg, swap_reg, tmp_reg, rscratch1, rscratch2, noreg); 476 assert(markWord::age_shift == markWord::lock_bits + markWord::biased_lock_bits, "biased locking makes assumptions about bit layout"); 477 Address mark_addr (obj_reg, oopDesc::mark_offset_in_bytes()); 478 Address klass_addr (obj_reg, oopDesc::klass_offset_in_bytes()); 479 Address saved_mark_addr(lock_reg, 0); 480 481 // Biased locking 482 // See whether the lock is currently biased toward our thread and 483 // whether the epoch is still valid 484 // Note that the runtime guarantees sufficient alignment of JavaThread 485 // pointers to allow age to be placed into low bits 486 // First check to see whether biasing is even enabled for this object 487 Label cas_label; 488 int null_check_offset = -1; 489 if (!swap_reg_contains_mark) { 490 null_check_offset = offset(); 491 ldr(swap_reg, mark_addr); 492 } 493 andr(tmp_reg, swap_reg, markWord::biased_lock_mask_in_place); 494 cmp(tmp_reg, (u1)markWord::biased_lock_pattern); 495 br(Assembler::NE, cas_label); 496 // The bias pattern is present in the object's header. Need to check 497 // whether the bias owner and the epoch are both still current. 498 load_prototype_header(tmp_reg, obj_reg); 499 orr(tmp_reg, tmp_reg, rthread); 500 eor(tmp_reg, swap_reg, tmp_reg); 501 andr(tmp_reg, tmp_reg, ~((int) markWord::age_mask_in_place)); 502 if (counters != NULL) { 503 Label around; 504 cbnz(tmp_reg, around); 505 atomic_incw(Address((address)counters->biased_lock_entry_count_addr()), tmp_reg, rscratch1, rscratch2); 506 b(done); 507 bind(around); 508 } else { 509 cbz(tmp_reg, done); 510 } 511 512 Label try_revoke_bias; 513 Label try_rebias; 514 515 // At this point we know that the header has the bias pattern and 516 // that we are not the bias owner in the current epoch. We need to 517 // figure out more details about the state of the header in order to 518 // know what operations can be legally performed on the object's 519 // header. 520 521 // If the low three bits in the xor result aren't clear, that means 522 // the prototype header is no longer biased and we have to revoke 523 // the bias on this object. 524 andr(rscratch1, tmp_reg, markWord::biased_lock_mask_in_place); 525 cbnz(rscratch1, try_revoke_bias); 526 527 // Biasing is still enabled for this data type. See whether the 528 // epoch of the current bias is still valid, meaning that the epoch 529 // bits of the mark word are equal to the epoch bits of the 530 // prototype header. (Note that the prototype header's epoch bits 531 // only change at a safepoint.) If not, attempt to rebias the object 532 // toward the current thread. Note that we must be absolutely sure 533 // that the current epoch is invalid in order to do this because 534 // otherwise the manipulations it performs on the mark word are 535 // illegal. 536 andr(rscratch1, tmp_reg, markWord::epoch_mask_in_place); 537 cbnz(rscratch1, try_rebias); 538 539 // The epoch of the current bias is still valid but we know nothing 540 // about the owner; it might be set or it might be clear. Try to 541 // acquire the bias of the object using an atomic operation. If this 542 // fails we will go in to the runtime to revoke the object's bias. 543 // Note that we first construct the presumed unbiased header so we 544 // don't accidentally blow away another thread's valid bias. 545 { 546 Label here; 547 mov(rscratch1, markWord::biased_lock_mask_in_place | markWord::age_mask_in_place | markWord::epoch_mask_in_place); 548 andr(swap_reg, swap_reg, rscratch1); 549 orr(tmp_reg, swap_reg, rthread); 550 cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, rscratch1, here, slow_case); 551 // If the biasing toward our thread failed, this means that 552 // another thread succeeded in biasing it toward itself and we 553 // need to revoke that bias. The revocation will occur in the 554 // interpreter runtime in the slow case. 555 bind(here); 556 if (counters != NULL) { 557 atomic_incw(Address((address)counters->anonymously_biased_lock_entry_count_addr()), 558 tmp_reg, rscratch1, rscratch2); 559 } 560 } 561 b(done); 562 563 bind(try_rebias); 564 // At this point we know the epoch has expired, meaning that the 565 // current "bias owner", if any, is actually invalid. Under these 566 // circumstances _only_, we are allowed to use the current header's 567 // value as the comparison value when doing the cas to acquire the 568 // bias in the current epoch. In other words, we allow transfer of 569 // the bias from one thread to another directly in this situation. 570 // 571 // FIXME: due to a lack of registers we currently blow away the age 572 // bits in this situation. Should attempt to preserve them. 573 { 574 Label here; 575 load_prototype_header(tmp_reg, obj_reg); 576 orr(tmp_reg, rthread, tmp_reg); 577 cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, rscratch1, here, slow_case); 578 // If the biasing toward our thread failed, then another thread 579 // succeeded in biasing it toward itself and we need to revoke that 580 // bias. The revocation will occur in the runtime in the slow case. 581 bind(here); 582 if (counters != NULL) { 583 atomic_incw(Address((address)counters->rebiased_lock_entry_count_addr()), 584 tmp_reg, rscratch1, rscratch2); 585 } 586 } 587 b(done); 588 589 bind(try_revoke_bias); 590 // The prototype mark in the klass doesn't have the bias bit set any 591 // more, indicating that objects of this data type are not supposed 592 // to be biased any more. We are going to try to reset the mark of 593 // this object to the prototype value and fall through to the 594 // CAS-based locking scheme. Note that if our CAS fails, it means 595 // that another thread raced us for the privilege of revoking the 596 // bias of this particular object, so it's okay to continue in the 597 // normal locking code. 598 // 599 // FIXME: due to a lack of registers we currently blow away the age 600 // bits in this situation. Should attempt to preserve them. 601 { 602 Label here, nope; 603 load_prototype_header(tmp_reg, obj_reg); 604 cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, rscratch1, here, &nope); 605 bind(here); 606 607 // Fall through to the normal CAS-based lock, because no matter what 608 // the result of the above CAS, some thread must have succeeded in 609 // removing the bias bit from the object's header. 610 if (counters != NULL) { 611 atomic_incw(Address((address)counters->revoked_lock_entry_count_addr()), tmp_reg, 612 rscratch1, rscratch2); 613 } 614 bind(nope); 615 } 616 617 bind(cas_label); 618 619 return null_check_offset; 620 } 621 622 void MacroAssembler::biased_locking_exit(Register obj_reg, Register temp_reg, Label& done) { 623 assert(UseBiasedLocking, "why call this otherwise?"); 624 625 // Check for biased locking unlock case, which is a no-op 626 // Note: we do not have to check the thread ID for two reasons. 627 // First, the interpreter checks for IllegalMonitorStateException at 628 // a higher level. Second, if the bias was revoked while we held the 629 // lock, the object could not be rebiased toward another thread, so 630 // the bias bit would be clear. 631 ldr(temp_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes())); 632 andr(temp_reg, temp_reg, markWord::biased_lock_mask_in_place); 633 cmp(temp_reg, (u1)markWord::biased_lock_pattern); 634 br(Assembler::EQ, done); 635 } 636 637 static void pass_arg0(MacroAssembler* masm, Register arg) { 638 if (c_rarg0 != arg ) { 639 masm->mov(c_rarg0, arg); 640 } 641 } 642 643 static void pass_arg1(MacroAssembler* masm, Register arg) { 644 if (c_rarg1 != arg ) { 645 masm->mov(c_rarg1, arg); 646 } 647 } 648 649 static void pass_arg2(MacroAssembler* masm, Register arg) { 650 if (c_rarg2 != arg ) { 651 masm->mov(c_rarg2, arg); 652 } 653 } 654 655 static void pass_arg3(MacroAssembler* masm, Register arg) { 656 if (c_rarg3 != arg ) { 657 masm->mov(c_rarg3, arg); 658 } 659 } 660 661 void MacroAssembler::call_VM_base(Register oop_result, 662 Register java_thread, 663 Register last_java_sp, 664 address entry_point, 665 int number_of_arguments, 666 bool check_exceptions) { 667 // determine java_thread register 668 if (!java_thread->is_valid()) { 669 java_thread = rthread; 670 } 671 672 // determine last_java_sp register 673 if (!last_java_sp->is_valid()) { 674 last_java_sp = esp; 675 } 676 677 // debugging support 678 assert(number_of_arguments >= 0 , "cannot have negative number of arguments"); 679 assert(java_thread == rthread, "unexpected register"); 680 #ifdef ASSERT 681 // TraceBytecodes does not use r12 but saves it over the call, so don't verify 682 // if ((UseCompressedOops || UseCompressedClassPointers) && !TraceBytecodes) verify_heapbase("call_VM_base: heap base corrupted?"); 683 #endif // ASSERT 684 685 assert(java_thread != oop_result , "cannot use the same register for java_thread & oop_result"); 686 assert(java_thread != last_java_sp, "cannot use the same register for java_thread & last_java_sp"); 687 688 // push java thread (becomes first argument of C function) 689 690 mov(c_rarg0, java_thread); 691 692 // set last Java frame before call 693 assert(last_java_sp != rfp, "can't use rfp"); 694 695 Label l; 696 set_last_Java_frame(last_java_sp, rfp, l, rscratch1); 697 698 // do the call, remove parameters 699 MacroAssembler::call_VM_leaf_base(entry_point, number_of_arguments, &l); 700 701 // reset last Java frame 702 // Only interpreter should have to clear fp 703 reset_last_Java_frame(true); 704 705 // C++ interp handles this in the interpreter 706 check_and_handle_popframe(java_thread); 707 check_and_handle_earlyret(java_thread); 708 709 if (check_exceptions) { 710 // check for pending exceptions (java_thread is set upon return) 711 ldr(rscratch1, Address(java_thread, in_bytes(Thread::pending_exception_offset()))); 712 Label ok; 713 cbz(rscratch1, ok); 714 lea(rscratch1, RuntimeAddress(StubRoutines::forward_exception_entry())); 715 br(rscratch1); 716 bind(ok); 717 } 718 719 // get oop result if there is one and reset the value in the thread 720 if (oop_result->is_valid()) { 721 get_vm_result(oop_result, java_thread); 722 } 723 } 724 725 void MacroAssembler::call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions) { 726 call_VM_base(oop_result, noreg, noreg, entry_point, number_of_arguments, check_exceptions); 727 } 728 729 // Maybe emit a call via a trampoline. If the code cache is small 730 // trampolines won't be emitted. 731 732 address MacroAssembler::trampoline_call(Address entry, CodeBuffer *cbuf) { 733 assert(JavaThread::current()->is_Compiler_thread(), "just checking"); 734 assert(entry.rspec().type() == relocInfo::runtime_call_type 735 || entry.rspec().type() == relocInfo::opt_virtual_call_type 736 || entry.rspec().type() == relocInfo::static_call_type 737 || entry.rspec().type() == relocInfo::virtual_call_type, "wrong reloc type"); 738 739 // We need a trampoline if branches are far. 740 if (far_branches()) { 741 bool in_scratch_emit_size = false; 742 #ifdef COMPILER2 743 // We don't want to emit a trampoline if C2 is generating dummy 744 // code during its branch shortening phase. 745 CompileTask* task = ciEnv::current()->task(); 746 in_scratch_emit_size = 747 (task != NULL && is_c2_compile(task->comp_level()) && 748 Compile::current()->in_scratch_emit_size()); 749 #endif 750 if (!in_scratch_emit_size) { 751 address stub = emit_trampoline_stub(offset(), entry.target()); 752 if (stub == NULL) { 753 return NULL; // CodeCache is full 754 } 755 } 756 } 757 758 if (cbuf) cbuf->set_insts_mark(); 759 relocate(entry.rspec()); 760 if (!far_branches()) { 761 bl(entry.target()); 762 } else { 763 bl(pc()); 764 } 765 // just need to return a non-null address 766 return pc(); 767 } 768 769 770 // Emit a trampoline stub for a call to a target which is too far away. 771 // 772 // code sequences: 773 // 774 // call-site: 775 // branch-and-link to <destination> or <trampoline stub> 776 // 777 // Related trampoline stub for this call site in the stub section: 778 // load the call target from the constant pool 779 // branch (LR still points to the call site above) 780 781 address MacroAssembler::emit_trampoline_stub(int insts_call_instruction_offset, 782 address dest) { 783 // Max stub size: alignment nop, TrampolineStub. 784 address stub = start_a_stub(NativeInstruction::instruction_size 785 + NativeCallTrampolineStub::instruction_size); 786 if (stub == NULL) { 787 return NULL; // CodeBuffer::expand failed 788 } 789 790 // Create a trampoline stub relocation which relates this trampoline stub 791 // with the call instruction at insts_call_instruction_offset in the 792 // instructions code-section. 793 align(wordSize); 794 relocate(trampoline_stub_Relocation::spec(code()->insts()->start() 795 + insts_call_instruction_offset)); 796 const int stub_start_offset = offset(); 797 798 // Now, create the trampoline stub's code: 799 // - load the call 800 // - call 801 Label target; 802 ldr(rscratch1, target); 803 br(rscratch1); 804 bind(target); 805 assert(offset() - stub_start_offset == NativeCallTrampolineStub::data_offset, 806 "should be"); 807 emit_int64((int64_t)dest); 808 809 const address stub_start_addr = addr_at(stub_start_offset); 810 811 assert(is_NativeCallTrampolineStub_at(stub_start_addr), "doesn't look like a trampoline"); 812 813 end_a_stub(); 814 return stub_start_addr; 815 } 816 817 void MacroAssembler::emit_static_call_stub() { 818 // CompiledDirectStaticCall::set_to_interpreted knows the 819 // exact layout of this stub. 820 821 isb(); 822 mov_metadata(rmethod, (Metadata*)NULL); 823 824 // Jump to the entry point of the i2c stub. 825 movptr(rscratch1, 0); 826 br(rscratch1); 827 } 828 829 void MacroAssembler::c2bool(Register x) { 830 // implements x == 0 ? 0 : 1 831 // note: must only look at least-significant byte of x 832 // since C-style booleans are stored in one byte 833 // only! (was bug) 834 tst(x, 0xff); 835 cset(x, Assembler::NE); 836 } 837 838 address MacroAssembler::ic_call(address entry, jint method_index) { 839 RelocationHolder rh = virtual_call_Relocation::spec(pc(), method_index); 840 // address const_ptr = long_constant((jlong)Universe::non_oop_word()); 841 // unsigned long offset; 842 // ldr_constant(rscratch2, const_ptr); 843 movptr(rscratch2, (uintptr_t)Universe::non_oop_word()); 844 return trampoline_call(Address(entry, rh)); 845 } 846 847 // Implementation of call_VM versions 848 849 void MacroAssembler::call_VM(Register oop_result, 850 address entry_point, 851 bool check_exceptions) { 852 call_VM_helper(oop_result, entry_point, 0, check_exceptions); 853 } 854 855 void MacroAssembler::call_VM(Register oop_result, 856 address entry_point, 857 Register arg_1, 858 bool check_exceptions) { 859 pass_arg1(this, arg_1); 860 call_VM_helper(oop_result, entry_point, 1, check_exceptions); 861 } 862 863 void MacroAssembler::call_VM(Register oop_result, 864 address entry_point, 865 Register arg_1, 866 Register arg_2, 867 bool check_exceptions) { 868 assert(arg_1 != c_rarg2, "smashed arg"); 869 pass_arg2(this, arg_2); 870 pass_arg1(this, arg_1); 871 call_VM_helper(oop_result, entry_point, 2, check_exceptions); 872 } 873 874 void MacroAssembler::call_VM(Register oop_result, 875 address entry_point, 876 Register arg_1, 877 Register arg_2, 878 Register arg_3, 879 bool check_exceptions) { 880 assert(arg_1 != c_rarg3, "smashed arg"); 881 assert(arg_2 != c_rarg3, "smashed arg"); 882 pass_arg3(this, arg_3); 883 884 assert(arg_1 != c_rarg2, "smashed arg"); 885 pass_arg2(this, arg_2); 886 887 pass_arg1(this, arg_1); 888 call_VM_helper(oop_result, entry_point, 3, check_exceptions); 889 } 890 891 void MacroAssembler::call_VM(Register oop_result, 892 Register last_java_sp, 893 address entry_point, 894 int number_of_arguments, 895 bool check_exceptions) { 896 call_VM_base(oop_result, rthread, last_java_sp, entry_point, number_of_arguments, check_exceptions); 897 } 898 899 void MacroAssembler::call_VM(Register oop_result, 900 Register last_java_sp, 901 address entry_point, 902 Register arg_1, 903 bool check_exceptions) { 904 pass_arg1(this, arg_1); 905 call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions); 906 } 907 908 void MacroAssembler::call_VM(Register oop_result, 909 Register last_java_sp, 910 address entry_point, 911 Register arg_1, 912 Register arg_2, 913 bool check_exceptions) { 914 915 assert(arg_1 != c_rarg2, "smashed arg"); 916 pass_arg2(this, arg_2); 917 pass_arg1(this, arg_1); 918 call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions); 919 } 920 921 void MacroAssembler::call_VM(Register oop_result, 922 Register last_java_sp, 923 address entry_point, 924 Register arg_1, 925 Register arg_2, 926 Register arg_3, 927 bool check_exceptions) { 928 assert(arg_1 != c_rarg3, "smashed arg"); 929 assert(arg_2 != c_rarg3, "smashed arg"); 930 pass_arg3(this, arg_3); 931 assert(arg_1 != c_rarg2, "smashed arg"); 932 pass_arg2(this, arg_2); 933 pass_arg1(this, arg_1); 934 call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions); 935 } 936 937 938 void MacroAssembler::get_vm_result(Register oop_result, Register java_thread) { 939 ldr(oop_result, Address(java_thread, JavaThread::vm_result_offset())); 940 str(zr, Address(java_thread, JavaThread::vm_result_offset())); 941 verify_oop(oop_result, "broken oop in call_VM_base"); 942 } 943 944 void MacroAssembler::get_vm_result_2(Register metadata_result, Register java_thread) { 945 ldr(metadata_result, Address(java_thread, JavaThread::vm_result_2_offset())); 946 str(zr, Address(java_thread, JavaThread::vm_result_2_offset())); 947 } 948 949 void MacroAssembler::align(int modulus) { 950 while (offset() % modulus != 0) nop(); 951 } 952 953 // these are no-ops overridden by InterpreterMacroAssembler 954 955 void MacroAssembler::check_and_handle_earlyret(Register java_thread) { } 956 957 void MacroAssembler::check_and_handle_popframe(Register java_thread) { } 958 959 960 RegisterOrConstant MacroAssembler::delayed_value_impl(intptr_t* delayed_value_addr, 961 Register tmp, 962 int offset) { 963 intptr_t value = *delayed_value_addr; 964 if (value != 0) 965 return RegisterOrConstant(value + offset); 966 967 // load indirectly to solve generation ordering problem 968 ldr(tmp, ExternalAddress((address) delayed_value_addr)); 969 970 if (offset != 0) 971 add(tmp, tmp, offset); 972 973 return RegisterOrConstant(tmp); 974 } 975 976 // Look up the method for a megamorphic invokeinterface call. 977 // The target method is determined by <intf_klass, itable_index>. 978 // The receiver klass is in recv_klass. 979 // On success, the result will be in method_result, and execution falls through. 980 // On failure, execution transfers to the given label. 981 void MacroAssembler::lookup_interface_method(Register recv_klass, 982 Register intf_klass, 983 RegisterOrConstant itable_index, 984 Register method_result, 985 Register scan_temp, 986 Label& L_no_such_interface, 987 bool return_method) { 988 assert_different_registers(recv_klass, intf_klass, scan_temp); 989 assert_different_registers(method_result, intf_klass, scan_temp); 990 assert(recv_klass != method_result || !return_method, 991 "recv_klass can be destroyed when method isn't needed"); 992 assert(itable_index.is_constant() || itable_index.as_register() == method_result, 993 "caller must use same register for non-constant itable index as for method"); 994 995 // Compute start of first itableOffsetEntry (which is at the end of the vtable) 996 int vtable_base = in_bytes(Klass::vtable_start_offset()); 997 int itentry_off = itableMethodEntry::method_offset_in_bytes(); 998 int scan_step = itableOffsetEntry::size() * wordSize; 999 int vte_size = vtableEntry::size_in_bytes(); 1000 assert(vte_size == wordSize, "else adjust times_vte_scale"); 1001 1002 ldrw(scan_temp, Address(recv_klass, Klass::vtable_length_offset())); 1003 1004 // %%% Could store the aligned, prescaled offset in the klassoop. 1005 // lea(scan_temp, Address(recv_klass, scan_temp, times_vte_scale, vtable_base)); 1006 lea(scan_temp, Address(recv_klass, scan_temp, Address::lsl(3))); 1007 add(scan_temp, scan_temp, vtable_base); 1008 1009 if (return_method) { 1010 // Adjust recv_klass by scaled itable_index, so we can free itable_index. 1011 assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below"); 1012 // lea(recv_klass, Address(recv_klass, itable_index, Address::times_ptr, itentry_off)); 1013 lea(recv_klass, Address(recv_klass, itable_index, Address::lsl(3))); 1014 if (itentry_off) 1015 add(recv_klass, recv_klass, itentry_off); 1016 } 1017 1018 // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) { 1019 // if (scan->interface() == intf) { 1020 // result = (klass + scan->offset() + itable_index); 1021 // } 1022 // } 1023 Label search, found_method; 1024 1025 for (int peel = 1; peel >= 0; peel--) { 1026 ldr(method_result, Address(scan_temp, itableOffsetEntry::interface_offset_in_bytes())); 1027 cmp(intf_klass, method_result); 1028 1029 if (peel) { 1030 br(Assembler::EQ, found_method); 1031 } else { 1032 br(Assembler::NE, search); 1033 // (invert the test to fall through to found_method...) 1034 } 1035 1036 if (!peel) break; 1037 1038 bind(search); 1039 1040 // Check that the previous entry is non-null. A null entry means that 1041 // the receiver class doesn't implement the interface, and wasn't the 1042 // same as when the caller was compiled. 1043 cbz(method_result, L_no_such_interface); 1044 add(scan_temp, scan_temp, scan_step); 1045 } 1046 1047 bind(found_method); 1048 1049 // Got a hit. 1050 if (return_method) { 1051 ldrw(scan_temp, Address(scan_temp, itableOffsetEntry::offset_offset_in_bytes())); 1052 ldr(method_result, Address(recv_klass, scan_temp, Address::uxtw(0))); 1053 } 1054 } 1055 1056 // virtual method calling 1057 void MacroAssembler::lookup_virtual_method(Register recv_klass, 1058 RegisterOrConstant vtable_index, 1059 Register method_result) { 1060 const int base = in_bytes(Klass::vtable_start_offset()); 1061 assert(vtableEntry::size() * wordSize == 8, 1062 "adjust the scaling in the code below"); 1063 int vtable_offset_in_bytes = base + vtableEntry::method_offset_in_bytes(); 1064 1065 if (vtable_index.is_register()) { 1066 lea(method_result, Address(recv_klass, 1067 vtable_index.as_register(), 1068 Address::lsl(LogBytesPerWord))); 1069 ldr(method_result, Address(method_result, vtable_offset_in_bytes)); 1070 } else { 1071 vtable_offset_in_bytes += vtable_index.as_constant() * wordSize; 1072 ldr(method_result, 1073 form_address(rscratch1, recv_klass, vtable_offset_in_bytes, 0)); 1074 } 1075 } 1076 1077 void MacroAssembler::check_klass_subtype(Register sub_klass, 1078 Register super_klass, 1079 Register temp_reg, 1080 Label& L_success) { 1081 Label L_failure; 1082 check_klass_subtype_fast_path(sub_klass, super_klass, temp_reg, &L_success, &L_failure, NULL); 1083 check_klass_subtype_slow_path(sub_klass, super_klass, temp_reg, noreg, &L_success, NULL); 1084 bind(L_failure); 1085 } 1086 1087 1088 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass, 1089 Register super_klass, 1090 Register temp_reg, 1091 Label* L_success, 1092 Label* L_failure, 1093 Label* L_slow_path, 1094 RegisterOrConstant super_check_offset) { 1095 assert_different_registers(sub_klass, super_klass, temp_reg); 1096 bool must_load_sco = (super_check_offset.constant_or_zero() == -1); 1097 if (super_check_offset.is_register()) { 1098 assert_different_registers(sub_klass, super_klass, 1099 super_check_offset.as_register()); 1100 } else if (must_load_sco) { 1101 assert(temp_reg != noreg, "supply either a temp or a register offset"); 1102 } 1103 1104 Label L_fallthrough; 1105 int label_nulls = 0; 1106 if (L_success == NULL) { L_success = &L_fallthrough; label_nulls++; } 1107 if (L_failure == NULL) { L_failure = &L_fallthrough; label_nulls++; } 1108 if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; } 1109 assert(label_nulls <= 1, "at most one NULL in the batch"); 1110 1111 int sc_offset = in_bytes(Klass::secondary_super_cache_offset()); 1112 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 1113 Address super_check_offset_addr(super_klass, sco_offset); 1114 1115 // Hacked jmp, which may only be used just before L_fallthrough. 1116 #define final_jmp(label) \ 1117 if (&(label) == &L_fallthrough) { /*do nothing*/ } \ 1118 else b(label) /*omit semi*/ 1119 1120 // If the pointers are equal, we are done (e.g., String[] elements). 1121 // This self-check enables sharing of secondary supertype arrays among 1122 // non-primary types such as array-of-interface. Otherwise, each such 1123 // type would need its own customized SSA. 1124 // We move this check to the front of the fast path because many 1125 // type checks are in fact trivially successful in this manner, 1126 // so we get a nicely predicted branch right at the start of the check. 1127 cmp(sub_klass, super_klass); 1128 br(Assembler::EQ, *L_success); 1129 1130 // Check the supertype display: 1131 if (must_load_sco) { 1132 ldrw(temp_reg, super_check_offset_addr); 1133 super_check_offset = RegisterOrConstant(temp_reg); 1134 } 1135 Address super_check_addr(sub_klass, super_check_offset); 1136 ldr(rscratch1, super_check_addr); 1137 cmp(super_klass, rscratch1); // load displayed supertype 1138 1139 // This check has worked decisively for primary supers. 1140 // Secondary supers are sought in the super_cache ('super_cache_addr'). 1141 // (Secondary supers are interfaces and very deeply nested subtypes.) 1142 // This works in the same check above because of a tricky aliasing 1143 // between the super_cache and the primary super display elements. 1144 // (The 'super_check_addr' can address either, as the case requires.) 1145 // Note that the cache is updated below if it does not help us find 1146 // what we need immediately. 1147 // So if it was a primary super, we can just fail immediately. 1148 // Otherwise, it's the slow path for us (no success at this point). 1149 1150 if (super_check_offset.is_register()) { 1151 br(Assembler::EQ, *L_success); 1152 subs(zr, super_check_offset.as_register(), sc_offset); 1153 if (L_failure == &L_fallthrough) { 1154 br(Assembler::EQ, *L_slow_path); 1155 } else { 1156 br(Assembler::NE, *L_failure); 1157 final_jmp(*L_slow_path); 1158 } 1159 } else if (super_check_offset.as_constant() == sc_offset) { 1160 // Need a slow path; fast failure is impossible. 1161 if (L_slow_path == &L_fallthrough) { 1162 br(Assembler::EQ, *L_success); 1163 } else { 1164 br(Assembler::NE, *L_slow_path); 1165 final_jmp(*L_success); 1166 } 1167 } else { 1168 // No slow path; it's a fast decision. 1169 if (L_failure == &L_fallthrough) { 1170 br(Assembler::EQ, *L_success); 1171 } else { 1172 br(Assembler::NE, *L_failure); 1173 final_jmp(*L_success); 1174 } 1175 } 1176 1177 bind(L_fallthrough); 1178 1179 #undef final_jmp 1180 } 1181 1182 // These two are taken from x86, but they look generally useful 1183 1184 // scans count pointer sized words at [addr] for occurence of value, 1185 // generic 1186 void MacroAssembler::repne_scan(Register addr, Register value, Register count, 1187 Register scratch) { 1188 Label Lloop, Lexit; 1189 cbz(count, Lexit); 1190 bind(Lloop); 1191 ldr(scratch, post(addr, wordSize)); 1192 cmp(value, scratch); 1193 br(EQ, Lexit); 1194 sub(count, count, 1); 1195 cbnz(count, Lloop); 1196 bind(Lexit); 1197 } 1198 1199 // scans count 4 byte words at [addr] for occurence of value, 1200 // generic 1201 void MacroAssembler::repne_scanw(Register addr, Register value, Register count, 1202 Register scratch) { 1203 Label Lloop, Lexit; 1204 cbz(count, Lexit); 1205 bind(Lloop); 1206 ldrw(scratch, post(addr, wordSize)); 1207 cmpw(value, scratch); 1208 br(EQ, Lexit); 1209 sub(count, count, 1); 1210 cbnz(count, Lloop); 1211 bind(Lexit); 1212 } 1213 1214 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass, 1215 Register super_klass, 1216 Register temp_reg, 1217 Register temp2_reg, 1218 Label* L_success, 1219 Label* L_failure, 1220 bool set_cond_codes) { 1221 assert_different_registers(sub_klass, super_klass, temp_reg); 1222 if (temp2_reg != noreg) 1223 assert_different_registers(sub_klass, super_klass, temp_reg, temp2_reg, rscratch1); 1224 #define IS_A_TEMP(reg) ((reg) == temp_reg || (reg) == temp2_reg) 1225 1226 Label L_fallthrough; 1227 int label_nulls = 0; 1228 if (L_success == NULL) { L_success = &L_fallthrough; label_nulls++; } 1229 if (L_failure == NULL) { L_failure = &L_fallthrough; label_nulls++; } 1230 assert(label_nulls <= 1, "at most one NULL in the batch"); 1231 1232 // a couple of useful fields in sub_klass: 1233 int ss_offset = in_bytes(Klass::secondary_supers_offset()); 1234 int sc_offset = in_bytes(Klass::secondary_super_cache_offset()); 1235 Address secondary_supers_addr(sub_klass, ss_offset); 1236 Address super_cache_addr( sub_klass, sc_offset); 1237 1238 BLOCK_COMMENT("check_klass_subtype_slow_path"); 1239 1240 // Do a linear scan of the secondary super-klass chain. 1241 // This code is rarely used, so simplicity is a virtue here. 1242 // The repne_scan instruction uses fixed registers, which we must spill. 1243 // Don't worry too much about pre-existing connections with the input regs. 1244 1245 assert(sub_klass != r0, "killed reg"); // killed by mov(r0, super) 1246 assert(sub_klass != r2, "killed reg"); // killed by lea(r2, &pst_counter) 1247 1248 RegSet pushed_registers; 1249 if (!IS_A_TEMP(r2)) pushed_registers += r2; 1250 if (!IS_A_TEMP(r5)) pushed_registers += r5; 1251 1252 if (super_klass != r0 || UseCompressedOops) { 1253 if (!IS_A_TEMP(r0)) pushed_registers += r0; 1254 } 1255 1256 push(pushed_registers, sp); 1257 1258 // Get super_klass value into r0 (even if it was in r5 or r2). 1259 if (super_klass != r0) { 1260 mov(r0, super_klass); 1261 } 1262 1263 #ifndef PRODUCT 1264 mov(rscratch2, (address)&SharedRuntime::_partial_subtype_ctr); 1265 Address pst_counter_addr(rscratch2); 1266 ldr(rscratch1, pst_counter_addr); 1267 add(rscratch1, rscratch1, 1); 1268 str(rscratch1, pst_counter_addr); 1269 #endif //PRODUCT 1270 1271 // We will consult the secondary-super array. 1272 ldr(r5, secondary_supers_addr); 1273 // Load the array length. 1274 ldrw(r2, Address(r5, Array<Klass*>::length_offset_in_bytes())); 1275 // Skip to start of data. 1276 add(r5, r5, Array<Klass*>::base_offset_in_bytes()); 1277 1278 cmp(sp, zr); // Clear Z flag; SP is never zero 1279 // Scan R2 words at [R5] for an occurrence of R0. 1280 // Set NZ/Z based on last compare. 1281 repne_scan(r5, r0, r2, rscratch1); 1282 1283 // Unspill the temp. registers: 1284 pop(pushed_registers, sp); 1285 1286 br(Assembler::NE, *L_failure); 1287 1288 // Success. Cache the super we found and proceed in triumph. 1289 str(super_klass, super_cache_addr); 1290 1291 if (L_success != &L_fallthrough) { 1292 b(*L_success); 1293 } 1294 1295 #undef IS_A_TEMP 1296 1297 bind(L_fallthrough); 1298 } 1299 1300 void MacroAssembler::clinit_barrier(Register klass, Register scratch, Label* L_fast_path, Label* L_slow_path) { 1301 assert(L_fast_path != NULL || L_slow_path != NULL, "at least one is required"); 1302 assert_different_registers(klass, rthread, scratch); 1303 1304 Label L_fallthrough, L_tmp; 1305 if (L_fast_path == NULL) { 1306 L_fast_path = &L_fallthrough; 1307 } else if (L_slow_path == NULL) { 1308 L_slow_path = &L_fallthrough; 1309 } 1310 // Fast path check: class is fully initialized 1311 ldrb(scratch, Address(klass, InstanceKlass::init_state_offset())); 1312 subs(zr, scratch, InstanceKlass::fully_initialized); 1313 br(Assembler::EQ, *L_fast_path); 1314 1315 // Fast path check: current thread is initializer thread 1316 ldr(scratch, Address(klass, InstanceKlass::init_thread_offset())); 1317 cmp(rthread, scratch); 1318 1319 if (L_slow_path == &L_fallthrough) { 1320 br(Assembler::EQ, *L_fast_path); 1321 bind(*L_slow_path); 1322 } else if (L_fast_path == &L_fallthrough) { 1323 br(Assembler::NE, *L_slow_path); 1324 bind(*L_fast_path); 1325 } else { 1326 Unimplemented(); 1327 } 1328 } 1329 1330 void MacroAssembler::verify_oop(Register reg, const char* s) { 1331 if (!VerifyOops) return; 1332 1333 // Pass register number to verify_oop_subroutine 1334 const char* b = NULL; 1335 { 1336 ResourceMark rm; 1337 stringStream ss; 1338 ss.print("verify_oop: %s: %s", reg->name(), s); 1339 b = code_string(ss.as_string()); 1340 } 1341 BLOCK_COMMENT("verify_oop {"); 1342 1343 stp(r0, rscratch1, Address(pre(sp, -2 * wordSize))); 1344 stp(rscratch2, lr, Address(pre(sp, -2 * wordSize))); 1345 1346 mov(r0, reg); 1347 mov(rscratch1, (address)b); 1348 1349 // call indirectly to solve generation ordering problem 1350 lea(rscratch2, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address())); 1351 ldr(rscratch2, Address(rscratch2)); 1352 blr(rscratch2); 1353 1354 ldp(rscratch2, lr, Address(post(sp, 2 * wordSize))); 1355 ldp(r0, rscratch1, Address(post(sp, 2 * wordSize))); 1356 1357 BLOCK_COMMENT("} verify_oop"); 1358 } 1359 1360 void MacroAssembler::verify_oop_addr(Address addr, const char* s) { 1361 if (!VerifyOops) return; 1362 1363 const char* b = NULL; 1364 { 1365 ResourceMark rm; 1366 stringStream ss; 1367 ss.print("verify_oop_addr: %s", s); 1368 b = code_string(ss.as_string()); 1369 } 1370 BLOCK_COMMENT("verify_oop_addr {"); 1371 1372 stp(r0, rscratch1, Address(pre(sp, -2 * wordSize))); 1373 stp(rscratch2, lr, Address(pre(sp, -2 * wordSize))); 1374 1375 // addr may contain sp so we will have to adjust it based on the 1376 // pushes that we just did. 1377 if (addr.uses(sp)) { 1378 lea(r0, addr); 1379 ldr(r0, Address(r0, 4 * wordSize)); 1380 } else { 1381 ldr(r0, addr); 1382 } 1383 mov(rscratch1, (address)b); 1384 1385 // call indirectly to solve generation ordering problem 1386 lea(rscratch2, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address())); 1387 ldr(rscratch2, Address(rscratch2)); 1388 blr(rscratch2); 1389 1390 ldp(rscratch2, lr, Address(post(sp, 2 * wordSize))); 1391 ldp(r0, rscratch1, Address(post(sp, 2 * wordSize))); 1392 1393 BLOCK_COMMENT("} verify_oop_addr"); 1394 } 1395 1396 Address MacroAssembler::argument_address(RegisterOrConstant arg_slot, 1397 int extra_slot_offset) { 1398 // cf. TemplateTable::prepare_invoke(), if (load_receiver). 1399 int stackElementSize = Interpreter::stackElementSize; 1400 int offset = Interpreter::expr_offset_in_bytes(extra_slot_offset+0); 1401 #ifdef ASSERT 1402 int offset1 = Interpreter::expr_offset_in_bytes(extra_slot_offset+1); 1403 assert(offset1 - offset == stackElementSize, "correct arithmetic"); 1404 #endif 1405 if (arg_slot.is_constant()) { 1406 return Address(esp, arg_slot.as_constant() * stackElementSize 1407 + offset); 1408 } else { 1409 add(rscratch1, esp, arg_slot.as_register(), 1410 ext::uxtx, exact_log2(stackElementSize)); 1411 return Address(rscratch1, offset); 1412 } 1413 } 1414 1415 void MacroAssembler::call_VM_leaf_base(address entry_point, 1416 int number_of_arguments, 1417 Label *retaddr) { 1418 Label E, L; 1419 1420 stp(rscratch1, rmethod, Address(pre(sp, -2 * wordSize))); 1421 1422 mov(rscratch1, entry_point); 1423 blr(rscratch1); 1424 if (retaddr) 1425 bind(*retaddr); 1426 1427 ldp(rscratch1, rmethod, Address(post(sp, 2 * wordSize))); 1428 maybe_isb(); 1429 } 1430 1431 void MacroAssembler::call_VM_leaf(address entry_point, int number_of_arguments) { 1432 call_VM_leaf_base(entry_point, number_of_arguments); 1433 } 1434 1435 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0) { 1436 pass_arg0(this, arg_0); 1437 call_VM_leaf_base(entry_point, 1); 1438 } 1439 1440 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1) { 1441 pass_arg0(this, arg_0); 1442 pass_arg1(this, arg_1); 1443 call_VM_leaf_base(entry_point, 2); 1444 } 1445 1446 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, 1447 Register arg_1, Register arg_2) { 1448 pass_arg0(this, arg_0); 1449 pass_arg1(this, arg_1); 1450 pass_arg2(this, arg_2); 1451 call_VM_leaf_base(entry_point, 3); 1452 } 1453 1454 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0) { 1455 pass_arg0(this, arg_0); 1456 MacroAssembler::call_VM_leaf_base(entry_point, 1); 1457 } 1458 1459 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1) { 1460 1461 assert(arg_0 != c_rarg1, "smashed arg"); 1462 pass_arg1(this, arg_1); 1463 pass_arg0(this, arg_0); 1464 MacroAssembler::call_VM_leaf_base(entry_point, 2); 1465 } 1466 1467 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) { 1468 assert(arg_0 != c_rarg2, "smashed arg"); 1469 assert(arg_1 != c_rarg2, "smashed arg"); 1470 pass_arg2(this, arg_2); 1471 assert(arg_0 != c_rarg1, "smashed arg"); 1472 pass_arg1(this, arg_1); 1473 pass_arg0(this, arg_0); 1474 MacroAssembler::call_VM_leaf_base(entry_point, 3); 1475 } 1476 1477 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) { 1478 assert(arg_0 != c_rarg3, "smashed arg"); 1479 assert(arg_1 != c_rarg3, "smashed arg"); 1480 assert(arg_2 != c_rarg3, "smashed arg"); 1481 pass_arg3(this, arg_3); 1482 assert(arg_0 != c_rarg2, "smashed arg"); 1483 assert(arg_1 != c_rarg2, "smashed arg"); 1484 pass_arg2(this, arg_2); 1485 assert(arg_0 != c_rarg1, "smashed arg"); 1486 pass_arg1(this, arg_1); 1487 pass_arg0(this, arg_0); 1488 MacroAssembler::call_VM_leaf_base(entry_point, 4); 1489 } 1490 1491 void MacroAssembler::null_check(Register reg, int offset) { 1492 if (needs_explicit_null_check(offset)) { 1493 // provoke OS NULL exception if reg = NULL by 1494 // accessing M[reg] w/o changing any registers 1495 // NOTE: this is plenty to provoke a segv 1496 ldr(zr, Address(reg)); 1497 } else { 1498 // nothing to do, (later) access of M[reg + offset] 1499 // will provoke OS NULL exception if reg = NULL 1500 } 1501 } 1502 1503 // MacroAssembler protected routines needed to implement 1504 // public methods 1505 1506 void MacroAssembler::mov(Register r, Address dest) { 1507 code_section()->relocate(pc(), dest.rspec()); 1508 u_int64_t imm64 = (u_int64_t)dest.target(); 1509 movptr(r, imm64); 1510 } 1511 1512 // Move a constant pointer into r. In AArch64 mode the virtual 1513 // address space is 48 bits in size, so we only need three 1514 // instructions to create a patchable instruction sequence that can 1515 // reach anywhere. 1516 void MacroAssembler::movptr(Register r, uintptr_t imm64) { 1517 #ifndef PRODUCT 1518 { 1519 char buffer[64]; 1520 snprintf(buffer, sizeof(buffer), "0x%" PRIX64, imm64); 1521 block_comment(buffer); 1522 } 1523 #endif 1524 assert(imm64 < (1ul << 48), "48-bit overflow in address constant"); 1525 movz(r, imm64 & 0xffff); 1526 imm64 >>= 16; 1527 movk(r, imm64 & 0xffff, 16); 1528 imm64 >>= 16; 1529 movk(r, imm64 & 0xffff, 32); 1530 } 1531 1532 // Macro to mov replicated immediate to vector register. 1533 // Vd will get the following values for different arrangements in T 1534 // imm32 == hex 000000gh T8B: Vd = ghghghghghghghgh 1535 // imm32 == hex 000000gh T16B: Vd = ghghghghghghghghghghghghghghghgh 1536 // imm32 == hex 0000efgh T4H: Vd = efghefghefghefgh 1537 // imm32 == hex 0000efgh T8H: Vd = efghefghefghefghefghefghefghefgh 1538 // imm32 == hex abcdefgh T2S: Vd = abcdefghabcdefgh 1539 // imm32 == hex abcdefgh T4S: Vd = abcdefghabcdefghabcdefghabcdefgh 1540 // T1D/T2D: invalid 1541 void MacroAssembler::mov(FloatRegister Vd, SIMD_Arrangement T, u_int32_t imm32) { 1542 assert(T != T1D && T != T2D, "invalid arrangement"); 1543 if (T == T8B || T == T16B) { 1544 assert((imm32 & ~0xff) == 0, "extraneous bits in unsigned imm32 (T8B/T16B)"); 1545 movi(Vd, T, imm32 & 0xff, 0); 1546 return; 1547 } 1548 u_int32_t nimm32 = ~imm32; 1549 if (T == T4H || T == T8H) { 1550 assert((imm32 & ~0xffff) == 0, "extraneous bits in unsigned imm32 (T4H/T8H)"); 1551 imm32 &= 0xffff; 1552 nimm32 &= 0xffff; 1553 } 1554 u_int32_t x = imm32; 1555 int movi_cnt = 0; 1556 int movn_cnt = 0; 1557 while (x) { if (x & 0xff) movi_cnt++; x >>= 8; } 1558 x = nimm32; 1559 while (x) { if (x & 0xff) movn_cnt++; x >>= 8; } 1560 if (movn_cnt < movi_cnt) imm32 = nimm32; 1561 unsigned lsl = 0; 1562 while (imm32 && (imm32 & 0xff) == 0) { lsl += 8; imm32 >>= 8; } 1563 if (movn_cnt < movi_cnt) 1564 mvni(Vd, T, imm32 & 0xff, lsl); 1565 else 1566 movi(Vd, T, imm32 & 0xff, lsl); 1567 imm32 >>= 8; lsl += 8; 1568 while (imm32) { 1569 while ((imm32 & 0xff) == 0) { lsl += 8; imm32 >>= 8; } 1570 if (movn_cnt < movi_cnt) 1571 bici(Vd, T, imm32 & 0xff, lsl); 1572 else 1573 orri(Vd, T, imm32 & 0xff, lsl); 1574 lsl += 8; imm32 >>= 8; 1575 } 1576 } 1577 1578 void MacroAssembler::mov_immediate64(Register dst, u_int64_t imm64) 1579 { 1580 #ifndef PRODUCT 1581 { 1582 char buffer[64]; 1583 snprintf(buffer, sizeof(buffer), "0x%" PRIX64, imm64); 1584 block_comment(buffer); 1585 } 1586 #endif 1587 if (operand_valid_for_logical_immediate(false, imm64)) { 1588 orr(dst, zr, imm64); 1589 } else { 1590 // we can use a combination of MOVZ or MOVN with 1591 // MOVK to build up the constant 1592 u_int64_t imm_h[4]; 1593 int zero_count = 0; 1594 int neg_count = 0; 1595 int i; 1596 for (i = 0; i < 4; i++) { 1597 imm_h[i] = ((imm64 >> (i * 16)) & 0xffffL); 1598 if (imm_h[i] == 0) { 1599 zero_count++; 1600 } else if (imm_h[i] == 0xffffL) { 1601 neg_count++; 1602 } 1603 } 1604 if (zero_count == 4) { 1605 // one MOVZ will do 1606 movz(dst, 0); 1607 } else if (neg_count == 4) { 1608 // one MOVN will do 1609 movn(dst, 0); 1610 } else if (zero_count == 3) { 1611 for (i = 0; i < 4; i++) { 1612 if (imm_h[i] != 0L) { 1613 movz(dst, (u_int32_t)imm_h[i], (i << 4)); 1614 break; 1615 } 1616 } 1617 } else if (neg_count == 3) { 1618 // one MOVN will do 1619 for (int i = 0; i < 4; i++) { 1620 if (imm_h[i] != 0xffffL) { 1621 movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4)); 1622 break; 1623 } 1624 } 1625 } else if (zero_count == 2) { 1626 // one MOVZ and one MOVK will do 1627 for (i = 0; i < 3; i++) { 1628 if (imm_h[i] != 0L) { 1629 movz(dst, (u_int32_t)imm_h[i], (i << 4)); 1630 i++; 1631 break; 1632 } 1633 } 1634 for (;i < 4; i++) { 1635 if (imm_h[i] != 0L) { 1636 movk(dst, (u_int32_t)imm_h[i], (i << 4)); 1637 } 1638 } 1639 } else if (neg_count == 2) { 1640 // one MOVN and one MOVK will do 1641 for (i = 0; i < 4; i++) { 1642 if (imm_h[i] != 0xffffL) { 1643 movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4)); 1644 i++; 1645 break; 1646 } 1647 } 1648 for (;i < 4; i++) { 1649 if (imm_h[i] != 0xffffL) { 1650 movk(dst, (u_int32_t)imm_h[i], (i << 4)); 1651 } 1652 } 1653 } else if (zero_count == 1) { 1654 // one MOVZ and two MOVKs will do 1655 for (i = 0; i < 4; i++) { 1656 if (imm_h[i] != 0L) { 1657 movz(dst, (u_int32_t)imm_h[i], (i << 4)); 1658 i++; 1659 break; 1660 } 1661 } 1662 for (;i < 4; i++) { 1663 if (imm_h[i] != 0x0L) { 1664 movk(dst, (u_int32_t)imm_h[i], (i << 4)); 1665 } 1666 } 1667 } else if (neg_count == 1) { 1668 // one MOVN and two MOVKs will do 1669 for (i = 0; i < 4; i++) { 1670 if (imm_h[i] != 0xffffL) { 1671 movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4)); 1672 i++; 1673 break; 1674 } 1675 } 1676 for (;i < 4; i++) { 1677 if (imm_h[i] != 0xffffL) { 1678 movk(dst, (u_int32_t)imm_h[i], (i << 4)); 1679 } 1680 } 1681 } else { 1682 // use a MOVZ and 3 MOVKs (makes it easier to debug) 1683 movz(dst, (u_int32_t)imm_h[0], 0); 1684 for (i = 1; i < 4; i++) { 1685 movk(dst, (u_int32_t)imm_h[i], (i << 4)); 1686 } 1687 } 1688 } 1689 } 1690 1691 void MacroAssembler::mov_immediate32(Register dst, u_int32_t imm32) 1692 { 1693 #ifndef PRODUCT 1694 { 1695 char buffer[64]; 1696 snprintf(buffer, sizeof(buffer), "0x%" PRIX32, imm32); 1697 block_comment(buffer); 1698 } 1699 #endif 1700 if (operand_valid_for_logical_immediate(true, imm32)) { 1701 orrw(dst, zr, imm32); 1702 } else { 1703 // we can use MOVZ, MOVN or two calls to MOVK to build up the 1704 // constant 1705 u_int32_t imm_h[2]; 1706 imm_h[0] = imm32 & 0xffff; 1707 imm_h[1] = ((imm32 >> 16) & 0xffff); 1708 if (imm_h[0] == 0) { 1709 movzw(dst, imm_h[1], 16); 1710 } else if (imm_h[0] == 0xffff) { 1711 movnw(dst, imm_h[1] ^ 0xffff, 16); 1712 } else if (imm_h[1] == 0) { 1713 movzw(dst, imm_h[0], 0); 1714 } else if (imm_h[1] == 0xffff) { 1715 movnw(dst, imm_h[0] ^ 0xffff, 0); 1716 } else { 1717 // use a MOVZ and MOVK (makes it easier to debug) 1718 movzw(dst, imm_h[0], 0); 1719 movkw(dst, imm_h[1], 16); 1720 } 1721 } 1722 } 1723 1724 // Form an address from base + offset in Rd. Rd may or may 1725 // not actually be used: you must use the Address that is returned. 1726 // It is up to you to ensure that the shift provided matches the size 1727 // of your data. 1728 Address MacroAssembler::form_address(Register Rd, Register base, long byte_offset, int shift) { 1729 if (Address::offset_ok_for_immed(byte_offset, shift)) 1730 // It fits; no need for any heroics 1731 return Address(base, byte_offset); 1732 1733 // Don't do anything clever with negative or misaligned offsets 1734 unsigned mask = (1 << shift) - 1; 1735 if (byte_offset < 0 || byte_offset & mask) { 1736 mov(Rd, byte_offset); 1737 add(Rd, base, Rd); 1738 return Address(Rd); 1739 } 1740 1741 // See if we can do this with two 12-bit offsets 1742 { 1743 unsigned long word_offset = byte_offset >> shift; 1744 unsigned long masked_offset = word_offset & 0xfff000; 1745 if (Address::offset_ok_for_immed(word_offset - masked_offset, 0) 1746 && Assembler::operand_valid_for_add_sub_immediate(masked_offset << shift)) { 1747 add(Rd, base, masked_offset << shift); 1748 word_offset -= masked_offset; 1749 return Address(Rd, word_offset << shift); 1750 } 1751 } 1752 1753 // Do it the hard way 1754 mov(Rd, byte_offset); 1755 add(Rd, base, Rd); 1756 return Address(Rd); 1757 } 1758 1759 void MacroAssembler::atomic_incw(Register counter_addr, Register tmp, Register tmp2) { 1760 if (UseLSE) { 1761 mov(tmp, 1); 1762 ldadd(Assembler::word, tmp, zr, counter_addr); 1763 return; 1764 } 1765 Label retry_load; 1766 if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH)) 1767 prfm(Address(counter_addr), PSTL1STRM); 1768 bind(retry_load); 1769 // flush and load exclusive from the memory location 1770 ldxrw(tmp, counter_addr); 1771 addw(tmp, tmp, 1); 1772 // if we store+flush with no intervening write tmp wil be zero 1773 stxrw(tmp2, tmp, counter_addr); 1774 cbnzw(tmp2, retry_load); 1775 } 1776 1777 1778 int MacroAssembler::corrected_idivl(Register result, Register ra, Register rb, 1779 bool want_remainder, Register scratch) 1780 { 1781 // Full implementation of Java idiv and irem. The function 1782 // returns the (pc) offset of the div instruction - may be needed 1783 // for implicit exceptions. 1784 // 1785 // constraint : ra/rb =/= scratch 1786 // normal case 1787 // 1788 // input : ra: dividend 1789 // rb: divisor 1790 // 1791 // result: either 1792 // quotient (= ra idiv rb) 1793 // remainder (= ra irem rb) 1794 1795 assert(ra != scratch && rb != scratch, "reg cannot be scratch"); 1796 1797 int idivl_offset = offset(); 1798 if (! want_remainder) { 1799 sdivw(result, ra, rb); 1800 } else { 1801 sdivw(scratch, ra, rb); 1802 Assembler::msubw(result, scratch, rb, ra); 1803 } 1804 1805 return idivl_offset; 1806 } 1807 1808 int MacroAssembler::corrected_idivq(Register result, Register ra, Register rb, 1809 bool want_remainder, Register scratch) 1810 { 1811 // Full implementation of Java ldiv and lrem. The function 1812 // returns the (pc) offset of the div instruction - may be needed 1813 // for implicit exceptions. 1814 // 1815 // constraint : ra/rb =/= scratch 1816 // normal case 1817 // 1818 // input : ra: dividend 1819 // rb: divisor 1820 // 1821 // result: either 1822 // quotient (= ra idiv rb) 1823 // remainder (= ra irem rb) 1824 1825 assert(ra != scratch && rb != scratch, "reg cannot be scratch"); 1826 1827 int idivq_offset = offset(); 1828 if (! want_remainder) { 1829 sdiv(result, ra, rb); 1830 } else { 1831 sdiv(scratch, ra, rb); 1832 Assembler::msub(result, scratch, rb, ra); 1833 } 1834 1835 return idivq_offset; 1836 } 1837 1838 void MacroAssembler::membar(Membar_mask_bits order_constraint) { 1839 address prev = pc() - NativeMembar::instruction_size; 1840 address last = code()->last_insn(); 1841 if (last != NULL && nativeInstruction_at(last)->is_Membar() && prev == last) { 1842 NativeMembar *bar = NativeMembar_at(prev); 1843 // We are merging two memory barrier instructions. On AArch64 we 1844 // can do this simply by ORing them together. 1845 bar->set_kind(bar->get_kind() | order_constraint); 1846 BLOCK_COMMENT("merged membar"); 1847 } else { 1848 code()->set_last_insn(pc()); 1849 dmb(Assembler::barrier(order_constraint)); 1850 } 1851 } 1852 1853 bool MacroAssembler::try_merge_ldst(Register rt, const Address &adr, size_t size_in_bytes, bool is_store) { 1854 if (ldst_can_merge(rt, adr, size_in_bytes, is_store)) { 1855 merge_ldst(rt, adr, size_in_bytes, is_store); 1856 code()->clear_last_insn(); 1857 return true; 1858 } else { 1859 assert(size_in_bytes == 8 || size_in_bytes == 4, "only 8 bytes or 4 bytes load/store is supported."); 1860 const unsigned mask = size_in_bytes - 1; 1861 if (adr.getMode() == Address::base_plus_offset && 1862 (adr.offset() & mask) == 0) { // only supports base_plus_offset. 1863 code()->set_last_insn(pc()); 1864 } 1865 return false; 1866 } 1867 } 1868 1869 void MacroAssembler::ldr(Register Rx, const Address &adr) { 1870 // We always try to merge two adjacent loads into one ldp. 1871 if (!try_merge_ldst(Rx, adr, 8, false)) { 1872 Assembler::ldr(Rx, adr); 1873 } 1874 } 1875 1876 void MacroAssembler::ldrw(Register Rw, const Address &adr) { 1877 // We always try to merge two adjacent loads into one ldp. 1878 if (!try_merge_ldst(Rw, adr, 4, false)) { 1879 Assembler::ldrw(Rw, adr); 1880 } 1881 } 1882 1883 void MacroAssembler::str(Register Rx, const Address &adr) { 1884 // We always try to merge two adjacent stores into one stp. 1885 if (!try_merge_ldst(Rx, adr, 8, true)) { 1886 Assembler::str(Rx, adr); 1887 } 1888 } 1889 1890 void MacroAssembler::strw(Register Rw, const Address &adr) { 1891 // We always try to merge two adjacent stores into one stp. 1892 if (!try_merge_ldst(Rw, adr, 4, true)) { 1893 Assembler::strw(Rw, adr); 1894 } 1895 } 1896 1897 // MacroAssembler routines found actually to be needed 1898 1899 void MacroAssembler::push(Register src) 1900 { 1901 str(src, Address(pre(esp, -1 * wordSize))); 1902 } 1903 1904 void MacroAssembler::pop(Register dst) 1905 { 1906 ldr(dst, Address(post(esp, 1 * wordSize))); 1907 } 1908 1909 // Note: load_unsigned_short used to be called load_unsigned_word. 1910 int MacroAssembler::load_unsigned_short(Register dst, Address src) { 1911 int off = offset(); 1912 ldrh(dst, src); 1913 return off; 1914 } 1915 1916 int MacroAssembler::load_unsigned_byte(Register dst, Address src) { 1917 int off = offset(); 1918 ldrb(dst, src); 1919 return off; 1920 } 1921 1922 int MacroAssembler::load_signed_short(Register dst, Address src) { 1923 int off = offset(); 1924 ldrsh(dst, src); 1925 return off; 1926 } 1927 1928 int MacroAssembler::load_signed_byte(Register dst, Address src) { 1929 int off = offset(); 1930 ldrsb(dst, src); 1931 return off; 1932 } 1933 1934 int MacroAssembler::load_signed_short32(Register dst, Address src) { 1935 int off = offset(); 1936 ldrshw(dst, src); 1937 return off; 1938 } 1939 1940 int MacroAssembler::load_signed_byte32(Register dst, Address src) { 1941 int off = offset(); 1942 ldrsbw(dst, src); 1943 return off; 1944 } 1945 1946 void MacroAssembler::load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed, Register dst2) { 1947 switch (size_in_bytes) { 1948 case 8: ldr(dst, src); break; 1949 case 4: ldrw(dst, src); break; 1950 case 2: is_signed ? load_signed_short(dst, src) : load_unsigned_short(dst, src); break; 1951 case 1: is_signed ? load_signed_byte( dst, src) : load_unsigned_byte( dst, src); break; 1952 default: ShouldNotReachHere(); 1953 } 1954 } 1955 1956 void MacroAssembler::store_sized_value(Address dst, Register src, size_t size_in_bytes, Register src2) { 1957 switch (size_in_bytes) { 1958 case 8: str(src, dst); break; 1959 case 4: strw(src, dst); break; 1960 case 2: strh(src, dst); break; 1961 case 1: strb(src, dst); break; 1962 default: ShouldNotReachHere(); 1963 } 1964 } 1965 1966 void MacroAssembler::decrementw(Register reg, int value) 1967 { 1968 if (value < 0) { incrementw(reg, -value); return; } 1969 if (value == 0) { return; } 1970 if (value < (1 << 12)) { subw(reg, reg, value); return; } 1971 /* else */ { 1972 guarantee(reg != rscratch2, "invalid dst for register decrement"); 1973 movw(rscratch2, (unsigned)value); 1974 subw(reg, reg, rscratch2); 1975 } 1976 } 1977 1978 void MacroAssembler::decrement(Register reg, int value) 1979 { 1980 if (value < 0) { increment(reg, -value); return; } 1981 if (value == 0) { return; } 1982 if (value < (1 << 12)) { sub(reg, reg, value); return; } 1983 /* else */ { 1984 assert(reg != rscratch2, "invalid dst for register decrement"); 1985 mov(rscratch2, (unsigned long)value); 1986 sub(reg, reg, rscratch2); 1987 } 1988 } 1989 1990 void MacroAssembler::decrementw(Address dst, int value) 1991 { 1992 assert(!dst.uses(rscratch1), "invalid dst for address decrement"); 1993 if (dst.getMode() == Address::literal) { 1994 assert(abs(value) < (1 << 12), "invalid value and address mode combination"); 1995 lea(rscratch2, dst); 1996 dst = Address(rscratch2); 1997 } 1998 ldrw(rscratch1, dst); 1999 decrementw(rscratch1, value); 2000 strw(rscratch1, dst); 2001 } 2002 2003 void MacroAssembler::decrement(Address dst, int value) 2004 { 2005 assert(!dst.uses(rscratch1), "invalid address for decrement"); 2006 if (dst.getMode() == Address::literal) { 2007 assert(abs(value) < (1 << 12), "invalid value and address mode combination"); 2008 lea(rscratch2, dst); 2009 dst = Address(rscratch2); 2010 } 2011 ldr(rscratch1, dst); 2012 decrement(rscratch1, value); 2013 str(rscratch1, dst); 2014 } 2015 2016 void MacroAssembler::incrementw(Register reg, int value) 2017 { 2018 if (value < 0) { decrementw(reg, -value); return; } 2019 if (value == 0) { return; } 2020 if (value < (1 << 12)) { addw(reg, reg, value); return; } 2021 /* else */ { 2022 assert(reg != rscratch2, "invalid dst for register increment"); 2023 movw(rscratch2, (unsigned)value); 2024 addw(reg, reg, rscratch2); 2025 } 2026 } 2027 2028 void MacroAssembler::increment(Register reg, int value) 2029 { 2030 if (value < 0) { decrement(reg, -value); return; } 2031 if (value == 0) { return; } 2032 if (value < (1 << 12)) { add(reg, reg, value); return; } 2033 /* else */ { 2034 assert(reg != rscratch2, "invalid dst for register increment"); 2035 movw(rscratch2, (unsigned)value); 2036 add(reg, reg, rscratch2); 2037 } 2038 } 2039 2040 void MacroAssembler::incrementw(Address dst, int value) 2041 { 2042 assert(!dst.uses(rscratch1), "invalid dst for address increment"); 2043 if (dst.getMode() == Address::literal) { 2044 assert(abs(value) < (1 << 12), "invalid value and address mode combination"); 2045 lea(rscratch2, dst); 2046 dst = Address(rscratch2); 2047 } 2048 ldrw(rscratch1, dst); 2049 incrementw(rscratch1, value); 2050 strw(rscratch1, dst); 2051 } 2052 2053 void MacroAssembler::increment(Address dst, int value) 2054 { 2055 assert(!dst.uses(rscratch1), "invalid dst for address increment"); 2056 if (dst.getMode() == Address::literal) { 2057 assert(abs(value) < (1 << 12), "invalid value and address mode combination"); 2058 lea(rscratch2, dst); 2059 dst = Address(rscratch2); 2060 } 2061 ldr(rscratch1, dst); 2062 increment(rscratch1, value); 2063 str(rscratch1, dst); 2064 } 2065 2066 2067 void MacroAssembler::pusha() { 2068 push(0x7fffffff, sp); 2069 } 2070 2071 void MacroAssembler::popa() { 2072 pop(0x7fffffff, sp); 2073 } 2074 2075 // Push lots of registers in the bit set supplied. Don't push sp. 2076 // Return the number of words pushed 2077 int MacroAssembler::push(unsigned int bitset, Register stack) { 2078 int words_pushed = 0; 2079 2080 // Scan bitset to accumulate register pairs 2081 unsigned char regs[32]; 2082 int count = 0; 2083 for (int reg = 0; reg <= 30; reg++) { 2084 if (1 & bitset) 2085 regs[count++] = reg; 2086 bitset >>= 1; 2087 } 2088 regs[count++] = zr->encoding_nocheck(); 2089 count &= ~1; // Only push an even nuber of regs 2090 2091 if (count) { 2092 stp(as_Register(regs[0]), as_Register(regs[1]), 2093 Address(pre(stack, -count * wordSize))); 2094 words_pushed += 2; 2095 } 2096 for (int i = 2; i < count; i += 2) { 2097 stp(as_Register(regs[i]), as_Register(regs[i+1]), 2098 Address(stack, i * wordSize)); 2099 words_pushed += 2; 2100 } 2101 2102 assert(words_pushed == count, "oops, pushed != count"); 2103 2104 return count; 2105 } 2106 2107 int MacroAssembler::pop(unsigned int bitset, Register stack) { 2108 int words_pushed = 0; 2109 2110 // Scan bitset to accumulate register pairs 2111 unsigned char regs[32]; 2112 int count = 0; 2113 for (int reg = 0; reg <= 30; reg++) { 2114 if (1 & bitset) 2115 regs[count++] = reg; 2116 bitset >>= 1; 2117 } 2118 regs[count++] = zr->encoding_nocheck(); 2119 count &= ~1; 2120 2121 for (int i = 2; i < count; i += 2) { 2122 ldp(as_Register(regs[i]), as_Register(regs[i+1]), 2123 Address(stack, i * wordSize)); 2124 words_pushed += 2; 2125 } 2126 if (count) { 2127 ldp(as_Register(regs[0]), as_Register(regs[1]), 2128 Address(post(stack, count * wordSize))); 2129 words_pushed += 2; 2130 } 2131 2132 assert(words_pushed == count, "oops, pushed != count"); 2133 2134 return count; 2135 } 2136 2137 // Push lots of registers in the bit set supplied. Don't push sp. 2138 // Return the number of words pushed 2139 int MacroAssembler::push_fp(unsigned int bitset, Register stack) { 2140 int words_pushed = 0; 2141 2142 // Scan bitset to accumulate register pairs 2143 unsigned char regs[32]; 2144 int count = 0; 2145 for (int reg = 0; reg <= 31; reg++) { 2146 if (1 & bitset) 2147 regs[count++] = reg; 2148 bitset >>= 1; 2149 } 2150 regs[count++] = zr->encoding_nocheck(); 2151 count &= ~1; // Only push an even number of regs 2152 2153 // Always pushing full 128 bit registers. 2154 if (count) { 2155 stpq(as_FloatRegister(regs[0]), as_FloatRegister(regs[1]), Address(pre(stack, -count * wordSize * 2))); 2156 words_pushed += 2; 2157 } 2158 for (int i = 2; i < count; i += 2) { 2159 stpq(as_FloatRegister(regs[i]), as_FloatRegister(regs[i+1]), Address(stack, i * wordSize * 2)); 2160 words_pushed += 2; 2161 } 2162 2163 assert(words_pushed == count, "oops, pushed != count"); 2164 return count; 2165 } 2166 2167 int MacroAssembler::pop_fp(unsigned int bitset, Register stack) { 2168 int words_pushed = 0; 2169 2170 // Scan bitset to accumulate register pairs 2171 unsigned char regs[32]; 2172 int count = 0; 2173 for (int reg = 0; reg <= 31; reg++) { 2174 if (1 & bitset) 2175 regs[count++] = reg; 2176 bitset >>= 1; 2177 } 2178 regs[count++] = zr->encoding_nocheck(); 2179 count &= ~1; 2180 2181 for (int i = 2; i < count; i += 2) { 2182 ldpq(as_FloatRegister(regs[i]), as_FloatRegister(regs[i+1]), Address(stack, i * wordSize * 2)); 2183 words_pushed += 2; 2184 } 2185 if (count) { 2186 ldpq(as_FloatRegister(regs[0]), as_FloatRegister(regs[1]), Address(post(stack, count * wordSize * 2))); 2187 words_pushed += 2; 2188 } 2189 2190 assert(words_pushed == count, "oops, pushed != count"); 2191 2192 return count; 2193 } 2194 2195 #ifdef ASSERT 2196 void MacroAssembler::verify_heapbase(const char* msg) { 2197 #if 0 2198 assert (UseCompressedOops || UseCompressedClassPointers, "should be compressed"); 2199 assert (Universe::heap() != NULL, "java heap should be initialized"); 2200 if (CheckCompressedOops) { 2201 Label ok; 2202 push(1 << rscratch1->encoding(), sp); // cmpptr trashes rscratch1 2203 cmpptr(rheapbase, ExternalAddress((address)CompressedOops::ptrs_base_addr())); 2204 br(Assembler::EQ, ok); 2205 stop(msg); 2206 bind(ok); 2207 pop(1 << rscratch1->encoding(), sp); 2208 } 2209 #endif 2210 } 2211 #endif 2212 2213 void MacroAssembler::resolve_jobject(Register value, Register thread, Register tmp) { 2214 Label done, not_weak; 2215 cbz(value, done); // Use NULL as-is. 2216 2217 STATIC_ASSERT(JNIHandles::weak_tag_mask == 1u); 2218 tbz(r0, 0, not_weak); // Test for jweak tag. 2219 2220 // Resolve jweak. 2221 access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF, value, 2222 Address(value, -JNIHandles::weak_tag_value), tmp, thread); 2223 verify_oop(value); 2224 b(done); 2225 2226 bind(not_weak); 2227 // Resolve (untagged) jobject. 2228 access_load_at(T_OBJECT, IN_NATIVE, value, Address(value, 0), tmp, thread); 2229 verify_oop(value); 2230 bind(done); 2231 } 2232 2233 void MacroAssembler::stop(const char* msg) { 2234 address ip = pc(); 2235 pusha(); 2236 mov(c_rarg0, (address)msg); 2237 mov(c_rarg1, (address)ip); 2238 mov(c_rarg2, sp); 2239 mov(c_rarg3, CAST_FROM_FN_PTR(address, MacroAssembler::debug64)); 2240 blr(c_rarg3); 2241 hlt(0); 2242 } 2243 2244 void MacroAssembler::warn(const char* msg) { 2245 pusha(); 2246 mov(c_rarg0, (address)msg); 2247 mov(lr, CAST_FROM_FN_PTR(address, warning)); 2248 blr(lr); 2249 popa(); 2250 } 2251 2252 void MacroAssembler::unimplemented(const char* what) { 2253 const char* buf = NULL; 2254 { 2255 ResourceMark rm; 2256 stringStream ss; 2257 ss.print("unimplemented: %s", what); 2258 buf = code_string(ss.as_string()); 2259 } 2260 stop(buf); 2261 } 2262 2263 // If a constant does not fit in an immediate field, generate some 2264 // number of MOV instructions and then perform the operation. 2265 void MacroAssembler::wrap_add_sub_imm_insn(Register Rd, Register Rn, unsigned imm, 2266 add_sub_imm_insn insn1, 2267 add_sub_reg_insn insn2) { 2268 assert(Rd != zr, "Rd = zr and not setting flags?"); 2269 if (operand_valid_for_add_sub_immediate((int)imm)) { 2270 (this->*insn1)(Rd, Rn, imm); 2271 } else { 2272 if (uabs(imm) < (1 << 24)) { 2273 (this->*insn1)(Rd, Rn, imm & -(1 << 12)); 2274 (this->*insn1)(Rd, Rd, imm & ((1 << 12)-1)); 2275 } else { 2276 assert_different_registers(Rd, Rn); 2277 mov(Rd, (uint64_t)imm); 2278 (this->*insn2)(Rd, Rn, Rd, LSL, 0); 2279 } 2280 } 2281 } 2282 2283 // Seperate vsn which sets the flags. Optimisations are more restricted 2284 // because we must set the flags correctly. 2285 void MacroAssembler::wrap_adds_subs_imm_insn(Register Rd, Register Rn, unsigned imm, 2286 add_sub_imm_insn insn1, 2287 add_sub_reg_insn insn2) { 2288 if (operand_valid_for_add_sub_immediate((int)imm)) { 2289 (this->*insn1)(Rd, Rn, imm); 2290 } else { 2291 assert_different_registers(Rd, Rn); 2292 assert(Rd != zr, "overflow in immediate operand"); 2293 mov(Rd, (uint64_t)imm); 2294 (this->*insn2)(Rd, Rn, Rd, LSL, 0); 2295 } 2296 } 2297 2298 2299 void MacroAssembler::add(Register Rd, Register Rn, RegisterOrConstant increment) { 2300 if (increment.is_register()) { 2301 add(Rd, Rn, increment.as_register()); 2302 } else { 2303 add(Rd, Rn, increment.as_constant()); 2304 } 2305 } 2306 2307 void MacroAssembler::addw(Register Rd, Register Rn, RegisterOrConstant increment) { 2308 if (increment.is_register()) { 2309 addw(Rd, Rn, increment.as_register()); 2310 } else { 2311 addw(Rd, Rn, increment.as_constant()); 2312 } 2313 } 2314 2315 void MacroAssembler::sub(Register Rd, Register Rn, RegisterOrConstant decrement) { 2316 if (decrement.is_register()) { 2317 sub(Rd, Rn, decrement.as_register()); 2318 } else { 2319 sub(Rd, Rn, decrement.as_constant()); 2320 } 2321 } 2322 2323 void MacroAssembler::subw(Register Rd, Register Rn, RegisterOrConstant decrement) { 2324 if (decrement.is_register()) { 2325 subw(Rd, Rn, decrement.as_register()); 2326 } else { 2327 subw(Rd, Rn, decrement.as_constant()); 2328 } 2329 } 2330 2331 void MacroAssembler::reinit_heapbase() 2332 { 2333 if (UseCompressedOops) { 2334 if (Universe::is_fully_initialized()) { 2335 mov(rheapbase, CompressedOops::ptrs_base()); 2336 } else { 2337 lea(rheapbase, ExternalAddress((address)CompressedOops::ptrs_base_addr())); 2338 ldr(rheapbase, Address(rheapbase)); 2339 } 2340 } 2341 } 2342 2343 // this simulates the behaviour of the x86 cmpxchg instruction using a 2344 // load linked/store conditional pair. we use the acquire/release 2345 // versions of these instructions so that we flush pending writes as 2346 // per Java semantics. 2347 2348 // n.b the x86 version assumes the old value to be compared against is 2349 // in rax and updates rax with the value located in memory if the 2350 // cmpxchg fails. we supply a register for the old value explicitly 2351 2352 // the aarch64 load linked/store conditional instructions do not 2353 // accept an offset. so, unlike x86, we must provide a plain register 2354 // to identify the memory word to be compared/exchanged rather than a 2355 // register+offset Address. 2356 2357 void MacroAssembler::cmpxchgptr(Register oldv, Register newv, Register addr, Register tmp, 2358 Label &succeed, Label *fail) { 2359 // oldv holds comparison value 2360 // newv holds value to write in exchange 2361 // addr identifies memory word to compare against/update 2362 if (UseLSE) { 2363 mov(tmp, oldv); 2364 casal(Assembler::xword, oldv, newv, addr); 2365 cmp(tmp, oldv); 2366 br(Assembler::EQ, succeed); 2367 membar(AnyAny); 2368 } else { 2369 Label retry_load, nope; 2370 if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH)) 2371 prfm(Address(addr), PSTL1STRM); 2372 bind(retry_load); 2373 // flush and load exclusive from the memory location 2374 // and fail if it is not what we expect 2375 ldaxr(tmp, addr); 2376 cmp(tmp, oldv); 2377 br(Assembler::NE, nope); 2378 // if we store+flush with no intervening write tmp wil be zero 2379 stlxr(tmp, newv, addr); 2380 cbzw(tmp, succeed); 2381 // retry so we only ever return after a load fails to compare 2382 // ensures we don't return a stale value after a failed write. 2383 b(retry_load); 2384 // if the memory word differs we return it in oldv and signal a fail 2385 bind(nope); 2386 membar(AnyAny); 2387 mov(oldv, tmp); 2388 } 2389 if (fail) 2390 b(*fail); 2391 } 2392 2393 void MacroAssembler::cmpxchg_obj_header(Register oldv, Register newv, Register obj, Register tmp, 2394 Label &succeed, Label *fail) { 2395 assert(oopDesc::mark_offset_in_bytes() == 0, "assumption"); 2396 cmpxchgptr(oldv, newv, obj, tmp, succeed, fail); 2397 } 2398 2399 void MacroAssembler::cmpxchgw(Register oldv, Register newv, Register addr, Register tmp, 2400 Label &succeed, Label *fail) { 2401 // oldv holds comparison value 2402 // newv holds value to write in exchange 2403 // addr identifies memory word to compare against/update 2404 // tmp returns 0/1 for success/failure 2405 if (UseLSE) { 2406 mov(tmp, oldv); 2407 casal(Assembler::word, oldv, newv, addr); 2408 cmp(tmp, oldv); 2409 br(Assembler::EQ, succeed); 2410 membar(AnyAny); 2411 } else { 2412 Label retry_load, nope; 2413 if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH)) 2414 prfm(Address(addr), PSTL1STRM); 2415 bind(retry_load); 2416 // flush and load exclusive from the memory location 2417 // and fail if it is not what we expect 2418 ldaxrw(tmp, addr); 2419 cmp(tmp, oldv); 2420 br(Assembler::NE, nope); 2421 // if we store+flush with no intervening write tmp wil be zero 2422 stlxrw(tmp, newv, addr); 2423 cbzw(tmp, succeed); 2424 // retry so we only ever return after a load fails to compare 2425 // ensures we don't return a stale value after a failed write. 2426 b(retry_load); 2427 // if the memory word differs we return it in oldv and signal a fail 2428 bind(nope); 2429 membar(AnyAny); 2430 mov(oldv, tmp); 2431 } 2432 if (fail) 2433 b(*fail); 2434 } 2435 2436 // A generic CAS; success or failure is in the EQ flag. A weak CAS 2437 // doesn't retry and may fail spuriously. If the oldval is wanted, 2438 // Pass a register for the result, otherwise pass noreg. 2439 2440 // Clobbers rscratch1 2441 void MacroAssembler::cmpxchg(Register addr, Register expected, 2442 Register new_val, 2443 enum operand_size size, 2444 bool acquire, bool release, 2445 bool weak, 2446 Register result) { 2447 if (result == noreg) result = rscratch1; 2448 BLOCK_COMMENT("cmpxchg {"); 2449 if (UseLSE) { 2450 mov(result, expected); 2451 lse_cas(result, new_val, addr, size, acquire, release, /*not_pair*/ true); 2452 compare_eq(result, expected, size); 2453 } else { 2454 Label retry_load, done; 2455 if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH)) 2456 prfm(Address(addr), PSTL1STRM); 2457 bind(retry_load); 2458 load_exclusive(result, addr, size, acquire); 2459 compare_eq(result, expected, size); 2460 br(Assembler::NE, done); 2461 store_exclusive(rscratch1, new_val, addr, size, release); 2462 if (weak) { 2463 cmpw(rscratch1, 0u); // If the store fails, return NE to our caller. 2464 } else { 2465 cbnzw(rscratch1, retry_load); 2466 } 2467 bind(done); 2468 } 2469 BLOCK_COMMENT("} cmpxchg"); 2470 } 2471 2472 // A generic comparison. Only compares for equality, clobbers rscratch1. 2473 void MacroAssembler::compare_eq(Register rm, Register rn, enum operand_size size) { 2474 if (size == xword) { 2475 cmp(rm, rn); 2476 } else if (size == word) { 2477 cmpw(rm, rn); 2478 } else if (size == halfword) { 2479 eorw(rscratch1, rm, rn); 2480 ands(zr, rscratch1, 0xffff); 2481 } else if (size == byte) { 2482 eorw(rscratch1, rm, rn); 2483 ands(zr, rscratch1, 0xff); 2484 } else { 2485 ShouldNotReachHere(); 2486 } 2487 } 2488 2489 2490 static bool different(Register a, RegisterOrConstant b, Register c) { 2491 if (b.is_constant()) 2492 return a != c; 2493 else 2494 return a != b.as_register() && a != c && b.as_register() != c; 2495 } 2496 2497 #define ATOMIC_OP(NAME, LDXR, OP, IOP, AOP, STXR, sz) \ 2498 void MacroAssembler::atomic_##NAME(Register prev, RegisterOrConstant incr, Register addr) { \ 2499 if (UseLSE) { \ 2500 prev = prev->is_valid() ? prev : zr; \ 2501 if (incr.is_register()) { \ 2502 AOP(sz, incr.as_register(), prev, addr); \ 2503 } else { \ 2504 mov(rscratch2, incr.as_constant()); \ 2505 AOP(sz, rscratch2, prev, addr); \ 2506 } \ 2507 return; \ 2508 } \ 2509 Register result = rscratch2; \ 2510 if (prev->is_valid()) \ 2511 result = different(prev, incr, addr) ? prev : rscratch2; \ 2512 \ 2513 Label retry_load; \ 2514 if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH)) \ 2515 prfm(Address(addr), PSTL1STRM); \ 2516 bind(retry_load); \ 2517 LDXR(result, addr); \ 2518 OP(rscratch1, result, incr); \ 2519 STXR(rscratch2, rscratch1, addr); \ 2520 cbnzw(rscratch2, retry_load); \ 2521 if (prev->is_valid() && prev != result) { \ 2522 IOP(prev, rscratch1, incr); \ 2523 } \ 2524 } 2525 2526 ATOMIC_OP(add, ldxr, add, sub, ldadd, stxr, Assembler::xword) 2527 ATOMIC_OP(addw, ldxrw, addw, subw, ldadd, stxrw, Assembler::word) 2528 ATOMIC_OP(addal, ldaxr, add, sub, ldaddal, stlxr, Assembler::xword) 2529 ATOMIC_OP(addalw, ldaxrw, addw, subw, ldaddal, stlxrw, Assembler::word) 2530 2531 #undef ATOMIC_OP 2532 2533 #define ATOMIC_XCHG(OP, AOP, LDXR, STXR, sz) \ 2534 void MacroAssembler::atomic_##OP(Register prev, Register newv, Register addr) { \ 2535 if (UseLSE) { \ 2536 prev = prev->is_valid() ? prev : zr; \ 2537 AOP(sz, newv, prev, addr); \ 2538 return; \ 2539 } \ 2540 Register result = rscratch2; \ 2541 if (prev->is_valid()) \ 2542 result = different(prev, newv, addr) ? prev : rscratch2; \ 2543 \ 2544 Label retry_load; \ 2545 if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH)) \ 2546 prfm(Address(addr), PSTL1STRM); \ 2547 bind(retry_load); \ 2548 LDXR(result, addr); \ 2549 STXR(rscratch1, newv, addr); \ 2550 cbnzw(rscratch1, retry_load); \ 2551 if (prev->is_valid() && prev != result) \ 2552 mov(prev, result); \ 2553 } 2554 2555 ATOMIC_XCHG(xchg, swp, ldxr, stxr, Assembler::xword) 2556 ATOMIC_XCHG(xchgw, swp, ldxrw, stxrw, Assembler::word) 2557 ATOMIC_XCHG(xchgal, swpal, ldaxr, stlxr, Assembler::xword) 2558 ATOMIC_XCHG(xchgalw, swpal, ldaxrw, stlxrw, Assembler::word) 2559 2560 #undef ATOMIC_XCHG 2561 2562 #ifndef PRODUCT 2563 extern "C" void findpc(intptr_t x); 2564 #endif 2565 2566 void MacroAssembler::debug64(char* msg, int64_t pc, int64_t regs[]) 2567 { 2568 // In order to get locks to work, we need to fake a in_VM state 2569 if (ShowMessageBoxOnError ) { 2570 JavaThread* thread = JavaThread::current(); 2571 JavaThreadState saved_state = thread->thread_state(); 2572 thread->set_thread_state(_thread_in_vm); 2573 #ifndef PRODUCT 2574 if (CountBytecodes || TraceBytecodes || StopInterpreterAt) { 2575 ttyLocker ttyl; 2576 BytecodeCounter::print(); 2577 } 2578 #endif 2579 if (os::message_box(msg, "Execution stopped, print registers?")) { 2580 ttyLocker ttyl; 2581 tty->print_cr(" pc = 0x%016lx", pc); 2582 #ifndef PRODUCT 2583 tty->cr(); 2584 findpc(pc); 2585 tty->cr(); 2586 #endif 2587 tty->print_cr(" r0 = 0x%016lx", regs[0]); 2588 tty->print_cr(" r1 = 0x%016lx", regs[1]); 2589 tty->print_cr(" r2 = 0x%016lx", regs[2]); 2590 tty->print_cr(" r3 = 0x%016lx", regs[3]); 2591 tty->print_cr(" r4 = 0x%016lx", regs[4]); 2592 tty->print_cr(" r5 = 0x%016lx", regs[5]); 2593 tty->print_cr(" r6 = 0x%016lx", regs[6]); 2594 tty->print_cr(" r7 = 0x%016lx", regs[7]); 2595 tty->print_cr(" r8 = 0x%016lx", regs[8]); 2596 tty->print_cr(" r9 = 0x%016lx", regs[9]); 2597 tty->print_cr("r10 = 0x%016lx", regs[10]); 2598 tty->print_cr("r11 = 0x%016lx", regs[11]); 2599 tty->print_cr("r12 = 0x%016lx", regs[12]); 2600 tty->print_cr("r13 = 0x%016lx", regs[13]); 2601 tty->print_cr("r14 = 0x%016lx", regs[14]); 2602 tty->print_cr("r15 = 0x%016lx", regs[15]); 2603 tty->print_cr("r16 = 0x%016lx", regs[16]); 2604 tty->print_cr("r17 = 0x%016lx", regs[17]); 2605 tty->print_cr("r18 = 0x%016lx", regs[18]); 2606 tty->print_cr("r19 = 0x%016lx", regs[19]); 2607 tty->print_cr("r20 = 0x%016lx", regs[20]); 2608 tty->print_cr("r21 = 0x%016lx", regs[21]); 2609 tty->print_cr("r22 = 0x%016lx", regs[22]); 2610 tty->print_cr("r23 = 0x%016lx", regs[23]); 2611 tty->print_cr("r24 = 0x%016lx", regs[24]); 2612 tty->print_cr("r25 = 0x%016lx", regs[25]); 2613 tty->print_cr("r26 = 0x%016lx", regs[26]); 2614 tty->print_cr("r27 = 0x%016lx", regs[27]); 2615 tty->print_cr("r28 = 0x%016lx", regs[28]); 2616 tty->print_cr("r30 = 0x%016lx", regs[30]); 2617 tty->print_cr("r31 = 0x%016lx", regs[31]); 2618 BREAKPOINT; 2619 } 2620 } 2621 fatal("DEBUG MESSAGE: %s", msg); 2622 } 2623 2624 void MacroAssembler::push_call_clobbered_registers() { 2625 int step = 4 * wordSize; 2626 push(RegSet::range(r0, r18) - RegSet::of(rscratch1, rscratch2), sp); 2627 sub(sp, sp, step); 2628 mov(rscratch1, -step); 2629 // Push v0-v7, v16-v31. 2630 for (int i = 31; i>= 4; i -= 4) { 2631 if (i <= v7->encoding() || i >= v16->encoding()) 2632 st1(as_FloatRegister(i-3), as_FloatRegister(i-2), as_FloatRegister(i-1), 2633 as_FloatRegister(i), T1D, Address(post(sp, rscratch1))); 2634 } 2635 st1(as_FloatRegister(0), as_FloatRegister(1), as_FloatRegister(2), 2636 as_FloatRegister(3), T1D, Address(sp)); 2637 } 2638 2639 void MacroAssembler::pop_call_clobbered_registers() { 2640 for (int i = 0; i < 32; i += 4) { 2641 if (i <= v7->encoding() || i >= v16->encoding()) 2642 ld1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2), 2643 as_FloatRegister(i+3), T1D, Address(post(sp, 4 * wordSize))); 2644 } 2645 2646 pop(RegSet::range(r0, r18) - RegSet::of(rscratch1, rscratch2), sp); 2647 } 2648 2649 void MacroAssembler::push_CPU_state(bool save_vectors) { 2650 int step = (save_vectors ? 8 : 4) * wordSize; 2651 push(0x3fffffff, sp); // integer registers except lr & sp 2652 mov(rscratch1, -step); 2653 sub(sp, sp, step); 2654 for (int i = 28; i >= 4; i -= 4) { 2655 st1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2), 2656 as_FloatRegister(i+3), save_vectors ? T2D : T1D, Address(post(sp, rscratch1))); 2657 } 2658 st1(v0, v1, v2, v3, save_vectors ? T2D : T1D, sp); 2659 } 2660 2661 void MacroAssembler::pop_CPU_state(bool restore_vectors) { 2662 int step = (restore_vectors ? 8 : 4) * wordSize; 2663 for (int i = 0; i <= 28; i += 4) 2664 ld1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2), 2665 as_FloatRegister(i+3), restore_vectors ? T2D : T1D, Address(post(sp, step))); 2666 pop(0x3fffffff, sp); // integer registers except lr & sp 2667 } 2668 2669 /** 2670 * Helpers for multiply_to_len(). 2671 */ 2672 void MacroAssembler::add2_with_carry(Register final_dest_hi, Register dest_hi, Register dest_lo, 2673 Register src1, Register src2) { 2674 adds(dest_lo, dest_lo, src1); 2675 adc(dest_hi, dest_hi, zr); 2676 adds(dest_lo, dest_lo, src2); 2677 adc(final_dest_hi, dest_hi, zr); 2678 } 2679 2680 // Generate an address from (r + r1 extend offset). "size" is the 2681 // size of the operand. The result may be in rscratch2. 2682 Address MacroAssembler::offsetted_address(Register r, Register r1, 2683 Address::extend ext, int offset, int size) { 2684 if (offset || (ext.shift() % size != 0)) { 2685 lea(rscratch2, Address(r, r1, ext)); 2686 return Address(rscratch2, offset); 2687 } else { 2688 return Address(r, r1, ext); 2689 } 2690 } 2691 2692 Address MacroAssembler::spill_address(int size, int offset, Register tmp) 2693 { 2694 assert(offset >= 0, "spill to negative address?"); 2695 // Offset reachable ? 2696 // Not aligned - 9 bits signed offset 2697 // Aligned - 12 bits unsigned offset shifted 2698 Register base = sp; 2699 if ((offset & (size-1)) && offset >= (1<<8)) { 2700 add(tmp, base, offset & ((1<<12)-1)); 2701 base = tmp; 2702 offset &= -1u<<12; 2703 } 2704 2705 if (offset >= (1<<12) * size) { 2706 add(tmp, base, offset & (((1<<12)-1)<<12)); 2707 base = tmp; 2708 offset &= ~(((1<<12)-1)<<12); 2709 } 2710 2711 return Address(base, offset); 2712 } 2713 2714 // Checks whether offset is aligned. 2715 // Returns true if it is, else false. 2716 bool MacroAssembler::merge_alignment_check(Register base, 2717 size_t size, 2718 long cur_offset, 2719 long prev_offset) const { 2720 if (AvoidUnalignedAccesses) { 2721 if (base == sp) { 2722 // Checks whether low offset if aligned to pair of registers. 2723 long pair_mask = size * 2 - 1; 2724 long offset = prev_offset > cur_offset ? cur_offset : prev_offset; 2725 return (offset & pair_mask) == 0; 2726 } else { // If base is not sp, we can't guarantee the access is aligned. 2727 return false; 2728 } 2729 } else { 2730 long mask = size - 1; 2731 // Load/store pair instruction only supports element size aligned offset. 2732 return (cur_offset & mask) == 0 && (prev_offset & mask) == 0; 2733 } 2734 } 2735 2736 // Checks whether current and previous loads/stores can be merged. 2737 // Returns true if it can be merged, else false. 2738 bool MacroAssembler::ldst_can_merge(Register rt, 2739 const Address &adr, 2740 size_t cur_size_in_bytes, 2741 bool is_store) const { 2742 address prev = pc() - NativeInstruction::instruction_size; 2743 address last = code()->last_insn(); 2744 2745 if (last == NULL || !nativeInstruction_at(last)->is_Imm_LdSt()) { 2746 return false; 2747 } 2748 2749 if (adr.getMode() != Address::base_plus_offset || prev != last) { 2750 return false; 2751 } 2752 2753 NativeLdSt* prev_ldst = NativeLdSt_at(prev); 2754 size_t prev_size_in_bytes = prev_ldst->size_in_bytes(); 2755 2756 assert(prev_size_in_bytes == 4 || prev_size_in_bytes == 8, "only supports 64/32bit merging."); 2757 assert(cur_size_in_bytes == 4 || cur_size_in_bytes == 8, "only supports 64/32bit merging."); 2758 2759 if (cur_size_in_bytes != prev_size_in_bytes || is_store != prev_ldst->is_store()) { 2760 return false; 2761 } 2762 2763 long max_offset = 63 * prev_size_in_bytes; 2764 long min_offset = -64 * prev_size_in_bytes; 2765 2766 assert(prev_ldst->is_not_pre_post_index(), "pre-index or post-index is not supported to be merged."); 2767 2768 // Only same base can be merged. 2769 if (adr.base() != prev_ldst->base()) { 2770 return false; 2771 } 2772 2773 long cur_offset = adr.offset(); 2774 long prev_offset = prev_ldst->offset(); 2775 size_t diff = abs(cur_offset - prev_offset); 2776 if (diff != prev_size_in_bytes) { 2777 return false; 2778 } 2779 2780 // Following cases can not be merged: 2781 // ldr x2, [x2, #8] 2782 // ldr x3, [x2, #16] 2783 // or: 2784 // ldr x2, [x3, #8] 2785 // ldr x2, [x3, #16] 2786 // If t1 and t2 is the same in "ldp t1, t2, [xn, #imm]", we'll get SIGILL. 2787 if (!is_store && (adr.base() == prev_ldst->target() || rt == prev_ldst->target())) { 2788 return false; 2789 } 2790 2791 long low_offset = prev_offset > cur_offset ? cur_offset : prev_offset; 2792 // Offset range must be in ldp/stp instruction's range. 2793 if (low_offset > max_offset || low_offset < min_offset) { 2794 return false; 2795 } 2796 2797 if (merge_alignment_check(adr.base(), prev_size_in_bytes, cur_offset, prev_offset)) { 2798 return true; 2799 } 2800 2801 return false; 2802 } 2803 2804 // Merge current load/store with previous load/store into ldp/stp. 2805 void MacroAssembler::merge_ldst(Register rt, 2806 const Address &adr, 2807 size_t cur_size_in_bytes, 2808 bool is_store) { 2809 2810 assert(ldst_can_merge(rt, adr, cur_size_in_bytes, is_store) == true, "cur and prev must be able to be merged."); 2811 2812 Register rt_low, rt_high; 2813 address prev = pc() - NativeInstruction::instruction_size; 2814 NativeLdSt* prev_ldst = NativeLdSt_at(prev); 2815 2816 long offset; 2817 2818 if (adr.offset() < prev_ldst->offset()) { 2819 offset = adr.offset(); 2820 rt_low = rt; 2821 rt_high = prev_ldst->target(); 2822 } else { 2823 offset = prev_ldst->offset(); 2824 rt_low = prev_ldst->target(); 2825 rt_high = rt; 2826 } 2827 2828 Address adr_p = Address(prev_ldst->base(), offset); 2829 // Overwrite previous generated binary. 2830 code_section()->set_end(prev); 2831 2832 const int sz = prev_ldst->size_in_bytes(); 2833 assert(sz == 8 || sz == 4, "only supports 64/32bit merging."); 2834 if (!is_store) { 2835 BLOCK_COMMENT("merged ldr pair"); 2836 if (sz == 8) { 2837 ldp(rt_low, rt_high, adr_p); 2838 } else { 2839 ldpw(rt_low, rt_high, adr_p); 2840 } 2841 } else { 2842 BLOCK_COMMENT("merged str pair"); 2843 if (sz == 8) { 2844 stp(rt_low, rt_high, adr_p); 2845 } else { 2846 stpw(rt_low, rt_high, adr_p); 2847 } 2848 } 2849 } 2850 2851 /** 2852 * Multiply 64 bit by 64 bit first loop. 2853 */ 2854 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart, 2855 Register y, Register y_idx, Register z, 2856 Register carry, Register product, 2857 Register idx, Register kdx) { 2858 // 2859 // jlong carry, x[], y[], z[]; 2860 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) { 2861 // huge_128 product = y[idx] * x[xstart] + carry; 2862 // z[kdx] = (jlong)product; 2863 // carry = (jlong)(product >>> 64); 2864 // } 2865 // z[xstart] = carry; 2866 // 2867 2868 Label L_first_loop, L_first_loop_exit; 2869 Label L_one_x, L_one_y, L_multiply; 2870 2871 subsw(xstart, xstart, 1); 2872 br(Assembler::MI, L_one_x); 2873 2874 lea(rscratch1, Address(x, xstart, Address::lsl(LogBytesPerInt))); 2875 ldr(x_xstart, Address(rscratch1)); 2876 ror(x_xstart, x_xstart, 32); // convert big-endian to little-endian 2877 2878 bind(L_first_loop); 2879 subsw(idx, idx, 1); 2880 br(Assembler::MI, L_first_loop_exit); 2881 subsw(idx, idx, 1); 2882 br(Assembler::MI, L_one_y); 2883 lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt))); 2884 ldr(y_idx, Address(rscratch1)); 2885 ror(y_idx, y_idx, 32); // convert big-endian to little-endian 2886 bind(L_multiply); 2887 2888 // AArch64 has a multiply-accumulate instruction that we can't use 2889 // here because it has no way to process carries, so we have to use 2890 // separate add and adc instructions. Bah. 2891 umulh(rscratch1, x_xstart, y_idx); // x_xstart * y_idx -> rscratch1:product 2892 mul(product, x_xstart, y_idx); 2893 adds(product, product, carry); 2894 adc(carry, rscratch1, zr); // x_xstart * y_idx + carry -> carry:product 2895 2896 subw(kdx, kdx, 2); 2897 ror(product, product, 32); // back to big-endian 2898 str(product, offsetted_address(z, kdx, Address::uxtw(LogBytesPerInt), 0, BytesPerLong)); 2899 2900 b(L_first_loop); 2901 2902 bind(L_one_y); 2903 ldrw(y_idx, Address(y, 0)); 2904 b(L_multiply); 2905 2906 bind(L_one_x); 2907 ldrw(x_xstart, Address(x, 0)); 2908 b(L_first_loop); 2909 2910 bind(L_first_loop_exit); 2911 } 2912 2913 /** 2914 * Multiply 128 bit by 128. Unrolled inner loop. 2915 * 2916 */ 2917 void MacroAssembler::multiply_128_x_128_loop(Register y, Register z, 2918 Register carry, Register carry2, 2919 Register idx, Register jdx, 2920 Register yz_idx1, Register yz_idx2, 2921 Register tmp, Register tmp3, Register tmp4, 2922 Register tmp6, Register product_hi) { 2923 2924 // jlong carry, x[], y[], z[]; 2925 // int kdx = ystart+1; 2926 // for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop 2927 // huge_128 tmp3 = (y[idx+1] * product_hi) + z[kdx+idx+1] + carry; 2928 // jlong carry2 = (jlong)(tmp3 >>> 64); 2929 // huge_128 tmp4 = (y[idx] * product_hi) + z[kdx+idx] + carry2; 2930 // carry = (jlong)(tmp4 >>> 64); 2931 // z[kdx+idx+1] = (jlong)tmp3; 2932 // z[kdx+idx] = (jlong)tmp4; 2933 // } 2934 // idx += 2; 2935 // if (idx > 0) { 2936 // yz_idx1 = (y[idx] * product_hi) + z[kdx+idx] + carry; 2937 // z[kdx+idx] = (jlong)yz_idx1; 2938 // carry = (jlong)(yz_idx1 >>> 64); 2939 // } 2940 // 2941 2942 Label L_third_loop, L_third_loop_exit, L_post_third_loop_done; 2943 2944 lsrw(jdx, idx, 2); 2945 2946 bind(L_third_loop); 2947 2948 subsw(jdx, jdx, 1); 2949 br(Assembler::MI, L_third_loop_exit); 2950 subw(idx, idx, 4); 2951 2952 lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt))); 2953 2954 ldp(yz_idx2, yz_idx1, Address(rscratch1, 0)); 2955 2956 lea(tmp6, Address(z, idx, Address::uxtw(LogBytesPerInt))); 2957 2958 ror(yz_idx1, yz_idx1, 32); // convert big-endian to little-endian 2959 ror(yz_idx2, yz_idx2, 32); 2960 2961 ldp(rscratch2, rscratch1, Address(tmp6, 0)); 2962 2963 mul(tmp3, product_hi, yz_idx1); // yz_idx1 * product_hi -> tmp4:tmp3 2964 umulh(tmp4, product_hi, yz_idx1); 2965 2966 ror(rscratch1, rscratch1, 32); // convert big-endian to little-endian 2967 ror(rscratch2, rscratch2, 32); 2968 2969 mul(tmp, product_hi, yz_idx2); // yz_idx2 * product_hi -> carry2:tmp 2970 umulh(carry2, product_hi, yz_idx2); 2971 2972 // propagate sum of both multiplications into carry:tmp4:tmp3 2973 adds(tmp3, tmp3, carry); 2974 adc(tmp4, tmp4, zr); 2975 adds(tmp3, tmp3, rscratch1); 2976 adcs(tmp4, tmp4, tmp); 2977 adc(carry, carry2, zr); 2978 adds(tmp4, tmp4, rscratch2); 2979 adc(carry, carry, zr); 2980 2981 ror(tmp3, tmp3, 32); // convert little-endian to big-endian 2982 ror(tmp4, tmp4, 32); 2983 stp(tmp4, tmp3, Address(tmp6, 0)); 2984 2985 b(L_third_loop); 2986 bind (L_third_loop_exit); 2987 2988 andw (idx, idx, 0x3); 2989 cbz(idx, L_post_third_loop_done); 2990 2991 Label L_check_1; 2992 subsw(idx, idx, 2); 2993 br(Assembler::MI, L_check_1); 2994 2995 lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt))); 2996 ldr(yz_idx1, Address(rscratch1, 0)); 2997 ror(yz_idx1, yz_idx1, 32); 2998 mul(tmp3, product_hi, yz_idx1); // yz_idx1 * product_hi -> tmp4:tmp3 2999 umulh(tmp4, product_hi, yz_idx1); 3000 lea(rscratch1, Address(z, idx, Address::uxtw(LogBytesPerInt))); 3001 ldr(yz_idx2, Address(rscratch1, 0)); 3002 ror(yz_idx2, yz_idx2, 32); 3003 3004 add2_with_carry(carry, tmp4, tmp3, carry, yz_idx2); 3005 3006 ror(tmp3, tmp3, 32); 3007 str(tmp3, Address(rscratch1, 0)); 3008 3009 bind (L_check_1); 3010 3011 andw (idx, idx, 0x1); 3012 subsw(idx, idx, 1); 3013 br(Assembler::MI, L_post_third_loop_done); 3014 ldrw(tmp4, Address(y, idx, Address::uxtw(LogBytesPerInt))); 3015 mul(tmp3, tmp4, product_hi); // tmp4 * product_hi -> carry2:tmp3 3016 umulh(carry2, tmp4, product_hi); 3017 ldrw(tmp4, Address(z, idx, Address::uxtw(LogBytesPerInt))); 3018 3019 add2_with_carry(carry2, tmp3, tmp4, carry); 3020 3021 strw(tmp3, Address(z, idx, Address::uxtw(LogBytesPerInt))); 3022 extr(carry, carry2, tmp3, 32); 3023 3024 bind(L_post_third_loop_done); 3025 } 3026 3027 /** 3028 * Code for BigInteger::multiplyToLen() instrinsic. 3029 * 3030 * r0: x 3031 * r1: xlen 3032 * r2: y 3033 * r3: ylen 3034 * r4: z 3035 * r5: zlen 3036 * r10: tmp1 3037 * r11: tmp2 3038 * r12: tmp3 3039 * r13: tmp4 3040 * r14: tmp5 3041 * r15: tmp6 3042 * r16: tmp7 3043 * 3044 */ 3045 void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Register ylen, 3046 Register z, Register zlen, 3047 Register tmp1, Register tmp2, Register tmp3, Register tmp4, 3048 Register tmp5, Register tmp6, Register product_hi) { 3049 3050 assert_different_registers(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6); 3051 3052 const Register idx = tmp1; 3053 const Register kdx = tmp2; 3054 const Register xstart = tmp3; 3055 3056 const Register y_idx = tmp4; 3057 const Register carry = tmp5; 3058 const Register product = xlen; 3059 const Register x_xstart = zlen; // reuse register 3060 3061 // First Loop. 3062 // 3063 // final static long LONG_MASK = 0xffffffffL; 3064 // int xstart = xlen - 1; 3065 // int ystart = ylen - 1; 3066 // long carry = 0; 3067 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) { 3068 // long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry; 3069 // z[kdx] = (int)product; 3070 // carry = product >>> 32; 3071 // } 3072 // z[xstart] = (int)carry; 3073 // 3074 3075 movw(idx, ylen); // idx = ylen; 3076 movw(kdx, zlen); // kdx = xlen+ylen; 3077 mov(carry, zr); // carry = 0; 3078 3079 Label L_done; 3080 3081 movw(xstart, xlen); 3082 subsw(xstart, xstart, 1); 3083 br(Assembler::MI, L_done); 3084 3085 multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, carry, product, idx, kdx); 3086 3087 Label L_second_loop; 3088 cbzw(kdx, L_second_loop); 3089 3090 Label L_carry; 3091 subw(kdx, kdx, 1); 3092 cbzw(kdx, L_carry); 3093 3094 strw(carry, Address(z, kdx, Address::uxtw(LogBytesPerInt))); 3095 lsr(carry, carry, 32); 3096 subw(kdx, kdx, 1); 3097 3098 bind(L_carry); 3099 strw(carry, Address(z, kdx, Address::uxtw(LogBytesPerInt))); 3100 3101 // Second and third (nested) loops. 3102 // 3103 // for (int i = xstart-1; i >= 0; i--) { // Second loop 3104 // carry = 0; 3105 // for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop 3106 // long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) + 3107 // (z[k] & LONG_MASK) + carry; 3108 // z[k] = (int)product; 3109 // carry = product >>> 32; 3110 // } 3111 // z[i] = (int)carry; 3112 // } 3113 // 3114 // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = product_hi 3115 3116 const Register jdx = tmp1; 3117 3118 bind(L_second_loop); 3119 mov(carry, zr); // carry = 0; 3120 movw(jdx, ylen); // j = ystart+1 3121 3122 subsw(xstart, xstart, 1); // i = xstart-1; 3123 br(Assembler::MI, L_done); 3124 3125 str(z, Address(pre(sp, -4 * wordSize))); 3126 3127 Label L_last_x; 3128 lea(z, offsetted_address(z, xstart, Address::uxtw(LogBytesPerInt), 4, BytesPerInt)); // z = z + k - j 3129 subsw(xstart, xstart, 1); // i = xstart-1; 3130 br(Assembler::MI, L_last_x); 3131 3132 lea(rscratch1, Address(x, xstart, Address::uxtw(LogBytesPerInt))); 3133 ldr(product_hi, Address(rscratch1)); 3134 ror(product_hi, product_hi, 32); // convert big-endian to little-endian 3135 3136 Label L_third_loop_prologue; 3137 bind(L_third_loop_prologue); 3138 3139 str(ylen, Address(sp, wordSize)); 3140 stp(x, xstart, Address(sp, 2 * wordSize)); 3141 multiply_128_x_128_loop(y, z, carry, x, jdx, ylen, product, 3142 tmp2, x_xstart, tmp3, tmp4, tmp6, product_hi); 3143 ldp(z, ylen, Address(post(sp, 2 * wordSize))); 3144 ldp(x, xlen, Address(post(sp, 2 * wordSize))); // copy old xstart -> xlen 3145 3146 addw(tmp3, xlen, 1); 3147 strw(carry, Address(z, tmp3, Address::uxtw(LogBytesPerInt))); 3148 subsw(tmp3, tmp3, 1); 3149 br(Assembler::MI, L_done); 3150 3151 lsr(carry, carry, 32); 3152 strw(carry, Address(z, tmp3, Address::uxtw(LogBytesPerInt))); 3153 b(L_second_loop); 3154 3155 // Next infrequent code is moved outside loops. 3156 bind(L_last_x); 3157 ldrw(product_hi, Address(x, 0)); 3158 b(L_third_loop_prologue); 3159 3160 bind(L_done); 3161 } 3162 3163 // Code for BigInteger::mulAdd instrinsic 3164 // out = r0 3165 // in = r1 3166 // offset = r2 (already out.length-offset) 3167 // len = r3 3168 // k = r4 3169 // 3170 // pseudo code from java implementation: 3171 // carry = 0; 3172 // offset = out.length-offset - 1; 3173 // for (int j=len-1; j >= 0; j--) { 3174 // product = (in[j] & LONG_MASK) * kLong + (out[offset] & LONG_MASK) + carry; 3175 // out[offset--] = (int)product; 3176 // carry = product >>> 32; 3177 // } 3178 // return (int)carry; 3179 void MacroAssembler::mul_add(Register out, Register in, Register offset, 3180 Register len, Register k) { 3181 Label LOOP, END; 3182 // pre-loop 3183 cmp(len, zr); // cmp, not cbz/cbnz: to use condition twice => less branches 3184 csel(out, zr, out, Assembler::EQ); 3185 br(Assembler::EQ, END); 3186 add(in, in, len, LSL, 2); // in[j+1] address 3187 add(offset, out, offset, LSL, 2); // out[offset + 1] address 3188 mov(out, zr); // used to keep carry now 3189 BIND(LOOP); 3190 ldrw(rscratch1, Address(pre(in, -4))); 3191 madd(rscratch1, rscratch1, k, out); 3192 ldrw(rscratch2, Address(pre(offset, -4))); 3193 add(rscratch1, rscratch1, rscratch2); 3194 strw(rscratch1, Address(offset)); 3195 lsr(out, rscratch1, 32); 3196 subs(len, len, 1); 3197 br(Assembler::NE, LOOP); 3198 BIND(END); 3199 } 3200 3201 /** 3202 * Emits code to update CRC-32 with a byte value according to constants in table 3203 * 3204 * @param [in,out]crc Register containing the crc. 3205 * @param [in]val Register containing the byte to fold into the CRC. 3206 * @param [in]table Register containing the table of crc constants. 3207 * 3208 * uint32_t crc; 3209 * val = crc_table[(val ^ crc) & 0xFF]; 3210 * crc = val ^ (crc >> 8); 3211 * 3212 */ 3213 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) { 3214 eor(val, val, crc); 3215 andr(val, val, 0xff); 3216 ldrw(val, Address(table, val, Address::lsl(2))); 3217 eor(crc, val, crc, Assembler::LSR, 8); 3218 } 3219 3220 /** 3221 * Emits code to update CRC-32 with a 32-bit value according to tables 0 to 3 3222 * 3223 * @param [in,out]crc Register containing the crc. 3224 * @param [in]v Register containing the 32-bit to fold into the CRC. 3225 * @param [in]table0 Register containing table 0 of crc constants. 3226 * @param [in]table1 Register containing table 1 of crc constants. 3227 * @param [in]table2 Register containing table 2 of crc constants. 3228 * @param [in]table3 Register containing table 3 of crc constants. 3229 * 3230 * uint32_t crc; 3231 * v = crc ^ v 3232 * crc = table3[v&0xff]^table2[(v>>8)&0xff]^table1[(v>>16)&0xff]^table0[v>>24] 3233 * 3234 */ 3235 void MacroAssembler::update_word_crc32(Register crc, Register v, Register tmp, 3236 Register table0, Register table1, Register table2, Register table3, 3237 bool upper) { 3238 eor(v, crc, v, upper ? LSR:LSL, upper ? 32:0); 3239 uxtb(tmp, v); 3240 ldrw(crc, Address(table3, tmp, Address::lsl(2))); 3241 ubfx(tmp, v, 8, 8); 3242 ldrw(tmp, Address(table2, tmp, Address::lsl(2))); 3243 eor(crc, crc, tmp); 3244 ubfx(tmp, v, 16, 8); 3245 ldrw(tmp, Address(table1, tmp, Address::lsl(2))); 3246 eor(crc, crc, tmp); 3247 ubfx(tmp, v, 24, 8); 3248 ldrw(tmp, Address(table0, tmp, Address::lsl(2))); 3249 eor(crc, crc, tmp); 3250 } 3251 3252 void MacroAssembler::kernel_crc32_using_crc32(Register crc, Register buf, 3253 Register len, Register tmp0, Register tmp1, Register tmp2, 3254 Register tmp3) { 3255 Label CRC_by64_loop, CRC_by4_loop, CRC_by1_loop, CRC_less64, CRC_by64_pre, CRC_by32_loop, CRC_less32, L_exit; 3256 assert_different_registers(crc, buf, len, tmp0, tmp1, tmp2, tmp3); 3257 3258 mvnw(crc, crc); 3259 3260 subs(len, len, 128); 3261 br(Assembler::GE, CRC_by64_pre); 3262 BIND(CRC_less64); 3263 adds(len, len, 128-32); 3264 br(Assembler::GE, CRC_by32_loop); 3265 BIND(CRC_less32); 3266 adds(len, len, 32-4); 3267 br(Assembler::GE, CRC_by4_loop); 3268 adds(len, len, 4); 3269 br(Assembler::GT, CRC_by1_loop); 3270 b(L_exit); 3271 3272 BIND(CRC_by32_loop); 3273 ldp(tmp0, tmp1, Address(post(buf, 16))); 3274 subs(len, len, 32); 3275 crc32x(crc, crc, tmp0); 3276 ldr(tmp2, Address(post(buf, 8))); 3277 crc32x(crc, crc, tmp1); 3278 ldr(tmp3, Address(post(buf, 8))); 3279 crc32x(crc, crc, tmp2); 3280 crc32x(crc, crc, tmp3); 3281 br(Assembler::GE, CRC_by32_loop); 3282 cmn(len, 32); 3283 br(Assembler::NE, CRC_less32); 3284 b(L_exit); 3285 3286 BIND(CRC_by4_loop); 3287 ldrw(tmp0, Address(post(buf, 4))); 3288 subs(len, len, 4); 3289 crc32w(crc, crc, tmp0); 3290 br(Assembler::GE, CRC_by4_loop); 3291 adds(len, len, 4); 3292 br(Assembler::LE, L_exit); 3293 BIND(CRC_by1_loop); 3294 ldrb(tmp0, Address(post(buf, 1))); 3295 subs(len, len, 1); 3296 crc32b(crc, crc, tmp0); 3297 br(Assembler::GT, CRC_by1_loop); 3298 b(L_exit); 3299 3300 BIND(CRC_by64_pre); 3301 sub(buf, buf, 8); 3302 ldp(tmp0, tmp1, Address(buf, 8)); 3303 crc32x(crc, crc, tmp0); 3304 ldr(tmp2, Address(buf, 24)); 3305 crc32x(crc, crc, tmp1); 3306 ldr(tmp3, Address(buf, 32)); 3307 crc32x(crc, crc, tmp2); 3308 ldr(tmp0, Address(buf, 40)); 3309 crc32x(crc, crc, tmp3); 3310 ldr(tmp1, Address(buf, 48)); 3311 crc32x(crc, crc, tmp0); 3312 ldr(tmp2, Address(buf, 56)); 3313 crc32x(crc, crc, tmp1); 3314 ldr(tmp3, Address(pre(buf, 64))); 3315 3316 b(CRC_by64_loop); 3317 3318 align(CodeEntryAlignment); 3319 BIND(CRC_by64_loop); 3320 subs(len, len, 64); 3321 crc32x(crc, crc, tmp2); 3322 ldr(tmp0, Address(buf, 8)); 3323 crc32x(crc, crc, tmp3); 3324 ldr(tmp1, Address(buf, 16)); 3325 crc32x(crc, crc, tmp0); 3326 ldr(tmp2, Address(buf, 24)); 3327 crc32x(crc, crc, tmp1); 3328 ldr(tmp3, Address(buf, 32)); 3329 crc32x(crc, crc, tmp2); 3330 ldr(tmp0, Address(buf, 40)); 3331 crc32x(crc, crc, tmp3); 3332 ldr(tmp1, Address(buf, 48)); 3333 crc32x(crc, crc, tmp0); 3334 ldr(tmp2, Address(buf, 56)); 3335 crc32x(crc, crc, tmp1); 3336 ldr(tmp3, Address(pre(buf, 64))); 3337 br(Assembler::GE, CRC_by64_loop); 3338 3339 // post-loop 3340 crc32x(crc, crc, tmp2); 3341 crc32x(crc, crc, tmp3); 3342 3343 sub(len, len, 64); 3344 add(buf, buf, 8); 3345 cmn(len, 128); 3346 br(Assembler::NE, CRC_less64); 3347 BIND(L_exit); 3348 mvnw(crc, crc); 3349 } 3350 3351 /** 3352 * @param crc register containing existing CRC (32-bit) 3353 * @param buf register pointing to input byte buffer (byte*) 3354 * @param len register containing number of bytes 3355 * @param table register that will contain address of CRC table 3356 * @param tmp scratch register 3357 */ 3358 void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len, 3359 Register table0, Register table1, Register table2, Register table3, 3360 Register tmp, Register tmp2, Register tmp3) { 3361 Label L_by16, L_by16_loop, L_by4, L_by4_loop, L_by1, L_by1_loop, L_exit; 3362 unsigned long offset; 3363 3364 if (UseCRC32) { 3365 kernel_crc32_using_crc32(crc, buf, len, table0, table1, table2, table3); 3366 return; 3367 } 3368 3369 mvnw(crc, crc); 3370 3371 adrp(table0, ExternalAddress(StubRoutines::crc_table_addr()), offset); 3372 if (offset) add(table0, table0, offset); 3373 add(table1, table0, 1*256*sizeof(juint)); 3374 add(table2, table0, 2*256*sizeof(juint)); 3375 add(table3, table0, 3*256*sizeof(juint)); 3376 3377 if (UseNeon) { 3378 cmp(len, (u1)64); 3379 br(Assembler::LT, L_by16); 3380 eor(v16, T16B, v16, v16); 3381 3382 Label L_fold; 3383 3384 add(tmp, table0, 4*256*sizeof(juint)); // Point at the Neon constants 3385 3386 ld1(v0, v1, T2D, post(buf, 32)); 3387 ld1r(v4, T2D, post(tmp, 8)); 3388 ld1r(v5, T2D, post(tmp, 8)); 3389 ld1r(v6, T2D, post(tmp, 8)); 3390 ld1r(v7, T2D, post(tmp, 8)); 3391 mov(v16, T4S, 0, crc); 3392 3393 eor(v0, T16B, v0, v16); 3394 sub(len, len, 64); 3395 3396 BIND(L_fold); 3397 pmull(v22, T8H, v0, v5, T8B); 3398 pmull(v20, T8H, v0, v7, T8B); 3399 pmull(v23, T8H, v0, v4, T8B); 3400 pmull(v21, T8H, v0, v6, T8B); 3401 3402 pmull2(v18, T8H, v0, v5, T16B); 3403 pmull2(v16, T8H, v0, v7, T16B); 3404 pmull2(v19, T8H, v0, v4, T16B); 3405 pmull2(v17, T8H, v0, v6, T16B); 3406 3407 uzp1(v24, T8H, v20, v22); 3408 uzp2(v25, T8H, v20, v22); 3409 eor(v20, T16B, v24, v25); 3410 3411 uzp1(v26, T8H, v16, v18); 3412 uzp2(v27, T8H, v16, v18); 3413 eor(v16, T16B, v26, v27); 3414 3415 ushll2(v22, T4S, v20, T8H, 8); 3416 ushll(v20, T4S, v20, T4H, 8); 3417 3418 ushll2(v18, T4S, v16, T8H, 8); 3419 ushll(v16, T4S, v16, T4H, 8); 3420 3421 eor(v22, T16B, v23, v22); 3422 eor(v18, T16B, v19, v18); 3423 eor(v20, T16B, v21, v20); 3424 eor(v16, T16B, v17, v16); 3425 3426 uzp1(v17, T2D, v16, v20); 3427 uzp2(v21, T2D, v16, v20); 3428 eor(v17, T16B, v17, v21); 3429 3430 ushll2(v20, T2D, v17, T4S, 16); 3431 ushll(v16, T2D, v17, T2S, 16); 3432 3433 eor(v20, T16B, v20, v22); 3434 eor(v16, T16B, v16, v18); 3435 3436 uzp1(v17, T2D, v20, v16); 3437 uzp2(v21, T2D, v20, v16); 3438 eor(v28, T16B, v17, v21); 3439 3440 pmull(v22, T8H, v1, v5, T8B); 3441 pmull(v20, T8H, v1, v7, T8B); 3442 pmull(v23, T8H, v1, v4, T8B); 3443 pmull(v21, T8H, v1, v6, T8B); 3444 3445 pmull2(v18, T8H, v1, v5, T16B); 3446 pmull2(v16, T8H, v1, v7, T16B); 3447 pmull2(v19, T8H, v1, v4, T16B); 3448 pmull2(v17, T8H, v1, v6, T16B); 3449 3450 ld1(v0, v1, T2D, post(buf, 32)); 3451 3452 uzp1(v24, T8H, v20, v22); 3453 uzp2(v25, T8H, v20, v22); 3454 eor(v20, T16B, v24, v25); 3455 3456 uzp1(v26, T8H, v16, v18); 3457 uzp2(v27, T8H, v16, v18); 3458 eor(v16, T16B, v26, v27); 3459 3460 ushll2(v22, T4S, v20, T8H, 8); 3461 ushll(v20, T4S, v20, T4H, 8); 3462 3463 ushll2(v18, T4S, v16, T8H, 8); 3464 ushll(v16, T4S, v16, T4H, 8); 3465 3466 eor(v22, T16B, v23, v22); 3467 eor(v18, T16B, v19, v18); 3468 eor(v20, T16B, v21, v20); 3469 eor(v16, T16B, v17, v16); 3470 3471 uzp1(v17, T2D, v16, v20); 3472 uzp2(v21, T2D, v16, v20); 3473 eor(v16, T16B, v17, v21); 3474 3475 ushll2(v20, T2D, v16, T4S, 16); 3476 ushll(v16, T2D, v16, T2S, 16); 3477 3478 eor(v20, T16B, v22, v20); 3479 eor(v16, T16B, v16, v18); 3480 3481 uzp1(v17, T2D, v20, v16); 3482 uzp2(v21, T2D, v20, v16); 3483 eor(v20, T16B, v17, v21); 3484 3485 shl(v16, T2D, v28, 1); 3486 shl(v17, T2D, v20, 1); 3487 3488 eor(v0, T16B, v0, v16); 3489 eor(v1, T16B, v1, v17); 3490 3491 subs(len, len, 32); 3492 br(Assembler::GE, L_fold); 3493 3494 mov(crc, 0); 3495 mov(tmp, v0, T1D, 0); 3496 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false); 3497 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true); 3498 mov(tmp, v0, T1D, 1); 3499 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false); 3500 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true); 3501 mov(tmp, v1, T1D, 0); 3502 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false); 3503 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true); 3504 mov(tmp, v1, T1D, 1); 3505 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false); 3506 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true); 3507 3508 add(len, len, 32); 3509 } 3510 3511 BIND(L_by16); 3512 subs(len, len, 16); 3513 br(Assembler::GE, L_by16_loop); 3514 adds(len, len, 16-4); 3515 br(Assembler::GE, L_by4_loop); 3516 adds(len, len, 4); 3517 br(Assembler::GT, L_by1_loop); 3518 b(L_exit); 3519 3520 BIND(L_by4_loop); 3521 ldrw(tmp, Address(post(buf, 4))); 3522 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3); 3523 subs(len, len, 4); 3524 br(Assembler::GE, L_by4_loop); 3525 adds(len, len, 4); 3526 br(Assembler::LE, L_exit); 3527 BIND(L_by1_loop); 3528 subs(len, len, 1); 3529 ldrb(tmp, Address(post(buf, 1))); 3530 update_byte_crc32(crc, tmp, table0); 3531 br(Assembler::GT, L_by1_loop); 3532 b(L_exit); 3533 3534 align(CodeEntryAlignment); 3535 BIND(L_by16_loop); 3536 subs(len, len, 16); 3537 ldp(tmp, tmp3, Address(post(buf, 16))); 3538 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false); 3539 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true); 3540 update_word_crc32(crc, tmp3, tmp2, table0, table1, table2, table3, false); 3541 update_word_crc32(crc, tmp3, tmp2, table0, table1, table2, table3, true); 3542 br(Assembler::GE, L_by16_loop); 3543 adds(len, len, 16-4); 3544 br(Assembler::GE, L_by4_loop); 3545 adds(len, len, 4); 3546 br(Assembler::GT, L_by1_loop); 3547 BIND(L_exit); 3548 mvnw(crc, crc); 3549 } 3550 3551 void MacroAssembler::kernel_crc32c_using_crc32c(Register crc, Register buf, 3552 Register len, Register tmp0, Register tmp1, Register tmp2, 3553 Register tmp3) { 3554 Label CRC_by64_loop, CRC_by4_loop, CRC_by1_loop, CRC_less64, CRC_by64_pre, CRC_by32_loop, CRC_less32, L_exit; 3555 assert_different_registers(crc, buf, len, tmp0, tmp1, tmp2, tmp3); 3556 3557 subs(len, len, 128); 3558 br(Assembler::GE, CRC_by64_pre); 3559 BIND(CRC_less64); 3560 adds(len, len, 128-32); 3561 br(Assembler::GE, CRC_by32_loop); 3562 BIND(CRC_less32); 3563 adds(len, len, 32-4); 3564 br(Assembler::GE, CRC_by4_loop); 3565 adds(len, len, 4); 3566 br(Assembler::GT, CRC_by1_loop); 3567 b(L_exit); 3568 3569 BIND(CRC_by32_loop); 3570 ldp(tmp0, tmp1, Address(post(buf, 16))); 3571 subs(len, len, 32); 3572 crc32cx(crc, crc, tmp0); 3573 ldr(tmp2, Address(post(buf, 8))); 3574 crc32cx(crc, crc, tmp1); 3575 ldr(tmp3, Address(post(buf, 8))); 3576 crc32cx(crc, crc, tmp2); 3577 crc32cx(crc, crc, tmp3); 3578 br(Assembler::GE, CRC_by32_loop); 3579 cmn(len, 32); 3580 br(Assembler::NE, CRC_less32); 3581 b(L_exit); 3582 3583 BIND(CRC_by4_loop); 3584 ldrw(tmp0, Address(post(buf, 4))); 3585 subs(len, len, 4); 3586 crc32cw(crc, crc, tmp0); 3587 br(Assembler::GE, CRC_by4_loop); 3588 adds(len, len, 4); 3589 br(Assembler::LE, L_exit); 3590 BIND(CRC_by1_loop); 3591 ldrb(tmp0, Address(post(buf, 1))); 3592 subs(len, len, 1); 3593 crc32cb(crc, crc, tmp0); 3594 br(Assembler::GT, CRC_by1_loop); 3595 b(L_exit); 3596 3597 BIND(CRC_by64_pre); 3598 sub(buf, buf, 8); 3599 ldp(tmp0, tmp1, Address(buf, 8)); 3600 crc32cx(crc, crc, tmp0); 3601 ldr(tmp2, Address(buf, 24)); 3602 crc32cx(crc, crc, tmp1); 3603 ldr(tmp3, Address(buf, 32)); 3604 crc32cx(crc, crc, tmp2); 3605 ldr(tmp0, Address(buf, 40)); 3606 crc32cx(crc, crc, tmp3); 3607 ldr(tmp1, Address(buf, 48)); 3608 crc32cx(crc, crc, tmp0); 3609 ldr(tmp2, Address(buf, 56)); 3610 crc32cx(crc, crc, tmp1); 3611 ldr(tmp3, Address(pre(buf, 64))); 3612 3613 b(CRC_by64_loop); 3614 3615 align(CodeEntryAlignment); 3616 BIND(CRC_by64_loop); 3617 subs(len, len, 64); 3618 crc32cx(crc, crc, tmp2); 3619 ldr(tmp0, Address(buf, 8)); 3620 crc32cx(crc, crc, tmp3); 3621 ldr(tmp1, Address(buf, 16)); 3622 crc32cx(crc, crc, tmp0); 3623 ldr(tmp2, Address(buf, 24)); 3624 crc32cx(crc, crc, tmp1); 3625 ldr(tmp3, Address(buf, 32)); 3626 crc32cx(crc, crc, tmp2); 3627 ldr(tmp0, Address(buf, 40)); 3628 crc32cx(crc, crc, tmp3); 3629 ldr(tmp1, Address(buf, 48)); 3630 crc32cx(crc, crc, tmp0); 3631 ldr(tmp2, Address(buf, 56)); 3632 crc32cx(crc, crc, tmp1); 3633 ldr(tmp3, Address(pre(buf, 64))); 3634 br(Assembler::GE, CRC_by64_loop); 3635 3636 // post-loop 3637 crc32cx(crc, crc, tmp2); 3638 crc32cx(crc, crc, tmp3); 3639 3640 sub(len, len, 64); 3641 add(buf, buf, 8); 3642 cmn(len, 128); 3643 br(Assembler::NE, CRC_less64); 3644 BIND(L_exit); 3645 } 3646 3647 /** 3648 * @param crc register containing existing CRC (32-bit) 3649 * @param buf register pointing to input byte buffer (byte*) 3650 * @param len register containing number of bytes 3651 * @param table register that will contain address of CRC table 3652 * @param tmp scratch register 3653 */ 3654 void MacroAssembler::kernel_crc32c(Register crc, Register buf, Register len, 3655 Register table0, Register table1, Register table2, Register table3, 3656 Register tmp, Register tmp2, Register tmp3) { 3657 kernel_crc32c_using_crc32c(crc, buf, len, table0, table1, table2, table3); 3658 } 3659 3660 3661 SkipIfEqual::SkipIfEqual( 3662 MacroAssembler* masm, const bool* flag_addr, bool value) { 3663 _masm = masm; 3664 unsigned long offset; 3665 _masm->adrp(rscratch1, ExternalAddress((address)flag_addr), offset); 3666 _masm->ldrb(rscratch1, Address(rscratch1, offset)); 3667 _masm->cbzw(rscratch1, _label); 3668 } 3669 3670 SkipIfEqual::~SkipIfEqual() { 3671 _masm->bind(_label); 3672 } 3673 3674 void MacroAssembler::addptr(const Address &dst, int32_t src) { 3675 Address adr; 3676 switch(dst.getMode()) { 3677 case Address::base_plus_offset: 3678 // This is the expected mode, although we allow all the other 3679 // forms below. 3680 adr = form_address(rscratch2, dst.base(), dst.offset(), LogBytesPerWord); 3681 break; 3682 default: 3683 lea(rscratch2, dst); 3684 adr = Address(rscratch2); 3685 break; 3686 } 3687 ldr(rscratch1, adr); 3688 add(rscratch1, rscratch1, src); 3689 str(rscratch1, adr); 3690 } 3691 3692 void MacroAssembler::cmpptr(Register src1, Address src2) { 3693 unsigned long offset; 3694 adrp(rscratch1, src2, offset); 3695 ldr(rscratch1, Address(rscratch1, offset)); 3696 cmp(src1, rscratch1); 3697 } 3698 3699 void MacroAssembler::cmpoop(Register obj1, Register obj2) { 3700 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 3701 bs->obj_equals(this, obj1, obj2); 3702 } 3703 3704 void MacroAssembler::load_method_holder(Register holder, Register method) { 3705 ldr(holder, Address(method, Method::const_offset())); // ConstMethod* 3706 ldr(holder, Address(holder, ConstMethod::constants_offset())); // ConstantPool* 3707 ldr(holder, Address(holder, ConstantPool::pool_holder_offset_in_bytes())); // InstanceKlass* 3708 } 3709 3710 void MacroAssembler::load_klass(Register dst, Register src) { 3711 if (UseCompressedClassPointers) { 3712 ldrw(dst, Address(src, oopDesc::klass_offset_in_bytes())); 3713 decode_klass_not_null(dst); 3714 } else { 3715 ldr(dst, Address(src, oopDesc::klass_offset_in_bytes())); 3716 } 3717 } 3718 3719 // ((OopHandle)result).resolve(); 3720 void MacroAssembler::resolve_oop_handle(Register result, Register tmp) { 3721 // OopHandle::resolve is an indirection. 3722 access_load_at(T_OBJECT, IN_NATIVE, result, Address(result, 0), tmp, noreg); 3723 } 3724 3725 void MacroAssembler::load_mirror(Register dst, Register method, Register tmp) { 3726 const int mirror_offset = in_bytes(Klass::java_mirror_offset()); 3727 ldr(dst, Address(rmethod, Method::const_offset())); 3728 ldr(dst, Address(dst, ConstMethod::constants_offset())); 3729 ldr(dst, Address(dst, ConstantPool::pool_holder_offset_in_bytes())); 3730 ldr(dst, Address(dst, mirror_offset)); 3731 resolve_oop_handle(dst, tmp); 3732 } 3733 3734 void MacroAssembler::cmp_klass(Register oop, Register trial_klass, Register tmp) { 3735 if (UseCompressedClassPointers) { 3736 ldrw(tmp, Address(oop, oopDesc::klass_offset_in_bytes())); 3737 if (CompressedKlassPointers::base() == NULL) { 3738 cmp(trial_klass, tmp, LSL, CompressedKlassPointers::shift()); 3739 return; 3740 } else if (((uint64_t)CompressedKlassPointers::base() & 0xffffffff) == 0 3741 && CompressedKlassPointers::shift() == 0) { 3742 // Only the bottom 32 bits matter 3743 cmpw(trial_klass, tmp); 3744 return; 3745 } 3746 decode_klass_not_null(tmp); 3747 } else { 3748 ldr(tmp, Address(oop, oopDesc::klass_offset_in_bytes())); 3749 } 3750 cmp(trial_klass, tmp); 3751 } 3752 3753 void MacroAssembler::load_prototype_header(Register dst, Register src) { 3754 load_klass(dst, src); 3755 ldr(dst, Address(dst, Klass::prototype_header_offset())); 3756 } 3757 3758 void MacroAssembler::store_klass(Register dst, Register src) { 3759 // FIXME: Should this be a store release? concurrent gcs assumes 3760 // klass length is valid if klass field is not null. 3761 if (UseCompressedClassPointers) { 3762 encode_klass_not_null(src); 3763 strw(src, Address(dst, oopDesc::klass_offset_in_bytes())); 3764 } else { 3765 str(src, Address(dst, oopDesc::klass_offset_in_bytes())); 3766 } 3767 } 3768 3769 void MacroAssembler::store_klass_gap(Register dst, Register src) { 3770 if (UseCompressedClassPointers) { 3771 // Store to klass gap in destination 3772 strw(src, Address(dst, oopDesc::klass_gap_offset_in_bytes())); 3773 } 3774 } 3775 3776 // Algorithm must match CompressedOops::encode. 3777 void MacroAssembler::encode_heap_oop(Register d, Register s) { 3778 #ifdef ASSERT 3779 verify_heapbase("MacroAssembler::encode_heap_oop: heap base corrupted?"); 3780 #endif 3781 verify_oop(s, "broken oop in encode_heap_oop"); 3782 if (CompressedOops::base() == NULL) { 3783 if (CompressedOops::shift() != 0) { 3784 assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong"); 3785 lsr(d, s, LogMinObjAlignmentInBytes); 3786 } else { 3787 mov(d, s); 3788 } 3789 } else { 3790 subs(d, s, rheapbase); 3791 csel(d, d, zr, Assembler::HS); 3792 lsr(d, d, LogMinObjAlignmentInBytes); 3793 3794 /* Old algorithm: is this any worse? 3795 Label nonnull; 3796 cbnz(r, nonnull); 3797 sub(r, r, rheapbase); 3798 bind(nonnull); 3799 lsr(r, r, LogMinObjAlignmentInBytes); 3800 */ 3801 } 3802 } 3803 3804 void MacroAssembler::encode_heap_oop_not_null(Register r) { 3805 #ifdef ASSERT 3806 verify_heapbase("MacroAssembler::encode_heap_oop_not_null: heap base corrupted?"); 3807 if (CheckCompressedOops) { 3808 Label ok; 3809 cbnz(r, ok); 3810 stop("null oop passed to encode_heap_oop_not_null"); 3811 bind(ok); 3812 } 3813 #endif 3814 verify_oop(r, "broken oop in encode_heap_oop_not_null"); 3815 if (CompressedOops::base() != NULL) { 3816 sub(r, r, rheapbase); 3817 } 3818 if (CompressedOops::shift() != 0) { 3819 assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong"); 3820 lsr(r, r, LogMinObjAlignmentInBytes); 3821 } 3822 } 3823 3824 void MacroAssembler::encode_heap_oop_not_null(Register dst, Register src) { 3825 #ifdef ASSERT 3826 verify_heapbase("MacroAssembler::encode_heap_oop_not_null2: heap base corrupted?"); 3827 if (CheckCompressedOops) { 3828 Label ok; 3829 cbnz(src, ok); 3830 stop("null oop passed to encode_heap_oop_not_null2"); 3831 bind(ok); 3832 } 3833 #endif 3834 verify_oop(src, "broken oop in encode_heap_oop_not_null2"); 3835 3836 Register data = src; 3837 if (CompressedOops::base() != NULL) { 3838 sub(dst, src, rheapbase); 3839 data = dst; 3840 } 3841 if (CompressedOops::shift() != 0) { 3842 assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong"); 3843 lsr(dst, data, LogMinObjAlignmentInBytes); 3844 data = dst; 3845 } 3846 if (data == src) 3847 mov(dst, src); 3848 } 3849 3850 void MacroAssembler::decode_heap_oop(Register d, Register s) { 3851 #ifdef ASSERT 3852 verify_heapbase("MacroAssembler::decode_heap_oop: heap base corrupted?"); 3853 #endif 3854 if (CompressedOops::base() == NULL) { 3855 if (CompressedOops::shift() != 0 || d != s) { 3856 lsl(d, s, CompressedOops::shift()); 3857 } 3858 } else { 3859 Label done; 3860 if (d != s) 3861 mov(d, s); 3862 cbz(s, done); 3863 add(d, rheapbase, s, Assembler::LSL, LogMinObjAlignmentInBytes); 3864 bind(done); 3865 } 3866 verify_oop(d, "broken oop in decode_heap_oop"); 3867 } 3868 3869 void MacroAssembler::decode_heap_oop_not_null(Register r) { 3870 assert (UseCompressedOops, "should only be used for compressed headers"); 3871 assert (Universe::heap() != NULL, "java heap should be initialized"); 3872 // Cannot assert, unverified entry point counts instructions (see .ad file) 3873 // vtableStubs also counts instructions in pd_code_size_limit. 3874 // Also do not verify_oop as this is called by verify_oop. 3875 if (CompressedOops::shift() != 0) { 3876 assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong"); 3877 if (CompressedOops::base() != NULL) { 3878 add(r, rheapbase, r, Assembler::LSL, LogMinObjAlignmentInBytes); 3879 } else { 3880 add(r, zr, r, Assembler::LSL, LogMinObjAlignmentInBytes); 3881 } 3882 } else { 3883 assert (CompressedOops::base() == NULL, "sanity"); 3884 } 3885 } 3886 3887 void MacroAssembler::decode_heap_oop_not_null(Register dst, Register src) { 3888 assert (UseCompressedOops, "should only be used for compressed headers"); 3889 assert (Universe::heap() != NULL, "java heap should be initialized"); 3890 // Cannot assert, unverified entry point counts instructions (see .ad file) 3891 // vtableStubs also counts instructions in pd_code_size_limit. 3892 // Also do not verify_oop as this is called by verify_oop. 3893 if (CompressedOops::shift() != 0) { 3894 assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong"); 3895 if (CompressedOops::base() != NULL) { 3896 add(dst, rheapbase, src, Assembler::LSL, LogMinObjAlignmentInBytes); 3897 } else { 3898 add(dst, zr, src, Assembler::LSL, LogMinObjAlignmentInBytes); 3899 } 3900 } else { 3901 assert (CompressedOops::base() == NULL, "sanity"); 3902 if (dst != src) { 3903 mov(dst, src); 3904 } 3905 } 3906 } 3907 3908 MacroAssembler::KlassDecodeMode MacroAssembler::_klass_decode_mode(KlassDecodeNone); 3909 3910 MacroAssembler::KlassDecodeMode MacroAssembler::klass_decode_mode() { 3911 assert(UseCompressedClassPointers, "not using compressed class pointers"); 3912 assert(Metaspace::initialized(), "metaspace not initialized yet"); 3913 3914 if (_klass_decode_mode != KlassDecodeNone) { 3915 return _klass_decode_mode; 3916 } 3917 3918 assert(LogKlassAlignmentInBytes == CompressedKlassPointers::shift() 3919 || 0 == CompressedKlassPointers::shift(), "decode alg wrong"); 3920 3921 if (CompressedKlassPointers::base() == NULL) { 3922 return (_klass_decode_mode = KlassDecodeZero); 3923 } 3924 3925 if (operand_valid_for_logical_immediate( 3926 /*is32*/false, (uint64_t)CompressedKlassPointers::base())) { 3927 const uint64_t range_mask = 3928 (1UL << log2_intptr(CompressedKlassPointers::range())) - 1; 3929 if (((uint64_t)CompressedKlassPointers::base() & range_mask) == 0) { 3930 return (_klass_decode_mode = KlassDecodeXor); 3931 } 3932 } 3933 3934 const uint64_t shifted_base = 3935 (uint64_t)CompressedKlassPointers::base() >> CompressedKlassPointers::shift(); 3936 guarantee((shifted_base & 0xffff0000ffffffff) == 0, 3937 "compressed class base bad alignment"); 3938 3939 return (_klass_decode_mode = KlassDecodeMovk); 3940 } 3941 3942 void MacroAssembler::encode_klass_not_null(Register dst, Register src) { 3943 switch (klass_decode_mode()) { 3944 case KlassDecodeZero: 3945 if (CompressedKlassPointers::shift() != 0) { 3946 lsr(dst, src, LogKlassAlignmentInBytes); 3947 } else { 3948 if (dst != src) mov(dst, src); 3949 } 3950 break; 3951 3952 case KlassDecodeXor: 3953 if (CompressedKlassPointers::shift() != 0) { 3954 eor(dst, src, (uint64_t)CompressedKlassPointers::base()); 3955 lsr(dst, dst, LogKlassAlignmentInBytes); 3956 } else { 3957 eor(dst, src, (uint64_t)CompressedKlassPointers::base()); 3958 } 3959 break; 3960 3961 case KlassDecodeMovk: 3962 if (CompressedKlassPointers::shift() != 0) { 3963 ubfx(dst, src, LogKlassAlignmentInBytes, 32); 3964 } else { 3965 movw(dst, src); 3966 } 3967 break; 3968 3969 case KlassDecodeNone: 3970 ShouldNotReachHere(); 3971 break; 3972 } 3973 } 3974 3975 void MacroAssembler::encode_klass_not_null(Register r) { 3976 encode_klass_not_null(r, r); 3977 } 3978 3979 void MacroAssembler::decode_klass_not_null(Register dst, Register src) { 3980 assert (UseCompressedClassPointers, "should only be used for compressed headers"); 3981 3982 switch (klass_decode_mode()) { 3983 case KlassDecodeZero: 3984 if (CompressedKlassPointers::shift() != 0) { 3985 lsl(dst, src, LogKlassAlignmentInBytes); 3986 } else { 3987 if (dst != src) mov(dst, src); 3988 } 3989 break; 3990 3991 case KlassDecodeXor: 3992 if (CompressedKlassPointers::shift() != 0) { 3993 lsl(dst, src, LogKlassAlignmentInBytes); 3994 eor(dst, dst, (uint64_t)CompressedKlassPointers::base()); 3995 } else { 3996 eor(dst, src, (uint64_t)CompressedKlassPointers::base()); 3997 } 3998 break; 3999 4000 case KlassDecodeMovk: { 4001 const uint64_t shifted_base = 4002 (uint64_t)CompressedKlassPointers::base() >> CompressedKlassPointers::shift(); 4003 4004 if (dst != src) movw(dst, src); 4005 movk(dst, shifted_base >> 32, 32); 4006 4007 if (CompressedKlassPointers::shift() != 0) { 4008 lsl(dst, dst, LogKlassAlignmentInBytes); 4009 } 4010 4011 break; 4012 } 4013 4014 case KlassDecodeNone: 4015 ShouldNotReachHere(); 4016 break; 4017 } 4018 } 4019 4020 void MacroAssembler::decode_klass_not_null(Register r) { 4021 decode_klass_not_null(r, r); 4022 } 4023 4024 void MacroAssembler::set_narrow_oop(Register dst, jobject obj) { 4025 #ifdef ASSERT 4026 { 4027 ThreadInVMfromUnknown tiv; 4028 assert (UseCompressedOops, "should only be used for compressed oops"); 4029 assert (Universe::heap() != NULL, "java heap should be initialized"); 4030 assert (oop_recorder() != NULL, "this assembler needs an OopRecorder"); 4031 assert(Universe::heap()->is_in(JNIHandles::resolve(obj)), "should be real oop"); 4032 } 4033 #endif 4034 int oop_index = oop_recorder()->find_index(obj); 4035 InstructionMark im(this); 4036 RelocationHolder rspec = oop_Relocation::spec(oop_index); 4037 code_section()->relocate(inst_mark(), rspec); 4038 movz(dst, 0xDEAD, 16); 4039 movk(dst, 0xBEEF); 4040 } 4041 4042 void MacroAssembler::set_narrow_klass(Register dst, Klass* k) { 4043 assert (UseCompressedClassPointers, "should only be used for compressed headers"); 4044 assert (oop_recorder() != NULL, "this assembler needs an OopRecorder"); 4045 int index = oop_recorder()->find_index(k); 4046 assert(! Universe::heap()->is_in(k), "should not be an oop"); 4047 4048 InstructionMark im(this); 4049 RelocationHolder rspec = metadata_Relocation::spec(index); 4050 code_section()->relocate(inst_mark(), rspec); 4051 narrowKlass nk = CompressedKlassPointers::encode(k); 4052 movz(dst, (nk >> 16), 16); 4053 movk(dst, nk & 0xffff); 4054 } 4055 4056 void MacroAssembler::access_load_at(BasicType type, DecoratorSet decorators, 4057 Register dst, Address src, 4058 Register tmp1, Register thread_tmp) { 4059 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 4060 decorators = AccessInternal::decorator_fixup(decorators); 4061 bool as_raw = (decorators & AS_RAW) != 0; 4062 if (as_raw) { 4063 bs->BarrierSetAssembler::load_at(this, decorators, type, dst, src, tmp1, thread_tmp); 4064 } else { 4065 bs->load_at(this, decorators, type, dst, src, tmp1, thread_tmp); 4066 } 4067 } 4068 4069 void MacroAssembler::access_store_at(BasicType type, DecoratorSet decorators, 4070 Address dst, Register src, 4071 Register tmp1, Register thread_tmp) { 4072 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 4073 decorators = AccessInternal::decorator_fixup(decorators); 4074 bool as_raw = (decorators & AS_RAW) != 0; 4075 if (as_raw) { 4076 bs->BarrierSetAssembler::store_at(this, decorators, type, dst, src, tmp1, thread_tmp); 4077 } else { 4078 bs->store_at(this, decorators, type, dst, src, tmp1, thread_tmp); 4079 } 4080 } 4081 4082 void MacroAssembler::resolve(DecoratorSet decorators, Register obj) { 4083 // Use stronger ACCESS_WRITE|ACCESS_READ by default. 4084 if ((decorators & (ACCESS_READ | ACCESS_WRITE)) == 0) { 4085 decorators |= ACCESS_READ | ACCESS_WRITE; 4086 } 4087 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 4088 return bs->resolve(this, decorators, obj); 4089 } 4090 4091 void MacroAssembler::load_heap_oop(Register dst, Address src, Register tmp1, 4092 Register thread_tmp, DecoratorSet decorators) { 4093 access_load_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, thread_tmp); 4094 } 4095 4096 void MacroAssembler::load_heap_oop_not_null(Register dst, Address src, Register tmp1, 4097 Register thread_tmp, DecoratorSet decorators) { 4098 access_load_at(T_OBJECT, IN_HEAP | IS_NOT_NULL | decorators, dst, src, tmp1, thread_tmp); 4099 } 4100 4101 void MacroAssembler::store_heap_oop(Address dst, Register src, Register tmp1, 4102 Register thread_tmp, DecoratorSet decorators) { 4103 access_store_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, thread_tmp); 4104 } 4105 4106 // Used for storing NULLs. 4107 void MacroAssembler::store_heap_oop_null(Address dst) { 4108 access_store_at(T_OBJECT, IN_HEAP, dst, noreg, noreg, noreg); 4109 } 4110 4111 Address MacroAssembler::allocate_metadata_address(Metadata* obj) { 4112 assert(oop_recorder() != NULL, "this assembler needs a Recorder"); 4113 int index = oop_recorder()->allocate_metadata_index(obj); 4114 RelocationHolder rspec = metadata_Relocation::spec(index); 4115 return Address((address)obj, rspec); 4116 } 4117 4118 // Move an oop into a register. immediate is true if we want 4119 // immediate instrcutions, i.e. we are not going to patch this 4120 // instruction while the code is being executed by another thread. In 4121 // that case we can use move immediates rather than the constant pool. 4122 void MacroAssembler::movoop(Register dst, jobject obj, bool immediate) { 4123 int oop_index; 4124 if (obj == NULL) { 4125 oop_index = oop_recorder()->allocate_oop_index(obj); 4126 } else { 4127 #ifdef ASSERT 4128 { 4129 ThreadInVMfromUnknown tiv; 4130 assert(Universe::heap()->is_in(JNIHandles::resolve(obj)), "should be real oop"); 4131 } 4132 #endif 4133 oop_index = oop_recorder()->find_index(obj); 4134 } 4135 RelocationHolder rspec = oop_Relocation::spec(oop_index); 4136 if (! immediate) { 4137 address dummy = address(uintptr_t(pc()) & -wordSize); // A nearby aligned address 4138 ldr_constant(dst, Address(dummy, rspec)); 4139 } else 4140 mov(dst, Address((address)obj, rspec)); 4141 } 4142 4143 // Move a metadata address into a register. 4144 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) { 4145 int oop_index; 4146 if (obj == NULL) { 4147 oop_index = oop_recorder()->allocate_metadata_index(obj); 4148 } else { 4149 oop_index = oop_recorder()->find_index(obj); 4150 } 4151 RelocationHolder rspec = metadata_Relocation::spec(oop_index); 4152 mov(dst, Address((address)obj, rspec)); 4153 } 4154 4155 Address MacroAssembler::constant_oop_address(jobject obj) { 4156 #ifdef ASSERT 4157 { 4158 ThreadInVMfromUnknown tiv; 4159 assert(oop_recorder() != NULL, "this assembler needs an OopRecorder"); 4160 assert(Universe::heap()->is_in(JNIHandles::resolve(obj)), "not an oop"); 4161 } 4162 #endif 4163 int oop_index = oop_recorder()->find_index(obj); 4164 return Address((address)obj, oop_Relocation::spec(oop_index)); 4165 } 4166 4167 // Defines obj, preserves var_size_in_bytes, okay for t2 == var_size_in_bytes. 4168 void MacroAssembler::tlab_allocate(Register obj, 4169 Register var_size_in_bytes, 4170 int con_size_in_bytes, 4171 Register t1, 4172 Register t2, 4173 Label& slow_case) { 4174 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 4175 bs->tlab_allocate(this, obj, var_size_in_bytes, con_size_in_bytes, t1, t2, slow_case); 4176 } 4177 4178 // Defines obj, preserves var_size_in_bytes 4179 void MacroAssembler::eden_allocate(Register obj, 4180 Register var_size_in_bytes, 4181 int con_size_in_bytes, 4182 Register t1, 4183 Label& slow_case) { 4184 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 4185 bs->eden_allocate(this, obj, var_size_in_bytes, con_size_in_bytes, t1, slow_case); 4186 } 4187 4188 // Zero words; len is in bytes 4189 // Destroys all registers except addr 4190 // len must be a nonzero multiple of wordSize 4191 void MacroAssembler::zero_memory(Register addr, Register len, Register t1) { 4192 assert_different_registers(addr, len, t1, rscratch1, rscratch2); 4193 4194 #ifdef ASSERT 4195 { Label L; 4196 tst(len, BytesPerWord - 1); 4197 br(Assembler::EQ, L); 4198 stop("len is not a multiple of BytesPerWord"); 4199 bind(L); 4200 } 4201 #endif 4202 4203 #ifndef PRODUCT 4204 block_comment("zero memory"); 4205 #endif 4206 4207 Label loop; 4208 Label entry; 4209 4210 // Algorithm: 4211 // 4212 // scratch1 = cnt & 7; 4213 // cnt -= scratch1; 4214 // p += scratch1; 4215 // switch (scratch1) { 4216 // do { 4217 // cnt -= 8; 4218 // p[-8] = 0; 4219 // case 7: 4220 // p[-7] = 0; 4221 // case 6: 4222 // p[-6] = 0; 4223 // // ... 4224 // case 1: 4225 // p[-1] = 0; 4226 // case 0: 4227 // p += 8; 4228 // } while (cnt); 4229 // } 4230 4231 const int unroll = 8; // Number of str(zr) instructions we'll unroll 4232 4233 lsr(len, len, LogBytesPerWord); 4234 andr(rscratch1, len, unroll - 1); // tmp1 = cnt % unroll 4235 sub(len, len, rscratch1); // cnt -= unroll 4236 // t1 always points to the end of the region we're about to zero 4237 add(t1, addr, rscratch1, Assembler::LSL, LogBytesPerWord); 4238 adr(rscratch2, entry); 4239 sub(rscratch2, rscratch2, rscratch1, Assembler::LSL, 2); 4240 br(rscratch2); 4241 bind(loop); 4242 sub(len, len, unroll); 4243 for (int i = -unroll; i < 0; i++) 4244 Assembler::str(zr, Address(t1, i * wordSize)); 4245 bind(entry); 4246 add(t1, t1, unroll * wordSize); 4247 cbnz(len, loop); 4248 } 4249 4250 void MacroAssembler::verify_tlab() { 4251 #ifdef ASSERT 4252 if (UseTLAB && VerifyOops) { 4253 Label next, ok; 4254 4255 stp(rscratch2, rscratch1, Address(pre(sp, -16))); 4256 4257 ldr(rscratch2, Address(rthread, in_bytes(JavaThread::tlab_top_offset()))); 4258 ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_start_offset()))); 4259 cmp(rscratch2, rscratch1); 4260 br(Assembler::HS, next); 4261 STOP("assert(top >= start)"); 4262 should_not_reach_here(); 4263 4264 bind(next); 4265 ldr(rscratch2, Address(rthread, in_bytes(JavaThread::tlab_end_offset()))); 4266 ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_top_offset()))); 4267 cmp(rscratch2, rscratch1); 4268 br(Assembler::HS, ok); 4269 STOP("assert(top <= end)"); 4270 should_not_reach_here(); 4271 4272 bind(ok); 4273 ldp(rscratch2, rscratch1, Address(post(sp, 16))); 4274 } 4275 #endif 4276 } 4277 4278 // Writes to stack successive pages until offset reached to check for 4279 // stack overflow + shadow pages. This clobbers tmp. 4280 void MacroAssembler::bang_stack_size(Register size, Register tmp) { 4281 assert_different_registers(tmp, size, rscratch1); 4282 mov(tmp, sp); 4283 // Bang stack for total size given plus shadow page size. 4284 // Bang one page at a time because large size can bang beyond yellow and 4285 // red zones. 4286 Label loop; 4287 mov(rscratch1, os::vm_page_size()); 4288 bind(loop); 4289 lea(tmp, Address(tmp, -os::vm_page_size())); 4290 subsw(size, size, rscratch1); 4291 str(size, Address(tmp)); 4292 br(Assembler::GT, loop); 4293 4294 // Bang down shadow pages too. 4295 // At this point, (tmp-0) is the last address touched, so don't 4296 // touch it again. (It was touched as (tmp-pagesize) but then tmp 4297 // was post-decremented.) Skip this address by starting at i=1, and 4298 // touch a few more pages below. N.B. It is important to touch all 4299 // the way down to and including i=StackShadowPages. 4300 for (int i = 0; i < (int)(JavaThread::stack_shadow_zone_size() / os::vm_page_size()) - 1; i++) { 4301 // this could be any sized move but this is can be a debugging crumb 4302 // so the bigger the better. 4303 lea(tmp, Address(tmp, -os::vm_page_size())); 4304 str(size, Address(tmp)); 4305 } 4306 } 4307 4308 4309 // Move the address of the polling page into dest. 4310 void MacroAssembler::get_polling_page(Register dest, address page, relocInfo::relocType rtype) { 4311 if (SafepointMechanism::uses_thread_local_poll()) { 4312 ldr(dest, Address(rthread, Thread::polling_page_offset())); 4313 } else { 4314 unsigned long off; 4315 adrp(dest, Address(page, rtype), off); 4316 assert(off == 0, "polling page must be page aligned"); 4317 } 4318 } 4319 4320 // Move the address of the polling page into r, then read the polling 4321 // page. 4322 address MacroAssembler::read_polling_page(Register r, address page, relocInfo::relocType rtype) { 4323 get_polling_page(r, page, rtype); 4324 return read_polling_page(r, rtype); 4325 } 4326 4327 // Read the polling page. The address of the polling page must 4328 // already be in r. 4329 address MacroAssembler::read_polling_page(Register r, relocInfo::relocType rtype) { 4330 InstructionMark im(this); 4331 code_section()->relocate(inst_mark(), rtype); 4332 ldrw(zr, Address(r, 0)); 4333 return inst_mark(); 4334 } 4335 4336 void MacroAssembler::adrp(Register reg1, const Address &dest, unsigned long &byte_offset) { 4337 relocInfo::relocType rtype = dest.rspec().reloc()->type(); 4338 unsigned long low_page = (unsigned long)CodeCache::low_bound() >> 12; 4339 unsigned long high_page = (unsigned long)(CodeCache::high_bound()-1) >> 12; 4340 unsigned long dest_page = (unsigned long)dest.target() >> 12; 4341 long offset_low = dest_page - low_page; 4342 long offset_high = dest_page - high_page; 4343 4344 assert(is_valid_AArch64_address(dest.target()), "bad address"); 4345 assert(dest.getMode() == Address::literal, "ADRP must be applied to a literal address"); 4346 4347 InstructionMark im(this); 4348 code_section()->relocate(inst_mark(), dest.rspec()); 4349 // 8143067: Ensure that the adrp can reach the dest from anywhere within 4350 // the code cache so that if it is relocated we know it will still reach 4351 if (offset_high >= -(1<<20) && offset_low < (1<<20)) { 4352 _adrp(reg1, dest.target()); 4353 } else { 4354 unsigned long target = (unsigned long)dest.target(); 4355 unsigned long adrp_target 4356 = (target & 0xffffffffUL) | ((unsigned long)pc() & 0xffff00000000UL); 4357 4358 _adrp(reg1, (address)adrp_target); 4359 movk(reg1, target >> 32, 32); 4360 } 4361 byte_offset = (unsigned long)dest.target() & 0xfff; 4362 } 4363 4364 void MacroAssembler::load_byte_map_base(Register reg) { 4365 CardTable::CardValue* byte_map_base = 4366 ((CardTableBarrierSet*)(BarrierSet::barrier_set()))->card_table()->byte_map_base(); 4367 4368 if (is_valid_AArch64_address((address)byte_map_base)) { 4369 // Strictly speaking the byte_map_base isn't an address at all, 4370 // and it might even be negative. 4371 unsigned long offset; 4372 adrp(reg, ExternalAddress((address)byte_map_base), offset); 4373 // We expect offset to be zero with most collectors. 4374 if (offset != 0) { 4375 add(reg, reg, offset); 4376 } 4377 } else { 4378 mov(reg, (uint64_t)byte_map_base); 4379 } 4380 } 4381 4382 void MacroAssembler::build_frame(int framesize) { 4383 assert(framesize > 0, "framesize must be > 0"); 4384 if (framesize < ((1 << 9) + 2 * wordSize)) { 4385 sub(sp, sp, framesize); 4386 stp(rfp, lr, Address(sp, framesize - 2 * wordSize)); 4387 if (PreserveFramePointer) add(rfp, sp, framesize - 2 * wordSize); 4388 } else { 4389 stp(rfp, lr, Address(pre(sp, -2 * wordSize))); 4390 if (PreserveFramePointer) mov(rfp, sp); 4391 if (framesize < ((1 << 12) + 2 * wordSize)) 4392 sub(sp, sp, framesize - 2 * wordSize); 4393 else { 4394 mov(rscratch1, framesize - 2 * wordSize); 4395 sub(sp, sp, rscratch1); 4396 } 4397 } 4398 } 4399 4400 void MacroAssembler::remove_frame(int framesize) { 4401 assert(framesize > 0, "framesize must be > 0"); 4402 if (framesize < ((1 << 9) + 2 * wordSize)) { 4403 ldp(rfp, lr, Address(sp, framesize - 2 * wordSize)); 4404 add(sp, sp, framesize); 4405 } else { 4406 if (framesize < ((1 << 12) + 2 * wordSize)) 4407 add(sp, sp, framesize - 2 * wordSize); 4408 else { 4409 mov(rscratch1, framesize - 2 * wordSize); 4410 add(sp, sp, rscratch1); 4411 } 4412 ldp(rfp, lr, Address(post(sp, 2 * wordSize))); 4413 } 4414 } 4415 4416 #ifdef COMPILER2 4417 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr); 4418 4419 // Search for str1 in str2 and return index or -1 4420 void MacroAssembler::string_indexof(Register str2, Register str1, 4421 Register cnt2, Register cnt1, 4422 Register tmp1, Register tmp2, 4423 Register tmp3, Register tmp4, 4424 Register tmp5, Register tmp6, 4425 int icnt1, Register result, int ae) { 4426 // NOTE: tmp5, tmp6 can be zr depending on specific method version 4427 Label LINEARSEARCH, LINEARSTUB, LINEAR_MEDIUM, DONE, NOMATCH, MATCH; 4428 4429 Register ch1 = rscratch1; 4430 Register ch2 = rscratch2; 4431 Register cnt1tmp = tmp1; 4432 Register cnt2tmp = tmp2; 4433 Register cnt1_neg = cnt1; 4434 Register cnt2_neg = cnt2; 4435 Register result_tmp = tmp4; 4436 4437 bool isL = ae == StrIntrinsicNode::LL; 4438 4439 bool str1_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL; 4440 bool str2_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU; 4441 int str1_chr_shift = str1_isL ? 0:1; 4442 int str2_chr_shift = str2_isL ? 0:1; 4443 int str1_chr_size = str1_isL ? 1:2; 4444 int str2_chr_size = str2_isL ? 1:2; 4445 chr_insn str1_load_1chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb : 4446 (chr_insn)&MacroAssembler::ldrh; 4447 chr_insn str2_load_1chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb : 4448 (chr_insn)&MacroAssembler::ldrh; 4449 chr_insn load_2chr = isL ? (chr_insn)&MacroAssembler::ldrh : (chr_insn)&MacroAssembler::ldrw; 4450 chr_insn load_4chr = isL ? (chr_insn)&MacroAssembler::ldrw : (chr_insn)&MacroAssembler::ldr; 4451 4452 // Note, inline_string_indexOf() generates checks: 4453 // if (substr.count > string.count) return -1; 4454 // if (substr.count == 0) return 0; 4455 4456 // We have two strings, a source string in str2, cnt2 and a pattern string 4457 // in str1, cnt1. Find the 1st occurence of pattern in source or return -1. 4458 4459 // For larger pattern and source we use a simplified Boyer Moore algorithm. 4460 // With a small pattern and source we use linear scan. 4461 4462 if (icnt1 == -1) { 4463 sub(result_tmp, cnt2, cnt1); 4464 cmp(cnt1, (u1)8); // Use Linear Scan if cnt1 < 8 || cnt1 >= 256 4465 br(LT, LINEARSEARCH); 4466 dup(v0, T16B, cnt1); // done in separate FPU pipeline. Almost no penalty 4467 subs(zr, cnt1, 256); 4468 lsr(tmp1, cnt2, 2); 4469 ccmp(cnt1, tmp1, 0b0000, LT); // Source must be 4 * pattern for BM 4470 br(GE, LINEARSTUB); 4471 } 4472 4473 // The Boyer Moore alogorithm is based on the description here:- 4474 // 4475 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm 4476 // 4477 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule 4478 // and the 'Good Suffix' rule. 4479 // 4480 // These rules are essentially heuristics for how far we can shift the 4481 // pattern along the search string. 4482 // 4483 // The implementation here uses the 'Bad Character' rule only because of the 4484 // complexity of initialisation for the 'Good Suffix' rule. 4485 // 4486 // This is also known as the Boyer-Moore-Horspool algorithm:- 4487 // 4488 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm 4489 // 4490 // This particular implementation has few java-specific optimizations. 4491 // 4492 // #define ASIZE 256 4493 // 4494 // int bm(unsigned char *x, int m, unsigned char *y, int n) { 4495 // int i, j; 4496 // unsigned c; 4497 // unsigned char bc[ASIZE]; 4498 // 4499 // /* Preprocessing */ 4500 // for (i = 0; i < ASIZE; ++i) 4501 // bc[i] = m; 4502 // for (i = 0; i < m - 1; ) { 4503 // c = x[i]; 4504 // ++i; 4505 // // c < 256 for Latin1 string, so, no need for branch 4506 // #ifdef PATTERN_STRING_IS_LATIN1 4507 // bc[c] = m - i; 4508 // #else 4509 // if (c < ASIZE) bc[c] = m - i; 4510 // #endif 4511 // } 4512 // 4513 // /* Searching */ 4514 // j = 0; 4515 // while (j <= n - m) { 4516 // c = y[i+j]; 4517 // if (x[m-1] == c) 4518 // for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i); 4519 // if (i < 0) return j; 4520 // // c < 256 for Latin1 string, so, no need for branch 4521 // #ifdef SOURCE_STRING_IS_LATIN1 4522 // // LL case: (c< 256) always true. Remove branch 4523 // j += bc[y[j+m-1]]; 4524 // #endif 4525 // #ifndef PATTERN_STRING_IS_UTF 4526 // // UU case: need if (c<ASIZE) check. Skip 1 character if not. 4527 // if (c < ASIZE) 4528 // j += bc[y[j+m-1]]; 4529 // else 4530 // j += 1 4531 // #endif 4532 // #ifdef PATTERN_IS_LATIN1_AND_SOURCE_IS_UTF 4533 // // UL case: need if (c<ASIZE) check. Skip <pattern length> if not. 4534 // if (c < ASIZE) 4535 // j += bc[y[j+m-1]]; 4536 // else 4537 // j += m 4538 // #endif 4539 // } 4540 // } 4541 4542 if (icnt1 == -1) { 4543 Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH, 4544 BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP; 4545 Register cnt1end = tmp2; 4546 Register str2end = cnt2; 4547 Register skipch = tmp2; 4548 4549 // str1 length is >=8, so, we can read at least 1 register for cases when 4550 // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for 4551 // UL case. We'll re-read last character in inner pre-loop code to have 4552 // single outer pre-loop load 4553 const int firstStep = isL ? 7 : 3; 4554 4555 const int ASIZE = 256; 4556 const int STORED_BYTES = 32; // amount of bytes stored per instruction 4557 sub(sp, sp, ASIZE); 4558 mov(tmp5, ASIZE/STORED_BYTES); // loop iterations 4559 mov(ch1, sp); 4560 BIND(BM_INIT_LOOP); 4561 stpq(v0, v0, Address(post(ch1, STORED_BYTES))); 4562 subs(tmp5, tmp5, 1); 4563 br(GT, BM_INIT_LOOP); 4564 4565 sub(cnt1tmp, cnt1, 1); 4566 mov(tmp5, str2); 4567 add(str2end, str2, result_tmp, LSL, str2_chr_shift); 4568 sub(ch2, cnt1, 1); 4569 mov(tmp3, str1); 4570 BIND(BCLOOP); 4571 (this->*str1_load_1chr)(ch1, Address(post(tmp3, str1_chr_size))); 4572 if (!str1_isL) { 4573 subs(zr, ch1, ASIZE); 4574 br(HS, BCSKIP); 4575 } 4576 strb(ch2, Address(sp, ch1)); 4577 BIND(BCSKIP); 4578 subs(ch2, ch2, 1); 4579 br(GT, BCLOOP); 4580 4581 add(tmp6, str1, cnt1, LSL, str1_chr_shift); // address after str1 4582 if (str1_isL == str2_isL) { 4583 // load last 8 bytes (8LL/4UU symbols) 4584 ldr(tmp6, Address(tmp6, -wordSize)); 4585 } else { 4586 ldrw(tmp6, Address(tmp6, -wordSize/2)); // load last 4 bytes(4 symbols) 4587 // convert Latin1 to UTF. We'll have to wait until load completed, but 4588 // it's still faster than per-character loads+checks 4589 lsr(tmp3, tmp6, BitsPerByte * (wordSize/2 - str1_chr_size)); // str1[N-1] 4590 ubfx(ch1, tmp6, 8, 8); // str1[N-2] 4591 ubfx(ch2, tmp6, 16, 8); // str1[N-3] 4592 andr(tmp6, tmp6, 0xFF); // str1[N-4] 4593 orr(ch2, ch1, ch2, LSL, 16); 4594 orr(tmp6, tmp6, tmp3, LSL, 48); 4595 orr(tmp6, tmp6, ch2, LSL, 16); 4596 } 4597 BIND(BMLOOPSTR2); 4598 (this->*str2_load_1chr)(skipch, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift))); 4599 sub(cnt1tmp, cnt1tmp, firstStep); // cnt1tmp is positive here, because cnt1 >= 8 4600 if (str1_isL == str2_isL) { 4601 // re-init tmp3. It's for free because it's executed in parallel with 4602 // load above. Alternative is to initialize it before loop, but it'll 4603 // affect performance on in-order systems with 2 or more ld/st pipelines 4604 lsr(tmp3, tmp6, BitsPerByte * (wordSize - str1_chr_size)); 4605 } 4606 if (!isL) { // UU/UL case 4607 lsl(ch2, cnt1tmp, 1); // offset in bytes 4608 } 4609 cmp(tmp3, skipch); 4610 br(NE, BMSKIP); 4611 ldr(ch2, Address(str2, isL ? cnt1tmp : ch2)); 4612 mov(ch1, tmp6); 4613 if (isL) { 4614 b(BMLOOPSTR1_AFTER_LOAD); 4615 } else { 4616 sub(cnt1tmp, cnt1tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8 4617 b(BMLOOPSTR1_CMP); 4618 } 4619 BIND(BMLOOPSTR1); 4620 (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift))); 4621 (this->*str2_load_1chr)(ch2, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift))); 4622 BIND(BMLOOPSTR1_AFTER_LOAD); 4623 subs(cnt1tmp, cnt1tmp, 1); 4624 br(LT, BMLOOPSTR1_LASTCMP); 4625 BIND(BMLOOPSTR1_CMP); 4626 cmp(ch1, ch2); 4627 br(EQ, BMLOOPSTR1); 4628 BIND(BMSKIP); 4629 if (!isL) { 4630 // if we've met UTF symbol while searching Latin1 pattern, then we can 4631 // skip cnt1 symbols 4632 if (str1_isL != str2_isL) { 4633 mov(result_tmp, cnt1); 4634 } else { 4635 mov(result_tmp, 1); 4636 } 4637 subs(zr, skipch, ASIZE); 4638 br(HS, BMADV); 4639 } 4640 ldrb(result_tmp, Address(sp, skipch)); // load skip distance 4641 BIND(BMADV); 4642 sub(cnt1tmp, cnt1, 1); 4643 add(str2, str2, result_tmp, LSL, str2_chr_shift); 4644 cmp(str2, str2end); 4645 br(LE, BMLOOPSTR2); 4646 add(sp, sp, ASIZE); 4647 b(NOMATCH); 4648 BIND(BMLOOPSTR1_LASTCMP); 4649 cmp(ch1, ch2); 4650 br(NE, BMSKIP); 4651 BIND(BMMATCH); 4652 sub(result, str2, tmp5); 4653 if (!str2_isL) lsr(result, result, 1); 4654 add(sp, sp, ASIZE); 4655 b(DONE); 4656 4657 BIND(LINEARSTUB); 4658 cmp(cnt1, (u1)16); // small patterns still should be handled by simple algorithm 4659 br(LT, LINEAR_MEDIUM); 4660 mov(result, zr); 4661 RuntimeAddress stub = NULL; 4662 if (isL) { 4663 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ll()); 4664 assert(stub.target() != NULL, "string_indexof_linear_ll stub has not been generated"); 4665 } else if (str1_isL) { 4666 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ul()); 4667 assert(stub.target() != NULL, "string_indexof_linear_ul stub has not been generated"); 4668 } else { 4669 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_uu()); 4670 assert(stub.target() != NULL, "string_indexof_linear_uu stub has not been generated"); 4671 } 4672 trampoline_call(stub); 4673 b(DONE); 4674 } 4675 4676 BIND(LINEARSEARCH); 4677 { 4678 Label DO1, DO2, DO3; 4679 4680 Register str2tmp = tmp2; 4681 Register first = tmp3; 4682 4683 if (icnt1 == -1) 4684 { 4685 Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT; 4686 4687 cmp(cnt1, u1(str1_isL == str2_isL ? 4 : 2)); 4688 br(LT, DOSHORT); 4689 BIND(LINEAR_MEDIUM); 4690 (this->*str1_load_1chr)(first, Address(str1)); 4691 lea(str1, Address(str1, cnt1, Address::lsl(str1_chr_shift))); 4692 sub(cnt1_neg, zr, cnt1, LSL, str1_chr_shift); 4693 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 4694 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 4695 4696 BIND(FIRST_LOOP); 4697 (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg)); 4698 cmp(first, ch2); 4699 br(EQ, STR1_LOOP); 4700 BIND(STR2_NEXT); 4701 adds(cnt2_neg, cnt2_neg, str2_chr_size); 4702 br(LE, FIRST_LOOP); 4703 b(NOMATCH); 4704 4705 BIND(STR1_LOOP); 4706 adds(cnt1tmp, cnt1_neg, str1_chr_size); 4707 add(cnt2tmp, cnt2_neg, str2_chr_size); 4708 br(GE, MATCH); 4709 4710 BIND(STR1_NEXT); 4711 (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp)); 4712 (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp)); 4713 cmp(ch1, ch2); 4714 br(NE, STR2_NEXT); 4715 adds(cnt1tmp, cnt1tmp, str1_chr_size); 4716 add(cnt2tmp, cnt2tmp, str2_chr_size); 4717 br(LT, STR1_NEXT); 4718 b(MATCH); 4719 4720 BIND(DOSHORT); 4721 if (str1_isL == str2_isL) { 4722 cmp(cnt1, (u1)2); 4723 br(LT, DO1); 4724 br(GT, DO3); 4725 } 4726 } 4727 4728 if (icnt1 == 4) { 4729 Label CH1_LOOP; 4730 4731 (this->*load_4chr)(ch1, str1); 4732 sub(result_tmp, cnt2, 4); 4733 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 4734 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 4735 4736 BIND(CH1_LOOP); 4737 (this->*load_4chr)(ch2, Address(str2, cnt2_neg)); 4738 cmp(ch1, ch2); 4739 br(EQ, MATCH); 4740 adds(cnt2_neg, cnt2_neg, str2_chr_size); 4741 br(LE, CH1_LOOP); 4742 b(NOMATCH); 4743 } 4744 4745 if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 2) { 4746 Label CH1_LOOP; 4747 4748 BIND(DO2); 4749 (this->*load_2chr)(ch1, str1); 4750 if (icnt1 == 2) { 4751 sub(result_tmp, cnt2, 2); 4752 } 4753 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 4754 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 4755 BIND(CH1_LOOP); 4756 (this->*load_2chr)(ch2, Address(str2, cnt2_neg)); 4757 cmp(ch1, ch2); 4758 br(EQ, MATCH); 4759 adds(cnt2_neg, cnt2_neg, str2_chr_size); 4760 br(LE, CH1_LOOP); 4761 b(NOMATCH); 4762 } 4763 4764 if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 3) { 4765 Label FIRST_LOOP, STR2_NEXT, STR1_LOOP; 4766 4767 BIND(DO3); 4768 (this->*load_2chr)(first, str1); 4769 (this->*str1_load_1chr)(ch1, Address(str1, 2*str1_chr_size)); 4770 if (icnt1 == 3) { 4771 sub(result_tmp, cnt2, 3); 4772 } 4773 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 4774 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 4775 BIND(FIRST_LOOP); 4776 (this->*load_2chr)(ch2, Address(str2, cnt2_neg)); 4777 cmpw(first, ch2); 4778 br(EQ, STR1_LOOP); 4779 BIND(STR2_NEXT); 4780 adds(cnt2_neg, cnt2_neg, str2_chr_size); 4781 br(LE, FIRST_LOOP); 4782 b(NOMATCH); 4783 4784 BIND(STR1_LOOP); 4785 add(cnt2tmp, cnt2_neg, 2*str2_chr_size); 4786 (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp)); 4787 cmp(ch1, ch2); 4788 br(NE, STR2_NEXT); 4789 b(MATCH); 4790 } 4791 4792 if (icnt1 == -1 || icnt1 == 1) { 4793 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP; 4794 4795 BIND(DO1); 4796 (this->*str1_load_1chr)(ch1, str1); 4797 cmp(cnt2, (u1)8); 4798 br(LT, DO1_SHORT); 4799 4800 sub(result_tmp, cnt2, 8/str2_chr_size); 4801 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 4802 mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001); 4803 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 4804 4805 if (str2_isL) { 4806 orr(ch1, ch1, ch1, LSL, 8); 4807 } 4808 orr(ch1, ch1, ch1, LSL, 16); 4809 orr(ch1, ch1, ch1, LSL, 32); 4810 BIND(CH1_LOOP); 4811 ldr(ch2, Address(str2, cnt2_neg)); 4812 eor(ch2, ch1, ch2); 4813 sub(tmp1, ch2, tmp3); 4814 orr(tmp2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 4815 bics(tmp1, tmp1, tmp2); 4816 br(NE, HAS_ZERO); 4817 adds(cnt2_neg, cnt2_neg, 8); 4818 br(LT, CH1_LOOP); 4819 4820 cmp(cnt2_neg, (u1)8); 4821 mov(cnt2_neg, 0); 4822 br(LT, CH1_LOOP); 4823 b(NOMATCH); 4824 4825 BIND(HAS_ZERO); 4826 rev(tmp1, tmp1); 4827 clz(tmp1, tmp1); 4828 add(cnt2_neg, cnt2_neg, tmp1, LSR, 3); 4829 b(MATCH); 4830 4831 BIND(DO1_SHORT); 4832 mov(result_tmp, cnt2); 4833 lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift))); 4834 sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift); 4835 BIND(DO1_LOOP); 4836 (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg)); 4837 cmpw(ch1, ch2); 4838 br(EQ, MATCH); 4839 adds(cnt2_neg, cnt2_neg, str2_chr_size); 4840 br(LT, DO1_LOOP); 4841 } 4842 } 4843 BIND(NOMATCH); 4844 mov(result, -1); 4845 b(DONE); 4846 BIND(MATCH); 4847 add(result, result_tmp, cnt2_neg, ASR, str2_chr_shift); 4848 BIND(DONE); 4849 } 4850 4851 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr); 4852 typedef void (MacroAssembler::* uxt_insn)(Register Rd, Register Rn); 4853 4854 void MacroAssembler::string_indexof_char(Register str1, Register cnt1, 4855 Register ch, Register result, 4856 Register tmp1, Register tmp2, Register tmp3) 4857 { 4858 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE; 4859 Register cnt1_neg = cnt1; 4860 Register ch1 = rscratch1; 4861 Register result_tmp = rscratch2; 4862 4863 cbz(cnt1, NOMATCH); 4864 4865 cmp(cnt1, (u1)4); 4866 br(LT, DO1_SHORT); 4867 4868 orr(ch, ch, ch, LSL, 16); 4869 orr(ch, ch, ch, LSL, 32); 4870 4871 sub(cnt1, cnt1, 4); 4872 mov(result_tmp, cnt1); 4873 lea(str1, Address(str1, cnt1, Address::uxtw(1))); 4874 sub(cnt1_neg, zr, cnt1, LSL, 1); 4875 4876 mov(tmp3, 0x0001000100010001); 4877 4878 BIND(CH1_LOOP); 4879 ldr(ch1, Address(str1, cnt1_neg)); 4880 eor(ch1, ch, ch1); 4881 sub(tmp1, ch1, tmp3); 4882 orr(tmp2, ch1, 0x7fff7fff7fff7fff); 4883 bics(tmp1, tmp1, tmp2); 4884 br(NE, HAS_ZERO); 4885 adds(cnt1_neg, cnt1_neg, 8); 4886 br(LT, CH1_LOOP); 4887 4888 cmp(cnt1_neg, (u1)8); 4889 mov(cnt1_neg, 0); 4890 br(LT, CH1_LOOP); 4891 b(NOMATCH); 4892 4893 BIND(HAS_ZERO); 4894 rev(tmp1, tmp1); 4895 clz(tmp1, tmp1); 4896 add(cnt1_neg, cnt1_neg, tmp1, LSR, 3); 4897 b(MATCH); 4898 4899 BIND(DO1_SHORT); 4900 mov(result_tmp, cnt1); 4901 lea(str1, Address(str1, cnt1, Address::uxtw(1))); 4902 sub(cnt1_neg, zr, cnt1, LSL, 1); 4903 BIND(DO1_LOOP); 4904 ldrh(ch1, Address(str1, cnt1_neg)); 4905 cmpw(ch, ch1); 4906 br(EQ, MATCH); 4907 adds(cnt1_neg, cnt1_neg, 2); 4908 br(LT, DO1_LOOP); 4909 BIND(NOMATCH); 4910 mov(result, -1); 4911 b(DONE); 4912 BIND(MATCH); 4913 add(result, result_tmp, cnt1_neg, ASR, 1); 4914 BIND(DONE); 4915 } 4916 4917 // Compare strings. 4918 void MacroAssembler::string_compare(Register str1, Register str2, 4919 Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2, 4920 FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3, int ae) { 4921 Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB, 4922 DIFFERENCE, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, 4923 SHORT_LOOP_START, TAIL_CHECK; 4924 4925 bool isLL = ae == StrIntrinsicNode::LL; 4926 bool isLU = ae == StrIntrinsicNode::LU; 4927 bool isUL = ae == StrIntrinsicNode::UL; 4928 4929 // The stub threshold for LL strings is: 72 (64 + 8) chars 4930 // UU: 36 chars, or 72 bytes (valid for the 64-byte large loop with prefetch) 4931 // LU/UL: 24 chars, or 48 bytes (valid for the 16-character loop at least) 4932 const u1 stub_threshold = isLL ? 72 : ((isLU || isUL) ? 24 : 36); 4933 4934 bool str1_isL = isLL || isLU; 4935 bool str2_isL = isLL || isUL; 4936 4937 int str1_chr_shift = str1_isL ? 0 : 1; 4938 int str2_chr_shift = str2_isL ? 0 : 1; 4939 int str1_chr_size = str1_isL ? 1 : 2; 4940 int str2_chr_size = str2_isL ? 1 : 2; 4941 int minCharsInWord = isLL ? wordSize : wordSize/2; 4942 4943 FloatRegister vtmpZ = vtmp1, vtmp = vtmp2; 4944 chr_insn str1_load_chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb : 4945 (chr_insn)&MacroAssembler::ldrh; 4946 chr_insn str2_load_chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb : 4947 (chr_insn)&MacroAssembler::ldrh; 4948 uxt_insn ext_chr = isLL ? (uxt_insn)&MacroAssembler::uxtbw : 4949 (uxt_insn)&MacroAssembler::uxthw; 4950 4951 BLOCK_COMMENT("string_compare {"); 4952 4953 // Bizzarely, the counts are passed in bytes, regardless of whether they 4954 // are L or U strings, however the result is always in characters. 4955 if (!str1_isL) asrw(cnt1, cnt1, 1); 4956 if (!str2_isL) asrw(cnt2, cnt2, 1); 4957 4958 // Compute the minimum of the string lengths and save the difference. 4959 subsw(result, cnt1, cnt2); 4960 cselw(cnt2, cnt1, cnt2, Assembler::LE); // min 4961 4962 // A very short string 4963 cmpw(cnt2, minCharsInWord); 4964 br(Assembler::LE, SHORT_STRING); 4965 4966 // Compare longwords 4967 // load first parts of strings and finish initialization while loading 4968 { 4969 if (str1_isL == str2_isL) { // LL or UU 4970 ldr(tmp1, Address(str1)); 4971 cmp(str1, str2); 4972 br(Assembler::EQ, DONE); 4973 ldr(tmp2, Address(str2)); 4974 cmp(cnt2, stub_threshold); 4975 br(GE, STUB); 4976 subsw(cnt2, cnt2, minCharsInWord); 4977 br(EQ, TAIL_CHECK); 4978 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift))); 4979 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift))); 4980 sub(cnt2, zr, cnt2, LSL, str2_chr_shift); 4981 } else if (isLU) { 4982 ldrs(vtmp, Address(str1)); 4983 ldr(tmp2, Address(str2)); 4984 cmp(cnt2, stub_threshold); 4985 br(GE, STUB); 4986 subw(cnt2, cnt2, 4); 4987 eor(vtmpZ, T16B, vtmpZ, vtmpZ); 4988 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift))); 4989 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift))); 4990 zip1(vtmp, T8B, vtmp, vtmpZ); 4991 sub(cnt1, zr, cnt2, LSL, str1_chr_shift); 4992 sub(cnt2, zr, cnt2, LSL, str2_chr_shift); 4993 add(cnt1, cnt1, 4); 4994 fmovd(tmp1, vtmp); 4995 } else { // UL case 4996 ldr(tmp1, Address(str1)); 4997 ldrs(vtmp, Address(str2)); 4998 cmp(cnt2, stub_threshold); 4999 br(GE, STUB); 5000 subw(cnt2, cnt2, 4); 5001 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift))); 5002 eor(vtmpZ, T16B, vtmpZ, vtmpZ); 5003 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift))); 5004 sub(cnt1, zr, cnt2, LSL, str1_chr_shift); 5005 zip1(vtmp, T8B, vtmp, vtmpZ); 5006 sub(cnt2, zr, cnt2, LSL, str2_chr_shift); 5007 add(cnt1, cnt1, 8); 5008 fmovd(tmp2, vtmp); 5009 } 5010 adds(cnt2, cnt2, isUL ? 4 : 8); 5011 br(GE, TAIL); 5012 eor(rscratch2, tmp1, tmp2); 5013 cbnz(rscratch2, DIFFERENCE); 5014 // main loop 5015 bind(NEXT_WORD); 5016 if (str1_isL == str2_isL) { 5017 ldr(tmp1, Address(str1, cnt2)); 5018 ldr(tmp2, Address(str2, cnt2)); 5019 adds(cnt2, cnt2, 8); 5020 } else if (isLU) { 5021 ldrs(vtmp, Address(str1, cnt1)); 5022 ldr(tmp2, Address(str2, cnt2)); 5023 add(cnt1, cnt1, 4); 5024 zip1(vtmp, T8B, vtmp, vtmpZ); 5025 fmovd(tmp1, vtmp); 5026 adds(cnt2, cnt2, 8); 5027 } else { // UL 5028 ldrs(vtmp, Address(str2, cnt2)); 5029 ldr(tmp1, Address(str1, cnt1)); 5030 zip1(vtmp, T8B, vtmp, vtmpZ); 5031 add(cnt1, cnt1, 8); 5032 fmovd(tmp2, vtmp); 5033 adds(cnt2, cnt2, 4); 5034 } 5035 br(GE, TAIL); 5036 5037 eor(rscratch2, tmp1, tmp2); 5038 cbz(rscratch2, NEXT_WORD); 5039 b(DIFFERENCE); 5040 bind(TAIL); 5041 eor(rscratch2, tmp1, tmp2); 5042 cbnz(rscratch2, DIFFERENCE); 5043 // Last longword. In the case where length == 4 we compare the 5044 // same longword twice, but that's still faster than another 5045 // conditional branch. 5046 if (str1_isL == str2_isL) { 5047 ldr(tmp1, Address(str1)); 5048 ldr(tmp2, Address(str2)); 5049 } else if (isLU) { 5050 ldrs(vtmp, Address(str1)); 5051 ldr(tmp2, Address(str2)); 5052 zip1(vtmp, T8B, vtmp, vtmpZ); 5053 fmovd(tmp1, vtmp); 5054 } else { // UL 5055 ldrs(vtmp, Address(str2)); 5056 ldr(tmp1, Address(str1)); 5057 zip1(vtmp, T8B, vtmp, vtmpZ); 5058 fmovd(tmp2, vtmp); 5059 } 5060 bind(TAIL_CHECK); 5061 eor(rscratch2, tmp1, tmp2); 5062 cbz(rscratch2, DONE); 5063 5064 // Find the first different characters in the longwords and 5065 // compute their difference. 5066 bind(DIFFERENCE); 5067 rev(rscratch2, rscratch2); 5068 clz(rscratch2, rscratch2); 5069 andr(rscratch2, rscratch2, isLL ? -8 : -16); 5070 lsrv(tmp1, tmp1, rscratch2); 5071 (this->*ext_chr)(tmp1, tmp1); 5072 lsrv(tmp2, tmp2, rscratch2); 5073 (this->*ext_chr)(tmp2, tmp2); 5074 subw(result, tmp1, tmp2); 5075 b(DONE); 5076 } 5077 5078 bind(STUB); 5079 RuntimeAddress stub = NULL; 5080 switch(ae) { 5081 case StrIntrinsicNode::LL: 5082 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LL()); 5083 break; 5084 case StrIntrinsicNode::UU: 5085 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UU()); 5086 break; 5087 case StrIntrinsicNode::LU: 5088 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LU()); 5089 break; 5090 case StrIntrinsicNode::UL: 5091 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UL()); 5092 break; 5093 default: 5094 ShouldNotReachHere(); 5095 } 5096 assert(stub.target() != NULL, "compare_long_string stub has not been generated"); 5097 trampoline_call(stub); 5098 b(DONE); 5099 5100 bind(SHORT_STRING); 5101 // Is the minimum length zero? 5102 cbz(cnt2, DONE); 5103 // arrange code to do most branches while loading and loading next characters 5104 // while comparing previous 5105 (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size))); 5106 subs(cnt2, cnt2, 1); 5107 br(EQ, SHORT_LAST_INIT); 5108 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size))); 5109 b(SHORT_LOOP_START); 5110 bind(SHORT_LOOP); 5111 subs(cnt2, cnt2, 1); 5112 br(EQ, SHORT_LAST); 5113 bind(SHORT_LOOP_START); 5114 (this->*str1_load_chr)(tmp2, Address(post(str1, str1_chr_size))); 5115 (this->*str2_load_chr)(rscratch1, Address(post(str2, str2_chr_size))); 5116 cmp(tmp1, cnt1); 5117 br(NE, SHORT_LOOP_TAIL); 5118 subs(cnt2, cnt2, 1); 5119 br(EQ, SHORT_LAST2); 5120 (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size))); 5121 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size))); 5122 cmp(tmp2, rscratch1); 5123 br(EQ, SHORT_LOOP); 5124 sub(result, tmp2, rscratch1); 5125 b(DONE); 5126 bind(SHORT_LOOP_TAIL); 5127 sub(result, tmp1, cnt1); 5128 b(DONE); 5129 bind(SHORT_LAST2); 5130 cmp(tmp2, rscratch1); 5131 br(EQ, DONE); 5132 sub(result, tmp2, rscratch1); 5133 5134 b(DONE); 5135 bind(SHORT_LAST_INIT); 5136 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size))); 5137 bind(SHORT_LAST); 5138 cmp(tmp1, cnt1); 5139 br(EQ, DONE); 5140 sub(result, tmp1, cnt1); 5141 5142 bind(DONE); 5143 5144 BLOCK_COMMENT("} string_compare"); 5145 } 5146 #endif // COMPILER2 5147 5148 // This method checks if provided byte array contains byte with highest bit set. 5149 void MacroAssembler::has_negatives(Register ary1, Register len, Register result) { 5150 // Simple and most common case of aligned small array which is not at the 5151 // end of memory page is placed here. All other cases are in stub. 5152 Label LOOP, END, STUB, STUB_LONG, SET_RESULT, DONE; 5153 const uint64_t UPPER_BIT_MASK=0x8080808080808080; 5154 assert_different_registers(ary1, len, result); 5155 5156 cmpw(len, 0); 5157 br(LE, SET_RESULT); 5158 cmpw(len, 4 * wordSize); 5159 br(GE, STUB_LONG); // size > 32 then go to stub 5160 5161 int shift = 64 - exact_log2(os::vm_page_size()); 5162 lsl(rscratch1, ary1, shift); 5163 mov(rscratch2, (size_t)(4 * wordSize) << shift); 5164 adds(rscratch2, rscratch1, rscratch2); // At end of page? 5165 br(CS, STUB); // at the end of page then go to stub 5166 subs(len, len, wordSize); 5167 br(LT, END); 5168 5169 BIND(LOOP); 5170 ldr(rscratch1, Address(post(ary1, wordSize))); 5171 tst(rscratch1, UPPER_BIT_MASK); 5172 br(NE, SET_RESULT); 5173 subs(len, len, wordSize); 5174 br(GE, LOOP); 5175 cmpw(len, -wordSize); 5176 br(EQ, SET_RESULT); 5177 5178 BIND(END); 5179 ldr(result, Address(ary1)); 5180 sub(len, zr, len, LSL, 3); // LSL 3 is to get bits from bytes 5181 lslv(result, result, len); 5182 tst(result, UPPER_BIT_MASK); 5183 b(SET_RESULT); 5184 5185 BIND(STUB); 5186 RuntimeAddress has_neg = RuntimeAddress(StubRoutines::aarch64::has_negatives()); 5187 assert(has_neg.target() != NULL, "has_negatives stub has not been generated"); 5188 trampoline_call(has_neg); 5189 b(DONE); 5190 5191 BIND(STUB_LONG); 5192 RuntimeAddress has_neg_long = RuntimeAddress( 5193 StubRoutines::aarch64::has_negatives_long()); 5194 assert(has_neg_long.target() != NULL, "has_negatives stub has not been generated"); 5195 trampoline_call(has_neg_long); 5196 b(DONE); 5197 5198 BIND(SET_RESULT); 5199 cset(result, NE); // set true or false 5200 5201 BIND(DONE); 5202 } 5203 5204 void MacroAssembler::arrays_equals(Register a1, Register a2, Register tmp3, 5205 Register tmp4, Register tmp5, Register result, 5206 Register cnt1, int elem_size) { 5207 Label DONE, SAME; 5208 Register tmp1 = rscratch1; 5209 Register tmp2 = rscratch2; 5210 Register cnt2 = tmp2; // cnt2 only used in array length compare 5211 int elem_per_word = wordSize/elem_size; 5212 int log_elem_size = exact_log2(elem_size); 5213 int length_offset = arrayOopDesc::length_offset_in_bytes(); 5214 int base_offset 5215 = arrayOopDesc::base_offset_in_bytes(elem_size == 2 ? T_CHAR : T_BYTE); 5216 int stubBytesThreshold = 3 * 64 + (UseSIMDForArrayEquals ? 0 : 16); 5217 5218 assert(elem_size == 1 || elem_size == 2, "must be char or byte"); 5219 assert_different_registers(a1, a2, result, cnt1, rscratch1, rscratch2); 5220 5221 #ifndef PRODUCT 5222 { 5223 const char kind = (elem_size == 2) ? 'U' : 'L'; 5224 char comment[64]; 5225 snprintf(comment, sizeof comment, "array_equals%c{", kind); 5226 BLOCK_COMMENT(comment); 5227 } 5228 #endif 5229 5230 // if (a1 == a2) 5231 // return true; 5232 cmpoop(a1, a2); // May have read barriers for a1 and a2. 5233 br(EQ, SAME); 5234 5235 if (UseSimpleArrayEquals) { 5236 Label NEXT_WORD, SHORT, TAIL03, TAIL01, A_MIGHT_BE_NULL, A_IS_NOT_NULL; 5237 // if (a1 == null || a2 == null) 5238 // return false; 5239 // a1 & a2 == 0 means (some-pointer is null) or 5240 // (very-rare-or-even-probably-impossible-pointer-values) 5241 // so, we can save one branch in most cases 5242 tst(a1, a2); 5243 mov(result, false); 5244 br(EQ, A_MIGHT_BE_NULL); 5245 // if (a1.length != a2.length) 5246 // return false; 5247 bind(A_IS_NOT_NULL); 5248 ldrw(cnt1, Address(a1, length_offset)); 5249 ldrw(cnt2, Address(a2, length_offset)); 5250 eorw(tmp5, cnt1, cnt2); 5251 cbnzw(tmp5, DONE); 5252 lea(a1, Address(a1, base_offset)); 5253 lea(a2, Address(a2, base_offset)); 5254 // Check for short strings, i.e. smaller than wordSize. 5255 subs(cnt1, cnt1, elem_per_word); 5256 br(Assembler::LT, SHORT); 5257 // Main 8 byte comparison loop. 5258 bind(NEXT_WORD); { 5259 ldr(tmp1, Address(post(a1, wordSize))); 5260 ldr(tmp2, Address(post(a2, wordSize))); 5261 subs(cnt1, cnt1, elem_per_word); 5262 eor(tmp5, tmp1, tmp2); 5263 cbnz(tmp5, DONE); 5264 } br(GT, NEXT_WORD); 5265 // Last longword. In the case where length == 4 we compare the 5266 // same longword twice, but that's still faster than another 5267 // conditional branch. 5268 // cnt1 could be 0, -1, -2, -3, -4 for chars; -4 only happens when 5269 // length == 4. 5270 if (log_elem_size > 0) 5271 lsl(cnt1, cnt1, log_elem_size); 5272 ldr(tmp3, Address(a1, cnt1)); 5273 ldr(tmp4, Address(a2, cnt1)); 5274 eor(tmp5, tmp3, tmp4); 5275 cbnz(tmp5, DONE); 5276 b(SAME); 5277 bind(A_MIGHT_BE_NULL); 5278 // in case both a1 and a2 are not-null, proceed with loads 5279 cbz(a1, DONE); 5280 cbz(a2, DONE); 5281 b(A_IS_NOT_NULL); 5282 bind(SHORT); 5283 5284 tbz(cnt1, 2 - log_elem_size, TAIL03); // 0-7 bytes left. 5285 { 5286 ldrw(tmp1, Address(post(a1, 4))); 5287 ldrw(tmp2, Address(post(a2, 4))); 5288 eorw(tmp5, tmp1, tmp2); 5289 cbnzw(tmp5, DONE); 5290 } 5291 bind(TAIL03); 5292 tbz(cnt1, 1 - log_elem_size, TAIL01); // 0-3 bytes left. 5293 { 5294 ldrh(tmp3, Address(post(a1, 2))); 5295 ldrh(tmp4, Address(post(a2, 2))); 5296 eorw(tmp5, tmp3, tmp4); 5297 cbnzw(tmp5, DONE); 5298 } 5299 bind(TAIL01); 5300 if (elem_size == 1) { // Only needed when comparing byte arrays. 5301 tbz(cnt1, 0, SAME); // 0-1 bytes left. 5302 { 5303 ldrb(tmp1, a1); 5304 ldrb(tmp2, a2); 5305 eorw(tmp5, tmp1, tmp2); 5306 cbnzw(tmp5, DONE); 5307 } 5308 } 5309 } else { 5310 Label NEXT_DWORD, SHORT, TAIL, TAIL2, STUB, EARLY_OUT, 5311 CSET_EQ, LAST_CHECK; 5312 mov(result, false); 5313 cbz(a1, DONE); 5314 ldrw(cnt1, Address(a1, length_offset)); 5315 cbz(a2, DONE); 5316 ldrw(cnt2, Address(a2, length_offset)); 5317 // on most CPUs a2 is still "locked"(surprisingly) in ldrw and it's 5318 // faster to perform another branch before comparing a1 and a2 5319 cmp(cnt1, (u1)elem_per_word); 5320 br(LE, SHORT); // short or same 5321 ldr(tmp3, Address(pre(a1, base_offset))); 5322 subs(zr, cnt1, stubBytesThreshold); 5323 br(GE, STUB); 5324 ldr(tmp4, Address(pre(a2, base_offset))); 5325 sub(tmp5, zr, cnt1, LSL, 3 + log_elem_size); 5326 cmp(cnt2, cnt1); 5327 br(NE, DONE); 5328 5329 // Main 16 byte comparison loop with 2 exits 5330 bind(NEXT_DWORD); { 5331 ldr(tmp1, Address(pre(a1, wordSize))); 5332 ldr(tmp2, Address(pre(a2, wordSize))); 5333 subs(cnt1, cnt1, 2 * elem_per_word); 5334 br(LE, TAIL); 5335 eor(tmp4, tmp3, tmp4); 5336 cbnz(tmp4, DONE); 5337 ldr(tmp3, Address(pre(a1, wordSize))); 5338 ldr(tmp4, Address(pre(a2, wordSize))); 5339 cmp(cnt1, (u1)elem_per_word); 5340 br(LE, TAIL2); 5341 cmp(tmp1, tmp2); 5342 } br(EQ, NEXT_DWORD); 5343 b(DONE); 5344 5345 bind(TAIL); 5346 eor(tmp4, tmp3, tmp4); 5347 eor(tmp2, tmp1, tmp2); 5348 lslv(tmp2, tmp2, tmp5); 5349 orr(tmp5, tmp4, tmp2); 5350 cmp(tmp5, zr); 5351 b(CSET_EQ); 5352 5353 bind(TAIL2); 5354 eor(tmp2, tmp1, tmp2); 5355 cbnz(tmp2, DONE); 5356 b(LAST_CHECK); 5357 5358 bind(STUB); 5359 ldr(tmp4, Address(pre(a2, base_offset))); 5360 cmp(cnt2, cnt1); 5361 br(NE, DONE); 5362 if (elem_size == 2) { // convert to byte counter 5363 lsl(cnt1, cnt1, 1); 5364 } 5365 eor(tmp5, tmp3, tmp4); 5366 cbnz(tmp5, DONE); 5367 RuntimeAddress stub = RuntimeAddress(StubRoutines::aarch64::large_array_equals()); 5368 assert(stub.target() != NULL, "array_equals_long stub has not been generated"); 5369 trampoline_call(stub); 5370 b(DONE); 5371 5372 bind(EARLY_OUT); 5373 // (a1 != null && a2 == null) || (a1 != null && a2 != null && a1 == a2) 5374 // so, if a2 == null => return false(0), else return true, so we can return a2 5375 mov(result, a2); 5376 b(DONE); 5377 bind(SHORT); 5378 cmp(cnt2, cnt1); 5379 br(NE, DONE); 5380 cbz(cnt1, SAME); 5381 sub(tmp5, zr, cnt1, LSL, 3 + log_elem_size); 5382 ldr(tmp3, Address(a1, base_offset)); 5383 ldr(tmp4, Address(a2, base_offset)); 5384 bind(LAST_CHECK); 5385 eor(tmp4, tmp3, tmp4); 5386 lslv(tmp5, tmp4, tmp5); 5387 cmp(tmp5, zr); 5388 bind(CSET_EQ); 5389 cset(result, EQ); 5390 b(DONE); 5391 } 5392 5393 bind(SAME); 5394 mov(result, true); 5395 // That's it. 5396 bind(DONE); 5397 5398 BLOCK_COMMENT("} array_equals"); 5399 } 5400 5401 // Compare Strings 5402 5403 // For Strings we're passed the address of the first characters in a1 5404 // and a2 and the length in cnt1. 5405 // elem_size is the element size in bytes: either 1 or 2. 5406 // There are two implementations. For arrays >= 8 bytes, all 5407 // comparisons (including the final one, which may overlap) are 5408 // performed 8 bytes at a time. For strings < 8 bytes, we compare a 5409 // halfword, then a short, and then a byte. 5410 5411 void MacroAssembler::string_equals(Register a1, Register a2, 5412 Register result, Register cnt1, int elem_size) 5413 { 5414 Label SAME, DONE, SHORT, NEXT_WORD; 5415 Register tmp1 = rscratch1; 5416 Register tmp2 = rscratch2; 5417 Register cnt2 = tmp2; // cnt2 only used in array length compare 5418 5419 assert(elem_size == 1 || elem_size == 2, "must be 2 or 1 byte"); 5420 assert_different_registers(a1, a2, result, cnt1, rscratch1, rscratch2); 5421 5422 #ifndef PRODUCT 5423 { 5424 const char kind = (elem_size == 2) ? 'U' : 'L'; 5425 char comment[64]; 5426 snprintf(comment, sizeof comment, "{string_equals%c", kind); 5427 BLOCK_COMMENT(comment); 5428 } 5429 #endif 5430 5431 mov(result, false); 5432 5433 // Check for short strings, i.e. smaller than wordSize. 5434 subs(cnt1, cnt1, wordSize); 5435 br(Assembler::LT, SHORT); 5436 // Main 8 byte comparison loop. 5437 bind(NEXT_WORD); { 5438 ldr(tmp1, Address(post(a1, wordSize))); 5439 ldr(tmp2, Address(post(a2, wordSize))); 5440 subs(cnt1, cnt1, wordSize); 5441 eor(tmp1, tmp1, tmp2); 5442 cbnz(tmp1, DONE); 5443 } br(GT, NEXT_WORD); 5444 // Last longword. In the case where length == 4 we compare the 5445 // same longword twice, but that's still faster than another 5446 // conditional branch. 5447 // cnt1 could be 0, -1, -2, -3, -4 for chars; -4 only happens when 5448 // length == 4. 5449 ldr(tmp1, Address(a1, cnt1)); 5450 ldr(tmp2, Address(a2, cnt1)); 5451 eor(tmp2, tmp1, tmp2); 5452 cbnz(tmp2, DONE); 5453 b(SAME); 5454 5455 bind(SHORT); 5456 Label TAIL03, TAIL01; 5457 5458 tbz(cnt1, 2, TAIL03); // 0-7 bytes left. 5459 { 5460 ldrw(tmp1, Address(post(a1, 4))); 5461 ldrw(tmp2, Address(post(a2, 4))); 5462 eorw(tmp1, tmp1, tmp2); 5463 cbnzw(tmp1, DONE); 5464 } 5465 bind(TAIL03); 5466 tbz(cnt1, 1, TAIL01); // 0-3 bytes left. 5467 { 5468 ldrh(tmp1, Address(post(a1, 2))); 5469 ldrh(tmp2, Address(post(a2, 2))); 5470 eorw(tmp1, tmp1, tmp2); 5471 cbnzw(tmp1, DONE); 5472 } 5473 bind(TAIL01); 5474 if (elem_size == 1) { // Only needed when comparing 1-byte elements 5475 tbz(cnt1, 0, SAME); // 0-1 bytes left. 5476 { 5477 ldrb(tmp1, a1); 5478 ldrb(tmp2, a2); 5479 eorw(tmp1, tmp1, tmp2); 5480 cbnzw(tmp1, DONE); 5481 } 5482 } 5483 // Arrays are equal. 5484 bind(SAME); 5485 mov(result, true); 5486 5487 // That's it. 5488 bind(DONE); 5489 BLOCK_COMMENT("} string_equals"); 5490 } 5491 5492 5493 // The size of the blocks erased by the zero_blocks stub. We must 5494 // handle anything smaller than this ourselves in zero_words(). 5495 const int MacroAssembler::zero_words_block_size = 8; 5496 5497 // zero_words() is used by C2 ClearArray patterns. It is as small as 5498 // possible, handling small word counts locally and delegating 5499 // anything larger to the zero_blocks stub. It is expanded many times 5500 // in compiled code, so it is important to keep it short. 5501 5502 // ptr: Address of a buffer to be zeroed. 5503 // cnt: Count in HeapWords. 5504 // 5505 // ptr, cnt, rscratch1, and rscratch2 are clobbered. 5506 void MacroAssembler::zero_words(Register ptr, Register cnt) 5507 { 5508 assert(is_power_of_2(zero_words_block_size), "adjust this"); 5509 assert(ptr == r10 && cnt == r11, "mismatch in register usage"); 5510 5511 BLOCK_COMMENT("zero_words {"); 5512 cmp(cnt, (u1)zero_words_block_size); 5513 Label around; 5514 br(LO, around); 5515 { 5516 RuntimeAddress zero_blocks = RuntimeAddress(StubRoutines::aarch64::zero_blocks()); 5517 assert(zero_blocks.target() != NULL, "zero_blocks stub has not been generated"); 5518 if (StubRoutines::aarch64::complete()) { 5519 trampoline_call(zero_blocks); 5520 } else { 5521 bl(zero_blocks); 5522 } 5523 } 5524 bind(around); 5525 for (int i = zero_words_block_size >> 1; i > 1; i >>= 1) { 5526 Label l; 5527 tbz(cnt, exact_log2(i), l); 5528 for (int j = 0; j < i; j += 2) { 5529 stp(zr, zr, post(ptr, 16)); 5530 } 5531 bind(l); 5532 } 5533 { 5534 Label l; 5535 tbz(cnt, 0, l); 5536 str(zr, Address(ptr)); 5537 bind(l); 5538 } 5539 BLOCK_COMMENT("} zero_words"); 5540 } 5541 5542 // base: Address of a buffer to be zeroed, 8 bytes aligned. 5543 // cnt: Immediate count in HeapWords. 5544 #define SmallArraySize (18 * BytesPerLong) 5545 void MacroAssembler::zero_words(Register base, u_int64_t cnt) 5546 { 5547 BLOCK_COMMENT("zero_words {"); 5548 int i = cnt & 1; // store any odd word to start 5549 if (i) str(zr, Address(base)); 5550 5551 if (cnt <= SmallArraySize / BytesPerLong) { 5552 for (; i < (int)cnt; i += 2) 5553 stp(zr, zr, Address(base, i * wordSize)); 5554 } else { 5555 const int unroll = 4; // Number of stp(zr, zr) instructions we'll unroll 5556 int remainder = cnt % (2 * unroll); 5557 for (; i < remainder; i += 2) 5558 stp(zr, zr, Address(base, i * wordSize)); 5559 5560 Label loop; 5561 Register cnt_reg = rscratch1; 5562 Register loop_base = rscratch2; 5563 cnt = cnt - remainder; 5564 mov(cnt_reg, cnt); 5565 // adjust base and prebias by -2 * wordSize so we can pre-increment 5566 add(loop_base, base, (remainder - 2) * wordSize); 5567 bind(loop); 5568 sub(cnt_reg, cnt_reg, 2 * unroll); 5569 for (i = 1; i < unroll; i++) 5570 stp(zr, zr, Address(loop_base, 2 * i * wordSize)); 5571 stp(zr, zr, Address(pre(loop_base, 2 * unroll * wordSize))); 5572 cbnz(cnt_reg, loop); 5573 } 5574 BLOCK_COMMENT("} zero_words"); 5575 } 5576 5577 // Zero blocks of memory by using DC ZVA. 5578 // 5579 // Aligns the base address first sufficently for DC ZVA, then uses 5580 // DC ZVA repeatedly for every full block. cnt is the size to be 5581 // zeroed in HeapWords. Returns the count of words left to be zeroed 5582 // in cnt. 5583 // 5584 // NOTE: This is intended to be used in the zero_blocks() stub. If 5585 // you want to use it elsewhere, note that cnt must be >= 2*zva_length. 5586 void MacroAssembler::zero_dcache_blocks(Register base, Register cnt) { 5587 Register tmp = rscratch1; 5588 Register tmp2 = rscratch2; 5589 int zva_length = VM_Version::zva_length(); 5590 Label initial_table_end, loop_zva; 5591 Label fini; 5592 5593 // Base must be 16 byte aligned. If not just return and let caller handle it 5594 tst(base, 0x0f); 5595 br(Assembler::NE, fini); 5596 // Align base with ZVA length. 5597 neg(tmp, base); 5598 andr(tmp, tmp, zva_length - 1); 5599 5600 // tmp: the number of bytes to be filled to align the base with ZVA length. 5601 add(base, base, tmp); 5602 sub(cnt, cnt, tmp, Assembler::ASR, 3); 5603 adr(tmp2, initial_table_end); 5604 sub(tmp2, tmp2, tmp, Assembler::LSR, 2); 5605 br(tmp2); 5606 5607 for (int i = -zva_length + 16; i < 0; i += 16) 5608 stp(zr, zr, Address(base, i)); 5609 bind(initial_table_end); 5610 5611 sub(cnt, cnt, zva_length >> 3); 5612 bind(loop_zva); 5613 dc(Assembler::ZVA, base); 5614 subs(cnt, cnt, zva_length >> 3); 5615 add(base, base, zva_length); 5616 br(Assembler::GE, loop_zva); 5617 add(cnt, cnt, zva_length >> 3); // count not zeroed by DC ZVA 5618 bind(fini); 5619 } 5620 5621 // base: Address of a buffer to be filled, 8 bytes aligned. 5622 // cnt: Count in 8-byte unit. 5623 // value: Value to be filled with. 5624 // base will point to the end of the buffer after filling. 5625 void MacroAssembler::fill_words(Register base, Register cnt, Register value) 5626 { 5627 // Algorithm: 5628 // 5629 // scratch1 = cnt & 7; 5630 // cnt -= scratch1; 5631 // p += scratch1; 5632 // switch (scratch1) { 5633 // do { 5634 // cnt -= 8; 5635 // p[-8] = v; 5636 // case 7: 5637 // p[-7] = v; 5638 // case 6: 5639 // p[-6] = v; 5640 // // ... 5641 // case 1: 5642 // p[-1] = v; 5643 // case 0: 5644 // p += 8; 5645 // } while (cnt); 5646 // } 5647 5648 assert_different_registers(base, cnt, value, rscratch1, rscratch2); 5649 5650 Label fini, skip, entry, loop; 5651 const int unroll = 8; // Number of stp instructions we'll unroll 5652 5653 cbz(cnt, fini); 5654 tbz(base, 3, skip); 5655 str(value, Address(post(base, 8))); 5656 sub(cnt, cnt, 1); 5657 bind(skip); 5658 5659 andr(rscratch1, cnt, (unroll-1) * 2); 5660 sub(cnt, cnt, rscratch1); 5661 add(base, base, rscratch1, Assembler::LSL, 3); 5662 adr(rscratch2, entry); 5663 sub(rscratch2, rscratch2, rscratch1, Assembler::LSL, 1); 5664 br(rscratch2); 5665 5666 bind(loop); 5667 add(base, base, unroll * 16); 5668 for (int i = -unroll; i < 0; i++) 5669 stp(value, value, Address(base, i * 16)); 5670 bind(entry); 5671 subs(cnt, cnt, unroll * 2); 5672 br(Assembler::GE, loop); 5673 5674 tbz(cnt, 0, fini); 5675 str(value, Address(post(base, 8))); 5676 bind(fini); 5677 } 5678 5679 // Intrinsic for sun/nio/cs/ISO_8859_1$Encoder.implEncodeISOArray and 5680 // java/lang/StringUTF16.compress. 5681 void MacroAssembler::encode_iso_array(Register src, Register dst, 5682 Register len, Register result, 5683 FloatRegister Vtmp1, FloatRegister Vtmp2, 5684 FloatRegister Vtmp3, FloatRegister Vtmp4) 5685 { 5686 Label DONE, SET_RESULT, NEXT_32, NEXT_32_PRFM, LOOP_8, NEXT_8, LOOP_1, NEXT_1, 5687 NEXT_32_START, NEXT_32_PRFM_START; 5688 Register tmp1 = rscratch1, tmp2 = rscratch2; 5689 5690 mov(result, len); // Save initial len 5691 5692 cmp(len, (u1)8); // handle shortest strings first 5693 br(LT, LOOP_1); 5694 cmp(len, (u1)32); 5695 br(LT, NEXT_8); 5696 // The following code uses the SIMD 'uzp1' and 'uzp2' instructions 5697 // to convert chars to bytes 5698 if (SoftwarePrefetchHintDistance >= 0) { 5699 ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src); 5700 subs(tmp2, len, SoftwarePrefetchHintDistance/2 + 16); 5701 br(LE, NEXT_32_START); 5702 b(NEXT_32_PRFM_START); 5703 BIND(NEXT_32_PRFM); 5704 ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src); 5705 BIND(NEXT_32_PRFM_START); 5706 prfm(Address(src, SoftwarePrefetchHintDistance)); 5707 orr(v4, T16B, Vtmp1, Vtmp2); 5708 orr(v5, T16B, Vtmp3, Vtmp4); 5709 uzp1(Vtmp1, T16B, Vtmp1, Vtmp2); 5710 uzp1(Vtmp3, T16B, Vtmp3, Vtmp4); 5711 uzp2(v5, T16B, v4, v5); // high bytes 5712 umov(tmp2, v5, D, 1); 5713 fmovd(tmp1, v5); 5714 orr(tmp1, tmp1, tmp2); 5715 cbnz(tmp1, LOOP_8); 5716 stpq(Vtmp1, Vtmp3, dst); 5717 sub(len, len, 32); 5718 add(dst, dst, 32); 5719 add(src, src, 64); 5720 subs(tmp2, len, SoftwarePrefetchHintDistance/2 + 16); 5721 br(GE, NEXT_32_PRFM); 5722 cmp(len, (u1)32); 5723 br(LT, LOOP_8); 5724 BIND(NEXT_32); 5725 ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src); 5726 BIND(NEXT_32_START); 5727 } else { 5728 BIND(NEXT_32); 5729 ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src); 5730 } 5731 prfm(Address(src, SoftwarePrefetchHintDistance)); 5732 uzp1(v4, T16B, Vtmp1, Vtmp2); 5733 uzp1(v5, T16B, Vtmp3, Vtmp4); 5734 orr(Vtmp1, T16B, Vtmp1, Vtmp2); 5735 orr(Vtmp3, T16B, Vtmp3, Vtmp4); 5736 uzp2(Vtmp1, T16B, Vtmp1, Vtmp3); // high bytes 5737 umov(tmp2, Vtmp1, D, 1); 5738 fmovd(tmp1, Vtmp1); 5739 orr(tmp1, tmp1, tmp2); 5740 cbnz(tmp1, LOOP_8); 5741 stpq(v4, v5, dst); 5742 sub(len, len, 32); 5743 add(dst, dst, 32); 5744 add(src, src, 64); 5745 cmp(len, (u1)32); 5746 br(GE, NEXT_32); 5747 cbz(len, DONE); 5748 5749 BIND(LOOP_8); 5750 cmp(len, (u1)8); 5751 br(LT, LOOP_1); 5752 BIND(NEXT_8); 5753 ld1(Vtmp1, T8H, src); 5754 uzp1(Vtmp2, T16B, Vtmp1, Vtmp1); // low bytes 5755 uzp2(Vtmp3, T16B, Vtmp1, Vtmp1); // high bytes 5756 fmovd(tmp1, Vtmp3); 5757 cbnz(tmp1, NEXT_1); 5758 strd(Vtmp2, dst); 5759 5760 sub(len, len, 8); 5761 add(dst, dst, 8); 5762 add(src, src, 16); 5763 cmp(len, (u1)8); 5764 br(GE, NEXT_8); 5765 5766 BIND(LOOP_1); 5767 5768 cbz(len, DONE); 5769 BIND(NEXT_1); 5770 ldrh(tmp1, Address(post(src, 2))); 5771 tst(tmp1, 0xff00); 5772 br(NE, SET_RESULT); 5773 strb(tmp1, Address(post(dst, 1))); 5774 subs(len, len, 1); 5775 br(GT, NEXT_1); 5776 5777 BIND(SET_RESULT); 5778 sub(result, result, len); // Return index where we stopped 5779 // Return len == 0 if we processed all 5780 // characters 5781 BIND(DONE); 5782 } 5783 5784 5785 // Inflate byte[] array to char[]. 5786 void MacroAssembler::byte_array_inflate(Register src, Register dst, Register len, 5787 FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3, 5788 Register tmp4) { 5789 Label big, done, after_init, to_stub; 5790 5791 assert_different_registers(src, dst, len, tmp4, rscratch1); 5792 5793 fmovd(vtmp1, zr); 5794 lsrw(tmp4, len, 3); 5795 bind(after_init); 5796 cbnzw(tmp4, big); 5797 // Short string: less than 8 bytes. 5798 { 5799 Label loop, tiny; 5800 5801 cmpw(len, 4); 5802 br(LT, tiny); 5803 // Use SIMD to do 4 bytes. 5804 ldrs(vtmp2, post(src, 4)); 5805 zip1(vtmp3, T8B, vtmp2, vtmp1); 5806 subw(len, len, 4); 5807 strd(vtmp3, post(dst, 8)); 5808 5809 cbzw(len, done); 5810 5811 // Do the remaining bytes by steam. 5812 bind(loop); 5813 ldrb(tmp4, post(src, 1)); 5814 strh(tmp4, post(dst, 2)); 5815 subw(len, len, 1); 5816 5817 bind(tiny); 5818 cbnz(len, loop); 5819 5820 b(done); 5821 } 5822 5823 if (SoftwarePrefetchHintDistance >= 0) { 5824 bind(to_stub); 5825 RuntimeAddress stub = RuntimeAddress(StubRoutines::aarch64::large_byte_array_inflate()); 5826 assert(stub.target() != NULL, "large_byte_array_inflate stub has not been generated"); 5827 trampoline_call(stub); 5828 b(after_init); 5829 } 5830 5831 // Unpack the bytes 8 at a time. 5832 bind(big); 5833 { 5834 Label loop, around, loop_last, loop_start; 5835 5836 if (SoftwarePrefetchHintDistance >= 0) { 5837 const int large_loop_threshold = (64 + 16)/8; 5838 ldrd(vtmp2, post(src, 8)); 5839 andw(len, len, 7); 5840 cmp(tmp4, (u1)large_loop_threshold); 5841 br(GE, to_stub); 5842 b(loop_start); 5843 5844 bind(loop); 5845 ldrd(vtmp2, post(src, 8)); 5846 bind(loop_start); 5847 subs(tmp4, tmp4, 1); 5848 br(EQ, loop_last); 5849 zip1(vtmp2, T16B, vtmp2, vtmp1); 5850 ldrd(vtmp3, post(src, 8)); 5851 st1(vtmp2, T8H, post(dst, 16)); 5852 subs(tmp4, tmp4, 1); 5853 zip1(vtmp3, T16B, vtmp3, vtmp1); 5854 st1(vtmp3, T8H, post(dst, 16)); 5855 br(NE, loop); 5856 b(around); 5857 bind(loop_last); 5858 zip1(vtmp2, T16B, vtmp2, vtmp1); 5859 st1(vtmp2, T8H, post(dst, 16)); 5860 bind(around); 5861 cbz(len, done); 5862 } else { 5863 andw(len, len, 7); 5864 bind(loop); 5865 ldrd(vtmp2, post(src, 8)); 5866 sub(tmp4, tmp4, 1); 5867 zip1(vtmp3, T16B, vtmp2, vtmp1); 5868 st1(vtmp3, T8H, post(dst, 16)); 5869 cbnz(tmp4, loop); 5870 } 5871 } 5872 5873 // Do the tail of up to 8 bytes. 5874 add(src, src, len); 5875 ldrd(vtmp3, Address(src, -8)); 5876 add(dst, dst, len, ext::uxtw, 1); 5877 zip1(vtmp3, T16B, vtmp3, vtmp1); 5878 strq(vtmp3, Address(dst, -16)); 5879 5880 bind(done); 5881 } 5882 5883 // Compress char[] array to byte[]. 5884 void MacroAssembler::char_array_compress(Register src, Register dst, Register len, 5885 FloatRegister tmp1Reg, FloatRegister tmp2Reg, 5886 FloatRegister tmp3Reg, FloatRegister tmp4Reg, 5887 Register result) { 5888 encode_iso_array(src, dst, len, result, 5889 tmp1Reg, tmp2Reg, tmp3Reg, tmp4Reg); 5890 cmp(len, zr); 5891 csel(result, result, zr, EQ); 5892 } 5893 5894 // get_thread() can be called anywhere inside generated code so we 5895 // need to save whatever non-callee save context might get clobbered 5896 // by the call to JavaThread::aarch64_get_thread_helper() or, indeed, 5897 // the call setup code. 5898 // 5899 // aarch64_get_thread_helper() clobbers only r0, r1, and flags. 5900 // 5901 void MacroAssembler::get_thread(Register dst) { 5902 RegSet saved_regs = RegSet::range(r0, r1) + lr - dst; 5903 push(saved_regs, sp); 5904 5905 mov(lr, CAST_FROM_FN_PTR(address, JavaThread::aarch64_get_thread_helper)); 5906 blr(lr); 5907 if (dst != c_rarg0) { 5908 mov(dst, c_rarg0); 5909 } 5910 5911 pop(saved_regs, sp); 5912 } 5913 5914 void MacroAssembler::cache_wb(Address line) { 5915 assert(line.getMode() == Address::base_plus_offset, "mode should be base_plus_offset"); 5916 assert(line.index() == noreg, "index should be noreg"); 5917 assert(line.offset() == 0, "offset should be 0"); 5918 // would like to assert this 5919 // assert(line._ext.shift == 0, "shift should be zero"); 5920 if (VM_Version::supports_dcpop()) { 5921 // writeback using clear virtual address to point of persistence 5922 dc(Assembler::CVAP, line.base()); 5923 } else { 5924 // no need to generate anything as Unsafe.writebackMemory should 5925 // never invoke this stub 5926 } 5927 } 5928 5929 void MacroAssembler::cache_wbsync(bool is_pre) { 5930 // we only need a barrier post sync 5931 if (!is_pre) { 5932 membar(Assembler::AnyAny); 5933 } 5934 }