1 /* 2 * Copyright (c) 1997, 2019, Oracle and/or its affiliates. All rights reserved. 3 * Copyright (c) 2014, 2019, Red Hat Inc. All rights reserved. 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. 9 * 10 * This code is distributed in the hope that it will be useful, but WITHOUT 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 * version 2 for more details (a copy is included in the LICENSE file that 14 * accompanied this code). 15 * 16 * You should have received a copy of the GNU General Public License version 17 * 2 along with this work; if not, write to the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 19 * 20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 21 * or visit www.oracle.com if you need additional information or have any 22 * questions. 23 * 24 */ 25 26 #include <sys/types.h> 27 28 #include "precompiled.hpp" 29 #include "jvm.h" 30 #include "asm/assembler.hpp" 31 #include "asm/assembler.inline.hpp" 32 #include "gc/shared/barrierSet.hpp" 33 #include "gc/shared/cardTable.hpp" 34 #include "gc/shared/barrierSetAssembler.hpp" 35 #include "gc/shared/cardTableBarrierSet.hpp" 36 #include "interpreter/interpreter.hpp" 37 #include "compiler/disassembler.hpp" 38 #include "memory/resourceArea.hpp" 39 #include "nativeInst_aarch64.hpp" 40 #include "oops/accessDecorators.hpp" 41 #include "oops/compressedOops.inline.hpp" 42 #include "oops/klass.inline.hpp" 43 #include "runtime/biasedLocking.hpp" 44 #include "runtime/icache.hpp" 45 #include "runtime/interfaceSupport.inline.hpp" 46 #include "runtime/jniHandles.inline.hpp" 47 #include "runtime/sharedRuntime.hpp" 48 #include "runtime/thread.hpp" 49 #ifdef COMPILER1 50 #include "c1/c1_LIRAssembler.hpp" 51 #endif 52 #ifdef COMPILER2 53 #include "oops/oop.hpp" 54 #include "opto/compile.hpp" 55 #include "opto/intrinsicnode.hpp" 56 #include "opto/node.hpp" 57 #endif 58 59 #ifdef PRODUCT 60 #define BLOCK_COMMENT(str) /* nothing */ 61 #define STOP(error) stop(error) 62 #else 63 #define BLOCK_COMMENT(str) block_comment(str) 64 #define STOP(error) block_comment(error); stop(error) 65 #endif 66 67 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 68 69 // Patch any kind of instruction; there may be several instructions. 70 // Return the total length (in bytes) of the instructions. 71 int MacroAssembler::pd_patch_instruction_size(address branch, address target) { 72 int instructions = 1; 73 assert((uint64_t)target < (1ul << 48), "48-bit overflow in address constant"); 74 long offset = (target - branch) >> 2; 75 unsigned insn = *(unsigned*)branch; 76 if ((Instruction_aarch64::extract(insn, 29, 24) & 0b111011) == 0b011000) { 77 // Load register (literal) 78 Instruction_aarch64::spatch(branch, 23, 5, offset); 79 } else if (Instruction_aarch64::extract(insn, 30, 26) == 0b00101) { 80 // Unconditional branch (immediate) 81 Instruction_aarch64::spatch(branch, 25, 0, offset); 82 } else if (Instruction_aarch64::extract(insn, 31, 25) == 0b0101010) { 83 // Conditional branch (immediate) 84 Instruction_aarch64::spatch(branch, 23, 5, offset); 85 } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011010) { 86 // Compare & branch (immediate) 87 Instruction_aarch64::spatch(branch, 23, 5, offset); 88 } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011011) { 89 // Test & branch (immediate) 90 Instruction_aarch64::spatch(branch, 18, 5, offset); 91 } else if (Instruction_aarch64::extract(insn, 28, 24) == 0b10000) { 92 // PC-rel. addressing 93 offset = target-branch; 94 int shift = Instruction_aarch64::extract(insn, 31, 31); 95 if (shift) { 96 u_int64_t dest = (u_int64_t)target; 97 uint64_t pc_page = (uint64_t)branch >> 12; 98 uint64_t adr_page = (uint64_t)target >> 12; 99 unsigned offset_lo = dest & 0xfff; 100 offset = adr_page - pc_page; 101 102 // We handle 4 types of PC relative addressing 103 // 1 - adrp Rx, target_page 104 // ldr/str Ry, [Rx, #offset_in_page] 105 // 2 - adrp Rx, target_page 106 // add Ry, Rx, #offset_in_page 107 // 3 - adrp Rx, target_page (page aligned reloc, offset == 0) 108 // movk Rx, #imm16<<32 109 // 4 - adrp Rx, target_page (page aligned reloc, offset == 0) 110 // In the first 3 cases we must check that Rx is the same in the adrp and the 111 // subsequent ldr/str, add or movk instruction. Otherwise we could accidentally end 112 // up treating a type 4 relocation as a type 1, 2 or 3 just because it happened 113 // to be followed by a random unrelated ldr/str, add or movk instruction. 114 // 115 unsigned insn2 = ((unsigned*)branch)[1]; 116 if (Instruction_aarch64::extract(insn2, 29, 24) == 0b111001 && 117 Instruction_aarch64::extract(insn, 4, 0) == 118 Instruction_aarch64::extract(insn2, 9, 5)) { 119 // Load/store register (unsigned immediate) 120 unsigned size = Instruction_aarch64::extract(insn2, 31, 30); 121 Instruction_aarch64::patch(branch + sizeof (unsigned), 122 21, 10, offset_lo >> size); 123 guarantee(((dest >> size) << size) == dest, "misaligned target"); 124 instructions = 2; 125 } else if (Instruction_aarch64::extract(insn2, 31, 22) == 0b1001000100 && 126 Instruction_aarch64::extract(insn, 4, 0) == 127 Instruction_aarch64::extract(insn2, 4, 0)) { 128 // add (immediate) 129 Instruction_aarch64::patch(branch + sizeof (unsigned), 130 21, 10, offset_lo); 131 instructions = 2; 132 } else if (Instruction_aarch64::extract(insn2, 31, 21) == 0b11110010110 && 133 Instruction_aarch64::extract(insn, 4, 0) == 134 Instruction_aarch64::extract(insn2, 4, 0)) { 135 // movk #imm16<<32 136 Instruction_aarch64::patch(branch + 4, 20, 5, (uint64_t)target >> 32); 137 long dest = ((long)target & 0xffffffffL) | ((long)branch & 0xffff00000000L); 138 long pc_page = (long)branch >> 12; 139 long adr_page = (long)dest >> 12; 140 offset = adr_page - pc_page; 141 instructions = 2; 142 } 143 } 144 int offset_lo = offset & 3; 145 offset >>= 2; 146 Instruction_aarch64::spatch(branch, 23, 5, offset); 147 Instruction_aarch64::patch(branch, 30, 29, offset_lo); 148 } else if (Instruction_aarch64::extract(insn, 31, 21) == 0b11010010100) { 149 u_int64_t dest = (u_int64_t)target; 150 // Move wide constant 151 assert(nativeInstruction_at(branch+4)->is_movk(), "wrong insns in patch"); 152 assert(nativeInstruction_at(branch+8)->is_movk(), "wrong insns in patch"); 153 Instruction_aarch64::patch(branch, 20, 5, dest & 0xffff); 154 Instruction_aarch64::patch(branch+4, 20, 5, (dest >>= 16) & 0xffff); 155 Instruction_aarch64::patch(branch+8, 20, 5, (dest >>= 16) & 0xffff); 156 assert(target_addr_for_insn(branch) == target, "should be"); 157 instructions = 3; 158 } else if (Instruction_aarch64::extract(insn, 31, 22) == 0b1011100101 && 159 Instruction_aarch64::extract(insn, 4, 0) == 0b11111) { 160 // nothing to do 161 assert(target == 0, "did not expect to relocate target for polling page load"); 162 } else { 163 ShouldNotReachHere(); 164 } 165 return instructions * NativeInstruction::instruction_size; 166 } 167 168 int MacroAssembler::patch_oop(address insn_addr, address o) { 169 int instructions; 170 unsigned insn = *(unsigned*)insn_addr; 171 assert(nativeInstruction_at(insn_addr+4)->is_movk(), "wrong insns in patch"); 172 173 // OOPs are either narrow (32 bits) or wide (48 bits). We encode 174 // narrow OOPs by setting the upper 16 bits in the first 175 // instruction. 176 if (Instruction_aarch64::extract(insn, 31, 21) == 0b11010010101) { 177 // Move narrow OOP 178 narrowOop n = CompressedOops::encode((oop)o); 179 Instruction_aarch64::patch(insn_addr, 20, 5, n >> 16); 180 Instruction_aarch64::patch(insn_addr+4, 20, 5, n & 0xffff); 181 instructions = 2; 182 } else { 183 // Move wide OOP 184 assert(nativeInstruction_at(insn_addr+8)->is_movk(), "wrong insns in patch"); 185 uintptr_t dest = (uintptr_t)o; 186 Instruction_aarch64::patch(insn_addr, 20, 5, dest & 0xffff); 187 Instruction_aarch64::patch(insn_addr+4, 20, 5, (dest >>= 16) & 0xffff); 188 Instruction_aarch64::patch(insn_addr+8, 20, 5, (dest >>= 16) & 0xffff); 189 instructions = 3; 190 } 191 return instructions * NativeInstruction::instruction_size; 192 } 193 194 int MacroAssembler::patch_narrow_klass(address insn_addr, narrowKlass n) { 195 // Metatdata pointers are either narrow (32 bits) or wide (48 bits). 196 // We encode narrow ones by setting the upper 16 bits in the first 197 // instruction. 198 NativeInstruction *insn = nativeInstruction_at(insn_addr); 199 assert(Instruction_aarch64::extract(insn->encoding(), 31, 21) == 0b11010010101 && 200 nativeInstruction_at(insn_addr+4)->is_movk(), "wrong insns in patch"); 201 202 Instruction_aarch64::patch(insn_addr, 20, 5, n >> 16); 203 Instruction_aarch64::patch(insn_addr+4, 20, 5, n & 0xffff); 204 return 2 * NativeInstruction::instruction_size; 205 } 206 207 address MacroAssembler::target_addr_for_insn(address insn_addr, unsigned insn) { 208 long offset = 0; 209 if ((Instruction_aarch64::extract(insn, 29, 24) & 0b011011) == 0b00011000) { 210 // Load register (literal) 211 offset = Instruction_aarch64::sextract(insn, 23, 5); 212 return address(((uint64_t)insn_addr + (offset << 2))); 213 } else if (Instruction_aarch64::extract(insn, 30, 26) == 0b00101) { 214 // Unconditional branch (immediate) 215 offset = Instruction_aarch64::sextract(insn, 25, 0); 216 } else if (Instruction_aarch64::extract(insn, 31, 25) == 0b0101010) { 217 // Conditional branch (immediate) 218 offset = Instruction_aarch64::sextract(insn, 23, 5); 219 } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011010) { 220 // Compare & branch (immediate) 221 offset = Instruction_aarch64::sextract(insn, 23, 5); 222 } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011011) { 223 // Test & branch (immediate) 224 offset = Instruction_aarch64::sextract(insn, 18, 5); 225 } else if (Instruction_aarch64::extract(insn, 28, 24) == 0b10000) { 226 // PC-rel. addressing 227 offset = Instruction_aarch64::extract(insn, 30, 29); 228 offset |= Instruction_aarch64::sextract(insn, 23, 5) << 2; 229 int shift = Instruction_aarch64::extract(insn, 31, 31) ? 12 : 0; 230 if (shift) { 231 offset <<= shift; 232 uint64_t target_page = ((uint64_t)insn_addr) + offset; 233 target_page &= ((uint64_t)-1) << shift; 234 // Return the target address for the following sequences 235 // 1 - adrp Rx, target_page 236 // ldr/str Ry, [Rx, #offset_in_page] 237 // 2 - adrp Rx, target_page 238 // add Ry, Rx, #offset_in_page 239 // 3 - adrp Rx, target_page (page aligned reloc, offset == 0) 240 // movk Rx, #imm12<<32 241 // 4 - adrp Rx, target_page (page aligned reloc, offset == 0) 242 // 243 // In the first two cases we check that the register is the same and 244 // return the target_page + the offset within the page. 245 // Otherwise we assume it is a page aligned relocation and return 246 // the target page only. 247 // 248 unsigned insn2 = ((unsigned*)insn_addr)[1]; 249 if (Instruction_aarch64::extract(insn2, 29, 24) == 0b111001 && 250 Instruction_aarch64::extract(insn, 4, 0) == 251 Instruction_aarch64::extract(insn2, 9, 5)) { 252 // Load/store register (unsigned immediate) 253 unsigned int byte_offset = Instruction_aarch64::extract(insn2, 21, 10); 254 unsigned int size = Instruction_aarch64::extract(insn2, 31, 30); 255 return address(target_page + (byte_offset << size)); 256 } else if (Instruction_aarch64::extract(insn2, 31, 22) == 0b1001000100 && 257 Instruction_aarch64::extract(insn, 4, 0) == 258 Instruction_aarch64::extract(insn2, 4, 0)) { 259 // add (immediate) 260 unsigned int byte_offset = Instruction_aarch64::extract(insn2, 21, 10); 261 return address(target_page + byte_offset); 262 } else { 263 if (Instruction_aarch64::extract(insn2, 31, 21) == 0b11110010110 && 264 Instruction_aarch64::extract(insn, 4, 0) == 265 Instruction_aarch64::extract(insn2, 4, 0)) { 266 target_page = (target_page & 0xffffffff) | 267 ((uint64_t)Instruction_aarch64::extract(insn2, 20, 5) << 32); 268 } 269 return (address)target_page; 270 } 271 } else { 272 ShouldNotReachHere(); 273 } 274 } else if (Instruction_aarch64::extract(insn, 31, 23) == 0b110100101) { 275 u_int32_t *insns = (u_int32_t *)insn_addr; 276 // Move wide constant: movz, movk, movk. See movptr(). 277 assert(nativeInstruction_at(insns+1)->is_movk(), "wrong insns in patch"); 278 assert(nativeInstruction_at(insns+2)->is_movk(), "wrong insns in patch"); 279 return address(u_int64_t(Instruction_aarch64::extract(insns[0], 20, 5)) 280 + (u_int64_t(Instruction_aarch64::extract(insns[1], 20, 5)) << 16) 281 + (u_int64_t(Instruction_aarch64::extract(insns[2], 20, 5)) << 32)); 282 } else if (Instruction_aarch64::extract(insn, 31, 22) == 0b1011100101 && 283 Instruction_aarch64::extract(insn, 4, 0) == 0b11111) { 284 return 0; 285 } else { 286 ShouldNotReachHere(); 287 } 288 return address(((uint64_t)insn_addr + (offset << 2))); 289 } 290 291 void MacroAssembler::safepoint_poll(Label& slow_path) { 292 if (SafepointMechanism::uses_thread_local_poll()) { 293 ldr(rscratch1, Address(rthread, Thread::polling_page_offset())); 294 tbnz(rscratch1, exact_log2(SafepointMechanism::poll_bit()), slow_path); 295 } else { 296 unsigned long offset; 297 adrp(rscratch1, ExternalAddress(SafepointSynchronize::address_of_state()), offset); 298 ldrw(rscratch1, Address(rscratch1, offset)); 299 assert(SafepointSynchronize::_not_synchronized == 0, "rewrite this code"); 300 cbnz(rscratch1, slow_path); 301 } 302 } 303 304 // Just like safepoint_poll, but use an acquiring load for thread- 305 // local polling. 306 // 307 // We need an acquire here to ensure that any subsequent load of the 308 // global SafepointSynchronize::_state flag is ordered after this load 309 // of the local Thread::_polling page. We don't want this poll to 310 // return false (i.e. not safepointing) and a later poll of the global 311 // SafepointSynchronize::_state spuriously to return true. 312 // 313 // This is to avoid a race when we're in a native->Java transition 314 // racing the code which wakes up from a safepoint. 315 // 316 void MacroAssembler::safepoint_poll_acquire(Label& slow_path) { 317 if (SafepointMechanism::uses_thread_local_poll()) { 318 lea(rscratch1, Address(rthread, Thread::polling_page_offset())); 319 ldar(rscratch1, rscratch1); 320 tbnz(rscratch1, exact_log2(SafepointMechanism::poll_bit()), slow_path); 321 } else { 322 safepoint_poll(slow_path); 323 } 324 } 325 326 void MacroAssembler::reset_last_Java_frame(bool clear_fp) { 327 // we must set sp to zero to clear frame 328 str(zr, Address(rthread, JavaThread::last_Java_sp_offset())); 329 330 // must clear fp, so that compiled frames are not confused; it is 331 // possible that we need it only for debugging 332 if (clear_fp) { 333 str(zr, Address(rthread, JavaThread::last_Java_fp_offset())); 334 } 335 336 // Always clear the pc because it could have been set by make_walkable() 337 str(zr, Address(rthread, JavaThread::last_Java_pc_offset())); 338 } 339 340 // Calls to C land 341 // 342 // When entering C land, the rfp, & resp of the last Java frame have to be recorded 343 // in the (thread-local) JavaThread object. When leaving C land, the last Java fp 344 // has to be reset to 0. This is required to allow proper stack traversal. 345 void MacroAssembler::set_last_Java_frame(Register last_java_sp, 346 Register last_java_fp, 347 Register last_java_pc, 348 Register scratch) { 349 350 if (last_java_pc->is_valid()) { 351 str(last_java_pc, Address(rthread, 352 JavaThread::frame_anchor_offset() 353 + JavaFrameAnchor::last_Java_pc_offset())); 354 } 355 356 // determine last_java_sp register 357 if (last_java_sp == sp) { 358 mov(scratch, sp); 359 last_java_sp = scratch; 360 } else if (!last_java_sp->is_valid()) { 361 last_java_sp = esp; 362 } 363 364 str(last_java_sp, Address(rthread, JavaThread::last_Java_sp_offset())); 365 366 // last_java_fp is optional 367 if (last_java_fp->is_valid()) { 368 str(last_java_fp, Address(rthread, JavaThread::last_Java_fp_offset())); 369 } 370 } 371 372 void MacroAssembler::set_last_Java_frame(Register last_java_sp, 373 Register last_java_fp, 374 address last_java_pc, 375 Register scratch) { 376 assert(last_java_pc != NULL, "must provide a valid PC"); 377 378 adr(scratch, last_java_pc); 379 str(scratch, Address(rthread, 380 JavaThread::frame_anchor_offset() 381 + JavaFrameAnchor::last_Java_pc_offset())); 382 383 set_last_Java_frame(last_java_sp, last_java_fp, noreg, scratch); 384 } 385 386 void MacroAssembler::set_last_Java_frame(Register last_java_sp, 387 Register last_java_fp, 388 Label &L, 389 Register scratch) { 390 if (L.is_bound()) { 391 set_last_Java_frame(last_java_sp, last_java_fp, target(L), scratch); 392 } else { 393 InstructionMark im(this); 394 L.add_patch_at(code(), locator()); 395 set_last_Java_frame(last_java_sp, last_java_fp, pc() /* Patched later */, scratch); 396 } 397 } 398 399 void MacroAssembler::far_call(Address entry, CodeBuffer *cbuf, Register tmp) { 400 assert(ReservedCodeCacheSize < 4*G, "branch out of range"); 401 assert(CodeCache::find_blob(entry.target()) != NULL, 402 "destination of far call not found in code cache"); 403 if (far_branches()) { 404 unsigned long offset; 405 // We can use ADRP here because we know that the total size of 406 // the code cache cannot exceed 2Gb. 407 adrp(tmp, entry, offset); 408 add(tmp, tmp, offset); 409 if (cbuf) cbuf->set_insts_mark(); 410 blr(tmp); 411 } else { 412 if (cbuf) cbuf->set_insts_mark(); 413 bl(entry); 414 } 415 } 416 417 void MacroAssembler::far_jump(Address entry, CodeBuffer *cbuf, Register tmp) { 418 assert(ReservedCodeCacheSize < 4*G, "branch out of range"); 419 assert(CodeCache::find_blob(entry.target()) != NULL, 420 "destination of far call not found in code cache"); 421 if (far_branches()) { 422 unsigned long offset; 423 // We can use ADRP here because we know that the total size of 424 // the code cache cannot exceed 2Gb. 425 adrp(tmp, entry, offset); 426 add(tmp, tmp, offset); 427 if (cbuf) cbuf->set_insts_mark(); 428 br(tmp); 429 } else { 430 if (cbuf) cbuf->set_insts_mark(); 431 b(entry); 432 } 433 } 434 435 void MacroAssembler::reserved_stack_check() { 436 // testing if reserved zone needs to be enabled 437 Label no_reserved_zone_enabling; 438 439 ldr(rscratch1, Address(rthread, JavaThread::reserved_stack_activation_offset())); 440 cmp(sp, rscratch1); 441 br(Assembler::LO, no_reserved_zone_enabling); 442 443 enter(); // LR and FP are live. 444 lea(rscratch1, CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone)); 445 mov(c_rarg0, rthread); 446 blr(rscratch1); 447 leave(); 448 449 // We have already removed our own frame. 450 // throw_delayed_StackOverflowError will think that it's been 451 // called by our caller. 452 lea(rscratch1, RuntimeAddress(StubRoutines::throw_delayed_StackOverflowError_entry())); 453 br(rscratch1); 454 should_not_reach_here(); 455 456 bind(no_reserved_zone_enabling); 457 } 458 459 int MacroAssembler::biased_locking_enter(Register lock_reg, 460 Register obj_reg, 461 Register swap_reg, 462 Register tmp_reg, 463 bool swap_reg_contains_mark, 464 Label& done, 465 Label* slow_case, 466 BiasedLockingCounters* counters) { 467 assert(UseBiasedLocking, "why call this otherwise?"); 468 assert_different_registers(lock_reg, obj_reg, swap_reg); 469 470 if (PrintBiasedLockingStatistics && counters == NULL) 471 counters = BiasedLocking::counters(); 472 473 assert_different_registers(lock_reg, obj_reg, swap_reg, tmp_reg, rscratch1, rscratch2, noreg); 474 assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits, "biased locking makes assumptions about bit layout"); 475 Address mark_addr (obj_reg, oopDesc::mark_offset_in_bytes()); 476 Address klass_addr (obj_reg, oopDesc::klass_offset_in_bytes()); 477 Address saved_mark_addr(lock_reg, 0); 478 479 // Biased locking 480 // See whether the lock is currently biased toward our thread and 481 // whether the epoch is still valid 482 // Note that the runtime guarantees sufficient alignment of JavaThread 483 // pointers to allow age to be placed into low bits 484 // First check to see whether biasing is even enabled for this object 485 Label cas_label; 486 int null_check_offset = -1; 487 if (!swap_reg_contains_mark) { 488 null_check_offset = offset(); 489 ldr(swap_reg, mark_addr); 490 } 491 andr(tmp_reg, swap_reg, markOopDesc::biased_lock_mask_in_place); 492 cmp(tmp_reg, (u1)markOopDesc::biased_lock_pattern); 493 br(Assembler::NE, cas_label); 494 // The bias pattern is present in the object's header. Need to check 495 // whether the bias owner and the epoch are both still current. 496 load_prototype_header(tmp_reg, obj_reg); 497 orr(tmp_reg, tmp_reg, rthread); 498 eor(tmp_reg, swap_reg, tmp_reg); 499 andr(tmp_reg, tmp_reg, ~((int) markOopDesc::age_mask_in_place)); 500 if (counters != NULL) { 501 Label around; 502 cbnz(tmp_reg, around); 503 atomic_incw(Address((address)counters->biased_lock_entry_count_addr()), tmp_reg, rscratch1, rscratch2); 504 b(done); 505 bind(around); 506 } else { 507 cbz(tmp_reg, done); 508 } 509 510 Label try_revoke_bias; 511 Label try_rebias; 512 513 // At this point we know that the header has the bias pattern and 514 // that we are not the bias owner in the current epoch. We need to 515 // figure out more details about the state of the header in order to 516 // know what operations can be legally performed on the object's 517 // header. 518 519 // If the low three bits in the xor result aren't clear, that means 520 // the prototype header is no longer biased and we have to revoke 521 // the bias on this object. 522 andr(rscratch1, tmp_reg, markOopDesc::biased_lock_mask_in_place); 523 cbnz(rscratch1, try_revoke_bias); 524 525 // Biasing is still enabled for this data type. See whether the 526 // epoch of the current bias is still valid, meaning that the epoch 527 // bits of the mark word are equal to the epoch bits of the 528 // prototype header. (Note that the prototype header's epoch bits 529 // only change at a safepoint.) If not, attempt to rebias the object 530 // toward the current thread. Note that we must be absolutely sure 531 // that the current epoch is invalid in order to do this because 532 // otherwise the manipulations it performs on the mark word are 533 // illegal. 534 andr(rscratch1, tmp_reg, markOopDesc::epoch_mask_in_place); 535 cbnz(rscratch1, try_rebias); 536 537 // The epoch of the current bias is still valid but we know nothing 538 // about the owner; it might be set or it might be clear. Try to 539 // acquire the bias of the object using an atomic operation. If this 540 // fails we will go in to the runtime to revoke the object's bias. 541 // Note that we first construct the presumed unbiased header so we 542 // don't accidentally blow away another thread's valid bias. 543 { 544 Label here; 545 mov(rscratch1, markOopDesc::biased_lock_mask_in_place | markOopDesc::age_mask_in_place | markOopDesc::epoch_mask_in_place); 546 andr(swap_reg, swap_reg, rscratch1); 547 orr(tmp_reg, swap_reg, rthread); 548 cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, rscratch1, here, slow_case); 549 // If the biasing toward our thread failed, this means that 550 // another thread succeeded in biasing it toward itself and we 551 // need to revoke that bias. The revocation will occur in the 552 // interpreter runtime in the slow case. 553 bind(here); 554 if (counters != NULL) { 555 atomic_incw(Address((address)counters->anonymously_biased_lock_entry_count_addr()), 556 tmp_reg, rscratch1, rscratch2); 557 } 558 } 559 b(done); 560 561 bind(try_rebias); 562 // At this point we know the epoch has expired, meaning that the 563 // current "bias owner", if any, is actually invalid. Under these 564 // circumstances _only_, we are allowed to use the current header's 565 // value as the comparison value when doing the cas to acquire the 566 // bias in the current epoch. In other words, we allow transfer of 567 // the bias from one thread to another directly in this situation. 568 // 569 // FIXME: due to a lack of registers we currently blow away the age 570 // bits in this situation. Should attempt to preserve them. 571 { 572 Label here; 573 load_prototype_header(tmp_reg, obj_reg); 574 orr(tmp_reg, rthread, tmp_reg); 575 cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, rscratch1, here, slow_case); 576 // If the biasing toward our thread failed, then another thread 577 // succeeded in biasing it toward itself and we need to revoke that 578 // bias. The revocation will occur in the runtime in the slow case. 579 bind(here); 580 if (counters != NULL) { 581 atomic_incw(Address((address)counters->rebiased_lock_entry_count_addr()), 582 tmp_reg, rscratch1, rscratch2); 583 } 584 } 585 b(done); 586 587 bind(try_revoke_bias); 588 // The prototype mark in the klass doesn't have the bias bit set any 589 // more, indicating that objects of this data type are not supposed 590 // to be biased any more. We are going to try to reset the mark of 591 // this object to the prototype value and fall through to the 592 // CAS-based locking scheme. Note that if our CAS fails, it means 593 // that another thread raced us for the privilege of revoking the 594 // bias of this particular object, so it's okay to continue in the 595 // normal locking code. 596 // 597 // FIXME: due to a lack of registers we currently blow away the age 598 // bits in this situation. Should attempt to preserve them. 599 { 600 Label here, nope; 601 load_prototype_header(tmp_reg, obj_reg); 602 cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, rscratch1, here, &nope); 603 bind(here); 604 605 // Fall through to the normal CAS-based lock, because no matter what 606 // the result of the above CAS, some thread must have succeeded in 607 // removing the bias bit from the object's header. 608 if (counters != NULL) { 609 atomic_incw(Address((address)counters->revoked_lock_entry_count_addr()), tmp_reg, 610 rscratch1, rscratch2); 611 } 612 bind(nope); 613 } 614 615 bind(cas_label); 616 617 return null_check_offset; 618 } 619 620 void MacroAssembler::biased_locking_exit(Register obj_reg, Register temp_reg, Label& done) { 621 assert(UseBiasedLocking, "why call this otherwise?"); 622 623 // Check for biased locking unlock case, which is a no-op 624 // Note: we do not have to check the thread ID for two reasons. 625 // First, the interpreter checks for IllegalMonitorStateException at 626 // a higher level. Second, if the bias was revoked while we held the 627 // lock, the object could not be rebiased toward another thread, so 628 // the bias bit would be clear. 629 ldr(temp_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes())); 630 andr(temp_reg, temp_reg, markOopDesc::biased_lock_mask_in_place); 631 cmp(temp_reg, (u1)markOopDesc::biased_lock_pattern); 632 br(Assembler::EQ, done); 633 } 634 635 static void pass_arg0(MacroAssembler* masm, Register arg) { 636 if (c_rarg0 != arg ) { 637 masm->mov(c_rarg0, arg); 638 } 639 } 640 641 static void pass_arg1(MacroAssembler* masm, Register arg) { 642 if (c_rarg1 != arg ) { 643 masm->mov(c_rarg1, arg); 644 } 645 } 646 647 static void pass_arg2(MacroAssembler* masm, Register arg) { 648 if (c_rarg2 != arg ) { 649 masm->mov(c_rarg2, arg); 650 } 651 } 652 653 static void pass_arg3(MacroAssembler* masm, Register arg) { 654 if (c_rarg3 != arg ) { 655 masm->mov(c_rarg3, arg); 656 } 657 } 658 659 void MacroAssembler::call_VM_base(Register oop_result, 660 Register java_thread, 661 Register last_java_sp, 662 address entry_point, 663 int number_of_arguments, 664 bool check_exceptions) { 665 // determine java_thread register 666 if (!java_thread->is_valid()) { 667 java_thread = rthread; 668 } 669 670 // determine last_java_sp register 671 if (!last_java_sp->is_valid()) { 672 last_java_sp = esp; 673 } 674 675 // debugging support 676 assert(number_of_arguments >= 0 , "cannot have negative number of arguments"); 677 assert(java_thread == rthread, "unexpected register"); 678 #ifdef ASSERT 679 // TraceBytecodes does not use r12 but saves it over the call, so don't verify 680 // if ((UseCompressedOops || UseCompressedClassPointers) && !TraceBytecodes) verify_heapbase("call_VM_base: heap base corrupted?"); 681 #endif // ASSERT 682 683 assert(java_thread != oop_result , "cannot use the same register for java_thread & oop_result"); 684 assert(java_thread != last_java_sp, "cannot use the same register for java_thread & last_java_sp"); 685 686 // push java thread (becomes first argument of C function) 687 688 mov(c_rarg0, java_thread); 689 690 // set last Java frame before call 691 assert(last_java_sp != rfp, "can't use rfp"); 692 693 Label l; 694 set_last_Java_frame(last_java_sp, rfp, l, rscratch1); 695 696 // do the call, remove parameters 697 MacroAssembler::call_VM_leaf_base(entry_point, number_of_arguments, &l); 698 699 // reset last Java frame 700 // Only interpreter should have to clear fp 701 reset_last_Java_frame(true); 702 703 // C++ interp handles this in the interpreter 704 check_and_handle_popframe(java_thread); 705 check_and_handle_earlyret(java_thread); 706 707 if (check_exceptions) { 708 // check for pending exceptions (java_thread is set upon return) 709 ldr(rscratch1, Address(java_thread, in_bytes(Thread::pending_exception_offset()))); 710 Label ok; 711 cbz(rscratch1, ok); 712 lea(rscratch1, RuntimeAddress(StubRoutines::forward_exception_entry())); 713 br(rscratch1); 714 bind(ok); 715 } 716 717 // get oop result if there is one and reset the value in the thread 718 if (oop_result->is_valid()) { 719 get_vm_result(oop_result, java_thread); 720 } 721 } 722 723 void MacroAssembler::call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions) { 724 call_VM_base(oop_result, noreg, noreg, entry_point, number_of_arguments, check_exceptions); 725 } 726 727 // Maybe emit a call via a trampoline. If the code cache is small 728 // trampolines won't be emitted. 729 730 address MacroAssembler::trampoline_call(Address entry, CodeBuffer *cbuf) { 731 assert(JavaThread::current()->is_Compiler_thread(), "just checking"); 732 assert(entry.rspec().type() == relocInfo::runtime_call_type 733 || entry.rspec().type() == relocInfo::opt_virtual_call_type 734 || entry.rspec().type() == relocInfo::static_call_type 735 || entry.rspec().type() == relocInfo::virtual_call_type, "wrong reloc type"); 736 737 // We need a trampoline if branches are far. 738 if (far_branches()) { 739 bool in_scratch_emit_size = false; 740 #ifdef COMPILER2 741 // We don't want to emit a trampoline if C2 is generating dummy 742 // code during its branch shortening phase. 743 CompileTask* task = ciEnv::current()->task(); 744 in_scratch_emit_size = 745 (task != NULL && is_c2_compile(task->comp_level()) && 746 Compile::current()->in_scratch_emit_size()); 747 #endif 748 if (!in_scratch_emit_size) { 749 address stub = emit_trampoline_stub(offset(), entry.target()); 750 if (stub == NULL) { 751 return NULL; // CodeCache is full 752 } 753 } 754 } 755 756 if (cbuf) cbuf->set_insts_mark(); 757 relocate(entry.rspec()); 758 if (!far_branches()) { 759 bl(entry.target()); 760 } else { 761 bl(pc()); 762 } 763 // just need to return a non-null address 764 return pc(); 765 } 766 767 768 // Emit a trampoline stub for a call to a target which is too far away. 769 // 770 // code sequences: 771 // 772 // call-site: 773 // branch-and-link to <destination> or <trampoline stub> 774 // 775 // Related trampoline stub for this call site in the stub section: 776 // load the call target from the constant pool 777 // branch (LR still points to the call site above) 778 779 address MacroAssembler::emit_trampoline_stub(int insts_call_instruction_offset, 780 address dest) { 781 // Max stub size: alignment nop, TrampolineStub. 782 address stub = start_a_stub(NativeInstruction::instruction_size 783 + NativeCallTrampolineStub::instruction_size); 784 if (stub == NULL) { 785 return NULL; // CodeBuffer::expand failed 786 } 787 788 // Create a trampoline stub relocation which relates this trampoline stub 789 // with the call instruction at insts_call_instruction_offset in the 790 // instructions code-section. 791 align(wordSize); 792 relocate(trampoline_stub_Relocation::spec(code()->insts()->start() 793 + insts_call_instruction_offset)); 794 const int stub_start_offset = offset(); 795 796 // Now, create the trampoline stub's code: 797 // - load the call 798 // - call 799 Label target; 800 ldr(rscratch1, target); 801 br(rscratch1); 802 bind(target); 803 assert(offset() - stub_start_offset == NativeCallTrampolineStub::data_offset, 804 "should be"); 805 emit_int64((int64_t)dest); 806 807 const address stub_start_addr = addr_at(stub_start_offset); 808 809 assert(is_NativeCallTrampolineStub_at(stub_start_addr), "doesn't look like a trampoline"); 810 811 end_a_stub(); 812 return stub_start_addr; 813 } 814 815 void MacroAssembler::c2bool(Register x) { 816 // implements x == 0 ? 0 : 1 817 // note: must only look at least-significant byte of x 818 // since C-style booleans are stored in one byte 819 // only! (was bug) 820 tst(x, 0xff); 821 cset(x, Assembler::NE); 822 } 823 824 address MacroAssembler::ic_call(address entry, jint method_index) { 825 RelocationHolder rh = virtual_call_Relocation::spec(pc(), method_index); 826 // address const_ptr = long_constant((jlong)Universe::non_oop_word()); 827 // unsigned long offset; 828 // ldr_constant(rscratch2, const_ptr); 829 movptr(rscratch2, (uintptr_t)Universe::non_oop_word()); 830 return trampoline_call(Address(entry, rh)); 831 } 832 833 // Implementation of call_VM versions 834 835 void MacroAssembler::call_VM(Register oop_result, 836 address entry_point, 837 bool check_exceptions) { 838 call_VM_helper(oop_result, entry_point, 0, check_exceptions); 839 } 840 841 void MacroAssembler::call_VM(Register oop_result, 842 address entry_point, 843 Register arg_1, 844 bool check_exceptions) { 845 pass_arg1(this, arg_1); 846 call_VM_helper(oop_result, entry_point, 1, check_exceptions); 847 } 848 849 void MacroAssembler::call_VM(Register oop_result, 850 address entry_point, 851 Register arg_1, 852 Register arg_2, 853 bool check_exceptions) { 854 assert(arg_1 != c_rarg2, "smashed arg"); 855 pass_arg2(this, arg_2); 856 pass_arg1(this, arg_1); 857 call_VM_helper(oop_result, entry_point, 2, check_exceptions); 858 } 859 860 void MacroAssembler::call_VM(Register oop_result, 861 address entry_point, 862 Register arg_1, 863 Register arg_2, 864 Register arg_3, 865 bool check_exceptions) { 866 assert(arg_1 != c_rarg3, "smashed arg"); 867 assert(arg_2 != c_rarg3, "smashed arg"); 868 pass_arg3(this, arg_3); 869 870 assert(arg_1 != c_rarg2, "smashed arg"); 871 pass_arg2(this, arg_2); 872 873 pass_arg1(this, arg_1); 874 call_VM_helper(oop_result, entry_point, 3, check_exceptions); 875 } 876 877 void MacroAssembler::call_VM(Register oop_result, 878 Register last_java_sp, 879 address entry_point, 880 int number_of_arguments, 881 bool check_exceptions) { 882 call_VM_base(oop_result, rthread, last_java_sp, entry_point, number_of_arguments, check_exceptions); 883 } 884 885 void MacroAssembler::call_VM(Register oop_result, 886 Register last_java_sp, 887 address entry_point, 888 Register arg_1, 889 bool check_exceptions) { 890 pass_arg1(this, arg_1); 891 call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions); 892 } 893 894 void MacroAssembler::call_VM(Register oop_result, 895 Register last_java_sp, 896 address entry_point, 897 Register arg_1, 898 Register arg_2, 899 bool check_exceptions) { 900 901 assert(arg_1 != c_rarg2, "smashed arg"); 902 pass_arg2(this, arg_2); 903 pass_arg1(this, arg_1); 904 call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions); 905 } 906 907 void MacroAssembler::call_VM(Register oop_result, 908 Register last_java_sp, 909 address entry_point, 910 Register arg_1, 911 Register arg_2, 912 Register arg_3, 913 bool check_exceptions) { 914 assert(arg_1 != c_rarg3, "smashed arg"); 915 assert(arg_2 != c_rarg3, "smashed arg"); 916 pass_arg3(this, arg_3); 917 assert(arg_1 != c_rarg2, "smashed arg"); 918 pass_arg2(this, arg_2); 919 pass_arg1(this, arg_1); 920 call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions); 921 } 922 923 924 void MacroAssembler::get_vm_result(Register oop_result, Register java_thread) { 925 ldr(oop_result, Address(java_thread, JavaThread::vm_result_offset())); 926 str(zr, Address(java_thread, JavaThread::vm_result_offset())); 927 verify_oop(oop_result, "broken oop in call_VM_base"); 928 } 929 930 void MacroAssembler::get_vm_result_2(Register metadata_result, Register java_thread) { 931 ldr(metadata_result, Address(java_thread, JavaThread::vm_result_2_offset())); 932 str(zr, Address(java_thread, JavaThread::vm_result_2_offset())); 933 } 934 935 void MacroAssembler::align(int modulus) { 936 while (offset() % modulus != 0) nop(); 937 } 938 939 // these are no-ops overridden by InterpreterMacroAssembler 940 941 void MacroAssembler::check_and_handle_earlyret(Register java_thread) { } 942 943 void MacroAssembler::check_and_handle_popframe(Register java_thread) { } 944 945 946 RegisterOrConstant MacroAssembler::delayed_value_impl(intptr_t* delayed_value_addr, 947 Register tmp, 948 int offset) { 949 intptr_t value = *delayed_value_addr; 950 if (value != 0) 951 return RegisterOrConstant(value + offset); 952 953 // load indirectly to solve generation ordering problem 954 ldr(tmp, ExternalAddress((address) delayed_value_addr)); 955 956 if (offset != 0) 957 add(tmp, tmp, offset); 958 959 return RegisterOrConstant(tmp); 960 } 961 962 963 void MacroAssembler:: notify(int type) { 964 if (type == bytecode_start) { 965 // set_last_Java_frame(esp, rfp, (address)NULL); 966 Assembler:: notify(type); 967 // reset_last_Java_frame(true); 968 } 969 else 970 Assembler:: notify(type); 971 } 972 973 // Look up the method for a megamorphic invokeinterface call. 974 // The target method is determined by <intf_klass, itable_index>. 975 // The receiver klass is in recv_klass. 976 // On success, the result will be in method_result, and execution falls through. 977 // On failure, execution transfers to the given label. 978 void MacroAssembler::lookup_interface_method(Register recv_klass, 979 Register intf_klass, 980 RegisterOrConstant itable_index, 981 Register method_result, 982 Register scan_temp, 983 Label& L_no_such_interface, 984 bool return_method) { 985 assert_different_registers(recv_klass, intf_klass, scan_temp); 986 assert_different_registers(method_result, intf_klass, scan_temp); 987 assert(recv_klass != method_result || !return_method, 988 "recv_klass can be destroyed when method isn't needed"); 989 assert(itable_index.is_constant() || itable_index.as_register() == method_result, 990 "caller must use same register for non-constant itable index as for method"); 991 992 // Compute start of first itableOffsetEntry (which is at the end of the vtable) 993 int vtable_base = in_bytes(Klass::vtable_start_offset()); 994 int itentry_off = itableMethodEntry::method_offset_in_bytes(); 995 int scan_step = itableOffsetEntry::size() * wordSize; 996 int vte_size = vtableEntry::size_in_bytes(); 997 assert(vte_size == wordSize, "else adjust times_vte_scale"); 998 999 ldrw(scan_temp, Address(recv_klass, Klass::vtable_length_offset())); 1000 1001 // %%% Could store the aligned, prescaled offset in the klassoop. 1002 // lea(scan_temp, Address(recv_klass, scan_temp, times_vte_scale, vtable_base)); 1003 lea(scan_temp, Address(recv_klass, scan_temp, Address::lsl(3))); 1004 add(scan_temp, scan_temp, vtable_base); 1005 1006 if (return_method) { 1007 // Adjust recv_klass by scaled itable_index, so we can free itable_index. 1008 assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below"); 1009 // lea(recv_klass, Address(recv_klass, itable_index, Address::times_ptr, itentry_off)); 1010 lea(recv_klass, Address(recv_klass, itable_index, Address::lsl(3))); 1011 if (itentry_off) 1012 add(recv_klass, recv_klass, itentry_off); 1013 } 1014 1015 // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) { 1016 // if (scan->interface() == intf) { 1017 // result = (klass + scan->offset() + itable_index); 1018 // } 1019 // } 1020 Label search, found_method; 1021 1022 for (int peel = 1; peel >= 0; peel--) { 1023 ldr(method_result, Address(scan_temp, itableOffsetEntry::interface_offset_in_bytes())); 1024 cmp(intf_klass, method_result); 1025 1026 if (peel) { 1027 br(Assembler::EQ, found_method); 1028 } else { 1029 br(Assembler::NE, search); 1030 // (invert the test to fall through to found_method...) 1031 } 1032 1033 if (!peel) break; 1034 1035 bind(search); 1036 1037 // Check that the previous entry is non-null. A null entry means that 1038 // the receiver class doesn't implement the interface, and wasn't the 1039 // same as when the caller was compiled. 1040 cbz(method_result, L_no_such_interface); 1041 add(scan_temp, scan_temp, scan_step); 1042 } 1043 1044 bind(found_method); 1045 1046 // Got a hit. 1047 if (return_method) { 1048 ldrw(scan_temp, Address(scan_temp, itableOffsetEntry::offset_offset_in_bytes())); 1049 ldr(method_result, Address(recv_klass, scan_temp, Address::uxtw(0))); 1050 } 1051 } 1052 1053 // virtual method calling 1054 void MacroAssembler::lookup_virtual_method(Register recv_klass, 1055 RegisterOrConstant vtable_index, 1056 Register method_result) { 1057 const int base = in_bytes(Klass::vtable_start_offset()); 1058 assert(vtableEntry::size() * wordSize == 8, 1059 "adjust the scaling in the code below"); 1060 int vtable_offset_in_bytes = base + vtableEntry::method_offset_in_bytes(); 1061 1062 if (vtable_index.is_register()) { 1063 lea(method_result, Address(recv_klass, 1064 vtable_index.as_register(), 1065 Address::lsl(LogBytesPerWord))); 1066 ldr(method_result, Address(method_result, vtable_offset_in_bytes)); 1067 } else { 1068 vtable_offset_in_bytes += vtable_index.as_constant() * wordSize; 1069 ldr(method_result, 1070 form_address(rscratch1, recv_klass, vtable_offset_in_bytes, 0)); 1071 } 1072 } 1073 1074 void MacroAssembler::check_klass_subtype(Register sub_klass, 1075 Register super_klass, 1076 Register temp_reg, 1077 Label& L_success) { 1078 Label L_failure; 1079 check_klass_subtype_fast_path(sub_klass, super_klass, temp_reg, &L_success, &L_failure, NULL); 1080 check_klass_subtype_slow_path(sub_klass, super_klass, temp_reg, noreg, &L_success, NULL); 1081 bind(L_failure); 1082 } 1083 1084 1085 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass, 1086 Register super_klass, 1087 Register temp_reg, 1088 Label* L_success, 1089 Label* L_failure, 1090 Label* L_slow_path, 1091 RegisterOrConstant super_check_offset) { 1092 assert_different_registers(sub_klass, super_klass, temp_reg); 1093 bool must_load_sco = (super_check_offset.constant_or_zero() == -1); 1094 if (super_check_offset.is_register()) { 1095 assert_different_registers(sub_klass, super_klass, 1096 super_check_offset.as_register()); 1097 } else if (must_load_sco) { 1098 assert(temp_reg != noreg, "supply either a temp or a register offset"); 1099 } 1100 1101 Label L_fallthrough; 1102 int label_nulls = 0; 1103 if (L_success == NULL) { L_success = &L_fallthrough; label_nulls++; } 1104 if (L_failure == NULL) { L_failure = &L_fallthrough; label_nulls++; } 1105 if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; } 1106 assert(label_nulls <= 1, "at most one NULL in the batch"); 1107 1108 int sc_offset = in_bytes(Klass::secondary_super_cache_offset()); 1109 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 1110 Address super_check_offset_addr(super_klass, sco_offset); 1111 1112 // Hacked jmp, which may only be used just before L_fallthrough. 1113 #define final_jmp(label) \ 1114 if (&(label) == &L_fallthrough) { /*do nothing*/ } \ 1115 else b(label) /*omit semi*/ 1116 1117 // If the pointers are equal, we are done (e.g., String[] elements). 1118 // This self-check enables sharing of secondary supertype arrays among 1119 // non-primary types such as array-of-interface. Otherwise, each such 1120 // type would need its own customized SSA. 1121 // We move this check to the front of the fast path because many 1122 // type checks are in fact trivially successful in this manner, 1123 // so we get a nicely predicted branch right at the start of the check. 1124 cmp(sub_klass, super_klass); 1125 br(Assembler::EQ, *L_success); 1126 1127 // Check the supertype display: 1128 if (must_load_sco) { 1129 ldrw(temp_reg, super_check_offset_addr); 1130 super_check_offset = RegisterOrConstant(temp_reg); 1131 } 1132 Address super_check_addr(sub_klass, super_check_offset); 1133 ldr(rscratch1, super_check_addr); 1134 cmp(super_klass, rscratch1); // load displayed supertype 1135 1136 // This check has worked decisively for primary supers. 1137 // Secondary supers are sought in the super_cache ('super_cache_addr'). 1138 // (Secondary supers are interfaces and very deeply nested subtypes.) 1139 // This works in the same check above because of a tricky aliasing 1140 // between the super_cache and the primary super display elements. 1141 // (The 'super_check_addr' can address either, as the case requires.) 1142 // Note that the cache is updated below if it does not help us find 1143 // what we need immediately. 1144 // So if it was a primary super, we can just fail immediately. 1145 // Otherwise, it's the slow path for us (no success at this point). 1146 1147 if (super_check_offset.is_register()) { 1148 br(Assembler::EQ, *L_success); 1149 subs(zr, super_check_offset.as_register(), sc_offset); 1150 if (L_failure == &L_fallthrough) { 1151 br(Assembler::EQ, *L_slow_path); 1152 } else { 1153 br(Assembler::NE, *L_failure); 1154 final_jmp(*L_slow_path); 1155 } 1156 } else if (super_check_offset.as_constant() == sc_offset) { 1157 // Need a slow path; fast failure is impossible. 1158 if (L_slow_path == &L_fallthrough) { 1159 br(Assembler::EQ, *L_success); 1160 } else { 1161 br(Assembler::NE, *L_slow_path); 1162 final_jmp(*L_success); 1163 } 1164 } else { 1165 // No slow path; it's a fast decision. 1166 if (L_failure == &L_fallthrough) { 1167 br(Assembler::EQ, *L_success); 1168 } else { 1169 br(Assembler::NE, *L_failure); 1170 final_jmp(*L_success); 1171 } 1172 } 1173 1174 bind(L_fallthrough); 1175 1176 #undef final_jmp 1177 } 1178 1179 // These two are taken from x86, but they look generally useful 1180 1181 // scans count pointer sized words at [addr] for occurence of value, 1182 // generic 1183 void MacroAssembler::repne_scan(Register addr, Register value, Register count, 1184 Register scratch) { 1185 Label Lloop, Lexit; 1186 cbz(count, Lexit); 1187 bind(Lloop); 1188 ldr(scratch, post(addr, wordSize)); 1189 cmp(value, scratch); 1190 br(EQ, Lexit); 1191 sub(count, count, 1); 1192 cbnz(count, Lloop); 1193 bind(Lexit); 1194 } 1195 1196 // scans count 4 byte words at [addr] for occurence of value, 1197 // generic 1198 void MacroAssembler::repne_scanw(Register addr, Register value, Register count, 1199 Register scratch) { 1200 Label Lloop, Lexit; 1201 cbz(count, Lexit); 1202 bind(Lloop); 1203 ldrw(scratch, post(addr, wordSize)); 1204 cmpw(value, scratch); 1205 br(EQ, Lexit); 1206 sub(count, count, 1); 1207 cbnz(count, Lloop); 1208 bind(Lexit); 1209 } 1210 1211 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass, 1212 Register super_klass, 1213 Register temp_reg, 1214 Register temp2_reg, 1215 Label* L_success, 1216 Label* L_failure, 1217 bool set_cond_codes) { 1218 assert_different_registers(sub_klass, super_klass, temp_reg); 1219 if (temp2_reg != noreg) 1220 assert_different_registers(sub_klass, super_klass, temp_reg, temp2_reg, rscratch1); 1221 #define IS_A_TEMP(reg) ((reg) == temp_reg || (reg) == temp2_reg) 1222 1223 Label L_fallthrough; 1224 int label_nulls = 0; 1225 if (L_success == NULL) { L_success = &L_fallthrough; label_nulls++; } 1226 if (L_failure == NULL) { L_failure = &L_fallthrough; label_nulls++; } 1227 assert(label_nulls <= 1, "at most one NULL in the batch"); 1228 1229 // a couple of useful fields in sub_klass: 1230 int ss_offset = in_bytes(Klass::secondary_supers_offset()); 1231 int sc_offset = in_bytes(Klass::secondary_super_cache_offset()); 1232 Address secondary_supers_addr(sub_klass, ss_offset); 1233 Address super_cache_addr( sub_klass, sc_offset); 1234 1235 BLOCK_COMMENT("check_klass_subtype_slow_path"); 1236 1237 // Do a linear scan of the secondary super-klass chain. 1238 // This code is rarely used, so simplicity is a virtue here. 1239 // The repne_scan instruction uses fixed registers, which we must spill. 1240 // Don't worry too much about pre-existing connections with the input regs. 1241 1242 assert(sub_klass != r0, "killed reg"); // killed by mov(r0, super) 1243 assert(sub_klass != r2, "killed reg"); // killed by lea(r2, &pst_counter) 1244 1245 RegSet pushed_registers; 1246 if (!IS_A_TEMP(r2)) pushed_registers += r2; 1247 if (!IS_A_TEMP(r5)) pushed_registers += r5; 1248 1249 if (super_klass != r0 || UseCompressedOops) { 1250 if (!IS_A_TEMP(r0)) pushed_registers += r0; 1251 } 1252 1253 push(pushed_registers, sp); 1254 1255 // Get super_klass value into r0 (even if it was in r5 or r2). 1256 if (super_klass != r0) { 1257 mov(r0, super_klass); 1258 } 1259 1260 #ifndef PRODUCT 1261 mov(rscratch2, (address)&SharedRuntime::_partial_subtype_ctr); 1262 Address pst_counter_addr(rscratch2); 1263 ldr(rscratch1, pst_counter_addr); 1264 add(rscratch1, rscratch1, 1); 1265 str(rscratch1, pst_counter_addr); 1266 #endif //PRODUCT 1267 1268 // We will consult the secondary-super array. 1269 ldr(r5, secondary_supers_addr); 1270 // Load the array length. 1271 ldrw(r2, Address(r5, Array<Klass*>::length_offset_in_bytes())); 1272 // Skip to start of data. 1273 add(r5, r5, Array<Klass*>::base_offset_in_bytes()); 1274 1275 cmp(sp, zr); // Clear Z flag; SP is never zero 1276 // Scan R2 words at [R5] for an occurrence of R0. 1277 // Set NZ/Z based on last compare. 1278 repne_scan(r5, r0, r2, rscratch1); 1279 1280 // Unspill the temp. registers: 1281 pop(pushed_registers, sp); 1282 1283 br(Assembler::NE, *L_failure); 1284 1285 // Success. Cache the super we found and proceed in triumph. 1286 str(super_klass, super_cache_addr); 1287 1288 if (L_success != &L_fallthrough) { 1289 b(*L_success); 1290 } 1291 1292 #undef IS_A_TEMP 1293 1294 bind(L_fallthrough); 1295 } 1296 1297 1298 void MacroAssembler::verify_oop(Register reg, const char* s) { 1299 if (!VerifyOops) return; 1300 1301 // Pass register number to verify_oop_subroutine 1302 const char* b = NULL; 1303 { 1304 ResourceMark rm; 1305 stringStream ss; 1306 ss.print("verify_oop: %s: %s", reg->name(), s); 1307 b = code_string(ss.as_string()); 1308 } 1309 BLOCK_COMMENT("verify_oop {"); 1310 1311 stp(r0, rscratch1, Address(pre(sp, -2 * wordSize))); 1312 stp(rscratch2, lr, Address(pre(sp, -2 * wordSize))); 1313 1314 mov(r0, reg); 1315 mov(rscratch1, (address)b); 1316 1317 // call indirectly to solve generation ordering problem 1318 lea(rscratch2, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address())); 1319 ldr(rscratch2, Address(rscratch2)); 1320 blr(rscratch2); 1321 1322 ldp(rscratch2, lr, Address(post(sp, 2 * wordSize))); 1323 ldp(r0, rscratch1, Address(post(sp, 2 * wordSize))); 1324 1325 BLOCK_COMMENT("} verify_oop"); 1326 } 1327 1328 void MacroAssembler::verify_oop_addr(Address addr, const char* s) { 1329 if (!VerifyOops) return; 1330 1331 const char* b = NULL; 1332 { 1333 ResourceMark rm; 1334 stringStream ss; 1335 ss.print("verify_oop_addr: %s", s); 1336 b = code_string(ss.as_string()); 1337 } 1338 BLOCK_COMMENT("verify_oop_addr {"); 1339 1340 stp(r0, rscratch1, Address(pre(sp, -2 * wordSize))); 1341 stp(rscratch2, lr, Address(pre(sp, -2 * wordSize))); 1342 1343 // addr may contain sp so we will have to adjust it based on the 1344 // pushes that we just did. 1345 if (addr.uses(sp)) { 1346 lea(r0, addr); 1347 ldr(r0, Address(r0, 4 * wordSize)); 1348 } else { 1349 ldr(r0, addr); 1350 } 1351 mov(rscratch1, (address)b); 1352 1353 // call indirectly to solve generation ordering problem 1354 lea(rscratch2, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address())); 1355 ldr(rscratch2, Address(rscratch2)); 1356 blr(rscratch2); 1357 1358 ldp(rscratch2, lr, Address(post(sp, 2 * wordSize))); 1359 ldp(r0, rscratch1, Address(post(sp, 2 * wordSize))); 1360 1361 BLOCK_COMMENT("} verify_oop_addr"); 1362 } 1363 1364 Address MacroAssembler::argument_address(RegisterOrConstant arg_slot, 1365 int extra_slot_offset) { 1366 // cf. TemplateTable::prepare_invoke(), if (load_receiver). 1367 int stackElementSize = Interpreter::stackElementSize; 1368 int offset = Interpreter::expr_offset_in_bytes(extra_slot_offset+0); 1369 #ifdef ASSERT 1370 int offset1 = Interpreter::expr_offset_in_bytes(extra_slot_offset+1); 1371 assert(offset1 - offset == stackElementSize, "correct arithmetic"); 1372 #endif 1373 if (arg_slot.is_constant()) { 1374 return Address(esp, arg_slot.as_constant() * stackElementSize 1375 + offset); 1376 } else { 1377 add(rscratch1, esp, arg_slot.as_register(), 1378 ext::uxtx, exact_log2(stackElementSize)); 1379 return Address(rscratch1, offset); 1380 } 1381 } 1382 1383 void MacroAssembler::call_VM_leaf_base(address entry_point, 1384 int number_of_arguments, 1385 Label *retaddr) { 1386 call_VM_leaf_base1(entry_point, number_of_arguments, 0, ret_type_integral, retaddr); 1387 } 1388 1389 void MacroAssembler::call_VM_leaf_base1(address entry_point, 1390 int number_of_gp_arguments, 1391 int number_of_fp_arguments, 1392 ret_type type, 1393 Label *retaddr) { 1394 Label E, L; 1395 1396 stp(rscratch1, rmethod, Address(pre(sp, -2 * wordSize))); 1397 1398 // We add 1 to number_of_arguments because the thread in arg0 is 1399 // not counted 1400 mov(rscratch1, entry_point); 1401 blrt(rscratch1, number_of_gp_arguments + 1, number_of_fp_arguments, type); 1402 if (retaddr) 1403 bind(*retaddr); 1404 1405 ldp(rscratch1, rmethod, Address(post(sp, 2 * wordSize))); 1406 maybe_isb(); 1407 } 1408 1409 void MacroAssembler::call_VM_leaf(address entry_point, int number_of_arguments) { 1410 call_VM_leaf_base(entry_point, number_of_arguments); 1411 } 1412 1413 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0) { 1414 pass_arg0(this, arg_0); 1415 call_VM_leaf_base(entry_point, 1); 1416 } 1417 1418 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1) { 1419 pass_arg0(this, arg_0); 1420 pass_arg1(this, arg_1); 1421 call_VM_leaf_base(entry_point, 2); 1422 } 1423 1424 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, 1425 Register arg_1, Register arg_2) { 1426 pass_arg0(this, arg_0); 1427 pass_arg1(this, arg_1); 1428 pass_arg2(this, arg_2); 1429 call_VM_leaf_base(entry_point, 3); 1430 } 1431 1432 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0) { 1433 pass_arg0(this, arg_0); 1434 MacroAssembler::call_VM_leaf_base(entry_point, 1); 1435 } 1436 1437 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1) { 1438 1439 assert(arg_0 != c_rarg1, "smashed arg"); 1440 pass_arg1(this, arg_1); 1441 pass_arg0(this, arg_0); 1442 MacroAssembler::call_VM_leaf_base(entry_point, 2); 1443 } 1444 1445 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) { 1446 assert(arg_0 != c_rarg2, "smashed arg"); 1447 assert(arg_1 != c_rarg2, "smashed arg"); 1448 pass_arg2(this, arg_2); 1449 assert(arg_0 != c_rarg1, "smashed arg"); 1450 pass_arg1(this, arg_1); 1451 pass_arg0(this, arg_0); 1452 MacroAssembler::call_VM_leaf_base(entry_point, 3); 1453 } 1454 1455 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) { 1456 assert(arg_0 != c_rarg3, "smashed arg"); 1457 assert(arg_1 != c_rarg3, "smashed arg"); 1458 assert(arg_2 != c_rarg3, "smashed arg"); 1459 pass_arg3(this, arg_3); 1460 assert(arg_0 != c_rarg2, "smashed arg"); 1461 assert(arg_1 != c_rarg2, "smashed arg"); 1462 pass_arg2(this, arg_2); 1463 assert(arg_0 != c_rarg1, "smashed arg"); 1464 pass_arg1(this, arg_1); 1465 pass_arg0(this, arg_0); 1466 MacroAssembler::call_VM_leaf_base(entry_point, 4); 1467 } 1468 1469 void MacroAssembler::null_check(Register reg, int offset) { 1470 if (needs_explicit_null_check(offset)) { 1471 // provoke OS NULL exception if reg = NULL by 1472 // accessing M[reg] w/o changing any registers 1473 // NOTE: this is plenty to provoke a segv 1474 ldr(zr, Address(reg)); 1475 } else { 1476 // nothing to do, (later) access of M[reg + offset] 1477 // will provoke OS NULL exception if reg = NULL 1478 } 1479 } 1480 1481 // MacroAssembler protected routines needed to implement 1482 // public methods 1483 1484 void MacroAssembler::mov(Register r, Address dest) { 1485 code_section()->relocate(pc(), dest.rspec()); 1486 u_int64_t imm64 = (u_int64_t)dest.target(); 1487 movptr(r, imm64); 1488 } 1489 1490 // Move a constant pointer into r. In AArch64 mode the virtual 1491 // address space is 48 bits in size, so we only need three 1492 // instructions to create a patchable instruction sequence that can 1493 // reach anywhere. 1494 void MacroAssembler::movptr(Register r, uintptr_t imm64) { 1495 #ifndef PRODUCT 1496 { 1497 char buffer[64]; 1498 snprintf(buffer, sizeof(buffer), "0x%" PRIX64, imm64); 1499 block_comment(buffer); 1500 } 1501 #endif 1502 assert(imm64 < (1ul << 48), "48-bit overflow in address constant"); 1503 movz(r, imm64 & 0xffff); 1504 imm64 >>= 16; 1505 movk(r, imm64 & 0xffff, 16); 1506 imm64 >>= 16; 1507 movk(r, imm64 & 0xffff, 32); 1508 } 1509 1510 // Macro to mov replicated immediate to vector register. 1511 // Vd will get the following values for different arrangements in T 1512 // imm32 == hex 000000gh T8B: Vd = ghghghghghghghgh 1513 // imm32 == hex 000000gh T16B: Vd = ghghghghghghghghghghghghghghghgh 1514 // imm32 == hex 0000efgh T4H: Vd = efghefghefghefgh 1515 // imm32 == hex 0000efgh T8H: Vd = efghefghefghefghefghefghefghefgh 1516 // imm32 == hex abcdefgh T2S: Vd = abcdefghabcdefgh 1517 // imm32 == hex abcdefgh T4S: Vd = abcdefghabcdefghabcdefghabcdefgh 1518 // T1D/T2D: invalid 1519 void MacroAssembler::mov(FloatRegister Vd, SIMD_Arrangement T, u_int32_t imm32) { 1520 assert(T != T1D && T != T2D, "invalid arrangement"); 1521 if (T == T8B || T == T16B) { 1522 assert((imm32 & ~0xff) == 0, "extraneous bits in unsigned imm32 (T8B/T16B)"); 1523 movi(Vd, T, imm32 & 0xff, 0); 1524 return; 1525 } 1526 u_int32_t nimm32 = ~imm32; 1527 if (T == T4H || T == T8H) { 1528 assert((imm32 & ~0xffff) == 0, "extraneous bits in unsigned imm32 (T4H/T8H)"); 1529 imm32 &= 0xffff; 1530 nimm32 &= 0xffff; 1531 } 1532 u_int32_t x = imm32; 1533 int movi_cnt = 0; 1534 int movn_cnt = 0; 1535 while (x) { if (x & 0xff) movi_cnt++; x >>= 8; } 1536 x = nimm32; 1537 while (x) { if (x & 0xff) movn_cnt++; x >>= 8; } 1538 if (movn_cnt < movi_cnt) imm32 = nimm32; 1539 unsigned lsl = 0; 1540 while (imm32 && (imm32 & 0xff) == 0) { lsl += 8; imm32 >>= 8; } 1541 if (movn_cnt < movi_cnt) 1542 mvni(Vd, T, imm32 & 0xff, lsl); 1543 else 1544 movi(Vd, T, imm32 & 0xff, lsl); 1545 imm32 >>= 8; lsl += 8; 1546 while (imm32) { 1547 while ((imm32 & 0xff) == 0) { lsl += 8; imm32 >>= 8; } 1548 if (movn_cnt < movi_cnt) 1549 bici(Vd, T, imm32 & 0xff, lsl); 1550 else 1551 orri(Vd, T, imm32 & 0xff, lsl); 1552 lsl += 8; imm32 >>= 8; 1553 } 1554 } 1555 1556 void MacroAssembler::mov_immediate64(Register dst, u_int64_t imm64) 1557 { 1558 #ifndef PRODUCT 1559 { 1560 char buffer[64]; 1561 snprintf(buffer, sizeof(buffer), "0x%" PRIX64, imm64); 1562 block_comment(buffer); 1563 } 1564 #endif 1565 if (operand_valid_for_logical_immediate(false, imm64)) { 1566 orr(dst, zr, imm64); 1567 } else { 1568 // we can use a combination of MOVZ or MOVN with 1569 // MOVK to build up the constant 1570 u_int64_t imm_h[4]; 1571 int zero_count = 0; 1572 int neg_count = 0; 1573 int i; 1574 for (i = 0; i < 4; i++) { 1575 imm_h[i] = ((imm64 >> (i * 16)) & 0xffffL); 1576 if (imm_h[i] == 0) { 1577 zero_count++; 1578 } else if (imm_h[i] == 0xffffL) { 1579 neg_count++; 1580 } 1581 } 1582 if (zero_count == 4) { 1583 // one MOVZ will do 1584 movz(dst, 0); 1585 } else if (neg_count == 4) { 1586 // one MOVN will do 1587 movn(dst, 0); 1588 } else if (zero_count == 3) { 1589 for (i = 0; i < 4; i++) { 1590 if (imm_h[i] != 0L) { 1591 movz(dst, (u_int32_t)imm_h[i], (i << 4)); 1592 break; 1593 } 1594 } 1595 } else if (neg_count == 3) { 1596 // one MOVN will do 1597 for (int i = 0; i < 4; i++) { 1598 if (imm_h[i] != 0xffffL) { 1599 movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4)); 1600 break; 1601 } 1602 } 1603 } else if (zero_count == 2) { 1604 // one MOVZ and one MOVK will do 1605 for (i = 0; i < 3; i++) { 1606 if (imm_h[i] != 0L) { 1607 movz(dst, (u_int32_t)imm_h[i], (i << 4)); 1608 i++; 1609 break; 1610 } 1611 } 1612 for (;i < 4; i++) { 1613 if (imm_h[i] != 0L) { 1614 movk(dst, (u_int32_t)imm_h[i], (i << 4)); 1615 } 1616 } 1617 } else if (neg_count == 2) { 1618 // one MOVN and one MOVK will do 1619 for (i = 0; i < 4; i++) { 1620 if (imm_h[i] != 0xffffL) { 1621 movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4)); 1622 i++; 1623 break; 1624 } 1625 } 1626 for (;i < 4; i++) { 1627 if (imm_h[i] != 0xffffL) { 1628 movk(dst, (u_int32_t)imm_h[i], (i << 4)); 1629 } 1630 } 1631 } else if (zero_count == 1) { 1632 // one MOVZ and two MOVKs will do 1633 for (i = 0; i < 4; i++) { 1634 if (imm_h[i] != 0L) { 1635 movz(dst, (u_int32_t)imm_h[i], (i << 4)); 1636 i++; 1637 break; 1638 } 1639 } 1640 for (;i < 4; i++) { 1641 if (imm_h[i] != 0x0L) { 1642 movk(dst, (u_int32_t)imm_h[i], (i << 4)); 1643 } 1644 } 1645 } else if (neg_count == 1) { 1646 // one MOVN and two MOVKs will do 1647 for (i = 0; i < 4; i++) { 1648 if (imm_h[i] != 0xffffL) { 1649 movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4)); 1650 i++; 1651 break; 1652 } 1653 } 1654 for (;i < 4; i++) { 1655 if (imm_h[i] != 0xffffL) { 1656 movk(dst, (u_int32_t)imm_h[i], (i << 4)); 1657 } 1658 } 1659 } else { 1660 // use a MOVZ and 3 MOVKs (makes it easier to debug) 1661 movz(dst, (u_int32_t)imm_h[0], 0); 1662 for (i = 1; i < 4; i++) { 1663 movk(dst, (u_int32_t)imm_h[i], (i << 4)); 1664 } 1665 } 1666 } 1667 } 1668 1669 void MacroAssembler::mov_immediate32(Register dst, u_int32_t imm32) 1670 { 1671 #ifndef PRODUCT 1672 { 1673 char buffer[64]; 1674 snprintf(buffer, sizeof(buffer), "0x%" PRIX32, imm32); 1675 block_comment(buffer); 1676 } 1677 #endif 1678 if (operand_valid_for_logical_immediate(true, imm32)) { 1679 orrw(dst, zr, imm32); 1680 } else { 1681 // we can use MOVZ, MOVN or two calls to MOVK to build up the 1682 // constant 1683 u_int32_t imm_h[2]; 1684 imm_h[0] = imm32 & 0xffff; 1685 imm_h[1] = ((imm32 >> 16) & 0xffff); 1686 if (imm_h[0] == 0) { 1687 movzw(dst, imm_h[1], 16); 1688 } else if (imm_h[0] == 0xffff) { 1689 movnw(dst, imm_h[1] ^ 0xffff, 16); 1690 } else if (imm_h[1] == 0) { 1691 movzw(dst, imm_h[0], 0); 1692 } else if (imm_h[1] == 0xffff) { 1693 movnw(dst, imm_h[0] ^ 0xffff, 0); 1694 } else { 1695 // use a MOVZ and MOVK (makes it easier to debug) 1696 movzw(dst, imm_h[0], 0); 1697 movkw(dst, imm_h[1], 16); 1698 } 1699 } 1700 } 1701 1702 // Form an address from base + offset in Rd. Rd may or may 1703 // not actually be used: you must use the Address that is returned. 1704 // It is up to you to ensure that the shift provided matches the size 1705 // of your data. 1706 Address MacroAssembler::form_address(Register Rd, Register base, long byte_offset, int shift) { 1707 if (Address::offset_ok_for_immed(byte_offset, shift)) 1708 // It fits; no need for any heroics 1709 return Address(base, byte_offset); 1710 1711 // Don't do anything clever with negative or misaligned offsets 1712 unsigned mask = (1 << shift) - 1; 1713 if (byte_offset < 0 || byte_offset & mask) { 1714 mov(Rd, byte_offset); 1715 add(Rd, base, Rd); 1716 return Address(Rd); 1717 } 1718 1719 // See if we can do this with two 12-bit offsets 1720 { 1721 unsigned long word_offset = byte_offset >> shift; 1722 unsigned long masked_offset = word_offset & 0xfff000; 1723 if (Address::offset_ok_for_immed(word_offset - masked_offset) 1724 && Assembler::operand_valid_for_add_sub_immediate(masked_offset << shift)) { 1725 add(Rd, base, masked_offset << shift); 1726 word_offset -= masked_offset; 1727 return Address(Rd, word_offset << shift); 1728 } 1729 } 1730 1731 // Do it the hard way 1732 mov(Rd, byte_offset); 1733 add(Rd, base, Rd); 1734 return Address(Rd); 1735 } 1736 1737 void MacroAssembler::atomic_incw(Register counter_addr, Register tmp, Register tmp2) { 1738 if (UseLSE) { 1739 mov(tmp, 1); 1740 ldadd(Assembler::word, tmp, zr, counter_addr); 1741 return; 1742 } 1743 Label retry_load; 1744 if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH)) 1745 prfm(Address(counter_addr), PSTL1STRM); 1746 bind(retry_load); 1747 // flush and load exclusive from the memory location 1748 ldxrw(tmp, counter_addr); 1749 addw(tmp, tmp, 1); 1750 // if we store+flush with no intervening write tmp wil be zero 1751 stxrw(tmp2, tmp, counter_addr); 1752 cbnzw(tmp2, retry_load); 1753 } 1754 1755 1756 int MacroAssembler::corrected_idivl(Register result, Register ra, Register rb, 1757 bool want_remainder, Register scratch) 1758 { 1759 // Full implementation of Java idiv and irem. The function 1760 // returns the (pc) offset of the div instruction - may be needed 1761 // for implicit exceptions. 1762 // 1763 // constraint : ra/rb =/= scratch 1764 // normal case 1765 // 1766 // input : ra: dividend 1767 // rb: divisor 1768 // 1769 // result: either 1770 // quotient (= ra idiv rb) 1771 // remainder (= ra irem rb) 1772 1773 assert(ra != scratch && rb != scratch, "reg cannot be scratch"); 1774 1775 int idivl_offset = offset(); 1776 if (! want_remainder) { 1777 sdivw(result, ra, rb); 1778 } else { 1779 sdivw(scratch, ra, rb); 1780 Assembler::msubw(result, scratch, rb, ra); 1781 } 1782 1783 return idivl_offset; 1784 } 1785 1786 int MacroAssembler::corrected_idivq(Register result, Register ra, Register rb, 1787 bool want_remainder, Register scratch) 1788 { 1789 // Full implementation of Java ldiv and lrem. The function 1790 // returns the (pc) offset of the div instruction - may be needed 1791 // for implicit exceptions. 1792 // 1793 // constraint : ra/rb =/= scratch 1794 // normal case 1795 // 1796 // input : ra: dividend 1797 // rb: divisor 1798 // 1799 // result: either 1800 // quotient (= ra idiv rb) 1801 // remainder (= ra irem rb) 1802 1803 assert(ra != scratch && rb != scratch, "reg cannot be scratch"); 1804 1805 int idivq_offset = offset(); 1806 if (! want_remainder) { 1807 sdiv(result, ra, rb); 1808 } else { 1809 sdiv(scratch, ra, rb); 1810 Assembler::msub(result, scratch, rb, ra); 1811 } 1812 1813 return idivq_offset; 1814 } 1815 1816 void MacroAssembler::membar(Membar_mask_bits order_constraint) { 1817 address prev = pc() - NativeMembar::instruction_size; 1818 address last = code()->last_insn(); 1819 if (last != NULL && nativeInstruction_at(last)->is_Membar() && prev == last) { 1820 NativeMembar *bar = NativeMembar_at(prev); 1821 // We are merging two memory barrier instructions. On AArch64 we 1822 // can do this simply by ORing them together. 1823 bar->set_kind(bar->get_kind() | order_constraint); 1824 BLOCK_COMMENT("merged membar"); 1825 } else { 1826 code()->set_last_insn(pc()); 1827 dmb(Assembler::barrier(order_constraint)); 1828 } 1829 } 1830 1831 bool MacroAssembler::try_merge_ldst(Register rt, const Address &adr, size_t size_in_bytes, bool is_store) { 1832 if (ldst_can_merge(rt, adr, size_in_bytes, is_store)) { 1833 merge_ldst(rt, adr, size_in_bytes, is_store); 1834 code()->clear_last_insn(); 1835 return true; 1836 } else { 1837 assert(size_in_bytes == 8 || size_in_bytes == 4, "only 8 bytes or 4 bytes load/store is supported."); 1838 const unsigned mask = size_in_bytes - 1; 1839 if (adr.getMode() == Address::base_plus_offset && 1840 (adr.offset() & mask) == 0) { // only supports base_plus_offset. 1841 code()->set_last_insn(pc()); 1842 } 1843 return false; 1844 } 1845 } 1846 1847 void MacroAssembler::ldr(Register Rx, const Address &adr) { 1848 // We always try to merge two adjacent loads into one ldp. 1849 if (!try_merge_ldst(Rx, adr, 8, false)) { 1850 Assembler::ldr(Rx, adr); 1851 } 1852 } 1853 1854 void MacroAssembler::ldrw(Register Rw, const Address &adr) { 1855 // We always try to merge two adjacent loads into one ldp. 1856 if (!try_merge_ldst(Rw, adr, 4, false)) { 1857 Assembler::ldrw(Rw, adr); 1858 } 1859 } 1860 1861 void MacroAssembler::str(Register Rx, const Address &adr) { 1862 // We always try to merge two adjacent stores into one stp. 1863 if (!try_merge_ldst(Rx, adr, 8, true)) { 1864 Assembler::str(Rx, adr); 1865 } 1866 } 1867 1868 void MacroAssembler::strw(Register Rw, const Address &adr) { 1869 // We always try to merge two adjacent stores into one stp. 1870 if (!try_merge_ldst(Rw, adr, 4, true)) { 1871 Assembler::strw(Rw, adr); 1872 } 1873 } 1874 1875 // MacroAssembler routines found actually to be needed 1876 1877 void MacroAssembler::push(Register src) 1878 { 1879 str(src, Address(pre(esp, -1 * wordSize))); 1880 } 1881 1882 void MacroAssembler::pop(Register dst) 1883 { 1884 ldr(dst, Address(post(esp, 1 * wordSize))); 1885 } 1886 1887 // Note: load_unsigned_short used to be called load_unsigned_word. 1888 int MacroAssembler::load_unsigned_short(Register dst, Address src) { 1889 int off = offset(); 1890 ldrh(dst, src); 1891 return off; 1892 } 1893 1894 int MacroAssembler::load_unsigned_byte(Register dst, Address src) { 1895 int off = offset(); 1896 ldrb(dst, src); 1897 return off; 1898 } 1899 1900 int MacroAssembler::load_signed_short(Register dst, Address src) { 1901 int off = offset(); 1902 ldrsh(dst, src); 1903 return off; 1904 } 1905 1906 int MacroAssembler::load_signed_byte(Register dst, Address src) { 1907 int off = offset(); 1908 ldrsb(dst, src); 1909 return off; 1910 } 1911 1912 int MacroAssembler::load_signed_short32(Register dst, Address src) { 1913 int off = offset(); 1914 ldrshw(dst, src); 1915 return off; 1916 } 1917 1918 int MacroAssembler::load_signed_byte32(Register dst, Address src) { 1919 int off = offset(); 1920 ldrsbw(dst, src); 1921 return off; 1922 } 1923 1924 void MacroAssembler::load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed, Register dst2) { 1925 switch (size_in_bytes) { 1926 case 8: ldr(dst, src); break; 1927 case 4: ldrw(dst, src); break; 1928 case 2: is_signed ? load_signed_short(dst, src) : load_unsigned_short(dst, src); break; 1929 case 1: is_signed ? load_signed_byte( dst, src) : load_unsigned_byte( dst, src); break; 1930 default: ShouldNotReachHere(); 1931 } 1932 } 1933 1934 void MacroAssembler::store_sized_value(Address dst, Register src, size_t size_in_bytes, Register src2) { 1935 switch (size_in_bytes) { 1936 case 8: str(src, dst); break; 1937 case 4: strw(src, dst); break; 1938 case 2: strh(src, dst); break; 1939 case 1: strb(src, dst); break; 1940 default: ShouldNotReachHere(); 1941 } 1942 } 1943 1944 void MacroAssembler::decrementw(Register reg, int value) 1945 { 1946 if (value < 0) { incrementw(reg, -value); return; } 1947 if (value == 0) { return; } 1948 if (value < (1 << 12)) { subw(reg, reg, value); return; } 1949 /* else */ { 1950 guarantee(reg != rscratch2, "invalid dst for register decrement"); 1951 movw(rscratch2, (unsigned)value); 1952 subw(reg, reg, rscratch2); 1953 } 1954 } 1955 1956 void MacroAssembler::decrement(Register reg, int value) 1957 { 1958 if (value < 0) { increment(reg, -value); return; } 1959 if (value == 0) { return; } 1960 if (value < (1 << 12)) { sub(reg, reg, value); return; } 1961 /* else */ { 1962 assert(reg != rscratch2, "invalid dst for register decrement"); 1963 mov(rscratch2, (unsigned long)value); 1964 sub(reg, reg, rscratch2); 1965 } 1966 } 1967 1968 void MacroAssembler::decrementw(Address dst, int value) 1969 { 1970 assert(!dst.uses(rscratch1), "invalid dst for address decrement"); 1971 if (dst.getMode() == Address::literal) { 1972 assert(abs(value) < (1 << 12), "invalid value and address mode combination"); 1973 lea(rscratch2, dst); 1974 dst = Address(rscratch2); 1975 } 1976 ldrw(rscratch1, dst); 1977 decrementw(rscratch1, value); 1978 strw(rscratch1, dst); 1979 } 1980 1981 void MacroAssembler::decrement(Address dst, int value) 1982 { 1983 assert(!dst.uses(rscratch1), "invalid address for decrement"); 1984 if (dst.getMode() == Address::literal) { 1985 assert(abs(value) < (1 << 12), "invalid value and address mode combination"); 1986 lea(rscratch2, dst); 1987 dst = Address(rscratch2); 1988 } 1989 ldr(rscratch1, dst); 1990 decrement(rscratch1, value); 1991 str(rscratch1, dst); 1992 } 1993 1994 void MacroAssembler::incrementw(Register reg, int value) 1995 { 1996 if (value < 0) { decrementw(reg, -value); return; } 1997 if (value == 0) { return; } 1998 if (value < (1 << 12)) { addw(reg, reg, value); return; } 1999 /* else */ { 2000 assert(reg != rscratch2, "invalid dst for register increment"); 2001 movw(rscratch2, (unsigned)value); 2002 addw(reg, reg, rscratch2); 2003 } 2004 } 2005 2006 void MacroAssembler::increment(Register reg, int value) 2007 { 2008 if (value < 0) { decrement(reg, -value); return; } 2009 if (value == 0) { return; } 2010 if (value < (1 << 12)) { add(reg, reg, value); return; } 2011 /* else */ { 2012 assert(reg != rscratch2, "invalid dst for register increment"); 2013 movw(rscratch2, (unsigned)value); 2014 add(reg, reg, rscratch2); 2015 } 2016 } 2017 2018 void MacroAssembler::incrementw(Address dst, int value) 2019 { 2020 assert(!dst.uses(rscratch1), "invalid dst for address increment"); 2021 if (dst.getMode() == Address::literal) { 2022 assert(abs(value) < (1 << 12), "invalid value and address mode combination"); 2023 lea(rscratch2, dst); 2024 dst = Address(rscratch2); 2025 } 2026 ldrw(rscratch1, dst); 2027 incrementw(rscratch1, value); 2028 strw(rscratch1, dst); 2029 } 2030 2031 void MacroAssembler::increment(Address dst, int value) 2032 { 2033 assert(!dst.uses(rscratch1), "invalid dst for address increment"); 2034 if (dst.getMode() == Address::literal) { 2035 assert(abs(value) < (1 << 12), "invalid value and address mode combination"); 2036 lea(rscratch2, dst); 2037 dst = Address(rscratch2); 2038 } 2039 ldr(rscratch1, dst); 2040 increment(rscratch1, value); 2041 str(rscratch1, dst); 2042 } 2043 2044 2045 void MacroAssembler::pusha() { 2046 push(0x7fffffff, sp); 2047 } 2048 2049 void MacroAssembler::popa() { 2050 pop(0x7fffffff, sp); 2051 } 2052 2053 // Push lots of registers in the bit set supplied. Don't push sp. 2054 // Return the number of words pushed 2055 int MacroAssembler::push(unsigned int bitset, Register stack) { 2056 int words_pushed = 0; 2057 2058 // Scan bitset to accumulate register pairs 2059 unsigned char regs[32]; 2060 int count = 0; 2061 for (int reg = 0; reg <= 30; reg++) { 2062 if (1 & bitset) 2063 regs[count++] = reg; 2064 bitset >>= 1; 2065 } 2066 regs[count++] = zr->encoding_nocheck(); 2067 count &= ~1; // Only push an even nuber of regs 2068 2069 if (count) { 2070 stp(as_Register(regs[0]), as_Register(regs[1]), 2071 Address(pre(stack, -count * wordSize))); 2072 words_pushed += 2; 2073 } 2074 for (int i = 2; i < count; i += 2) { 2075 stp(as_Register(regs[i]), as_Register(regs[i+1]), 2076 Address(stack, i * wordSize)); 2077 words_pushed += 2; 2078 } 2079 2080 assert(words_pushed == count, "oops, pushed != count"); 2081 2082 return count; 2083 } 2084 2085 int MacroAssembler::pop(unsigned int bitset, Register stack) { 2086 int words_pushed = 0; 2087 2088 // Scan bitset to accumulate register pairs 2089 unsigned char regs[32]; 2090 int count = 0; 2091 for (int reg = 0; reg <= 30; reg++) { 2092 if (1 & bitset) 2093 regs[count++] = reg; 2094 bitset >>= 1; 2095 } 2096 regs[count++] = zr->encoding_nocheck(); 2097 count &= ~1; 2098 2099 for (int i = 2; i < count; i += 2) { 2100 ldp(as_Register(regs[i]), as_Register(regs[i+1]), 2101 Address(stack, i * wordSize)); 2102 words_pushed += 2; 2103 } 2104 if (count) { 2105 ldp(as_Register(regs[0]), as_Register(regs[1]), 2106 Address(post(stack, count * wordSize))); 2107 words_pushed += 2; 2108 } 2109 2110 assert(words_pushed == count, "oops, pushed != count"); 2111 2112 return count; 2113 } 2114 #ifdef ASSERT 2115 void MacroAssembler::verify_heapbase(const char* msg) { 2116 #if 0 2117 assert (UseCompressedOops || UseCompressedClassPointers, "should be compressed"); 2118 assert (Universe::heap() != NULL, "java heap should be initialized"); 2119 if (CheckCompressedOops) { 2120 Label ok; 2121 push(1 << rscratch1->encoding(), sp); // cmpptr trashes rscratch1 2122 cmpptr(rheapbase, ExternalAddress((address)Universe::narrow_ptrs_base_addr())); 2123 br(Assembler::EQ, ok); 2124 stop(msg); 2125 bind(ok); 2126 pop(1 << rscratch1->encoding(), sp); 2127 } 2128 #endif 2129 } 2130 #endif 2131 2132 void MacroAssembler::resolve_jobject(Register value, Register thread, Register tmp) { 2133 Label done, not_weak; 2134 cbz(value, done); // Use NULL as-is. 2135 2136 STATIC_ASSERT(JNIHandles::weak_tag_mask == 1u); 2137 tbz(r0, 0, not_weak); // Test for jweak tag. 2138 2139 // Resolve jweak. 2140 access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF, value, 2141 Address(value, -JNIHandles::weak_tag_value), tmp, thread); 2142 verify_oop(value); 2143 b(done); 2144 2145 bind(not_weak); 2146 // Resolve (untagged) jobject. 2147 access_load_at(T_OBJECT, IN_NATIVE, value, Address(value, 0), tmp, thread); 2148 verify_oop(value); 2149 bind(done); 2150 } 2151 2152 void MacroAssembler::stop(const char* msg) { 2153 address ip = pc(); 2154 pusha(); 2155 mov(c_rarg0, (address)msg); 2156 mov(c_rarg1, (address)ip); 2157 mov(c_rarg2, sp); 2158 mov(c_rarg3, CAST_FROM_FN_PTR(address, MacroAssembler::debug64)); 2159 // call(c_rarg3); 2160 blrt(c_rarg3, 3, 0, 1); 2161 hlt(0); 2162 } 2163 2164 void MacroAssembler::warn(const char* msg) { 2165 pusha(); 2166 mov(c_rarg0, (address)msg); 2167 mov(lr, CAST_FROM_FN_PTR(address, warning)); 2168 blrt(lr, 1, 0, MacroAssembler::ret_type_void); 2169 popa(); 2170 } 2171 2172 void MacroAssembler::unimplemented(const char* what) { 2173 const char* buf = NULL; 2174 { 2175 ResourceMark rm; 2176 stringStream ss; 2177 ss.print("unimplemented: %s", what); 2178 buf = code_string(ss.as_string()); 2179 } 2180 stop(buf); 2181 } 2182 2183 // If a constant does not fit in an immediate field, generate some 2184 // number of MOV instructions and then perform the operation. 2185 void MacroAssembler::wrap_add_sub_imm_insn(Register Rd, Register Rn, unsigned imm, 2186 add_sub_imm_insn insn1, 2187 add_sub_reg_insn insn2) { 2188 assert(Rd != zr, "Rd = zr and not setting flags?"); 2189 if (operand_valid_for_add_sub_immediate((int)imm)) { 2190 (this->*insn1)(Rd, Rn, imm); 2191 } else { 2192 if (uabs(imm) < (1 << 24)) { 2193 (this->*insn1)(Rd, Rn, imm & -(1 << 12)); 2194 (this->*insn1)(Rd, Rd, imm & ((1 << 12)-1)); 2195 } else { 2196 assert_different_registers(Rd, Rn); 2197 mov(Rd, (uint64_t)imm); 2198 (this->*insn2)(Rd, Rn, Rd, LSL, 0); 2199 } 2200 } 2201 } 2202 2203 // Seperate vsn which sets the flags. Optimisations are more restricted 2204 // because we must set the flags correctly. 2205 void MacroAssembler::wrap_adds_subs_imm_insn(Register Rd, Register Rn, unsigned imm, 2206 add_sub_imm_insn insn1, 2207 add_sub_reg_insn insn2) { 2208 if (operand_valid_for_add_sub_immediate((int)imm)) { 2209 (this->*insn1)(Rd, Rn, imm); 2210 } else { 2211 assert_different_registers(Rd, Rn); 2212 assert(Rd != zr, "overflow in immediate operand"); 2213 mov(Rd, (uint64_t)imm); 2214 (this->*insn2)(Rd, Rn, Rd, LSL, 0); 2215 } 2216 } 2217 2218 2219 void MacroAssembler::add(Register Rd, Register Rn, RegisterOrConstant increment) { 2220 if (increment.is_register()) { 2221 add(Rd, Rn, increment.as_register()); 2222 } else { 2223 add(Rd, Rn, increment.as_constant()); 2224 } 2225 } 2226 2227 void MacroAssembler::addw(Register Rd, Register Rn, RegisterOrConstant increment) { 2228 if (increment.is_register()) { 2229 addw(Rd, Rn, increment.as_register()); 2230 } else { 2231 addw(Rd, Rn, increment.as_constant()); 2232 } 2233 } 2234 2235 void MacroAssembler::sub(Register Rd, Register Rn, RegisterOrConstant decrement) { 2236 if (decrement.is_register()) { 2237 sub(Rd, Rn, decrement.as_register()); 2238 } else { 2239 sub(Rd, Rn, decrement.as_constant()); 2240 } 2241 } 2242 2243 void MacroAssembler::subw(Register Rd, Register Rn, RegisterOrConstant decrement) { 2244 if (decrement.is_register()) { 2245 subw(Rd, Rn, decrement.as_register()); 2246 } else { 2247 subw(Rd, Rn, decrement.as_constant()); 2248 } 2249 } 2250 2251 void MacroAssembler::reinit_heapbase() 2252 { 2253 if (UseCompressedOops) { 2254 if (Universe::is_fully_initialized()) { 2255 mov(rheapbase, Universe::narrow_ptrs_base()); 2256 } else { 2257 lea(rheapbase, ExternalAddress((address)Universe::narrow_ptrs_base_addr())); 2258 ldr(rheapbase, Address(rheapbase)); 2259 } 2260 } 2261 } 2262 2263 // this simulates the behaviour of the x86 cmpxchg instruction using a 2264 // load linked/store conditional pair. we use the acquire/release 2265 // versions of these instructions so that we flush pending writes as 2266 // per Java semantics. 2267 2268 // n.b the x86 version assumes the old value to be compared against is 2269 // in rax and updates rax with the value located in memory if the 2270 // cmpxchg fails. we supply a register for the old value explicitly 2271 2272 // the aarch64 load linked/store conditional instructions do not 2273 // accept an offset. so, unlike x86, we must provide a plain register 2274 // to identify the memory word to be compared/exchanged rather than a 2275 // register+offset Address. 2276 2277 void MacroAssembler::cmpxchgptr(Register oldv, Register newv, Register addr, Register tmp, 2278 Label &succeed, Label *fail) { 2279 // oldv holds comparison value 2280 // newv holds value to write in exchange 2281 // addr identifies memory word to compare against/update 2282 if (UseLSE) { 2283 mov(tmp, oldv); 2284 casal(Assembler::xword, oldv, newv, addr); 2285 cmp(tmp, oldv); 2286 br(Assembler::EQ, succeed); 2287 membar(AnyAny); 2288 } else { 2289 Label retry_load, nope; 2290 if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH)) 2291 prfm(Address(addr), PSTL1STRM); 2292 bind(retry_load); 2293 // flush and load exclusive from the memory location 2294 // and fail if it is not what we expect 2295 ldaxr(tmp, addr); 2296 cmp(tmp, oldv); 2297 br(Assembler::NE, nope); 2298 // if we store+flush with no intervening write tmp wil be zero 2299 stlxr(tmp, newv, addr); 2300 cbzw(tmp, succeed); 2301 // retry so we only ever return after a load fails to compare 2302 // ensures we don't return a stale value after a failed write. 2303 b(retry_load); 2304 // if the memory word differs we return it in oldv and signal a fail 2305 bind(nope); 2306 membar(AnyAny); 2307 mov(oldv, tmp); 2308 } 2309 if (fail) 2310 b(*fail); 2311 } 2312 2313 void MacroAssembler::cmpxchg_obj_header(Register oldv, Register newv, Register obj, Register tmp, 2314 Label &succeed, Label *fail) { 2315 assert(oopDesc::mark_offset_in_bytes() == 0, "assumption"); 2316 cmpxchgptr(oldv, newv, obj, tmp, succeed, fail); 2317 } 2318 2319 void MacroAssembler::cmpxchgw(Register oldv, Register newv, Register addr, Register tmp, 2320 Label &succeed, Label *fail) { 2321 // oldv holds comparison value 2322 // newv holds value to write in exchange 2323 // addr identifies memory word to compare against/update 2324 // tmp returns 0/1 for success/failure 2325 if (UseLSE) { 2326 mov(tmp, oldv); 2327 casal(Assembler::word, oldv, newv, addr); 2328 cmp(tmp, oldv); 2329 br(Assembler::EQ, succeed); 2330 membar(AnyAny); 2331 } else { 2332 Label retry_load, nope; 2333 if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH)) 2334 prfm(Address(addr), PSTL1STRM); 2335 bind(retry_load); 2336 // flush and load exclusive from the memory location 2337 // and fail if it is not what we expect 2338 ldaxrw(tmp, addr); 2339 cmp(tmp, oldv); 2340 br(Assembler::NE, nope); 2341 // if we store+flush with no intervening write tmp wil be zero 2342 stlxrw(tmp, newv, addr); 2343 cbzw(tmp, succeed); 2344 // retry so we only ever return after a load fails to compare 2345 // ensures we don't return a stale value after a failed write. 2346 b(retry_load); 2347 // if the memory word differs we return it in oldv and signal a fail 2348 bind(nope); 2349 membar(AnyAny); 2350 mov(oldv, tmp); 2351 } 2352 if (fail) 2353 b(*fail); 2354 } 2355 2356 // A generic CAS; success or failure is in the EQ flag. A weak CAS 2357 // doesn't retry and may fail spuriously. If the oldval is wanted, 2358 // Pass a register for the result, otherwise pass noreg. 2359 2360 // Clobbers rscratch1 2361 void MacroAssembler::cmpxchg(Register addr, Register expected, 2362 Register new_val, 2363 enum operand_size size, 2364 bool acquire, bool release, 2365 bool weak, 2366 Register result) { 2367 if (result == noreg) result = rscratch1; 2368 BLOCK_COMMENT("cmpxchg {"); 2369 if (UseLSE) { 2370 mov(result, expected); 2371 lse_cas(result, new_val, addr, size, acquire, release, /*not_pair*/ true); 2372 compare_eq(result, expected, size); 2373 } else { 2374 Label retry_load, done; 2375 if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH)) 2376 prfm(Address(addr), PSTL1STRM); 2377 bind(retry_load); 2378 load_exclusive(result, addr, size, acquire); 2379 compare_eq(result, expected, size); 2380 br(Assembler::NE, done); 2381 store_exclusive(rscratch1, new_val, addr, size, release); 2382 if (weak) { 2383 cmpw(rscratch1, 0u); // If the store fails, return NE to our caller. 2384 } else { 2385 cbnzw(rscratch1, retry_load); 2386 } 2387 bind(done); 2388 } 2389 BLOCK_COMMENT("} cmpxchg"); 2390 } 2391 2392 // A generic comparison. Only compares for equality, clobbers rscratch1. 2393 void MacroAssembler::compare_eq(Register rm, Register rn, enum operand_size size) { 2394 if (size == xword) { 2395 cmp(rm, rn); 2396 } else if (size == word) { 2397 cmpw(rm, rn); 2398 } else if (size == halfword) { 2399 eorw(rscratch1, rm, rn); 2400 ands(zr, rscratch1, 0xffff); 2401 } else if (size == byte) { 2402 eorw(rscratch1, rm, rn); 2403 ands(zr, rscratch1, 0xff); 2404 } else { 2405 ShouldNotReachHere(); 2406 } 2407 } 2408 2409 2410 static bool different(Register a, RegisterOrConstant b, Register c) { 2411 if (b.is_constant()) 2412 return a != c; 2413 else 2414 return a != b.as_register() && a != c && b.as_register() != c; 2415 } 2416 2417 #define ATOMIC_OP(NAME, LDXR, OP, IOP, AOP, STXR, sz) \ 2418 void MacroAssembler::atomic_##NAME(Register prev, RegisterOrConstant incr, Register addr) { \ 2419 if (UseLSE) { \ 2420 prev = prev->is_valid() ? prev : zr; \ 2421 if (incr.is_register()) { \ 2422 AOP(sz, incr.as_register(), prev, addr); \ 2423 } else { \ 2424 mov(rscratch2, incr.as_constant()); \ 2425 AOP(sz, rscratch2, prev, addr); \ 2426 } \ 2427 return; \ 2428 } \ 2429 Register result = rscratch2; \ 2430 if (prev->is_valid()) \ 2431 result = different(prev, incr, addr) ? prev : rscratch2; \ 2432 \ 2433 Label retry_load; \ 2434 if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH)) \ 2435 prfm(Address(addr), PSTL1STRM); \ 2436 bind(retry_load); \ 2437 LDXR(result, addr); \ 2438 OP(rscratch1, result, incr); \ 2439 STXR(rscratch2, rscratch1, addr); \ 2440 cbnzw(rscratch2, retry_load); \ 2441 if (prev->is_valid() && prev != result) { \ 2442 IOP(prev, rscratch1, incr); \ 2443 } \ 2444 } 2445 2446 ATOMIC_OP(add, ldxr, add, sub, ldadd, stxr, Assembler::xword) 2447 ATOMIC_OP(addw, ldxrw, addw, subw, ldadd, stxrw, Assembler::word) 2448 ATOMIC_OP(addal, ldaxr, add, sub, ldaddal, stlxr, Assembler::xword) 2449 ATOMIC_OP(addalw, ldaxrw, addw, subw, ldaddal, stlxrw, Assembler::word) 2450 2451 #undef ATOMIC_OP 2452 2453 #define ATOMIC_XCHG(OP, AOP, LDXR, STXR, sz) \ 2454 void MacroAssembler::atomic_##OP(Register prev, Register newv, Register addr) { \ 2455 if (UseLSE) { \ 2456 prev = prev->is_valid() ? prev : zr; \ 2457 AOP(sz, newv, prev, addr); \ 2458 return; \ 2459 } \ 2460 Register result = rscratch2; \ 2461 if (prev->is_valid()) \ 2462 result = different(prev, newv, addr) ? prev : rscratch2; \ 2463 \ 2464 Label retry_load; \ 2465 if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH)) \ 2466 prfm(Address(addr), PSTL1STRM); \ 2467 bind(retry_load); \ 2468 LDXR(result, addr); \ 2469 STXR(rscratch1, newv, addr); \ 2470 cbnzw(rscratch1, retry_load); \ 2471 if (prev->is_valid() && prev != result) \ 2472 mov(prev, result); \ 2473 } 2474 2475 ATOMIC_XCHG(xchg, swp, ldxr, stxr, Assembler::xword) 2476 ATOMIC_XCHG(xchgw, swp, ldxrw, stxrw, Assembler::word) 2477 ATOMIC_XCHG(xchgal, swpal, ldaxr, stlxr, Assembler::xword) 2478 ATOMIC_XCHG(xchgalw, swpal, ldaxrw, stlxrw, Assembler::word) 2479 2480 #undef ATOMIC_XCHG 2481 2482 #ifndef PRODUCT 2483 extern "C" void findpc(intptr_t x); 2484 #endif 2485 2486 void MacroAssembler::debug64(char* msg, int64_t pc, int64_t regs[]) 2487 { 2488 // In order to get locks to work, we need to fake a in_VM state 2489 if (ShowMessageBoxOnError ) { 2490 JavaThread* thread = JavaThread::current(); 2491 JavaThreadState saved_state = thread->thread_state(); 2492 thread->set_thread_state(_thread_in_vm); 2493 #ifndef PRODUCT 2494 if (CountBytecodes || TraceBytecodes || StopInterpreterAt) { 2495 ttyLocker ttyl; 2496 BytecodeCounter::print(); 2497 } 2498 #endif 2499 if (os::message_box(msg, "Execution stopped, print registers?")) { 2500 ttyLocker ttyl; 2501 tty->print_cr(" pc = 0x%016lx", pc); 2502 #ifndef PRODUCT 2503 tty->cr(); 2504 findpc(pc); 2505 tty->cr(); 2506 #endif 2507 tty->print_cr(" r0 = 0x%016lx", regs[0]); 2508 tty->print_cr(" r1 = 0x%016lx", regs[1]); 2509 tty->print_cr(" r2 = 0x%016lx", regs[2]); 2510 tty->print_cr(" r3 = 0x%016lx", regs[3]); 2511 tty->print_cr(" r4 = 0x%016lx", regs[4]); 2512 tty->print_cr(" r5 = 0x%016lx", regs[5]); 2513 tty->print_cr(" r6 = 0x%016lx", regs[6]); 2514 tty->print_cr(" r7 = 0x%016lx", regs[7]); 2515 tty->print_cr(" r8 = 0x%016lx", regs[8]); 2516 tty->print_cr(" r9 = 0x%016lx", regs[9]); 2517 tty->print_cr("r10 = 0x%016lx", regs[10]); 2518 tty->print_cr("r11 = 0x%016lx", regs[11]); 2519 tty->print_cr("r12 = 0x%016lx", regs[12]); 2520 tty->print_cr("r13 = 0x%016lx", regs[13]); 2521 tty->print_cr("r14 = 0x%016lx", regs[14]); 2522 tty->print_cr("r15 = 0x%016lx", regs[15]); 2523 tty->print_cr("r16 = 0x%016lx", regs[16]); 2524 tty->print_cr("r17 = 0x%016lx", regs[17]); 2525 tty->print_cr("r18 = 0x%016lx", regs[18]); 2526 tty->print_cr("r19 = 0x%016lx", regs[19]); 2527 tty->print_cr("r20 = 0x%016lx", regs[20]); 2528 tty->print_cr("r21 = 0x%016lx", regs[21]); 2529 tty->print_cr("r22 = 0x%016lx", regs[22]); 2530 tty->print_cr("r23 = 0x%016lx", regs[23]); 2531 tty->print_cr("r24 = 0x%016lx", regs[24]); 2532 tty->print_cr("r25 = 0x%016lx", regs[25]); 2533 tty->print_cr("r26 = 0x%016lx", regs[26]); 2534 tty->print_cr("r27 = 0x%016lx", regs[27]); 2535 tty->print_cr("r28 = 0x%016lx", regs[28]); 2536 tty->print_cr("r30 = 0x%016lx", regs[30]); 2537 tty->print_cr("r31 = 0x%016lx", regs[31]); 2538 BREAKPOINT; 2539 } 2540 ThreadStateTransition::transition(thread, _thread_in_vm, saved_state); 2541 } else { 2542 ttyLocker ttyl; 2543 ::tty->print_cr("=============== DEBUG MESSAGE: %s ================\n", 2544 msg); 2545 assert(false, "DEBUG MESSAGE: %s", msg); 2546 } 2547 } 2548 2549 #ifdef BUILTIN_SIM 2550 // routine to generate an x86 prolog for a stub function which 2551 // bootstraps into the generated ARM code which directly follows the 2552 // stub 2553 // 2554 // the argument encodes the number of general and fp registers 2555 // passed by the caller and the callng convention (currently just 2556 // the number of general registers and assumes C argument passing) 2557 2558 extern "C" { 2559 int aarch64_stub_prolog_size(); 2560 void aarch64_stub_prolog(); 2561 void aarch64_prolog(); 2562 } 2563 2564 void MacroAssembler::c_stub_prolog(int gp_arg_count, int fp_arg_count, int ret_type, 2565 address *prolog_ptr) 2566 { 2567 int calltype = (((ret_type & 0x3) << 8) | 2568 ((fp_arg_count & 0xf) << 4) | 2569 (gp_arg_count & 0xf)); 2570 2571 // the addresses for the x86 to ARM entry code we need to use 2572 address start = pc(); 2573 // printf("start = %lx\n", start); 2574 int byteCount = aarch64_stub_prolog_size(); 2575 // printf("byteCount = %x\n", byteCount); 2576 int instructionCount = (byteCount + 3)/ 4; 2577 // printf("instructionCount = %x\n", instructionCount); 2578 for (int i = 0; i < instructionCount; i++) { 2579 nop(); 2580 } 2581 2582 memcpy(start, (void*)aarch64_stub_prolog, byteCount); 2583 2584 // write the address of the setup routine and the call format at the 2585 // end of into the copied code 2586 u_int64_t *patch_end = (u_int64_t *)(start + byteCount); 2587 if (prolog_ptr) 2588 patch_end[-2] = (u_int64_t)prolog_ptr; 2589 patch_end[-1] = calltype; 2590 } 2591 #endif 2592 2593 void MacroAssembler::push_call_clobbered_registers() { 2594 int step = 4 * wordSize; 2595 push(RegSet::range(r0, r18) - RegSet::of(rscratch1, rscratch2), sp); 2596 sub(sp, sp, step); 2597 mov(rscratch1, -step); 2598 // Push v0-v7, v16-v31. 2599 for (int i = 31; i>= 4; i -= 4) { 2600 if (i <= v7->encoding() || i >= v16->encoding()) 2601 st1(as_FloatRegister(i-3), as_FloatRegister(i-2), as_FloatRegister(i-1), 2602 as_FloatRegister(i), T1D, Address(post(sp, rscratch1))); 2603 } 2604 st1(as_FloatRegister(0), as_FloatRegister(1), as_FloatRegister(2), 2605 as_FloatRegister(3), T1D, Address(sp)); 2606 } 2607 2608 void MacroAssembler::pop_call_clobbered_registers() { 2609 for (int i = 0; i < 32; i += 4) { 2610 if (i <= v7->encoding() || i >= v16->encoding()) 2611 ld1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2), 2612 as_FloatRegister(i+3), T1D, Address(post(sp, 4 * wordSize))); 2613 } 2614 2615 pop(RegSet::range(r0, r18) - RegSet::of(rscratch1, rscratch2), sp); 2616 } 2617 2618 void MacroAssembler::push_CPU_state(bool save_vectors) { 2619 int step = (save_vectors ? 8 : 4) * wordSize; 2620 push(0x3fffffff, sp); // integer registers except lr & sp 2621 mov(rscratch1, -step); 2622 sub(sp, sp, step); 2623 for (int i = 28; i >= 4; i -= 4) { 2624 st1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2), 2625 as_FloatRegister(i+3), save_vectors ? T2D : T1D, Address(post(sp, rscratch1))); 2626 } 2627 st1(v0, v1, v2, v3, save_vectors ? T2D : T1D, sp); 2628 } 2629 2630 void MacroAssembler::pop_CPU_state(bool restore_vectors) { 2631 int step = (restore_vectors ? 8 : 4) * wordSize; 2632 for (int i = 0; i <= 28; i += 4) 2633 ld1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2), 2634 as_FloatRegister(i+3), restore_vectors ? T2D : T1D, Address(post(sp, step))); 2635 pop(0x3fffffff, sp); // integer registers except lr & sp 2636 } 2637 2638 /** 2639 * Helpers for multiply_to_len(). 2640 */ 2641 void MacroAssembler::add2_with_carry(Register final_dest_hi, Register dest_hi, Register dest_lo, 2642 Register src1, Register src2) { 2643 adds(dest_lo, dest_lo, src1); 2644 adc(dest_hi, dest_hi, zr); 2645 adds(dest_lo, dest_lo, src2); 2646 adc(final_dest_hi, dest_hi, zr); 2647 } 2648 2649 // Generate an address from (r + r1 extend offset). "size" is the 2650 // size of the operand. The result may be in rscratch2. 2651 Address MacroAssembler::offsetted_address(Register r, Register r1, 2652 Address::extend ext, int offset, int size) { 2653 if (offset || (ext.shift() % size != 0)) { 2654 lea(rscratch2, Address(r, r1, ext)); 2655 return Address(rscratch2, offset); 2656 } else { 2657 return Address(r, r1, ext); 2658 } 2659 } 2660 2661 Address MacroAssembler::spill_address(int size, int offset, Register tmp) 2662 { 2663 assert(offset >= 0, "spill to negative address?"); 2664 // Offset reachable ? 2665 // Not aligned - 9 bits signed offset 2666 // Aligned - 12 bits unsigned offset shifted 2667 Register base = sp; 2668 if ((offset & (size-1)) && offset >= (1<<8)) { 2669 add(tmp, base, offset & ((1<<12)-1)); 2670 base = tmp; 2671 offset &= -1<<12; 2672 } 2673 2674 if (offset >= (1<<12) * size) { 2675 add(tmp, base, offset & (((1<<12)-1)<<12)); 2676 base = tmp; 2677 offset &= ~(((1<<12)-1)<<12); 2678 } 2679 2680 return Address(base, offset); 2681 } 2682 2683 // Checks whether offset is aligned. 2684 // Returns true if it is, else false. 2685 bool MacroAssembler::merge_alignment_check(Register base, 2686 size_t size, 2687 long cur_offset, 2688 long prev_offset) const { 2689 if (AvoidUnalignedAccesses) { 2690 if (base == sp) { 2691 // Checks whether low offset if aligned to pair of registers. 2692 long pair_mask = size * 2 - 1; 2693 long offset = prev_offset > cur_offset ? cur_offset : prev_offset; 2694 return (offset & pair_mask) == 0; 2695 } else { // If base is not sp, we can't guarantee the access is aligned. 2696 return false; 2697 } 2698 } else { 2699 long mask = size - 1; 2700 // Load/store pair instruction only supports element size aligned offset. 2701 return (cur_offset & mask) == 0 && (prev_offset & mask) == 0; 2702 } 2703 } 2704 2705 // Checks whether current and previous loads/stores can be merged. 2706 // Returns true if it can be merged, else false. 2707 bool MacroAssembler::ldst_can_merge(Register rt, 2708 const Address &adr, 2709 size_t cur_size_in_bytes, 2710 bool is_store) const { 2711 address prev = pc() - NativeInstruction::instruction_size; 2712 address last = code()->last_insn(); 2713 2714 if (last == NULL || !nativeInstruction_at(last)->is_Imm_LdSt()) { 2715 return false; 2716 } 2717 2718 if (adr.getMode() != Address::base_plus_offset || prev != last) { 2719 return false; 2720 } 2721 2722 NativeLdSt* prev_ldst = NativeLdSt_at(prev); 2723 size_t prev_size_in_bytes = prev_ldst->size_in_bytes(); 2724 2725 assert(prev_size_in_bytes == 4 || prev_size_in_bytes == 8, "only supports 64/32bit merging."); 2726 assert(cur_size_in_bytes == 4 || cur_size_in_bytes == 8, "only supports 64/32bit merging."); 2727 2728 if (cur_size_in_bytes != prev_size_in_bytes || is_store != prev_ldst->is_store()) { 2729 return false; 2730 } 2731 2732 long max_offset = 63 * prev_size_in_bytes; 2733 long min_offset = -64 * prev_size_in_bytes; 2734 2735 assert(prev_ldst->is_not_pre_post_index(), "pre-index or post-index is not supported to be merged."); 2736 2737 // Only same base can be merged. 2738 if (adr.base() != prev_ldst->base()) { 2739 return false; 2740 } 2741 2742 long cur_offset = adr.offset(); 2743 long prev_offset = prev_ldst->offset(); 2744 size_t diff = abs(cur_offset - prev_offset); 2745 if (diff != prev_size_in_bytes) { 2746 return false; 2747 } 2748 2749 // Following cases can not be merged: 2750 // ldr x2, [x2, #8] 2751 // ldr x3, [x2, #16] 2752 // or: 2753 // ldr x2, [x3, #8] 2754 // ldr x2, [x3, #16] 2755 // If t1 and t2 is the same in "ldp t1, t2, [xn, #imm]", we'll get SIGILL. 2756 if (!is_store && (adr.base() == prev_ldst->target() || rt == prev_ldst->target())) { 2757 return false; 2758 } 2759 2760 long low_offset = prev_offset > cur_offset ? cur_offset : prev_offset; 2761 // Offset range must be in ldp/stp instruction's range. 2762 if (low_offset > max_offset || low_offset < min_offset) { 2763 return false; 2764 } 2765 2766 if (merge_alignment_check(adr.base(), prev_size_in_bytes, cur_offset, prev_offset)) { 2767 return true; 2768 } 2769 2770 return false; 2771 } 2772 2773 // Merge current load/store with previous load/store into ldp/stp. 2774 void MacroAssembler::merge_ldst(Register rt, 2775 const Address &adr, 2776 size_t cur_size_in_bytes, 2777 bool is_store) { 2778 2779 assert(ldst_can_merge(rt, adr, cur_size_in_bytes, is_store) == true, "cur and prev must be able to be merged."); 2780 2781 Register rt_low, rt_high; 2782 address prev = pc() - NativeInstruction::instruction_size; 2783 NativeLdSt* prev_ldst = NativeLdSt_at(prev); 2784 2785 long offset; 2786 2787 if (adr.offset() < prev_ldst->offset()) { 2788 offset = adr.offset(); 2789 rt_low = rt; 2790 rt_high = prev_ldst->target(); 2791 } else { 2792 offset = prev_ldst->offset(); 2793 rt_low = prev_ldst->target(); 2794 rt_high = rt; 2795 } 2796 2797 Address adr_p = Address(prev_ldst->base(), offset); 2798 // Overwrite previous generated binary. 2799 code_section()->set_end(prev); 2800 2801 const int sz = prev_ldst->size_in_bytes(); 2802 assert(sz == 8 || sz == 4, "only supports 64/32bit merging."); 2803 if (!is_store) { 2804 BLOCK_COMMENT("merged ldr pair"); 2805 if (sz == 8) { 2806 ldp(rt_low, rt_high, adr_p); 2807 } else { 2808 ldpw(rt_low, rt_high, adr_p); 2809 } 2810 } else { 2811 BLOCK_COMMENT("merged str pair"); 2812 if (sz == 8) { 2813 stp(rt_low, rt_high, adr_p); 2814 } else { 2815 stpw(rt_low, rt_high, adr_p); 2816 } 2817 } 2818 } 2819 2820 /** 2821 * Multiply 64 bit by 64 bit first loop. 2822 */ 2823 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart, 2824 Register y, Register y_idx, Register z, 2825 Register carry, Register product, 2826 Register idx, Register kdx) { 2827 // 2828 // jlong carry, x[], y[], z[]; 2829 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) { 2830 // huge_128 product = y[idx] * x[xstart] + carry; 2831 // z[kdx] = (jlong)product; 2832 // carry = (jlong)(product >>> 64); 2833 // } 2834 // z[xstart] = carry; 2835 // 2836 2837 Label L_first_loop, L_first_loop_exit; 2838 Label L_one_x, L_one_y, L_multiply; 2839 2840 subsw(xstart, xstart, 1); 2841 br(Assembler::MI, L_one_x); 2842 2843 lea(rscratch1, Address(x, xstart, Address::lsl(LogBytesPerInt))); 2844 ldr(x_xstart, Address(rscratch1)); 2845 ror(x_xstart, x_xstart, 32); // convert big-endian to little-endian 2846 2847 bind(L_first_loop); 2848 subsw(idx, idx, 1); 2849 br(Assembler::MI, L_first_loop_exit); 2850 subsw(idx, idx, 1); 2851 br(Assembler::MI, L_one_y); 2852 lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt))); 2853 ldr(y_idx, Address(rscratch1)); 2854 ror(y_idx, y_idx, 32); // convert big-endian to little-endian 2855 bind(L_multiply); 2856 2857 // AArch64 has a multiply-accumulate instruction that we can't use 2858 // here because it has no way to process carries, so we have to use 2859 // separate add and adc instructions. Bah. 2860 umulh(rscratch1, x_xstart, y_idx); // x_xstart * y_idx -> rscratch1:product 2861 mul(product, x_xstart, y_idx); 2862 adds(product, product, carry); 2863 adc(carry, rscratch1, zr); // x_xstart * y_idx + carry -> carry:product 2864 2865 subw(kdx, kdx, 2); 2866 ror(product, product, 32); // back to big-endian 2867 str(product, offsetted_address(z, kdx, Address::uxtw(LogBytesPerInt), 0, BytesPerLong)); 2868 2869 b(L_first_loop); 2870 2871 bind(L_one_y); 2872 ldrw(y_idx, Address(y, 0)); 2873 b(L_multiply); 2874 2875 bind(L_one_x); 2876 ldrw(x_xstart, Address(x, 0)); 2877 b(L_first_loop); 2878 2879 bind(L_first_loop_exit); 2880 } 2881 2882 /** 2883 * Multiply 128 bit by 128. Unrolled inner loop. 2884 * 2885 */ 2886 void MacroAssembler::multiply_128_x_128_loop(Register y, Register z, 2887 Register carry, Register carry2, 2888 Register idx, Register jdx, 2889 Register yz_idx1, Register yz_idx2, 2890 Register tmp, Register tmp3, Register tmp4, 2891 Register tmp6, Register product_hi) { 2892 2893 // jlong carry, x[], y[], z[]; 2894 // int kdx = ystart+1; 2895 // for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop 2896 // huge_128 tmp3 = (y[idx+1] * product_hi) + z[kdx+idx+1] + carry; 2897 // jlong carry2 = (jlong)(tmp3 >>> 64); 2898 // huge_128 tmp4 = (y[idx] * product_hi) + z[kdx+idx] + carry2; 2899 // carry = (jlong)(tmp4 >>> 64); 2900 // z[kdx+idx+1] = (jlong)tmp3; 2901 // z[kdx+idx] = (jlong)tmp4; 2902 // } 2903 // idx += 2; 2904 // if (idx > 0) { 2905 // yz_idx1 = (y[idx] * product_hi) + z[kdx+idx] + carry; 2906 // z[kdx+idx] = (jlong)yz_idx1; 2907 // carry = (jlong)(yz_idx1 >>> 64); 2908 // } 2909 // 2910 2911 Label L_third_loop, L_third_loop_exit, L_post_third_loop_done; 2912 2913 lsrw(jdx, idx, 2); 2914 2915 bind(L_third_loop); 2916 2917 subsw(jdx, jdx, 1); 2918 br(Assembler::MI, L_third_loop_exit); 2919 subw(idx, idx, 4); 2920 2921 lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt))); 2922 2923 ldp(yz_idx2, yz_idx1, Address(rscratch1, 0)); 2924 2925 lea(tmp6, Address(z, idx, Address::uxtw(LogBytesPerInt))); 2926 2927 ror(yz_idx1, yz_idx1, 32); // convert big-endian to little-endian 2928 ror(yz_idx2, yz_idx2, 32); 2929 2930 ldp(rscratch2, rscratch1, Address(tmp6, 0)); 2931 2932 mul(tmp3, product_hi, yz_idx1); // yz_idx1 * product_hi -> tmp4:tmp3 2933 umulh(tmp4, product_hi, yz_idx1); 2934 2935 ror(rscratch1, rscratch1, 32); // convert big-endian to little-endian 2936 ror(rscratch2, rscratch2, 32); 2937 2938 mul(tmp, product_hi, yz_idx2); // yz_idx2 * product_hi -> carry2:tmp 2939 umulh(carry2, product_hi, yz_idx2); 2940 2941 // propagate sum of both multiplications into carry:tmp4:tmp3 2942 adds(tmp3, tmp3, carry); 2943 adc(tmp4, tmp4, zr); 2944 adds(tmp3, tmp3, rscratch1); 2945 adcs(tmp4, tmp4, tmp); 2946 adc(carry, carry2, zr); 2947 adds(tmp4, tmp4, rscratch2); 2948 adc(carry, carry, zr); 2949 2950 ror(tmp3, tmp3, 32); // convert little-endian to big-endian 2951 ror(tmp4, tmp4, 32); 2952 stp(tmp4, tmp3, Address(tmp6, 0)); 2953 2954 b(L_third_loop); 2955 bind (L_third_loop_exit); 2956 2957 andw (idx, idx, 0x3); 2958 cbz(idx, L_post_third_loop_done); 2959 2960 Label L_check_1; 2961 subsw(idx, idx, 2); 2962 br(Assembler::MI, L_check_1); 2963 2964 lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt))); 2965 ldr(yz_idx1, Address(rscratch1, 0)); 2966 ror(yz_idx1, yz_idx1, 32); 2967 mul(tmp3, product_hi, yz_idx1); // yz_idx1 * product_hi -> tmp4:tmp3 2968 umulh(tmp4, product_hi, yz_idx1); 2969 lea(rscratch1, Address(z, idx, Address::uxtw(LogBytesPerInt))); 2970 ldr(yz_idx2, Address(rscratch1, 0)); 2971 ror(yz_idx2, yz_idx2, 32); 2972 2973 add2_with_carry(carry, tmp4, tmp3, carry, yz_idx2); 2974 2975 ror(tmp3, tmp3, 32); 2976 str(tmp3, Address(rscratch1, 0)); 2977 2978 bind (L_check_1); 2979 2980 andw (idx, idx, 0x1); 2981 subsw(idx, idx, 1); 2982 br(Assembler::MI, L_post_third_loop_done); 2983 ldrw(tmp4, Address(y, idx, Address::uxtw(LogBytesPerInt))); 2984 mul(tmp3, tmp4, product_hi); // tmp4 * product_hi -> carry2:tmp3 2985 umulh(carry2, tmp4, product_hi); 2986 ldrw(tmp4, Address(z, idx, Address::uxtw(LogBytesPerInt))); 2987 2988 add2_with_carry(carry2, tmp3, tmp4, carry); 2989 2990 strw(tmp3, Address(z, idx, Address::uxtw(LogBytesPerInt))); 2991 extr(carry, carry2, tmp3, 32); 2992 2993 bind(L_post_third_loop_done); 2994 } 2995 2996 /** 2997 * Code for BigInteger::multiplyToLen() instrinsic. 2998 * 2999 * r0: x 3000 * r1: xlen 3001 * r2: y 3002 * r3: ylen 3003 * r4: z 3004 * r5: zlen 3005 * r10: tmp1 3006 * r11: tmp2 3007 * r12: tmp3 3008 * r13: tmp4 3009 * r14: tmp5 3010 * r15: tmp6 3011 * r16: tmp7 3012 * 3013 */ 3014 void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Register ylen, 3015 Register z, Register zlen, 3016 Register tmp1, Register tmp2, Register tmp3, Register tmp4, 3017 Register tmp5, Register tmp6, Register product_hi) { 3018 3019 assert_different_registers(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6); 3020 3021 const Register idx = tmp1; 3022 const Register kdx = tmp2; 3023 const Register xstart = tmp3; 3024 3025 const Register y_idx = tmp4; 3026 const Register carry = tmp5; 3027 const Register product = xlen; 3028 const Register x_xstart = zlen; // reuse register 3029 3030 // First Loop. 3031 // 3032 // final static long LONG_MASK = 0xffffffffL; 3033 // int xstart = xlen - 1; 3034 // int ystart = ylen - 1; 3035 // long carry = 0; 3036 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) { 3037 // long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry; 3038 // z[kdx] = (int)product; 3039 // carry = product >>> 32; 3040 // } 3041 // z[xstart] = (int)carry; 3042 // 3043 3044 movw(idx, ylen); // idx = ylen; 3045 movw(kdx, zlen); // kdx = xlen+ylen; 3046 mov(carry, zr); // carry = 0; 3047 3048 Label L_done; 3049 3050 movw(xstart, xlen); 3051 subsw(xstart, xstart, 1); 3052 br(Assembler::MI, L_done); 3053 3054 multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, carry, product, idx, kdx); 3055 3056 Label L_second_loop; 3057 cbzw(kdx, L_second_loop); 3058 3059 Label L_carry; 3060 subw(kdx, kdx, 1); 3061 cbzw(kdx, L_carry); 3062 3063 strw(carry, Address(z, kdx, Address::uxtw(LogBytesPerInt))); 3064 lsr(carry, carry, 32); 3065 subw(kdx, kdx, 1); 3066 3067 bind(L_carry); 3068 strw(carry, Address(z, kdx, Address::uxtw(LogBytesPerInt))); 3069 3070 // Second and third (nested) loops. 3071 // 3072 // for (int i = xstart-1; i >= 0; i--) { // Second loop 3073 // carry = 0; 3074 // for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop 3075 // long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) + 3076 // (z[k] & LONG_MASK) + carry; 3077 // z[k] = (int)product; 3078 // carry = product >>> 32; 3079 // } 3080 // z[i] = (int)carry; 3081 // } 3082 // 3083 // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = product_hi 3084 3085 const Register jdx = tmp1; 3086 3087 bind(L_second_loop); 3088 mov(carry, zr); // carry = 0; 3089 movw(jdx, ylen); // j = ystart+1 3090 3091 subsw(xstart, xstart, 1); // i = xstart-1; 3092 br(Assembler::MI, L_done); 3093 3094 str(z, Address(pre(sp, -4 * wordSize))); 3095 3096 Label L_last_x; 3097 lea(z, offsetted_address(z, xstart, Address::uxtw(LogBytesPerInt), 4, BytesPerInt)); // z = z + k - j 3098 subsw(xstart, xstart, 1); // i = xstart-1; 3099 br(Assembler::MI, L_last_x); 3100 3101 lea(rscratch1, Address(x, xstart, Address::uxtw(LogBytesPerInt))); 3102 ldr(product_hi, Address(rscratch1)); 3103 ror(product_hi, product_hi, 32); // convert big-endian to little-endian 3104 3105 Label L_third_loop_prologue; 3106 bind(L_third_loop_prologue); 3107 3108 str(ylen, Address(sp, wordSize)); 3109 stp(x, xstart, Address(sp, 2 * wordSize)); 3110 multiply_128_x_128_loop(y, z, carry, x, jdx, ylen, product, 3111 tmp2, x_xstart, tmp3, tmp4, tmp6, product_hi); 3112 ldp(z, ylen, Address(post(sp, 2 * wordSize))); 3113 ldp(x, xlen, Address(post(sp, 2 * wordSize))); // copy old xstart -> xlen 3114 3115 addw(tmp3, xlen, 1); 3116 strw(carry, Address(z, tmp3, Address::uxtw(LogBytesPerInt))); 3117 subsw(tmp3, tmp3, 1); 3118 br(Assembler::MI, L_done); 3119 3120 lsr(carry, carry, 32); 3121 strw(carry, Address(z, tmp3, Address::uxtw(LogBytesPerInt))); 3122 b(L_second_loop); 3123 3124 // Next infrequent code is moved outside loops. 3125 bind(L_last_x); 3126 ldrw(product_hi, Address(x, 0)); 3127 b(L_third_loop_prologue); 3128 3129 bind(L_done); 3130 } 3131 3132 // Code for BigInteger::mulAdd instrinsic 3133 // out = r0 3134 // in = r1 3135 // offset = r2 (already out.length-offset) 3136 // len = r3 3137 // k = r4 3138 // 3139 // pseudo code from java implementation: 3140 // carry = 0; 3141 // offset = out.length-offset - 1; 3142 // for (int j=len-1; j >= 0; j--) { 3143 // product = (in[j] & LONG_MASK) * kLong + (out[offset] & LONG_MASK) + carry; 3144 // out[offset--] = (int)product; 3145 // carry = product >>> 32; 3146 // } 3147 // return (int)carry; 3148 void MacroAssembler::mul_add(Register out, Register in, Register offset, 3149 Register len, Register k) { 3150 Label LOOP, END; 3151 // pre-loop 3152 cmp(len, zr); // cmp, not cbz/cbnz: to use condition twice => less branches 3153 csel(out, zr, out, Assembler::EQ); 3154 br(Assembler::EQ, END); 3155 add(in, in, len, LSL, 2); // in[j+1] address 3156 add(offset, out, offset, LSL, 2); // out[offset + 1] address 3157 mov(out, zr); // used to keep carry now 3158 BIND(LOOP); 3159 ldrw(rscratch1, Address(pre(in, -4))); 3160 madd(rscratch1, rscratch1, k, out); 3161 ldrw(rscratch2, Address(pre(offset, -4))); 3162 add(rscratch1, rscratch1, rscratch2); 3163 strw(rscratch1, Address(offset)); 3164 lsr(out, rscratch1, 32); 3165 subs(len, len, 1); 3166 br(Assembler::NE, LOOP); 3167 BIND(END); 3168 } 3169 3170 /** 3171 * Emits code to update CRC-32 with a byte value according to constants in table 3172 * 3173 * @param [in,out]crc Register containing the crc. 3174 * @param [in]val Register containing the byte to fold into the CRC. 3175 * @param [in]table Register containing the table of crc constants. 3176 * 3177 * uint32_t crc; 3178 * val = crc_table[(val ^ crc) & 0xFF]; 3179 * crc = val ^ (crc >> 8); 3180 * 3181 */ 3182 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) { 3183 eor(val, val, crc); 3184 andr(val, val, 0xff); 3185 ldrw(val, Address(table, val, Address::lsl(2))); 3186 eor(crc, val, crc, Assembler::LSR, 8); 3187 } 3188 3189 /** 3190 * Emits code to update CRC-32 with a 32-bit value according to tables 0 to 3 3191 * 3192 * @param [in,out]crc Register containing the crc. 3193 * @param [in]v Register containing the 32-bit to fold into the CRC. 3194 * @param [in]table0 Register containing table 0 of crc constants. 3195 * @param [in]table1 Register containing table 1 of crc constants. 3196 * @param [in]table2 Register containing table 2 of crc constants. 3197 * @param [in]table3 Register containing table 3 of crc constants. 3198 * 3199 * uint32_t crc; 3200 * v = crc ^ v 3201 * crc = table3[v&0xff]^table2[(v>>8)&0xff]^table1[(v>>16)&0xff]^table0[v>>24] 3202 * 3203 */ 3204 void MacroAssembler::update_word_crc32(Register crc, Register v, Register tmp, 3205 Register table0, Register table1, Register table2, Register table3, 3206 bool upper) { 3207 eor(v, crc, v, upper ? LSR:LSL, upper ? 32:0); 3208 uxtb(tmp, v); 3209 ldrw(crc, Address(table3, tmp, Address::lsl(2))); 3210 ubfx(tmp, v, 8, 8); 3211 ldrw(tmp, Address(table2, tmp, Address::lsl(2))); 3212 eor(crc, crc, tmp); 3213 ubfx(tmp, v, 16, 8); 3214 ldrw(tmp, Address(table1, tmp, Address::lsl(2))); 3215 eor(crc, crc, tmp); 3216 ubfx(tmp, v, 24, 8); 3217 ldrw(tmp, Address(table0, tmp, Address::lsl(2))); 3218 eor(crc, crc, tmp); 3219 } 3220 3221 void MacroAssembler::kernel_crc32_using_crc32(Register crc, Register buf, 3222 Register len, Register tmp0, Register tmp1, Register tmp2, 3223 Register tmp3) { 3224 Label CRC_by64_loop, CRC_by4_loop, CRC_by1_loop, CRC_less64, CRC_by64_pre, CRC_by32_loop, CRC_less32, L_exit; 3225 assert_different_registers(crc, buf, len, tmp0, tmp1, tmp2, tmp3); 3226 3227 mvnw(crc, crc); 3228 3229 subs(len, len, 128); 3230 br(Assembler::GE, CRC_by64_pre); 3231 BIND(CRC_less64); 3232 adds(len, len, 128-32); 3233 br(Assembler::GE, CRC_by32_loop); 3234 BIND(CRC_less32); 3235 adds(len, len, 32-4); 3236 br(Assembler::GE, CRC_by4_loop); 3237 adds(len, len, 4); 3238 br(Assembler::GT, CRC_by1_loop); 3239 b(L_exit); 3240 3241 BIND(CRC_by32_loop); 3242 ldp(tmp0, tmp1, Address(post(buf, 16))); 3243 subs(len, len, 32); 3244 crc32x(crc, crc, tmp0); 3245 ldr(tmp2, Address(post(buf, 8))); 3246 crc32x(crc, crc, tmp1); 3247 ldr(tmp3, Address(post(buf, 8))); 3248 crc32x(crc, crc, tmp2); 3249 crc32x(crc, crc, tmp3); 3250 br(Assembler::GE, CRC_by32_loop); 3251 cmn(len, 32); 3252 br(Assembler::NE, CRC_less32); 3253 b(L_exit); 3254 3255 BIND(CRC_by4_loop); 3256 ldrw(tmp0, Address(post(buf, 4))); 3257 subs(len, len, 4); 3258 crc32w(crc, crc, tmp0); 3259 br(Assembler::GE, CRC_by4_loop); 3260 adds(len, len, 4); 3261 br(Assembler::LE, L_exit); 3262 BIND(CRC_by1_loop); 3263 ldrb(tmp0, Address(post(buf, 1))); 3264 subs(len, len, 1); 3265 crc32b(crc, crc, tmp0); 3266 br(Assembler::GT, CRC_by1_loop); 3267 b(L_exit); 3268 3269 BIND(CRC_by64_pre); 3270 sub(buf, buf, 8); 3271 ldp(tmp0, tmp1, Address(buf, 8)); 3272 crc32x(crc, crc, tmp0); 3273 ldr(tmp2, Address(buf, 24)); 3274 crc32x(crc, crc, tmp1); 3275 ldr(tmp3, Address(buf, 32)); 3276 crc32x(crc, crc, tmp2); 3277 ldr(tmp0, Address(buf, 40)); 3278 crc32x(crc, crc, tmp3); 3279 ldr(tmp1, Address(buf, 48)); 3280 crc32x(crc, crc, tmp0); 3281 ldr(tmp2, Address(buf, 56)); 3282 crc32x(crc, crc, tmp1); 3283 ldr(tmp3, Address(pre(buf, 64))); 3284 3285 b(CRC_by64_loop); 3286 3287 align(CodeEntryAlignment); 3288 BIND(CRC_by64_loop); 3289 subs(len, len, 64); 3290 crc32x(crc, crc, tmp2); 3291 ldr(tmp0, Address(buf, 8)); 3292 crc32x(crc, crc, tmp3); 3293 ldr(tmp1, Address(buf, 16)); 3294 crc32x(crc, crc, tmp0); 3295 ldr(tmp2, Address(buf, 24)); 3296 crc32x(crc, crc, tmp1); 3297 ldr(tmp3, Address(buf, 32)); 3298 crc32x(crc, crc, tmp2); 3299 ldr(tmp0, Address(buf, 40)); 3300 crc32x(crc, crc, tmp3); 3301 ldr(tmp1, Address(buf, 48)); 3302 crc32x(crc, crc, tmp0); 3303 ldr(tmp2, Address(buf, 56)); 3304 crc32x(crc, crc, tmp1); 3305 ldr(tmp3, Address(pre(buf, 64))); 3306 br(Assembler::GE, CRC_by64_loop); 3307 3308 // post-loop 3309 crc32x(crc, crc, tmp2); 3310 crc32x(crc, crc, tmp3); 3311 3312 sub(len, len, 64); 3313 add(buf, buf, 8); 3314 cmn(len, 128); 3315 br(Assembler::NE, CRC_less64); 3316 BIND(L_exit); 3317 mvnw(crc, crc); 3318 } 3319 3320 /** 3321 * @param crc register containing existing CRC (32-bit) 3322 * @param buf register pointing to input byte buffer (byte*) 3323 * @param len register containing number of bytes 3324 * @param table register that will contain address of CRC table 3325 * @param tmp scratch register 3326 */ 3327 void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len, 3328 Register table0, Register table1, Register table2, Register table3, 3329 Register tmp, Register tmp2, Register tmp3) { 3330 Label L_by16, L_by16_loop, L_by4, L_by4_loop, L_by1, L_by1_loop, L_exit; 3331 unsigned long offset; 3332 3333 if (UseCRC32) { 3334 kernel_crc32_using_crc32(crc, buf, len, table0, table1, table2, table3); 3335 return; 3336 } 3337 3338 mvnw(crc, crc); 3339 3340 adrp(table0, ExternalAddress(StubRoutines::crc_table_addr()), offset); 3341 if (offset) add(table0, table0, offset); 3342 add(table1, table0, 1*256*sizeof(juint)); 3343 add(table2, table0, 2*256*sizeof(juint)); 3344 add(table3, table0, 3*256*sizeof(juint)); 3345 3346 if (UseNeon) { 3347 cmp(len, (u1)64); 3348 br(Assembler::LT, L_by16); 3349 eor(v16, T16B, v16, v16); 3350 3351 Label L_fold; 3352 3353 add(tmp, table0, 4*256*sizeof(juint)); // Point at the Neon constants 3354 3355 ld1(v0, v1, T2D, post(buf, 32)); 3356 ld1r(v4, T2D, post(tmp, 8)); 3357 ld1r(v5, T2D, post(tmp, 8)); 3358 ld1r(v6, T2D, post(tmp, 8)); 3359 ld1r(v7, T2D, post(tmp, 8)); 3360 mov(v16, T4S, 0, crc); 3361 3362 eor(v0, T16B, v0, v16); 3363 sub(len, len, 64); 3364 3365 BIND(L_fold); 3366 pmull(v22, T8H, v0, v5, T8B); 3367 pmull(v20, T8H, v0, v7, T8B); 3368 pmull(v23, T8H, v0, v4, T8B); 3369 pmull(v21, T8H, v0, v6, T8B); 3370 3371 pmull2(v18, T8H, v0, v5, T16B); 3372 pmull2(v16, T8H, v0, v7, T16B); 3373 pmull2(v19, T8H, v0, v4, T16B); 3374 pmull2(v17, T8H, v0, v6, T16B); 3375 3376 uzp1(v24, T8H, v20, v22); 3377 uzp2(v25, T8H, v20, v22); 3378 eor(v20, T16B, v24, v25); 3379 3380 uzp1(v26, T8H, v16, v18); 3381 uzp2(v27, T8H, v16, v18); 3382 eor(v16, T16B, v26, v27); 3383 3384 ushll2(v22, T4S, v20, T8H, 8); 3385 ushll(v20, T4S, v20, T4H, 8); 3386 3387 ushll2(v18, T4S, v16, T8H, 8); 3388 ushll(v16, T4S, v16, T4H, 8); 3389 3390 eor(v22, T16B, v23, v22); 3391 eor(v18, T16B, v19, v18); 3392 eor(v20, T16B, v21, v20); 3393 eor(v16, T16B, v17, v16); 3394 3395 uzp1(v17, T2D, v16, v20); 3396 uzp2(v21, T2D, v16, v20); 3397 eor(v17, T16B, v17, v21); 3398 3399 ushll2(v20, T2D, v17, T4S, 16); 3400 ushll(v16, T2D, v17, T2S, 16); 3401 3402 eor(v20, T16B, v20, v22); 3403 eor(v16, T16B, v16, v18); 3404 3405 uzp1(v17, T2D, v20, v16); 3406 uzp2(v21, T2D, v20, v16); 3407 eor(v28, T16B, v17, v21); 3408 3409 pmull(v22, T8H, v1, v5, T8B); 3410 pmull(v20, T8H, v1, v7, T8B); 3411 pmull(v23, T8H, v1, v4, T8B); 3412 pmull(v21, T8H, v1, v6, T8B); 3413 3414 pmull2(v18, T8H, v1, v5, T16B); 3415 pmull2(v16, T8H, v1, v7, T16B); 3416 pmull2(v19, T8H, v1, v4, T16B); 3417 pmull2(v17, T8H, v1, v6, T16B); 3418 3419 ld1(v0, v1, T2D, post(buf, 32)); 3420 3421 uzp1(v24, T8H, v20, v22); 3422 uzp2(v25, T8H, v20, v22); 3423 eor(v20, T16B, v24, v25); 3424 3425 uzp1(v26, T8H, v16, v18); 3426 uzp2(v27, T8H, v16, v18); 3427 eor(v16, T16B, v26, v27); 3428 3429 ushll2(v22, T4S, v20, T8H, 8); 3430 ushll(v20, T4S, v20, T4H, 8); 3431 3432 ushll2(v18, T4S, v16, T8H, 8); 3433 ushll(v16, T4S, v16, T4H, 8); 3434 3435 eor(v22, T16B, v23, v22); 3436 eor(v18, T16B, v19, v18); 3437 eor(v20, T16B, v21, v20); 3438 eor(v16, T16B, v17, v16); 3439 3440 uzp1(v17, T2D, v16, v20); 3441 uzp2(v21, T2D, v16, v20); 3442 eor(v16, T16B, v17, v21); 3443 3444 ushll2(v20, T2D, v16, T4S, 16); 3445 ushll(v16, T2D, v16, T2S, 16); 3446 3447 eor(v20, T16B, v22, v20); 3448 eor(v16, T16B, v16, v18); 3449 3450 uzp1(v17, T2D, v20, v16); 3451 uzp2(v21, T2D, v20, v16); 3452 eor(v20, T16B, v17, v21); 3453 3454 shl(v16, T2D, v28, 1); 3455 shl(v17, T2D, v20, 1); 3456 3457 eor(v0, T16B, v0, v16); 3458 eor(v1, T16B, v1, v17); 3459 3460 subs(len, len, 32); 3461 br(Assembler::GE, L_fold); 3462 3463 mov(crc, 0); 3464 mov(tmp, v0, T1D, 0); 3465 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false); 3466 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true); 3467 mov(tmp, v0, T1D, 1); 3468 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false); 3469 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true); 3470 mov(tmp, v1, T1D, 0); 3471 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false); 3472 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true); 3473 mov(tmp, v1, T1D, 1); 3474 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false); 3475 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true); 3476 3477 add(len, len, 32); 3478 } 3479 3480 BIND(L_by16); 3481 subs(len, len, 16); 3482 br(Assembler::GE, L_by16_loop); 3483 adds(len, len, 16-4); 3484 br(Assembler::GE, L_by4_loop); 3485 adds(len, len, 4); 3486 br(Assembler::GT, L_by1_loop); 3487 b(L_exit); 3488 3489 BIND(L_by4_loop); 3490 ldrw(tmp, Address(post(buf, 4))); 3491 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3); 3492 subs(len, len, 4); 3493 br(Assembler::GE, L_by4_loop); 3494 adds(len, len, 4); 3495 br(Assembler::LE, L_exit); 3496 BIND(L_by1_loop); 3497 subs(len, len, 1); 3498 ldrb(tmp, Address(post(buf, 1))); 3499 update_byte_crc32(crc, tmp, table0); 3500 br(Assembler::GT, L_by1_loop); 3501 b(L_exit); 3502 3503 align(CodeEntryAlignment); 3504 BIND(L_by16_loop); 3505 subs(len, len, 16); 3506 ldp(tmp, tmp3, Address(post(buf, 16))); 3507 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false); 3508 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true); 3509 update_word_crc32(crc, tmp3, tmp2, table0, table1, table2, table3, false); 3510 update_word_crc32(crc, tmp3, tmp2, table0, table1, table2, table3, true); 3511 br(Assembler::GE, L_by16_loop); 3512 adds(len, len, 16-4); 3513 br(Assembler::GE, L_by4_loop); 3514 adds(len, len, 4); 3515 br(Assembler::GT, L_by1_loop); 3516 BIND(L_exit); 3517 mvnw(crc, crc); 3518 } 3519 3520 void MacroAssembler::kernel_crc32c_using_crc32c(Register crc, Register buf, 3521 Register len, Register tmp0, Register tmp1, Register tmp2, 3522 Register tmp3) { 3523 Label CRC_by64_loop, CRC_by4_loop, CRC_by1_loop, CRC_less64, CRC_by64_pre, CRC_by32_loop, CRC_less32, L_exit; 3524 assert_different_registers(crc, buf, len, tmp0, tmp1, tmp2, tmp3); 3525 3526 subs(len, len, 128); 3527 br(Assembler::GE, CRC_by64_pre); 3528 BIND(CRC_less64); 3529 adds(len, len, 128-32); 3530 br(Assembler::GE, CRC_by32_loop); 3531 BIND(CRC_less32); 3532 adds(len, len, 32-4); 3533 br(Assembler::GE, CRC_by4_loop); 3534 adds(len, len, 4); 3535 br(Assembler::GT, CRC_by1_loop); 3536 b(L_exit); 3537 3538 BIND(CRC_by32_loop); 3539 ldp(tmp0, tmp1, Address(post(buf, 16))); 3540 subs(len, len, 32); 3541 crc32cx(crc, crc, tmp0); 3542 ldr(tmp2, Address(post(buf, 8))); 3543 crc32cx(crc, crc, tmp1); 3544 ldr(tmp3, Address(post(buf, 8))); 3545 crc32cx(crc, crc, tmp2); 3546 crc32cx(crc, crc, tmp3); 3547 br(Assembler::GE, CRC_by32_loop); 3548 cmn(len, 32); 3549 br(Assembler::NE, CRC_less32); 3550 b(L_exit); 3551 3552 BIND(CRC_by4_loop); 3553 ldrw(tmp0, Address(post(buf, 4))); 3554 subs(len, len, 4); 3555 crc32cw(crc, crc, tmp0); 3556 br(Assembler::GE, CRC_by4_loop); 3557 adds(len, len, 4); 3558 br(Assembler::LE, L_exit); 3559 BIND(CRC_by1_loop); 3560 ldrb(tmp0, Address(post(buf, 1))); 3561 subs(len, len, 1); 3562 crc32cb(crc, crc, tmp0); 3563 br(Assembler::GT, CRC_by1_loop); 3564 b(L_exit); 3565 3566 BIND(CRC_by64_pre); 3567 sub(buf, buf, 8); 3568 ldp(tmp0, tmp1, Address(buf, 8)); 3569 crc32cx(crc, crc, tmp0); 3570 ldr(tmp2, Address(buf, 24)); 3571 crc32cx(crc, crc, tmp1); 3572 ldr(tmp3, Address(buf, 32)); 3573 crc32cx(crc, crc, tmp2); 3574 ldr(tmp0, Address(buf, 40)); 3575 crc32cx(crc, crc, tmp3); 3576 ldr(tmp1, Address(buf, 48)); 3577 crc32cx(crc, crc, tmp0); 3578 ldr(tmp2, Address(buf, 56)); 3579 crc32cx(crc, crc, tmp1); 3580 ldr(tmp3, Address(pre(buf, 64))); 3581 3582 b(CRC_by64_loop); 3583 3584 align(CodeEntryAlignment); 3585 BIND(CRC_by64_loop); 3586 subs(len, len, 64); 3587 crc32cx(crc, crc, tmp2); 3588 ldr(tmp0, Address(buf, 8)); 3589 crc32cx(crc, crc, tmp3); 3590 ldr(tmp1, Address(buf, 16)); 3591 crc32cx(crc, crc, tmp0); 3592 ldr(tmp2, Address(buf, 24)); 3593 crc32cx(crc, crc, tmp1); 3594 ldr(tmp3, Address(buf, 32)); 3595 crc32cx(crc, crc, tmp2); 3596 ldr(tmp0, Address(buf, 40)); 3597 crc32cx(crc, crc, tmp3); 3598 ldr(tmp1, Address(buf, 48)); 3599 crc32cx(crc, crc, tmp0); 3600 ldr(tmp2, Address(buf, 56)); 3601 crc32cx(crc, crc, tmp1); 3602 ldr(tmp3, Address(pre(buf, 64))); 3603 br(Assembler::GE, CRC_by64_loop); 3604 3605 // post-loop 3606 crc32cx(crc, crc, tmp2); 3607 crc32cx(crc, crc, tmp3); 3608 3609 sub(len, len, 64); 3610 add(buf, buf, 8); 3611 cmn(len, 128); 3612 br(Assembler::NE, CRC_less64); 3613 BIND(L_exit); 3614 } 3615 3616 /** 3617 * @param crc register containing existing CRC (32-bit) 3618 * @param buf register pointing to input byte buffer (byte*) 3619 * @param len register containing number of bytes 3620 * @param table register that will contain address of CRC table 3621 * @param tmp scratch register 3622 */ 3623 void MacroAssembler::kernel_crc32c(Register crc, Register buf, Register len, 3624 Register table0, Register table1, Register table2, Register table3, 3625 Register tmp, Register tmp2, Register tmp3) { 3626 kernel_crc32c_using_crc32c(crc, buf, len, table0, table1, table2, table3); 3627 } 3628 3629 3630 SkipIfEqual::SkipIfEqual( 3631 MacroAssembler* masm, const bool* flag_addr, bool value) { 3632 _masm = masm; 3633 unsigned long offset; 3634 _masm->adrp(rscratch1, ExternalAddress((address)flag_addr), offset); 3635 _masm->ldrb(rscratch1, Address(rscratch1, offset)); 3636 _masm->cbzw(rscratch1, _label); 3637 } 3638 3639 SkipIfEqual::~SkipIfEqual() { 3640 _masm->bind(_label); 3641 } 3642 3643 void MacroAssembler::addptr(const Address &dst, int32_t src) { 3644 Address adr; 3645 switch(dst.getMode()) { 3646 case Address::base_plus_offset: 3647 // This is the expected mode, although we allow all the other 3648 // forms below. 3649 adr = form_address(rscratch2, dst.base(), dst.offset(), LogBytesPerWord); 3650 break; 3651 default: 3652 lea(rscratch2, dst); 3653 adr = Address(rscratch2); 3654 break; 3655 } 3656 ldr(rscratch1, adr); 3657 add(rscratch1, rscratch1, src); 3658 str(rscratch1, adr); 3659 } 3660 3661 void MacroAssembler::cmpptr(Register src1, Address src2) { 3662 unsigned long offset; 3663 adrp(rscratch1, src2, offset); 3664 ldr(rscratch1, Address(rscratch1, offset)); 3665 cmp(src1, rscratch1); 3666 } 3667 3668 void MacroAssembler::cmpoop(Register obj1, Register obj2) { 3669 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 3670 bs->obj_equals(this, obj1, obj2); 3671 } 3672 3673 void MacroAssembler::load_klass(Register dst, Register src) { 3674 if (UseCompressedClassPointers) { 3675 ldrw(dst, Address(src, oopDesc::klass_offset_in_bytes())); 3676 decode_klass_not_null(dst); 3677 } else { 3678 ldr(dst, Address(src, oopDesc::klass_offset_in_bytes())); 3679 } 3680 } 3681 3682 // ((OopHandle)result).resolve(); 3683 void MacroAssembler::resolve_oop_handle(Register result, Register tmp) { 3684 // OopHandle::resolve is an indirection. 3685 access_load_at(T_OBJECT, IN_NATIVE, result, Address(result, 0), tmp, noreg); 3686 } 3687 3688 void MacroAssembler::load_mirror(Register dst, Register method, Register tmp) { 3689 const int mirror_offset = in_bytes(Klass::java_mirror_offset()); 3690 ldr(dst, Address(rmethod, Method::const_offset())); 3691 ldr(dst, Address(dst, ConstMethod::constants_offset())); 3692 ldr(dst, Address(dst, ConstantPool::pool_holder_offset_in_bytes())); 3693 ldr(dst, Address(dst, mirror_offset)); 3694 resolve_oop_handle(dst, tmp); 3695 } 3696 3697 void MacroAssembler::cmp_klass(Register oop, Register trial_klass, Register tmp) { 3698 if (UseCompressedClassPointers) { 3699 ldrw(tmp, Address(oop, oopDesc::klass_offset_in_bytes())); 3700 if (Universe::narrow_klass_base() == NULL) { 3701 cmp(trial_klass, tmp, LSL, Universe::narrow_klass_shift()); 3702 return; 3703 } else if (((uint64_t)Universe::narrow_klass_base() & 0xffffffff) == 0 3704 && Universe::narrow_klass_shift() == 0) { 3705 // Only the bottom 32 bits matter 3706 cmpw(trial_klass, tmp); 3707 return; 3708 } 3709 decode_klass_not_null(tmp); 3710 } else { 3711 ldr(tmp, Address(oop, oopDesc::klass_offset_in_bytes())); 3712 } 3713 cmp(trial_klass, tmp); 3714 } 3715 3716 void MacroAssembler::load_prototype_header(Register dst, Register src) { 3717 load_klass(dst, src); 3718 ldr(dst, Address(dst, Klass::prototype_header_offset())); 3719 } 3720 3721 void MacroAssembler::store_klass(Register dst, Register src) { 3722 // FIXME: Should this be a store release? concurrent gcs assumes 3723 // klass length is valid if klass field is not null. 3724 if (UseCompressedClassPointers) { 3725 encode_klass_not_null(src); 3726 strw(src, Address(dst, oopDesc::klass_offset_in_bytes())); 3727 } else { 3728 str(src, Address(dst, oopDesc::klass_offset_in_bytes())); 3729 } 3730 } 3731 3732 void MacroAssembler::store_klass_gap(Register dst, Register src) { 3733 if (UseCompressedClassPointers) { 3734 // Store to klass gap in destination 3735 strw(src, Address(dst, oopDesc::klass_gap_offset_in_bytes())); 3736 } 3737 } 3738 3739 // Algorithm must match CompressedOops::encode. 3740 void MacroAssembler::encode_heap_oop(Register d, Register s) { 3741 #ifdef ASSERT 3742 verify_heapbase("MacroAssembler::encode_heap_oop: heap base corrupted?"); 3743 #endif 3744 verify_oop(s, "broken oop in encode_heap_oop"); 3745 if (Universe::narrow_oop_base() == NULL) { 3746 if (Universe::narrow_oop_shift() != 0) { 3747 assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong"); 3748 lsr(d, s, LogMinObjAlignmentInBytes); 3749 } else { 3750 mov(d, s); 3751 } 3752 } else { 3753 subs(d, s, rheapbase); 3754 csel(d, d, zr, Assembler::HS); 3755 lsr(d, d, LogMinObjAlignmentInBytes); 3756 3757 /* Old algorithm: is this any worse? 3758 Label nonnull; 3759 cbnz(r, nonnull); 3760 sub(r, r, rheapbase); 3761 bind(nonnull); 3762 lsr(r, r, LogMinObjAlignmentInBytes); 3763 */ 3764 } 3765 } 3766 3767 void MacroAssembler::encode_heap_oop_not_null(Register r) { 3768 #ifdef ASSERT 3769 verify_heapbase("MacroAssembler::encode_heap_oop_not_null: heap base corrupted?"); 3770 if (CheckCompressedOops) { 3771 Label ok; 3772 cbnz(r, ok); 3773 stop("null oop passed to encode_heap_oop_not_null"); 3774 bind(ok); 3775 } 3776 #endif 3777 verify_oop(r, "broken oop in encode_heap_oop_not_null"); 3778 if (Universe::narrow_oop_base() != NULL) { 3779 sub(r, r, rheapbase); 3780 } 3781 if (Universe::narrow_oop_shift() != 0) { 3782 assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong"); 3783 lsr(r, r, LogMinObjAlignmentInBytes); 3784 } 3785 } 3786 3787 void MacroAssembler::encode_heap_oop_not_null(Register dst, Register src) { 3788 #ifdef ASSERT 3789 verify_heapbase("MacroAssembler::encode_heap_oop_not_null2: heap base corrupted?"); 3790 if (CheckCompressedOops) { 3791 Label ok; 3792 cbnz(src, ok); 3793 stop("null oop passed to encode_heap_oop_not_null2"); 3794 bind(ok); 3795 } 3796 #endif 3797 verify_oop(src, "broken oop in encode_heap_oop_not_null2"); 3798 3799 Register data = src; 3800 if (Universe::narrow_oop_base() != NULL) { 3801 sub(dst, src, rheapbase); 3802 data = dst; 3803 } 3804 if (Universe::narrow_oop_shift() != 0) { 3805 assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong"); 3806 lsr(dst, data, LogMinObjAlignmentInBytes); 3807 data = dst; 3808 } 3809 if (data == src) 3810 mov(dst, src); 3811 } 3812 3813 void MacroAssembler::decode_heap_oop(Register d, Register s) { 3814 #ifdef ASSERT 3815 verify_heapbase("MacroAssembler::decode_heap_oop: heap base corrupted?"); 3816 #endif 3817 if (Universe::narrow_oop_base() == NULL) { 3818 if (Universe::narrow_oop_shift() != 0 || d != s) { 3819 lsl(d, s, Universe::narrow_oop_shift()); 3820 } 3821 } else { 3822 Label done; 3823 if (d != s) 3824 mov(d, s); 3825 cbz(s, done); 3826 add(d, rheapbase, s, Assembler::LSL, LogMinObjAlignmentInBytes); 3827 bind(done); 3828 } 3829 verify_oop(d, "broken oop in decode_heap_oop"); 3830 } 3831 3832 void MacroAssembler::decode_heap_oop_not_null(Register r) { 3833 assert (UseCompressedOops, "should only be used for compressed headers"); 3834 assert (Universe::heap() != NULL, "java heap should be initialized"); 3835 // Cannot assert, unverified entry point counts instructions (see .ad file) 3836 // vtableStubs also counts instructions in pd_code_size_limit. 3837 // Also do not verify_oop as this is called by verify_oop. 3838 if (Universe::narrow_oop_shift() != 0) { 3839 assert(LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong"); 3840 if (Universe::narrow_oop_base() != NULL) { 3841 add(r, rheapbase, r, Assembler::LSL, LogMinObjAlignmentInBytes); 3842 } else { 3843 add(r, zr, r, Assembler::LSL, LogMinObjAlignmentInBytes); 3844 } 3845 } else { 3846 assert (Universe::narrow_oop_base() == NULL, "sanity"); 3847 } 3848 } 3849 3850 void MacroAssembler::decode_heap_oop_not_null(Register dst, Register src) { 3851 assert (UseCompressedOops, "should only be used for compressed headers"); 3852 assert (Universe::heap() != NULL, "java heap should be initialized"); 3853 // Cannot assert, unverified entry point counts instructions (see .ad file) 3854 // vtableStubs also counts instructions in pd_code_size_limit. 3855 // Also do not verify_oop as this is called by verify_oop. 3856 if (Universe::narrow_oop_shift() != 0) { 3857 assert(LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong"); 3858 if (Universe::narrow_oop_base() != NULL) { 3859 add(dst, rheapbase, src, Assembler::LSL, LogMinObjAlignmentInBytes); 3860 } else { 3861 add(dst, zr, src, Assembler::LSL, LogMinObjAlignmentInBytes); 3862 } 3863 } else { 3864 assert (Universe::narrow_oop_base() == NULL, "sanity"); 3865 if (dst != src) { 3866 mov(dst, src); 3867 } 3868 } 3869 } 3870 3871 void MacroAssembler::encode_klass_not_null(Register dst, Register src) { 3872 if (Universe::narrow_klass_base() == NULL) { 3873 if (Universe::narrow_klass_shift() != 0) { 3874 assert (LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong"); 3875 lsr(dst, src, LogKlassAlignmentInBytes); 3876 } else { 3877 if (dst != src) mov(dst, src); 3878 } 3879 return; 3880 } 3881 3882 if (use_XOR_for_compressed_class_base) { 3883 if (Universe::narrow_klass_shift() != 0) { 3884 eor(dst, src, (uint64_t)Universe::narrow_klass_base()); 3885 lsr(dst, dst, LogKlassAlignmentInBytes); 3886 } else { 3887 eor(dst, src, (uint64_t)Universe::narrow_klass_base()); 3888 } 3889 return; 3890 } 3891 3892 if (((uint64_t)Universe::narrow_klass_base() & 0xffffffff) == 0 3893 && Universe::narrow_klass_shift() == 0) { 3894 movw(dst, src); 3895 return; 3896 } 3897 3898 #ifdef ASSERT 3899 verify_heapbase("MacroAssembler::encode_klass_not_null2: heap base corrupted?"); 3900 #endif 3901 3902 Register rbase = dst; 3903 if (dst == src) rbase = rheapbase; 3904 mov(rbase, (uint64_t)Universe::narrow_klass_base()); 3905 sub(dst, src, rbase); 3906 if (Universe::narrow_klass_shift() != 0) { 3907 assert (LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong"); 3908 lsr(dst, dst, LogKlassAlignmentInBytes); 3909 } 3910 if (dst == src) reinit_heapbase(); 3911 } 3912 3913 void MacroAssembler::encode_klass_not_null(Register r) { 3914 encode_klass_not_null(r, r); 3915 } 3916 3917 void MacroAssembler::decode_klass_not_null(Register dst, Register src) { 3918 Register rbase = dst; 3919 assert (UseCompressedClassPointers, "should only be used for compressed headers"); 3920 3921 if (Universe::narrow_klass_base() == NULL) { 3922 if (Universe::narrow_klass_shift() != 0) { 3923 assert(LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong"); 3924 lsl(dst, src, LogKlassAlignmentInBytes); 3925 } else { 3926 if (dst != src) mov(dst, src); 3927 } 3928 return; 3929 } 3930 3931 if (use_XOR_for_compressed_class_base) { 3932 if (Universe::narrow_klass_shift() != 0) { 3933 lsl(dst, src, LogKlassAlignmentInBytes); 3934 eor(dst, dst, (uint64_t)Universe::narrow_klass_base()); 3935 } else { 3936 eor(dst, src, (uint64_t)Universe::narrow_klass_base()); 3937 } 3938 return; 3939 } 3940 3941 if (((uint64_t)Universe::narrow_klass_base() & 0xffffffff) == 0 3942 && Universe::narrow_klass_shift() == 0) { 3943 if (dst != src) 3944 movw(dst, src); 3945 movk(dst, (uint64_t)Universe::narrow_klass_base() >> 32, 32); 3946 return; 3947 } 3948 3949 // Cannot assert, unverified entry point counts instructions (see .ad file) 3950 // vtableStubs also counts instructions in pd_code_size_limit. 3951 // Also do not verify_oop as this is called by verify_oop. 3952 if (dst == src) rbase = rheapbase; 3953 mov(rbase, (uint64_t)Universe::narrow_klass_base()); 3954 if (Universe::narrow_klass_shift() != 0) { 3955 assert(LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong"); 3956 add(dst, rbase, src, Assembler::LSL, LogKlassAlignmentInBytes); 3957 } else { 3958 add(dst, rbase, src); 3959 } 3960 if (dst == src) reinit_heapbase(); 3961 } 3962 3963 void MacroAssembler::decode_klass_not_null(Register r) { 3964 decode_klass_not_null(r, r); 3965 } 3966 3967 void MacroAssembler::set_narrow_oop(Register dst, jobject obj) { 3968 #ifdef ASSERT 3969 { 3970 ThreadInVMfromUnknown tiv; 3971 assert (UseCompressedOops, "should only be used for compressed oops"); 3972 assert (Universe::heap() != NULL, "java heap should be initialized"); 3973 assert (oop_recorder() != NULL, "this assembler needs an OopRecorder"); 3974 assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "should be real oop"); 3975 } 3976 #endif 3977 int oop_index = oop_recorder()->find_index(obj); 3978 InstructionMark im(this); 3979 RelocationHolder rspec = oop_Relocation::spec(oop_index); 3980 code_section()->relocate(inst_mark(), rspec); 3981 movz(dst, 0xDEAD, 16); 3982 movk(dst, 0xBEEF); 3983 } 3984 3985 void MacroAssembler::set_narrow_klass(Register dst, Klass* k) { 3986 assert (UseCompressedClassPointers, "should only be used for compressed headers"); 3987 assert (oop_recorder() != NULL, "this assembler needs an OopRecorder"); 3988 int index = oop_recorder()->find_index(k); 3989 assert(! Universe::heap()->is_in_reserved(k), "should not be an oop"); 3990 3991 InstructionMark im(this); 3992 RelocationHolder rspec = metadata_Relocation::spec(index); 3993 code_section()->relocate(inst_mark(), rspec); 3994 narrowKlass nk = Klass::encode_klass(k); 3995 movz(dst, (nk >> 16), 16); 3996 movk(dst, nk & 0xffff); 3997 } 3998 3999 void MacroAssembler::access_load_at(BasicType type, DecoratorSet decorators, 4000 Register dst, Address src, 4001 Register tmp1, Register thread_tmp) { 4002 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 4003 decorators = AccessInternal::decorator_fixup(decorators); 4004 bool as_raw = (decorators & AS_RAW) != 0; 4005 if (as_raw) { 4006 bs->BarrierSetAssembler::load_at(this, decorators, type, dst, src, tmp1, thread_tmp); 4007 } else { 4008 bs->load_at(this, decorators, type, dst, src, tmp1, thread_tmp); 4009 } 4010 } 4011 4012 void MacroAssembler::access_store_at(BasicType type, DecoratorSet decorators, 4013 Address dst, Register src, 4014 Register tmp1, Register thread_tmp) { 4015 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 4016 decorators = AccessInternal::decorator_fixup(decorators); 4017 bool as_raw = (decorators & AS_RAW) != 0; 4018 if (as_raw) { 4019 bs->BarrierSetAssembler::store_at(this, decorators, type, dst, src, tmp1, thread_tmp); 4020 } else { 4021 bs->store_at(this, decorators, type, dst, src, tmp1, thread_tmp); 4022 } 4023 } 4024 4025 void MacroAssembler::resolve(DecoratorSet decorators, Register obj) { 4026 // Use stronger ACCESS_WRITE|ACCESS_READ by default. 4027 if ((decorators & (ACCESS_READ | ACCESS_WRITE)) == 0) { 4028 decorators |= ACCESS_READ | ACCESS_WRITE; 4029 } 4030 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 4031 return bs->resolve(this, decorators, obj); 4032 } 4033 4034 void MacroAssembler::load_heap_oop(Register dst, Address src, Register tmp1, 4035 Register thread_tmp, DecoratorSet decorators) { 4036 access_load_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, thread_tmp); 4037 } 4038 4039 void MacroAssembler::load_heap_oop_not_null(Register dst, Address src, Register tmp1, 4040 Register thread_tmp, DecoratorSet decorators) { 4041 access_load_at(T_OBJECT, IN_HEAP | IS_NOT_NULL | decorators, dst, src, tmp1, thread_tmp); 4042 } 4043 4044 void MacroAssembler::store_heap_oop(Address dst, Register src, Register tmp1, 4045 Register thread_tmp, DecoratorSet decorators) { 4046 access_store_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, thread_tmp); 4047 } 4048 4049 // Used for storing NULLs. 4050 void MacroAssembler::store_heap_oop_null(Address dst) { 4051 access_store_at(T_OBJECT, IN_HEAP, dst, noreg, noreg, noreg); 4052 } 4053 4054 Address MacroAssembler::allocate_metadata_address(Metadata* obj) { 4055 assert(oop_recorder() != NULL, "this assembler needs a Recorder"); 4056 int index = oop_recorder()->allocate_metadata_index(obj); 4057 RelocationHolder rspec = metadata_Relocation::spec(index); 4058 return Address((address)obj, rspec); 4059 } 4060 4061 // Move an oop into a register. immediate is true if we want 4062 // immediate instrcutions, i.e. we are not going to patch this 4063 // instruction while the code is being executed by another thread. In 4064 // that case we can use move immediates rather than the constant pool. 4065 void MacroAssembler::movoop(Register dst, jobject obj, bool immediate) { 4066 int oop_index; 4067 if (obj == NULL) { 4068 oop_index = oop_recorder()->allocate_oop_index(obj); 4069 } else { 4070 #ifdef ASSERT 4071 { 4072 ThreadInVMfromUnknown tiv; 4073 assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "should be real oop"); 4074 } 4075 #endif 4076 oop_index = oop_recorder()->find_index(obj); 4077 } 4078 RelocationHolder rspec = oop_Relocation::spec(oop_index); 4079 if (! immediate) { 4080 address dummy = address(uintptr_t(pc()) & -wordSize); // A nearby aligned address 4081 ldr_constant(dst, Address(dummy, rspec)); 4082 } else 4083 mov(dst, Address((address)obj, rspec)); 4084 } 4085 4086 // Move a metadata address into a register. 4087 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) { 4088 int oop_index; 4089 if (obj == NULL) { 4090 oop_index = oop_recorder()->allocate_metadata_index(obj); 4091 } else { 4092 oop_index = oop_recorder()->find_index(obj); 4093 } 4094 RelocationHolder rspec = metadata_Relocation::spec(oop_index); 4095 mov(dst, Address((address)obj, rspec)); 4096 } 4097 4098 Address MacroAssembler::constant_oop_address(jobject obj) { 4099 #ifdef ASSERT 4100 { 4101 ThreadInVMfromUnknown tiv; 4102 assert(oop_recorder() != NULL, "this assembler needs an OopRecorder"); 4103 assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "not an oop"); 4104 } 4105 #endif 4106 int oop_index = oop_recorder()->find_index(obj); 4107 return Address((address)obj, oop_Relocation::spec(oop_index)); 4108 } 4109 4110 // Defines obj, preserves var_size_in_bytes, okay for t2 == var_size_in_bytes. 4111 void MacroAssembler::tlab_allocate(Register obj, 4112 Register var_size_in_bytes, 4113 int con_size_in_bytes, 4114 Register t1, 4115 Register t2, 4116 Label& slow_case) { 4117 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 4118 bs->tlab_allocate(this, obj, var_size_in_bytes, con_size_in_bytes, t1, t2, slow_case); 4119 } 4120 4121 // Defines obj, preserves var_size_in_bytes 4122 void MacroAssembler::eden_allocate(Register obj, 4123 Register var_size_in_bytes, 4124 int con_size_in_bytes, 4125 Register t1, 4126 Label& slow_case) { 4127 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 4128 bs->eden_allocate(this, obj, var_size_in_bytes, con_size_in_bytes, t1, slow_case); 4129 } 4130 4131 // Zero words; len is in bytes 4132 // Destroys all registers except addr 4133 // len must be a nonzero multiple of wordSize 4134 void MacroAssembler::zero_memory(Register addr, Register len, Register t1) { 4135 assert_different_registers(addr, len, t1, rscratch1, rscratch2); 4136 4137 #ifdef ASSERT 4138 { Label L; 4139 tst(len, BytesPerWord - 1); 4140 br(Assembler::EQ, L); 4141 stop("len is not a multiple of BytesPerWord"); 4142 bind(L); 4143 } 4144 #endif 4145 4146 #ifndef PRODUCT 4147 block_comment("zero memory"); 4148 #endif 4149 4150 Label loop; 4151 Label entry; 4152 4153 // Algorithm: 4154 // 4155 // scratch1 = cnt & 7; 4156 // cnt -= scratch1; 4157 // p += scratch1; 4158 // switch (scratch1) { 4159 // do { 4160 // cnt -= 8; 4161 // p[-8] = 0; 4162 // case 7: 4163 // p[-7] = 0; 4164 // case 6: 4165 // p[-6] = 0; 4166 // // ... 4167 // case 1: 4168 // p[-1] = 0; 4169 // case 0: 4170 // p += 8; 4171 // } while (cnt); 4172 // } 4173 4174 const int unroll = 8; // Number of str(zr) instructions we'll unroll 4175 4176 lsr(len, len, LogBytesPerWord); 4177 andr(rscratch1, len, unroll - 1); // tmp1 = cnt % unroll 4178 sub(len, len, rscratch1); // cnt -= unroll 4179 // t1 always points to the end of the region we're about to zero 4180 add(t1, addr, rscratch1, Assembler::LSL, LogBytesPerWord); 4181 adr(rscratch2, entry); 4182 sub(rscratch2, rscratch2, rscratch1, Assembler::LSL, 2); 4183 br(rscratch2); 4184 bind(loop); 4185 sub(len, len, unroll); 4186 for (int i = -unroll; i < 0; i++) 4187 Assembler::str(zr, Address(t1, i * wordSize)); 4188 bind(entry); 4189 add(t1, t1, unroll * wordSize); 4190 cbnz(len, loop); 4191 } 4192 4193 void MacroAssembler::verify_tlab() { 4194 #ifdef ASSERT 4195 if (UseTLAB && VerifyOops) { 4196 Label next, ok; 4197 4198 stp(rscratch2, rscratch1, Address(pre(sp, -16))); 4199 4200 ldr(rscratch2, Address(rthread, in_bytes(JavaThread::tlab_top_offset()))); 4201 ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_start_offset()))); 4202 cmp(rscratch2, rscratch1); 4203 br(Assembler::HS, next); 4204 STOP("assert(top >= start)"); 4205 should_not_reach_here(); 4206 4207 bind(next); 4208 ldr(rscratch2, Address(rthread, in_bytes(JavaThread::tlab_end_offset()))); 4209 ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_top_offset()))); 4210 cmp(rscratch2, rscratch1); 4211 br(Assembler::HS, ok); 4212 STOP("assert(top <= end)"); 4213 should_not_reach_here(); 4214 4215 bind(ok); 4216 ldp(rscratch2, rscratch1, Address(post(sp, 16))); 4217 } 4218 #endif 4219 } 4220 4221 // Writes to stack successive pages until offset reached to check for 4222 // stack overflow + shadow pages. This clobbers tmp. 4223 void MacroAssembler::bang_stack_size(Register size, Register tmp) { 4224 assert_different_registers(tmp, size, rscratch1); 4225 mov(tmp, sp); 4226 // Bang stack for total size given plus shadow page size. 4227 // Bang one page at a time because large size can bang beyond yellow and 4228 // red zones. 4229 Label loop; 4230 mov(rscratch1, os::vm_page_size()); 4231 bind(loop); 4232 lea(tmp, Address(tmp, -os::vm_page_size())); 4233 subsw(size, size, rscratch1); 4234 str(size, Address(tmp)); 4235 br(Assembler::GT, loop); 4236 4237 // Bang down shadow pages too. 4238 // At this point, (tmp-0) is the last address touched, so don't 4239 // touch it again. (It was touched as (tmp-pagesize) but then tmp 4240 // was post-decremented.) Skip this address by starting at i=1, and 4241 // touch a few more pages below. N.B. It is important to touch all 4242 // the way down to and including i=StackShadowPages. 4243 for (int i = 0; i < (int)(JavaThread::stack_shadow_zone_size() / os::vm_page_size()) - 1; i++) { 4244 // this could be any sized move but this is can be a debugging crumb 4245 // so the bigger the better. 4246 lea(tmp, Address(tmp, -os::vm_page_size())); 4247 str(size, Address(tmp)); 4248 } 4249 } 4250 4251 4252 // Move the address of the polling page into dest. 4253 void MacroAssembler::get_polling_page(Register dest, address page, relocInfo::relocType rtype) { 4254 if (SafepointMechanism::uses_thread_local_poll()) { 4255 ldr(dest, Address(rthread, Thread::polling_page_offset())); 4256 } else { 4257 unsigned long off; 4258 adrp(dest, Address(page, rtype), off); 4259 assert(off == 0, "polling page must be page aligned"); 4260 } 4261 } 4262 4263 // Move the address of the polling page into r, then read the polling 4264 // page. 4265 address MacroAssembler::read_polling_page(Register r, address page, relocInfo::relocType rtype) { 4266 get_polling_page(r, page, rtype); 4267 return read_polling_page(r, rtype); 4268 } 4269 4270 // Read the polling page. The address of the polling page must 4271 // already be in r. 4272 address MacroAssembler::read_polling_page(Register r, relocInfo::relocType rtype) { 4273 InstructionMark im(this); 4274 code_section()->relocate(inst_mark(), rtype); 4275 ldrw(zr, Address(r, 0)); 4276 return inst_mark(); 4277 } 4278 4279 void MacroAssembler::adrp(Register reg1, const Address &dest, unsigned long &byte_offset) { 4280 relocInfo::relocType rtype = dest.rspec().reloc()->type(); 4281 unsigned long low_page = (unsigned long)CodeCache::low_bound() >> 12; 4282 unsigned long high_page = (unsigned long)(CodeCache::high_bound()-1) >> 12; 4283 unsigned long dest_page = (unsigned long)dest.target() >> 12; 4284 long offset_low = dest_page - low_page; 4285 long offset_high = dest_page - high_page; 4286 4287 assert(is_valid_AArch64_address(dest.target()), "bad address"); 4288 assert(dest.getMode() == Address::literal, "ADRP must be applied to a literal address"); 4289 4290 InstructionMark im(this); 4291 code_section()->relocate(inst_mark(), dest.rspec()); 4292 // 8143067: Ensure that the adrp can reach the dest from anywhere within 4293 // the code cache so that if it is relocated we know it will still reach 4294 if (offset_high >= -(1<<20) && offset_low < (1<<20)) { 4295 _adrp(reg1, dest.target()); 4296 } else { 4297 unsigned long target = (unsigned long)dest.target(); 4298 unsigned long adrp_target 4299 = (target & 0xffffffffUL) | ((unsigned long)pc() & 0xffff00000000UL); 4300 4301 _adrp(reg1, (address)adrp_target); 4302 movk(reg1, target >> 32, 32); 4303 } 4304 byte_offset = (unsigned long)dest.target() & 0xfff; 4305 } 4306 4307 void MacroAssembler::load_byte_map_base(Register reg) { 4308 CardTable::CardValue* byte_map_base = 4309 ((CardTableBarrierSet*)(BarrierSet::barrier_set()))->card_table()->byte_map_base(); 4310 4311 if (is_valid_AArch64_address((address)byte_map_base)) { 4312 // Strictly speaking the byte_map_base isn't an address at all, 4313 // and it might even be negative. 4314 unsigned long offset; 4315 adrp(reg, ExternalAddress((address)byte_map_base), offset); 4316 // We expect offset to be zero with most collectors. 4317 if (offset != 0) { 4318 add(reg, reg, offset); 4319 } 4320 } else { 4321 mov(reg, (uint64_t)byte_map_base); 4322 } 4323 } 4324 4325 void MacroAssembler::build_frame(int framesize) { 4326 assert(framesize > 0, "framesize must be > 0"); 4327 if (framesize < ((1 << 9) + 2 * wordSize)) { 4328 sub(sp, sp, framesize); 4329 stp(rfp, lr, Address(sp, framesize - 2 * wordSize)); 4330 if (PreserveFramePointer) add(rfp, sp, framesize - 2 * wordSize); 4331 } else { 4332 stp(rfp, lr, Address(pre(sp, -2 * wordSize))); 4333 if (PreserveFramePointer) mov(rfp, sp); 4334 if (framesize < ((1 << 12) + 2 * wordSize)) 4335 sub(sp, sp, framesize - 2 * wordSize); 4336 else { 4337 mov(rscratch1, framesize - 2 * wordSize); 4338 sub(sp, sp, rscratch1); 4339 } 4340 } 4341 } 4342 4343 void MacroAssembler::remove_frame(int framesize) { 4344 assert(framesize > 0, "framesize must be > 0"); 4345 if (framesize < ((1 << 9) + 2 * wordSize)) { 4346 ldp(rfp, lr, Address(sp, framesize - 2 * wordSize)); 4347 add(sp, sp, framesize); 4348 } else { 4349 if (framesize < ((1 << 12) + 2 * wordSize)) 4350 add(sp, sp, framesize - 2 * wordSize); 4351 else { 4352 mov(rscratch1, framesize - 2 * wordSize); 4353 add(sp, sp, rscratch1); 4354 } 4355 ldp(rfp, lr, Address(post(sp, 2 * wordSize))); 4356 } 4357 } 4358 4359 #ifdef COMPILER2 4360 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr); 4361 4362 // Search for str1 in str2 and return index or -1 4363 void MacroAssembler::string_indexof(Register str2, Register str1, 4364 Register cnt2, Register cnt1, 4365 Register tmp1, Register tmp2, 4366 Register tmp3, Register tmp4, 4367 Register tmp5, Register tmp6, 4368 int icnt1, Register result, int ae) { 4369 // NOTE: tmp5, tmp6 can be zr depending on specific method version 4370 Label LINEARSEARCH, LINEARSTUB, LINEAR_MEDIUM, DONE, NOMATCH, MATCH; 4371 4372 Register ch1 = rscratch1; 4373 Register ch2 = rscratch2; 4374 Register cnt1tmp = tmp1; 4375 Register cnt2tmp = tmp2; 4376 Register cnt1_neg = cnt1; 4377 Register cnt2_neg = cnt2; 4378 Register result_tmp = tmp4; 4379 4380 bool isL = ae == StrIntrinsicNode::LL; 4381 4382 bool str1_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL; 4383 bool str2_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU; 4384 int str1_chr_shift = str1_isL ? 0:1; 4385 int str2_chr_shift = str2_isL ? 0:1; 4386 int str1_chr_size = str1_isL ? 1:2; 4387 int str2_chr_size = str2_isL ? 1:2; 4388 chr_insn str1_load_1chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb : 4389 (chr_insn)&MacroAssembler::ldrh; 4390 chr_insn str2_load_1chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb : 4391 (chr_insn)&MacroAssembler::ldrh; 4392 chr_insn load_2chr = isL ? (chr_insn)&MacroAssembler::ldrh : (chr_insn)&MacroAssembler::ldrw; 4393 chr_insn load_4chr = isL ? (chr_insn)&MacroAssembler::ldrw : (chr_insn)&MacroAssembler::ldr; 4394 4395 // Note, inline_string_indexOf() generates checks: 4396 // if (substr.count > string.count) return -1; 4397 // if (substr.count == 0) return 0; 4398 4399 // We have two strings, a source string in str2, cnt2 and a pattern string 4400 // in str1, cnt1. Find the 1st occurence of pattern in source or return -1. 4401 4402 // For larger pattern and source we use a simplified Boyer Moore algorithm. 4403 // With a small pattern and source we use linear scan. 4404 4405 if (icnt1 == -1) { 4406 sub(result_tmp, cnt2, cnt1); 4407 cmp(cnt1, (u1)8); // Use Linear Scan if cnt1 < 8 || cnt1 >= 256 4408 br(LT, LINEARSEARCH); 4409 dup(v0, T16B, cnt1); // done in separate FPU pipeline. Almost no penalty 4410 subs(zr, cnt1, 256); 4411 lsr(tmp1, cnt2, 2); 4412 ccmp(cnt1, tmp1, 0b0000, LT); // Source must be 4 * pattern for BM 4413 br(GE, LINEARSTUB); 4414 } 4415 4416 // The Boyer Moore alogorithm is based on the description here:- 4417 // 4418 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm 4419 // 4420 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule 4421 // and the 'Good Suffix' rule. 4422 // 4423 // These rules are essentially heuristics for how far we can shift the 4424 // pattern along the search string. 4425 // 4426 // The implementation here uses the 'Bad Character' rule only because of the 4427 // complexity of initialisation for the 'Good Suffix' rule. 4428 // 4429 // This is also known as the Boyer-Moore-Horspool algorithm:- 4430 // 4431 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm 4432 // 4433 // This particular implementation has few java-specific optimizations. 4434 // 4435 // #define ASIZE 256 4436 // 4437 // int bm(unsigned char *x, int m, unsigned char *y, int n) { 4438 // int i, j; 4439 // unsigned c; 4440 // unsigned char bc[ASIZE]; 4441 // 4442 // /* Preprocessing */ 4443 // for (i = 0; i < ASIZE; ++i) 4444 // bc[i] = m; 4445 // for (i = 0; i < m - 1; ) { 4446 // c = x[i]; 4447 // ++i; 4448 // // c < 256 for Latin1 string, so, no need for branch 4449 // #ifdef PATTERN_STRING_IS_LATIN1 4450 // bc[c] = m - i; 4451 // #else 4452 // if (c < ASIZE) bc[c] = m - i; 4453 // #endif 4454 // } 4455 // 4456 // /* Searching */ 4457 // j = 0; 4458 // while (j <= n - m) { 4459 // c = y[i+j]; 4460 // if (x[m-1] == c) 4461 // for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i); 4462 // if (i < 0) return j; 4463 // // c < 256 for Latin1 string, so, no need for branch 4464 // #ifdef SOURCE_STRING_IS_LATIN1 4465 // // LL case: (c< 256) always true. Remove branch 4466 // j += bc[y[j+m-1]]; 4467 // #endif 4468 // #ifndef PATTERN_STRING_IS_UTF 4469 // // UU case: need if (c<ASIZE) check. Skip 1 character if not. 4470 // if (c < ASIZE) 4471 // j += bc[y[j+m-1]]; 4472 // else 4473 // j += 1 4474 // #endif 4475 // #ifdef PATTERN_IS_LATIN1_AND_SOURCE_IS_UTF 4476 // // UL case: need if (c<ASIZE) check. Skip <pattern length> if not. 4477 // if (c < ASIZE) 4478 // j += bc[y[j+m-1]]; 4479 // else 4480 // j += m 4481 // #endif 4482 // } 4483 // } 4484 4485 if (icnt1 == -1) { 4486 Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH, 4487 BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP; 4488 Register cnt1end = tmp2; 4489 Register str2end = cnt2; 4490 Register skipch = tmp2; 4491 4492 // str1 length is >=8, so, we can read at least 1 register for cases when 4493 // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for 4494 // UL case. We'll re-read last character in inner pre-loop code to have 4495 // single outer pre-loop load 4496 const int firstStep = isL ? 7 : 3; 4497 4498 const int ASIZE = 256; 4499 const int STORED_BYTES = 32; // amount of bytes stored per instruction 4500 sub(sp, sp, ASIZE); 4501 mov(tmp5, ASIZE/STORED_BYTES); // loop iterations 4502 mov(ch1, sp); 4503 BIND(BM_INIT_LOOP); 4504 stpq(v0, v0, Address(post(ch1, STORED_BYTES))); 4505 subs(tmp5, tmp5, 1); 4506 br(GT, BM_INIT_LOOP); 4507 4508 sub(cnt1tmp, cnt1, 1); 4509 mov(tmp5, str2); 4510 add(str2end, str2, result_tmp, LSL, str2_chr_shift); 4511 sub(ch2, cnt1, 1); 4512 mov(tmp3, str1); 4513 BIND(BCLOOP); 4514 (this->*str1_load_1chr)(ch1, Address(post(tmp3, str1_chr_size))); 4515 if (!str1_isL) { 4516 subs(zr, ch1, ASIZE); 4517 br(HS, BCSKIP); 4518 } 4519 strb(ch2, Address(sp, ch1)); 4520 BIND(BCSKIP); 4521 subs(ch2, ch2, 1); 4522 br(GT, BCLOOP); 4523 4524 add(tmp6, str1, cnt1, LSL, str1_chr_shift); // address after str1 4525 if (str1_isL == str2_isL) { 4526 // load last 8 bytes (8LL/4UU symbols) 4527 ldr(tmp6, Address(tmp6, -wordSize)); 4528 } else { 4529 ldrw(tmp6, Address(tmp6, -wordSize/2)); // load last 4 bytes(4 symbols) 4530 // convert Latin1 to UTF. We'll have to wait until load completed, but 4531 // it's still faster than per-character loads+checks 4532 lsr(tmp3, tmp6, BitsPerByte * (wordSize/2 - str1_chr_size)); // str1[N-1] 4533 ubfx(ch1, tmp6, 8, 8); // str1[N-2] 4534 ubfx(ch2, tmp6, 16, 8); // str1[N-3] 4535 andr(tmp6, tmp6, 0xFF); // str1[N-4] 4536 orr(ch2, ch1, ch2, LSL, 16); 4537 orr(tmp6, tmp6, tmp3, LSL, 48); 4538 orr(tmp6, tmp6, ch2, LSL, 16); 4539 } 4540 BIND(BMLOOPSTR2); 4541 (this->*str2_load_1chr)(skipch, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift))); 4542 sub(cnt1tmp, cnt1tmp, firstStep); // cnt1tmp is positive here, because cnt1 >= 8 4543 if (str1_isL == str2_isL) { 4544 // re-init tmp3. It's for free because it's executed in parallel with 4545 // load above. Alternative is to initialize it before loop, but it'll 4546 // affect performance on in-order systems with 2 or more ld/st pipelines 4547 lsr(tmp3, tmp6, BitsPerByte * (wordSize - str1_chr_size)); 4548 } 4549 if (!isL) { // UU/UL case 4550 lsl(ch2, cnt1tmp, 1); // offset in bytes 4551 } 4552 cmp(tmp3, skipch); 4553 br(NE, BMSKIP); 4554 ldr(ch2, Address(str2, isL ? cnt1tmp : ch2)); 4555 mov(ch1, tmp6); 4556 if (isL) { 4557 b(BMLOOPSTR1_AFTER_LOAD); 4558 } else { 4559 sub(cnt1tmp, cnt1tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8 4560 b(BMLOOPSTR1_CMP); 4561 } 4562 BIND(BMLOOPSTR1); 4563 (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift))); 4564 (this->*str2_load_1chr)(ch2, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift))); 4565 BIND(BMLOOPSTR1_AFTER_LOAD); 4566 subs(cnt1tmp, cnt1tmp, 1); 4567 br(LT, BMLOOPSTR1_LASTCMP); 4568 BIND(BMLOOPSTR1_CMP); 4569 cmp(ch1, ch2); 4570 br(EQ, BMLOOPSTR1); 4571 BIND(BMSKIP); 4572 if (!isL) { 4573 // if we've met UTF symbol while searching Latin1 pattern, then we can 4574 // skip cnt1 symbols 4575 if (str1_isL != str2_isL) { 4576 mov(result_tmp, cnt1); 4577 } else { 4578 mov(result_tmp, 1); 4579 } 4580 subs(zr, skipch, ASIZE); 4581 br(HS, BMADV); 4582 } 4583 ldrb(result_tmp, Address(sp, skipch)); // load skip distance 4584 BIND(BMADV); 4585 sub(cnt1tmp, cnt1, 1); 4586 add(str2, str2, result_tmp, LSL, str2_chr_shift); 4587 cmp(str2, str2end); 4588 br(LE, BMLOOPSTR2); 4589 add(sp, sp, ASIZE); 4590 b(NOMATCH); 4591 BIND(BMLOOPSTR1_LASTCMP); 4592 cmp(ch1, ch2); 4593 br(NE, BMSKIP); 4594 BIND(BMMATCH); 4595 sub(result, str2, tmp5); 4596 if (!str2_isL) lsr(result, result, 1); 4597 add(sp, sp, ASIZE); 4598 b(DONE); 4599 4600 BIND(LINEARSTUB); 4601 cmp(cnt1, (u1)16); // small patterns still should be handled by simple algorithm 4602 br(LT, LINEAR_MEDIUM); 4603 mov(result, zr); 4604 RuntimeAddress stub = NULL; 4605 if (isL) { 4606 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ll()); 4607 assert(stub.target() != NULL, "string_indexof_linear_ll stub has not been generated"); 4608 } else if (str1_isL) { 4609 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ul()); 4610 assert(stub.target() != NULL, "string_indexof_linear_ul stub has not been generated"); 4611 } else { 4612 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_uu()); 4613 assert(stub.target() != NULL, "string_indexof_linear_uu stub has not been generated"); 4614 } 4615 trampoline_call(stub); 4616 b(DONE); 4617 } 4618 4619 BIND(LINEARSEARCH); 4620 { 4621 Label DO1, DO2, DO3; 4622 4623 Register str2tmp = tmp2; 4624 Register first = tmp3; 4625 4626 if (icnt1 == -1) 4627 { 4628 Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT; 4629 4630 cmp(cnt1, u1(str1_isL == str2_isL ? 4 : 2)); 4631 br(LT, DOSHORT); 4632 BIND(LINEAR_MEDIUM); 4633 (this->*str1_load_1chr)(first, Address(str1)); 4634 lea(str1, Address(str1, cnt1, Address::lsl(str1_chr_shift))); 4635 sub(cnt1_neg, zr, cnt1, LSL, str1_chr_shift); 4636 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 4637 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 4638 4639 BIND(FIRST_LOOP); 4640 (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg)); 4641 cmp(first, ch2); 4642 br(EQ, STR1_LOOP); 4643 BIND(STR2_NEXT); 4644 adds(cnt2_neg, cnt2_neg, str2_chr_size); 4645 br(LE, FIRST_LOOP); 4646 b(NOMATCH); 4647 4648 BIND(STR1_LOOP); 4649 adds(cnt1tmp, cnt1_neg, str1_chr_size); 4650 add(cnt2tmp, cnt2_neg, str2_chr_size); 4651 br(GE, MATCH); 4652 4653 BIND(STR1_NEXT); 4654 (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp)); 4655 (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp)); 4656 cmp(ch1, ch2); 4657 br(NE, STR2_NEXT); 4658 adds(cnt1tmp, cnt1tmp, str1_chr_size); 4659 add(cnt2tmp, cnt2tmp, str2_chr_size); 4660 br(LT, STR1_NEXT); 4661 b(MATCH); 4662 4663 BIND(DOSHORT); 4664 if (str1_isL == str2_isL) { 4665 cmp(cnt1, (u1)2); 4666 br(LT, DO1); 4667 br(GT, DO3); 4668 } 4669 } 4670 4671 if (icnt1 == 4) { 4672 Label CH1_LOOP; 4673 4674 (this->*load_4chr)(ch1, str1); 4675 sub(result_tmp, cnt2, 4); 4676 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 4677 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 4678 4679 BIND(CH1_LOOP); 4680 (this->*load_4chr)(ch2, Address(str2, cnt2_neg)); 4681 cmp(ch1, ch2); 4682 br(EQ, MATCH); 4683 adds(cnt2_neg, cnt2_neg, str2_chr_size); 4684 br(LE, CH1_LOOP); 4685 b(NOMATCH); 4686 } 4687 4688 if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 2) { 4689 Label CH1_LOOP; 4690 4691 BIND(DO2); 4692 (this->*load_2chr)(ch1, str1); 4693 if (icnt1 == 2) { 4694 sub(result_tmp, cnt2, 2); 4695 } 4696 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 4697 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 4698 BIND(CH1_LOOP); 4699 (this->*load_2chr)(ch2, Address(str2, cnt2_neg)); 4700 cmp(ch1, ch2); 4701 br(EQ, MATCH); 4702 adds(cnt2_neg, cnt2_neg, str2_chr_size); 4703 br(LE, CH1_LOOP); 4704 b(NOMATCH); 4705 } 4706 4707 if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 3) { 4708 Label FIRST_LOOP, STR2_NEXT, STR1_LOOP; 4709 4710 BIND(DO3); 4711 (this->*load_2chr)(first, str1); 4712 (this->*str1_load_1chr)(ch1, Address(str1, 2*str1_chr_size)); 4713 if (icnt1 == 3) { 4714 sub(result_tmp, cnt2, 3); 4715 } 4716 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 4717 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 4718 BIND(FIRST_LOOP); 4719 (this->*load_2chr)(ch2, Address(str2, cnt2_neg)); 4720 cmpw(first, ch2); 4721 br(EQ, STR1_LOOP); 4722 BIND(STR2_NEXT); 4723 adds(cnt2_neg, cnt2_neg, str2_chr_size); 4724 br(LE, FIRST_LOOP); 4725 b(NOMATCH); 4726 4727 BIND(STR1_LOOP); 4728 add(cnt2tmp, cnt2_neg, 2*str2_chr_size); 4729 (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp)); 4730 cmp(ch1, ch2); 4731 br(NE, STR2_NEXT); 4732 b(MATCH); 4733 } 4734 4735 if (icnt1 == -1 || icnt1 == 1) { 4736 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP; 4737 4738 BIND(DO1); 4739 (this->*str1_load_1chr)(ch1, str1); 4740 cmp(cnt2, (u1)8); 4741 br(LT, DO1_SHORT); 4742 4743 sub(result_tmp, cnt2, 8/str2_chr_size); 4744 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 4745 mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001); 4746 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 4747 4748 if (str2_isL) { 4749 orr(ch1, ch1, ch1, LSL, 8); 4750 } 4751 orr(ch1, ch1, ch1, LSL, 16); 4752 orr(ch1, ch1, ch1, LSL, 32); 4753 BIND(CH1_LOOP); 4754 ldr(ch2, Address(str2, cnt2_neg)); 4755 eor(ch2, ch1, ch2); 4756 sub(tmp1, ch2, tmp3); 4757 orr(tmp2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 4758 bics(tmp1, tmp1, tmp2); 4759 br(NE, HAS_ZERO); 4760 adds(cnt2_neg, cnt2_neg, 8); 4761 br(LT, CH1_LOOP); 4762 4763 cmp(cnt2_neg, (u1)8); 4764 mov(cnt2_neg, 0); 4765 br(LT, CH1_LOOP); 4766 b(NOMATCH); 4767 4768 BIND(HAS_ZERO); 4769 rev(tmp1, tmp1); 4770 clz(tmp1, tmp1); 4771 add(cnt2_neg, cnt2_neg, tmp1, LSR, 3); 4772 b(MATCH); 4773 4774 BIND(DO1_SHORT); 4775 mov(result_tmp, cnt2); 4776 lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift))); 4777 sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift); 4778 BIND(DO1_LOOP); 4779 (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg)); 4780 cmpw(ch1, ch2); 4781 br(EQ, MATCH); 4782 adds(cnt2_neg, cnt2_neg, str2_chr_size); 4783 br(LT, DO1_LOOP); 4784 } 4785 } 4786 BIND(NOMATCH); 4787 mov(result, -1); 4788 b(DONE); 4789 BIND(MATCH); 4790 add(result, result_tmp, cnt2_neg, ASR, str2_chr_shift); 4791 BIND(DONE); 4792 } 4793 4794 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr); 4795 typedef void (MacroAssembler::* uxt_insn)(Register Rd, Register Rn); 4796 4797 void MacroAssembler::string_indexof_char(Register str1, Register cnt1, 4798 Register ch, Register result, 4799 Register tmp1, Register tmp2, Register tmp3) 4800 { 4801 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE; 4802 Register cnt1_neg = cnt1; 4803 Register ch1 = rscratch1; 4804 Register result_tmp = rscratch2; 4805 4806 cmp(cnt1, (u1)4); 4807 br(LT, DO1_SHORT); 4808 4809 orr(ch, ch, ch, LSL, 16); 4810 orr(ch, ch, ch, LSL, 32); 4811 4812 sub(cnt1, cnt1, 4); 4813 mov(result_tmp, cnt1); 4814 lea(str1, Address(str1, cnt1, Address::uxtw(1))); 4815 sub(cnt1_neg, zr, cnt1, LSL, 1); 4816 4817 mov(tmp3, 0x0001000100010001); 4818 4819 BIND(CH1_LOOP); 4820 ldr(ch1, Address(str1, cnt1_neg)); 4821 eor(ch1, ch, ch1); 4822 sub(tmp1, ch1, tmp3); 4823 orr(tmp2, ch1, 0x7fff7fff7fff7fff); 4824 bics(tmp1, tmp1, tmp2); 4825 br(NE, HAS_ZERO); 4826 adds(cnt1_neg, cnt1_neg, 8); 4827 br(LT, CH1_LOOP); 4828 4829 cmp(cnt1_neg, (u1)8); 4830 mov(cnt1_neg, 0); 4831 br(LT, CH1_LOOP); 4832 b(NOMATCH); 4833 4834 BIND(HAS_ZERO); 4835 rev(tmp1, tmp1); 4836 clz(tmp1, tmp1); 4837 add(cnt1_neg, cnt1_neg, tmp1, LSR, 3); 4838 b(MATCH); 4839 4840 BIND(DO1_SHORT); 4841 mov(result_tmp, cnt1); 4842 lea(str1, Address(str1, cnt1, Address::uxtw(1))); 4843 sub(cnt1_neg, zr, cnt1, LSL, 1); 4844 BIND(DO1_LOOP); 4845 ldrh(ch1, Address(str1, cnt1_neg)); 4846 cmpw(ch, ch1); 4847 br(EQ, MATCH); 4848 adds(cnt1_neg, cnt1_neg, 2); 4849 br(LT, DO1_LOOP); 4850 BIND(NOMATCH); 4851 mov(result, -1); 4852 b(DONE); 4853 BIND(MATCH); 4854 add(result, result_tmp, cnt1_neg, ASR, 1); 4855 BIND(DONE); 4856 } 4857 4858 // Compare strings. 4859 void MacroAssembler::string_compare(Register str1, Register str2, 4860 Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2, 4861 FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3, int ae) { 4862 Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB, 4863 DIFFERENCE, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, 4864 SHORT_LOOP_START, TAIL_CHECK; 4865 4866 const u1 STUB_THRESHOLD = 64 + 8; 4867 bool isLL = ae == StrIntrinsicNode::LL; 4868 bool isLU = ae == StrIntrinsicNode::LU; 4869 bool isUL = ae == StrIntrinsicNode::UL; 4870 4871 bool str1_isL = isLL || isLU; 4872 bool str2_isL = isLL || isUL; 4873 4874 int str1_chr_shift = str1_isL ? 0 : 1; 4875 int str2_chr_shift = str2_isL ? 0 : 1; 4876 int str1_chr_size = str1_isL ? 1 : 2; 4877 int str2_chr_size = str2_isL ? 1 : 2; 4878 int minCharsInWord = isLL ? wordSize : wordSize/2; 4879 4880 FloatRegister vtmpZ = vtmp1, vtmp = vtmp2; 4881 chr_insn str1_load_chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb : 4882 (chr_insn)&MacroAssembler::ldrh; 4883 chr_insn str2_load_chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb : 4884 (chr_insn)&MacroAssembler::ldrh; 4885 uxt_insn ext_chr = isLL ? (uxt_insn)&MacroAssembler::uxtbw : 4886 (uxt_insn)&MacroAssembler::uxthw; 4887 4888 BLOCK_COMMENT("string_compare {"); 4889 4890 // Bizzarely, the counts are passed in bytes, regardless of whether they 4891 // are L or U strings, however the result is always in characters. 4892 if (!str1_isL) asrw(cnt1, cnt1, 1); 4893 if (!str2_isL) asrw(cnt2, cnt2, 1); 4894 4895 // Compute the minimum of the string lengths and save the difference. 4896 subsw(result, cnt1, cnt2); 4897 cselw(cnt2, cnt1, cnt2, Assembler::LE); // min 4898 4899 // A very short string 4900 cmpw(cnt2, minCharsInWord); 4901 br(Assembler::LE, SHORT_STRING); 4902 4903 // Compare longwords 4904 // load first parts of strings and finish initialization while loading 4905 { 4906 if (str1_isL == str2_isL) { // LL or UU 4907 ldr(tmp1, Address(str1)); 4908 cmp(str1, str2); 4909 br(Assembler::EQ, DONE); 4910 ldr(tmp2, Address(str2)); 4911 cmp(cnt2, STUB_THRESHOLD); 4912 br(GE, STUB); 4913 subsw(cnt2, cnt2, minCharsInWord); 4914 br(EQ, TAIL_CHECK); 4915 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift))); 4916 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift))); 4917 sub(cnt2, zr, cnt2, LSL, str2_chr_shift); 4918 } else if (isLU) { 4919 ldrs(vtmp, Address(str1)); 4920 cmp(str1, str2); 4921 br(Assembler::EQ, DONE); 4922 ldr(tmp2, Address(str2)); 4923 cmp(cnt2, STUB_THRESHOLD); 4924 br(GE, STUB); 4925 subw(cnt2, cnt2, 4); 4926 eor(vtmpZ, T16B, vtmpZ, vtmpZ); 4927 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift))); 4928 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift))); 4929 zip1(vtmp, T8B, vtmp, vtmpZ); 4930 sub(cnt1, zr, cnt2, LSL, str1_chr_shift); 4931 sub(cnt2, zr, cnt2, LSL, str2_chr_shift); 4932 add(cnt1, cnt1, 4); 4933 fmovd(tmp1, vtmp); 4934 } else { // UL case 4935 ldr(tmp1, Address(str1)); 4936 cmp(str1, str2); 4937 br(Assembler::EQ, DONE); 4938 ldrs(vtmp, Address(str2)); 4939 cmp(cnt2, STUB_THRESHOLD); 4940 br(GE, STUB); 4941 subw(cnt2, cnt2, 4); 4942 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift))); 4943 eor(vtmpZ, T16B, vtmpZ, vtmpZ); 4944 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift))); 4945 sub(cnt1, zr, cnt2, LSL, str1_chr_shift); 4946 zip1(vtmp, T8B, vtmp, vtmpZ); 4947 sub(cnt2, zr, cnt2, LSL, str2_chr_shift); 4948 add(cnt1, cnt1, 8); 4949 fmovd(tmp2, vtmp); 4950 } 4951 adds(cnt2, cnt2, isUL ? 4 : 8); 4952 br(GE, TAIL); 4953 eor(rscratch2, tmp1, tmp2); 4954 cbnz(rscratch2, DIFFERENCE); 4955 // main loop 4956 bind(NEXT_WORD); 4957 if (str1_isL == str2_isL) { 4958 ldr(tmp1, Address(str1, cnt2)); 4959 ldr(tmp2, Address(str2, cnt2)); 4960 adds(cnt2, cnt2, 8); 4961 } else if (isLU) { 4962 ldrs(vtmp, Address(str1, cnt1)); 4963 ldr(tmp2, Address(str2, cnt2)); 4964 add(cnt1, cnt1, 4); 4965 zip1(vtmp, T8B, vtmp, vtmpZ); 4966 fmovd(tmp1, vtmp); 4967 adds(cnt2, cnt2, 8); 4968 } else { // UL 4969 ldrs(vtmp, Address(str2, cnt2)); 4970 ldr(tmp1, Address(str1, cnt1)); 4971 zip1(vtmp, T8B, vtmp, vtmpZ); 4972 add(cnt1, cnt1, 8); 4973 fmovd(tmp2, vtmp); 4974 adds(cnt2, cnt2, 4); 4975 } 4976 br(GE, TAIL); 4977 4978 eor(rscratch2, tmp1, tmp2); 4979 cbz(rscratch2, NEXT_WORD); 4980 b(DIFFERENCE); 4981 bind(TAIL); 4982 eor(rscratch2, tmp1, tmp2); 4983 cbnz(rscratch2, DIFFERENCE); 4984 // Last longword. In the case where length == 4 we compare the 4985 // same longword twice, but that's still faster than another 4986 // conditional branch. 4987 if (str1_isL == str2_isL) { 4988 ldr(tmp1, Address(str1)); 4989 ldr(tmp2, Address(str2)); 4990 } else if (isLU) { 4991 ldrs(vtmp, Address(str1)); 4992 ldr(tmp2, Address(str2)); 4993 zip1(vtmp, T8B, vtmp, vtmpZ); 4994 fmovd(tmp1, vtmp); 4995 } else { // UL 4996 ldrs(vtmp, Address(str2)); 4997 ldr(tmp1, Address(str1)); 4998 zip1(vtmp, T8B, vtmp, vtmpZ); 4999 fmovd(tmp2, vtmp); 5000 } 5001 bind(TAIL_CHECK); 5002 eor(rscratch2, tmp1, tmp2); 5003 cbz(rscratch2, DONE); 5004 5005 // Find the first different characters in the longwords and 5006 // compute their difference. 5007 bind(DIFFERENCE); 5008 rev(rscratch2, rscratch2); 5009 clz(rscratch2, rscratch2); 5010 andr(rscratch2, rscratch2, isLL ? -8 : -16); 5011 lsrv(tmp1, tmp1, rscratch2); 5012 (this->*ext_chr)(tmp1, tmp1); 5013 lsrv(tmp2, tmp2, rscratch2); 5014 (this->*ext_chr)(tmp2, tmp2); 5015 subw(result, tmp1, tmp2); 5016 b(DONE); 5017 } 5018 5019 bind(STUB); 5020 RuntimeAddress stub = NULL; 5021 switch(ae) { 5022 case StrIntrinsicNode::LL: 5023 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LL()); 5024 break; 5025 case StrIntrinsicNode::UU: 5026 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UU()); 5027 break; 5028 case StrIntrinsicNode::LU: 5029 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LU()); 5030 break; 5031 case StrIntrinsicNode::UL: 5032 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UL()); 5033 break; 5034 default: 5035 ShouldNotReachHere(); 5036 } 5037 assert(stub.target() != NULL, "compare_long_string stub has not been generated"); 5038 trampoline_call(stub); 5039 b(DONE); 5040 5041 bind(SHORT_STRING); 5042 // Is the minimum length zero? 5043 cbz(cnt2, DONE); 5044 // arrange code to do most branches while loading and loading next characters 5045 // while comparing previous 5046 (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size))); 5047 subs(cnt2, cnt2, 1); 5048 br(EQ, SHORT_LAST_INIT); 5049 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size))); 5050 b(SHORT_LOOP_START); 5051 bind(SHORT_LOOP); 5052 subs(cnt2, cnt2, 1); 5053 br(EQ, SHORT_LAST); 5054 bind(SHORT_LOOP_START); 5055 (this->*str1_load_chr)(tmp2, Address(post(str1, str1_chr_size))); 5056 (this->*str2_load_chr)(rscratch1, Address(post(str2, str2_chr_size))); 5057 cmp(tmp1, cnt1); 5058 br(NE, SHORT_LOOP_TAIL); 5059 subs(cnt2, cnt2, 1); 5060 br(EQ, SHORT_LAST2); 5061 (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size))); 5062 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size))); 5063 cmp(tmp2, rscratch1); 5064 br(EQ, SHORT_LOOP); 5065 sub(result, tmp2, rscratch1); 5066 b(DONE); 5067 bind(SHORT_LOOP_TAIL); 5068 sub(result, tmp1, cnt1); 5069 b(DONE); 5070 bind(SHORT_LAST2); 5071 cmp(tmp2, rscratch1); 5072 br(EQ, DONE); 5073 sub(result, tmp2, rscratch1); 5074 5075 b(DONE); 5076 bind(SHORT_LAST_INIT); 5077 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size))); 5078 bind(SHORT_LAST); 5079 cmp(tmp1, cnt1); 5080 br(EQ, DONE); 5081 sub(result, tmp1, cnt1); 5082 5083 bind(DONE); 5084 5085 BLOCK_COMMENT("} string_compare"); 5086 } 5087 #endif // COMPILER2 5088 5089 // This method checks if provided byte array contains byte with highest bit set. 5090 void MacroAssembler::has_negatives(Register ary1, Register len, Register result) { 5091 // Simple and most common case of aligned small array which is not at the 5092 // end of memory page is placed here. All other cases are in stub. 5093 Label LOOP, END, STUB, STUB_LONG, SET_RESULT, DONE; 5094 const uint64_t UPPER_BIT_MASK=0x8080808080808080; 5095 assert_different_registers(ary1, len, result); 5096 5097 cmpw(len, 0); 5098 br(LE, SET_RESULT); 5099 cmpw(len, 4 * wordSize); 5100 br(GE, STUB_LONG); // size > 32 then go to stub 5101 5102 int shift = 64 - exact_log2(os::vm_page_size()); 5103 lsl(rscratch1, ary1, shift); 5104 mov(rscratch2, (size_t)(4 * wordSize) << shift); 5105 adds(rscratch2, rscratch1, rscratch2); // At end of page? 5106 br(CS, STUB); // at the end of page then go to stub 5107 subs(len, len, wordSize); 5108 br(LT, END); 5109 5110 BIND(LOOP); 5111 ldr(rscratch1, Address(post(ary1, wordSize))); 5112 tst(rscratch1, UPPER_BIT_MASK); 5113 br(NE, SET_RESULT); 5114 subs(len, len, wordSize); 5115 br(GE, LOOP); 5116 cmpw(len, -wordSize); 5117 br(EQ, SET_RESULT); 5118 5119 BIND(END); 5120 ldr(result, Address(ary1)); 5121 sub(len, zr, len, LSL, 3); // LSL 3 is to get bits from bytes 5122 lslv(result, result, len); 5123 tst(result, UPPER_BIT_MASK); 5124 b(SET_RESULT); 5125 5126 BIND(STUB); 5127 RuntimeAddress has_neg = RuntimeAddress(StubRoutines::aarch64::has_negatives()); 5128 assert(has_neg.target() != NULL, "has_negatives stub has not been generated"); 5129 trampoline_call(has_neg); 5130 b(DONE); 5131 5132 BIND(STUB_LONG); 5133 RuntimeAddress has_neg_long = RuntimeAddress( 5134 StubRoutines::aarch64::has_negatives_long()); 5135 assert(has_neg_long.target() != NULL, "has_negatives stub has not been generated"); 5136 trampoline_call(has_neg_long); 5137 b(DONE); 5138 5139 BIND(SET_RESULT); 5140 cset(result, NE); // set true or false 5141 5142 BIND(DONE); 5143 } 5144 5145 void MacroAssembler::arrays_equals(Register a1, Register a2, Register tmp3, 5146 Register tmp4, Register tmp5, Register result, 5147 Register cnt1, int elem_size) { 5148 Label DONE, SAME; 5149 Register tmp1 = rscratch1; 5150 Register tmp2 = rscratch2; 5151 Register cnt2 = tmp2; // cnt2 only used in array length compare 5152 int elem_per_word = wordSize/elem_size; 5153 int log_elem_size = exact_log2(elem_size); 5154 int length_offset = arrayOopDesc::length_offset_in_bytes(); 5155 int base_offset 5156 = arrayOopDesc::base_offset_in_bytes(elem_size == 2 ? T_CHAR : T_BYTE); 5157 int stubBytesThreshold = 3 * 64 + (UseSIMDForArrayEquals ? 0 : 16); 5158 5159 assert(elem_size == 1 || elem_size == 2, "must be char or byte"); 5160 assert_different_registers(a1, a2, result, cnt1, rscratch1, rscratch2); 5161 5162 #ifndef PRODUCT 5163 { 5164 const char kind = (elem_size == 2) ? 'U' : 'L'; 5165 char comment[64]; 5166 snprintf(comment, sizeof comment, "array_equals%c{", kind); 5167 BLOCK_COMMENT(comment); 5168 } 5169 #endif 5170 5171 // if (a1 == a2) 5172 // return true; 5173 cmpoop(a1, a2); // May have read barriers for a1 and a2. 5174 br(EQ, SAME); 5175 5176 if (UseSimpleArrayEquals) { 5177 Label NEXT_WORD, SHORT, TAIL03, TAIL01, A_MIGHT_BE_NULL, A_IS_NOT_NULL; 5178 // if (a1 == null || a2 == null) 5179 // return false; 5180 // a1 & a2 == 0 means (some-pointer is null) or 5181 // (very-rare-or-even-probably-impossible-pointer-values) 5182 // so, we can save one branch in most cases 5183 tst(a1, a2); 5184 mov(result, false); 5185 br(EQ, A_MIGHT_BE_NULL); 5186 // if (a1.length != a2.length) 5187 // return false; 5188 bind(A_IS_NOT_NULL); 5189 ldrw(cnt1, Address(a1, length_offset)); 5190 ldrw(cnt2, Address(a2, length_offset)); 5191 eorw(tmp5, cnt1, cnt2); 5192 cbnzw(tmp5, DONE); 5193 lea(a1, Address(a1, base_offset)); 5194 lea(a2, Address(a2, base_offset)); 5195 // Check for short strings, i.e. smaller than wordSize. 5196 subs(cnt1, cnt1, elem_per_word); 5197 br(Assembler::LT, SHORT); 5198 // Main 8 byte comparison loop. 5199 bind(NEXT_WORD); { 5200 ldr(tmp1, Address(post(a1, wordSize))); 5201 ldr(tmp2, Address(post(a2, wordSize))); 5202 subs(cnt1, cnt1, elem_per_word); 5203 eor(tmp5, tmp1, tmp2); 5204 cbnz(tmp5, DONE); 5205 } br(GT, NEXT_WORD); 5206 // Last longword. In the case where length == 4 we compare the 5207 // same longword twice, but that's still faster than another 5208 // conditional branch. 5209 // cnt1 could be 0, -1, -2, -3, -4 for chars; -4 only happens when 5210 // length == 4. 5211 if (log_elem_size > 0) 5212 lsl(cnt1, cnt1, log_elem_size); 5213 ldr(tmp3, Address(a1, cnt1)); 5214 ldr(tmp4, Address(a2, cnt1)); 5215 eor(tmp5, tmp3, tmp4); 5216 cbnz(tmp5, DONE); 5217 b(SAME); 5218 bind(A_MIGHT_BE_NULL); 5219 // in case both a1 and a2 are not-null, proceed with loads 5220 cbz(a1, DONE); 5221 cbz(a2, DONE); 5222 b(A_IS_NOT_NULL); 5223 bind(SHORT); 5224 5225 tbz(cnt1, 2 - log_elem_size, TAIL03); // 0-7 bytes left. 5226 { 5227 ldrw(tmp1, Address(post(a1, 4))); 5228 ldrw(tmp2, Address(post(a2, 4))); 5229 eorw(tmp5, tmp1, tmp2); 5230 cbnzw(tmp5, DONE); 5231 } 5232 bind(TAIL03); 5233 tbz(cnt1, 1 - log_elem_size, TAIL01); // 0-3 bytes left. 5234 { 5235 ldrh(tmp3, Address(post(a1, 2))); 5236 ldrh(tmp4, Address(post(a2, 2))); 5237 eorw(tmp5, tmp3, tmp4); 5238 cbnzw(tmp5, DONE); 5239 } 5240 bind(TAIL01); 5241 if (elem_size == 1) { // Only needed when comparing byte arrays. 5242 tbz(cnt1, 0, SAME); // 0-1 bytes left. 5243 { 5244 ldrb(tmp1, a1); 5245 ldrb(tmp2, a2); 5246 eorw(tmp5, tmp1, tmp2); 5247 cbnzw(tmp5, DONE); 5248 } 5249 } 5250 } else { 5251 Label NEXT_DWORD, SHORT, TAIL, TAIL2, STUB, EARLY_OUT, 5252 CSET_EQ, LAST_CHECK; 5253 mov(result, false); 5254 cbz(a1, DONE); 5255 ldrw(cnt1, Address(a1, length_offset)); 5256 cbz(a2, DONE); 5257 ldrw(cnt2, Address(a2, length_offset)); 5258 // on most CPUs a2 is still "locked"(surprisingly) in ldrw and it's 5259 // faster to perform another branch before comparing a1 and a2 5260 cmp(cnt1, (u1)elem_per_word); 5261 br(LE, SHORT); // short or same 5262 ldr(tmp3, Address(pre(a1, base_offset))); 5263 subs(zr, cnt1, stubBytesThreshold); 5264 br(GE, STUB); 5265 ldr(tmp4, Address(pre(a2, base_offset))); 5266 sub(tmp5, zr, cnt1, LSL, 3 + log_elem_size); 5267 cmp(cnt2, cnt1); 5268 br(NE, DONE); 5269 5270 // Main 16 byte comparison loop with 2 exits 5271 bind(NEXT_DWORD); { 5272 ldr(tmp1, Address(pre(a1, wordSize))); 5273 ldr(tmp2, Address(pre(a2, wordSize))); 5274 subs(cnt1, cnt1, 2 * elem_per_word); 5275 br(LE, TAIL); 5276 eor(tmp4, tmp3, tmp4); 5277 cbnz(tmp4, DONE); 5278 ldr(tmp3, Address(pre(a1, wordSize))); 5279 ldr(tmp4, Address(pre(a2, wordSize))); 5280 cmp(cnt1, (u1)elem_per_word); 5281 br(LE, TAIL2); 5282 cmp(tmp1, tmp2); 5283 } br(EQ, NEXT_DWORD); 5284 b(DONE); 5285 5286 bind(TAIL); 5287 eor(tmp4, tmp3, tmp4); 5288 eor(tmp2, tmp1, tmp2); 5289 lslv(tmp2, tmp2, tmp5); 5290 orr(tmp5, tmp4, tmp2); 5291 cmp(tmp5, zr); 5292 b(CSET_EQ); 5293 5294 bind(TAIL2); 5295 eor(tmp2, tmp1, tmp2); 5296 cbnz(tmp2, DONE); 5297 b(LAST_CHECK); 5298 5299 bind(STUB); 5300 ldr(tmp4, Address(pre(a2, base_offset))); 5301 cmp(cnt2, cnt1); 5302 br(NE, DONE); 5303 if (elem_size == 2) { // convert to byte counter 5304 lsl(cnt1, cnt1, 1); 5305 } 5306 eor(tmp5, tmp3, tmp4); 5307 cbnz(tmp5, DONE); 5308 RuntimeAddress stub = RuntimeAddress(StubRoutines::aarch64::large_array_equals()); 5309 assert(stub.target() != NULL, "array_equals_long stub has not been generated"); 5310 trampoline_call(stub); 5311 b(DONE); 5312 5313 bind(EARLY_OUT); 5314 // (a1 != null && a2 == null) || (a1 != null && a2 != null && a1 == a2) 5315 // so, if a2 == null => return false(0), else return true, so we can return a2 5316 mov(result, a2); 5317 b(DONE); 5318 bind(SHORT); 5319 cmp(cnt2, cnt1); 5320 br(NE, DONE); 5321 cbz(cnt1, SAME); 5322 sub(tmp5, zr, cnt1, LSL, 3 + log_elem_size); 5323 ldr(tmp3, Address(a1, base_offset)); 5324 ldr(tmp4, Address(a2, base_offset)); 5325 bind(LAST_CHECK); 5326 eor(tmp4, tmp3, tmp4); 5327 lslv(tmp5, tmp4, tmp5); 5328 cmp(tmp5, zr); 5329 bind(CSET_EQ); 5330 cset(result, EQ); 5331 b(DONE); 5332 } 5333 5334 bind(SAME); 5335 mov(result, true); 5336 // That's it. 5337 bind(DONE); 5338 5339 BLOCK_COMMENT("} array_equals"); 5340 } 5341 5342 // Compare Strings 5343 5344 // For Strings we're passed the address of the first characters in a1 5345 // and a2 and the length in cnt1. 5346 // elem_size is the element size in bytes: either 1 or 2. 5347 // There are two implementations. For arrays >= 8 bytes, all 5348 // comparisons (including the final one, which may overlap) are 5349 // performed 8 bytes at a time. For strings < 8 bytes, we compare a 5350 // halfword, then a short, and then a byte. 5351 5352 void MacroAssembler::string_equals(Register a1, Register a2, 5353 Register result, Register cnt1, int elem_size) 5354 { 5355 Label SAME, DONE, SHORT, NEXT_WORD; 5356 Register tmp1 = rscratch1; 5357 Register tmp2 = rscratch2; 5358 Register cnt2 = tmp2; // cnt2 only used in array length compare 5359 5360 assert(elem_size == 1 || elem_size == 2, "must be 2 or 1 byte"); 5361 assert_different_registers(a1, a2, result, cnt1, rscratch1, rscratch2); 5362 5363 #ifndef PRODUCT 5364 { 5365 const char kind = (elem_size == 2) ? 'U' : 'L'; 5366 char comment[64]; 5367 snprintf(comment, sizeof comment, "{string_equals%c", kind); 5368 BLOCK_COMMENT(comment); 5369 } 5370 #endif 5371 5372 mov(result, false); 5373 5374 // Check for short strings, i.e. smaller than wordSize. 5375 subs(cnt1, cnt1, wordSize); 5376 br(Assembler::LT, SHORT); 5377 // Main 8 byte comparison loop. 5378 bind(NEXT_WORD); { 5379 ldr(tmp1, Address(post(a1, wordSize))); 5380 ldr(tmp2, Address(post(a2, wordSize))); 5381 subs(cnt1, cnt1, wordSize); 5382 eor(tmp1, tmp1, tmp2); 5383 cbnz(tmp1, DONE); 5384 } br(GT, NEXT_WORD); 5385 // Last longword. In the case where length == 4 we compare the 5386 // same longword twice, but that's still faster than another 5387 // conditional branch. 5388 // cnt1 could be 0, -1, -2, -3, -4 for chars; -4 only happens when 5389 // length == 4. 5390 ldr(tmp1, Address(a1, cnt1)); 5391 ldr(tmp2, Address(a2, cnt1)); 5392 eor(tmp2, tmp1, tmp2); 5393 cbnz(tmp2, DONE); 5394 b(SAME); 5395 5396 bind(SHORT); 5397 Label TAIL03, TAIL01; 5398 5399 tbz(cnt1, 2, TAIL03); // 0-7 bytes left. 5400 { 5401 ldrw(tmp1, Address(post(a1, 4))); 5402 ldrw(tmp2, Address(post(a2, 4))); 5403 eorw(tmp1, tmp1, tmp2); 5404 cbnzw(tmp1, DONE); 5405 } 5406 bind(TAIL03); 5407 tbz(cnt1, 1, TAIL01); // 0-3 bytes left. 5408 { 5409 ldrh(tmp1, Address(post(a1, 2))); 5410 ldrh(tmp2, Address(post(a2, 2))); 5411 eorw(tmp1, tmp1, tmp2); 5412 cbnzw(tmp1, DONE); 5413 } 5414 bind(TAIL01); 5415 if (elem_size == 1) { // Only needed when comparing 1-byte elements 5416 tbz(cnt1, 0, SAME); // 0-1 bytes left. 5417 { 5418 ldrb(tmp1, a1); 5419 ldrb(tmp2, a2); 5420 eorw(tmp1, tmp1, tmp2); 5421 cbnzw(tmp1, DONE); 5422 } 5423 } 5424 // Arrays are equal. 5425 bind(SAME); 5426 mov(result, true); 5427 5428 // That's it. 5429 bind(DONE); 5430 BLOCK_COMMENT("} string_equals"); 5431 } 5432 5433 5434 // The size of the blocks erased by the zero_blocks stub. We must 5435 // handle anything smaller than this ourselves in zero_words(). 5436 const int MacroAssembler::zero_words_block_size = 8; 5437 5438 // zero_words() is used by C2 ClearArray patterns. It is as small as 5439 // possible, handling small word counts locally and delegating 5440 // anything larger to the zero_blocks stub. It is expanded many times 5441 // in compiled code, so it is important to keep it short. 5442 5443 // ptr: Address of a buffer to be zeroed. 5444 // cnt: Count in HeapWords. 5445 // 5446 // ptr, cnt, rscratch1, and rscratch2 are clobbered. 5447 void MacroAssembler::zero_words(Register ptr, Register cnt) 5448 { 5449 assert(is_power_of_2(zero_words_block_size), "adjust this"); 5450 assert(ptr == r10 && cnt == r11, "mismatch in register usage"); 5451 5452 BLOCK_COMMENT("zero_words {"); 5453 cmp(cnt, (u1)zero_words_block_size); 5454 Label around; 5455 br(LO, around); 5456 { 5457 RuntimeAddress zero_blocks = RuntimeAddress(StubRoutines::aarch64::zero_blocks()); 5458 assert(zero_blocks.target() != NULL, "zero_blocks stub has not been generated"); 5459 if (StubRoutines::aarch64::complete()) { 5460 trampoline_call(zero_blocks); 5461 } else { 5462 bl(zero_blocks); 5463 } 5464 } 5465 bind(around); 5466 for (int i = zero_words_block_size >> 1; i > 1; i >>= 1) { 5467 Label l; 5468 tbz(cnt, exact_log2(i), l); 5469 for (int j = 0; j < i; j += 2) { 5470 stp(zr, zr, post(ptr, 16)); 5471 } 5472 bind(l); 5473 } 5474 { 5475 Label l; 5476 tbz(cnt, 0, l); 5477 str(zr, Address(ptr)); 5478 bind(l); 5479 } 5480 BLOCK_COMMENT("} zero_words"); 5481 } 5482 5483 // base: Address of a buffer to be zeroed, 8 bytes aligned. 5484 // cnt: Immediate count in HeapWords. 5485 #define SmallArraySize (18 * BytesPerLong) 5486 void MacroAssembler::zero_words(Register base, u_int64_t cnt) 5487 { 5488 BLOCK_COMMENT("zero_words {"); 5489 int i = cnt & 1; // store any odd word to start 5490 if (i) str(zr, Address(base)); 5491 5492 if (cnt <= SmallArraySize / BytesPerLong) { 5493 for (; i < (int)cnt; i += 2) 5494 stp(zr, zr, Address(base, i * wordSize)); 5495 } else { 5496 const int unroll = 4; // Number of stp(zr, zr) instructions we'll unroll 5497 int remainder = cnt % (2 * unroll); 5498 for (; i < remainder; i += 2) 5499 stp(zr, zr, Address(base, i * wordSize)); 5500 5501 Label loop; 5502 Register cnt_reg = rscratch1; 5503 Register loop_base = rscratch2; 5504 cnt = cnt - remainder; 5505 mov(cnt_reg, cnt); 5506 // adjust base and prebias by -2 * wordSize so we can pre-increment 5507 add(loop_base, base, (remainder - 2) * wordSize); 5508 bind(loop); 5509 sub(cnt_reg, cnt_reg, 2 * unroll); 5510 for (i = 1; i < unroll; i++) 5511 stp(zr, zr, Address(loop_base, 2 * i * wordSize)); 5512 stp(zr, zr, Address(pre(loop_base, 2 * unroll * wordSize))); 5513 cbnz(cnt_reg, loop); 5514 } 5515 BLOCK_COMMENT("} zero_words"); 5516 } 5517 5518 // Zero blocks of memory by using DC ZVA. 5519 // 5520 // Aligns the base address first sufficently for DC ZVA, then uses 5521 // DC ZVA repeatedly for every full block. cnt is the size to be 5522 // zeroed in HeapWords. Returns the count of words left to be zeroed 5523 // in cnt. 5524 // 5525 // NOTE: This is intended to be used in the zero_blocks() stub. If 5526 // you want to use it elsewhere, note that cnt must be >= 2*zva_length. 5527 void MacroAssembler::zero_dcache_blocks(Register base, Register cnt) { 5528 Register tmp = rscratch1; 5529 Register tmp2 = rscratch2; 5530 int zva_length = VM_Version::zva_length(); 5531 Label initial_table_end, loop_zva; 5532 Label fini; 5533 5534 // Base must be 16 byte aligned. If not just return and let caller handle it 5535 tst(base, 0x0f); 5536 br(Assembler::NE, fini); 5537 // Align base with ZVA length. 5538 neg(tmp, base); 5539 andr(tmp, tmp, zva_length - 1); 5540 5541 // tmp: the number of bytes to be filled to align the base with ZVA length. 5542 add(base, base, tmp); 5543 sub(cnt, cnt, tmp, Assembler::ASR, 3); 5544 adr(tmp2, initial_table_end); 5545 sub(tmp2, tmp2, tmp, Assembler::LSR, 2); 5546 br(tmp2); 5547 5548 for (int i = -zva_length + 16; i < 0; i += 16) 5549 stp(zr, zr, Address(base, i)); 5550 bind(initial_table_end); 5551 5552 sub(cnt, cnt, zva_length >> 3); 5553 bind(loop_zva); 5554 dc(Assembler::ZVA, base); 5555 subs(cnt, cnt, zva_length >> 3); 5556 add(base, base, zva_length); 5557 br(Assembler::GE, loop_zva); 5558 add(cnt, cnt, zva_length >> 3); // count not zeroed by DC ZVA 5559 bind(fini); 5560 } 5561 5562 // base: Address of a buffer to be filled, 8 bytes aligned. 5563 // cnt: Count in 8-byte unit. 5564 // value: Value to be filled with. 5565 // base will point to the end of the buffer after filling. 5566 void MacroAssembler::fill_words(Register base, Register cnt, Register value) 5567 { 5568 // Algorithm: 5569 // 5570 // scratch1 = cnt & 7; 5571 // cnt -= scratch1; 5572 // p += scratch1; 5573 // switch (scratch1) { 5574 // do { 5575 // cnt -= 8; 5576 // p[-8] = v; 5577 // case 7: 5578 // p[-7] = v; 5579 // case 6: 5580 // p[-6] = v; 5581 // // ... 5582 // case 1: 5583 // p[-1] = v; 5584 // case 0: 5585 // p += 8; 5586 // } while (cnt); 5587 // } 5588 5589 assert_different_registers(base, cnt, value, rscratch1, rscratch2); 5590 5591 Label fini, skip, entry, loop; 5592 const int unroll = 8; // Number of stp instructions we'll unroll 5593 5594 cbz(cnt, fini); 5595 tbz(base, 3, skip); 5596 str(value, Address(post(base, 8))); 5597 sub(cnt, cnt, 1); 5598 bind(skip); 5599 5600 andr(rscratch1, cnt, (unroll-1) * 2); 5601 sub(cnt, cnt, rscratch1); 5602 add(base, base, rscratch1, Assembler::LSL, 3); 5603 adr(rscratch2, entry); 5604 sub(rscratch2, rscratch2, rscratch1, Assembler::LSL, 1); 5605 br(rscratch2); 5606 5607 bind(loop); 5608 add(base, base, unroll * 16); 5609 for (int i = -unroll; i < 0; i++) 5610 stp(value, value, Address(base, i * 16)); 5611 bind(entry); 5612 subs(cnt, cnt, unroll * 2); 5613 br(Assembler::GE, loop); 5614 5615 tbz(cnt, 0, fini); 5616 str(value, Address(post(base, 8))); 5617 bind(fini); 5618 } 5619 5620 // Intrinsic for sun/nio/cs/ISO_8859_1$Encoder.implEncodeISOArray and 5621 // java/lang/StringUTF16.compress. 5622 void MacroAssembler::encode_iso_array(Register src, Register dst, 5623 Register len, Register result, 5624 FloatRegister Vtmp1, FloatRegister Vtmp2, 5625 FloatRegister Vtmp3, FloatRegister Vtmp4) 5626 { 5627 Label DONE, SET_RESULT, NEXT_32, NEXT_32_PRFM, LOOP_8, NEXT_8, LOOP_1, NEXT_1, 5628 NEXT_32_START, NEXT_32_PRFM_START; 5629 Register tmp1 = rscratch1, tmp2 = rscratch2; 5630 5631 mov(result, len); // Save initial len 5632 5633 #ifndef BUILTIN_SIM 5634 cmp(len, (u1)8); // handle shortest strings first 5635 br(LT, LOOP_1); 5636 cmp(len, (u1)32); 5637 br(LT, NEXT_8); 5638 // The following code uses the SIMD 'uzp1' and 'uzp2' instructions 5639 // to convert chars to bytes 5640 if (SoftwarePrefetchHintDistance >= 0) { 5641 ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src); 5642 subs(tmp2, len, SoftwarePrefetchHintDistance/2 + 16); 5643 br(LE, NEXT_32_START); 5644 b(NEXT_32_PRFM_START); 5645 BIND(NEXT_32_PRFM); 5646 ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src); 5647 BIND(NEXT_32_PRFM_START); 5648 prfm(Address(src, SoftwarePrefetchHintDistance)); 5649 orr(v4, T16B, Vtmp1, Vtmp2); 5650 orr(v5, T16B, Vtmp3, Vtmp4); 5651 uzp1(Vtmp1, T16B, Vtmp1, Vtmp2); 5652 uzp1(Vtmp3, T16B, Vtmp3, Vtmp4); 5653 uzp2(v5, T16B, v4, v5); // high bytes 5654 umov(tmp2, v5, D, 1); 5655 fmovd(tmp1, v5); 5656 orr(tmp1, tmp1, tmp2); 5657 cbnz(tmp1, LOOP_8); 5658 stpq(Vtmp1, Vtmp3, dst); 5659 sub(len, len, 32); 5660 add(dst, dst, 32); 5661 add(src, src, 64); 5662 subs(tmp2, len, SoftwarePrefetchHintDistance/2 + 16); 5663 br(GE, NEXT_32_PRFM); 5664 cmp(len, (u1)32); 5665 br(LT, LOOP_8); 5666 BIND(NEXT_32); 5667 ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src); 5668 BIND(NEXT_32_START); 5669 } else { 5670 BIND(NEXT_32); 5671 ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src); 5672 } 5673 prfm(Address(src, SoftwarePrefetchHintDistance)); 5674 uzp1(v4, T16B, Vtmp1, Vtmp2); 5675 uzp1(v5, T16B, Vtmp3, Vtmp4); 5676 orr(Vtmp1, T16B, Vtmp1, Vtmp2); 5677 orr(Vtmp3, T16B, Vtmp3, Vtmp4); 5678 uzp2(Vtmp1, T16B, Vtmp1, Vtmp3); // high bytes 5679 umov(tmp2, Vtmp1, D, 1); 5680 fmovd(tmp1, Vtmp1); 5681 orr(tmp1, tmp1, tmp2); 5682 cbnz(tmp1, LOOP_8); 5683 stpq(v4, v5, dst); 5684 sub(len, len, 32); 5685 add(dst, dst, 32); 5686 add(src, src, 64); 5687 cmp(len, (u1)32); 5688 br(GE, NEXT_32); 5689 cbz(len, DONE); 5690 5691 BIND(LOOP_8); 5692 cmp(len, (u1)8); 5693 br(LT, LOOP_1); 5694 BIND(NEXT_8); 5695 ld1(Vtmp1, T8H, src); 5696 uzp1(Vtmp2, T16B, Vtmp1, Vtmp1); // low bytes 5697 uzp2(Vtmp3, T16B, Vtmp1, Vtmp1); // high bytes 5698 fmovd(tmp1, Vtmp3); 5699 cbnz(tmp1, NEXT_1); 5700 strd(Vtmp2, dst); 5701 5702 sub(len, len, 8); 5703 add(dst, dst, 8); 5704 add(src, src, 16); 5705 cmp(len, (u1)8); 5706 br(GE, NEXT_8); 5707 5708 BIND(LOOP_1); 5709 #endif 5710 cbz(len, DONE); 5711 BIND(NEXT_1); 5712 ldrh(tmp1, Address(post(src, 2))); 5713 tst(tmp1, 0xff00); 5714 br(NE, SET_RESULT); 5715 strb(tmp1, Address(post(dst, 1))); 5716 subs(len, len, 1); 5717 br(GT, NEXT_1); 5718 5719 BIND(SET_RESULT); 5720 sub(result, result, len); // Return index where we stopped 5721 // Return len == 0 if we processed all 5722 // characters 5723 BIND(DONE); 5724 } 5725 5726 5727 // Inflate byte[] array to char[]. 5728 void MacroAssembler::byte_array_inflate(Register src, Register dst, Register len, 5729 FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3, 5730 Register tmp4) { 5731 Label big, done, after_init, to_stub; 5732 5733 assert_different_registers(src, dst, len, tmp4, rscratch1); 5734 5735 fmovd(vtmp1, zr); 5736 lsrw(tmp4, len, 3); 5737 bind(after_init); 5738 cbnzw(tmp4, big); 5739 // Short string: less than 8 bytes. 5740 { 5741 Label loop, tiny; 5742 5743 cmpw(len, 4); 5744 br(LT, tiny); 5745 // Use SIMD to do 4 bytes. 5746 ldrs(vtmp2, post(src, 4)); 5747 zip1(vtmp3, T8B, vtmp2, vtmp1); 5748 subw(len, len, 4); 5749 strd(vtmp3, post(dst, 8)); 5750 5751 cbzw(len, done); 5752 5753 // Do the remaining bytes by steam. 5754 bind(loop); 5755 ldrb(tmp4, post(src, 1)); 5756 strh(tmp4, post(dst, 2)); 5757 subw(len, len, 1); 5758 5759 bind(tiny); 5760 cbnz(len, loop); 5761 5762 b(done); 5763 } 5764 5765 if (SoftwarePrefetchHintDistance >= 0) { 5766 bind(to_stub); 5767 RuntimeAddress stub = RuntimeAddress(StubRoutines::aarch64::large_byte_array_inflate()); 5768 assert(stub.target() != NULL, "large_byte_array_inflate stub has not been generated"); 5769 trampoline_call(stub); 5770 b(after_init); 5771 } 5772 5773 // Unpack the bytes 8 at a time. 5774 bind(big); 5775 { 5776 Label loop, around, loop_last, loop_start; 5777 5778 if (SoftwarePrefetchHintDistance >= 0) { 5779 const int large_loop_threshold = (64 + 16)/8; 5780 ldrd(vtmp2, post(src, 8)); 5781 andw(len, len, 7); 5782 cmp(tmp4, (u1)large_loop_threshold); 5783 br(GE, to_stub); 5784 b(loop_start); 5785 5786 bind(loop); 5787 ldrd(vtmp2, post(src, 8)); 5788 bind(loop_start); 5789 subs(tmp4, tmp4, 1); 5790 br(EQ, loop_last); 5791 zip1(vtmp2, T16B, vtmp2, vtmp1); 5792 ldrd(vtmp3, post(src, 8)); 5793 st1(vtmp2, T8H, post(dst, 16)); 5794 subs(tmp4, tmp4, 1); 5795 zip1(vtmp3, T16B, vtmp3, vtmp1); 5796 st1(vtmp3, T8H, post(dst, 16)); 5797 br(NE, loop); 5798 b(around); 5799 bind(loop_last); 5800 zip1(vtmp2, T16B, vtmp2, vtmp1); 5801 st1(vtmp2, T8H, post(dst, 16)); 5802 bind(around); 5803 cbz(len, done); 5804 } else { 5805 andw(len, len, 7); 5806 bind(loop); 5807 ldrd(vtmp2, post(src, 8)); 5808 sub(tmp4, tmp4, 1); 5809 zip1(vtmp3, T16B, vtmp2, vtmp1); 5810 st1(vtmp3, T8H, post(dst, 16)); 5811 cbnz(tmp4, loop); 5812 } 5813 } 5814 5815 // Do the tail of up to 8 bytes. 5816 add(src, src, len); 5817 ldrd(vtmp3, Address(src, -8)); 5818 add(dst, dst, len, ext::uxtw, 1); 5819 zip1(vtmp3, T16B, vtmp3, vtmp1); 5820 strq(vtmp3, Address(dst, -16)); 5821 5822 bind(done); 5823 } 5824 5825 // Compress char[] array to byte[]. 5826 void MacroAssembler::char_array_compress(Register src, Register dst, Register len, 5827 FloatRegister tmp1Reg, FloatRegister tmp2Reg, 5828 FloatRegister tmp3Reg, FloatRegister tmp4Reg, 5829 Register result) { 5830 encode_iso_array(src, dst, len, result, 5831 tmp1Reg, tmp2Reg, tmp3Reg, tmp4Reg); 5832 cmp(len, zr); 5833 csel(result, result, zr, EQ); 5834 } 5835 5836 // get_thread() can be called anywhere inside generated code so we 5837 // need to save whatever non-callee save context might get clobbered 5838 // by the call to JavaThread::aarch64_get_thread_helper() or, indeed, 5839 // the call setup code. 5840 // 5841 // aarch64_get_thread_helper() clobbers only r0, r1, and flags. 5842 // 5843 void MacroAssembler::get_thread(Register dst) { 5844 RegSet saved_regs = RegSet::range(r0, r1) + lr - dst; 5845 push(saved_regs, sp); 5846 5847 mov(lr, CAST_FROM_FN_PTR(address, JavaThread::aarch64_get_thread_helper)); 5848 blrt(lr, 1, 0, 1); 5849 if (dst != c_rarg0) { 5850 mov(dst, c_rarg0); 5851 } 5852 5853 pop(saved_regs, sp); 5854 }