1 /* 2 * Copyright (c) 1997, 2019, Oracle and/or its affiliates. All rights reserved. 3 * Copyright (c) 2014, 2019, Red Hat Inc. All rights reserved. 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. 9 * 10 * This code is distributed in the hope that it will be useful, but WITHOUT 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 * version 2 for more details (a copy is included in the LICENSE file that 14 * accompanied this code). 15 * 16 * You should have received a copy of the GNU General Public License version 17 * 2 along with this work; if not, write to the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 19 * 20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 21 * or visit www.oracle.com if you need additional information or have any 22 * questions. 23 * 24 */ 25 26 #include <sys/types.h> 27 28 #include "precompiled.hpp" 29 #include "jvm.h" 30 #include "asm/assembler.hpp" 31 #include "asm/assembler.inline.hpp" 32 #include "gc/shared/barrierSet.hpp" 33 #include "gc/shared/cardTable.hpp" 34 #include "gc/shared/barrierSetAssembler.hpp" 35 #include "gc/shared/cardTableBarrierSet.hpp" 36 #include "interpreter/interpreter.hpp" 37 #include "compiler/disassembler.hpp" 38 #include "memory/resourceArea.hpp" 39 #include "memory/universe.hpp" 40 #include "nativeInst_aarch64.hpp" 41 #include "oops/accessDecorators.hpp" 42 #include "oops/compressedOops.inline.hpp" 43 #include "oops/klass.inline.hpp" 44 #include "runtime/biasedLocking.hpp" 45 #include "runtime/icache.hpp" 46 #include "runtime/interfaceSupport.inline.hpp" 47 #include "runtime/jniHandles.inline.hpp" 48 #include "runtime/sharedRuntime.hpp" 49 #include "runtime/thread.hpp" 50 #ifdef COMPILER1 51 #include "c1/c1_LIRAssembler.hpp" 52 #endif 53 #ifdef COMPILER2 54 #include "oops/oop.hpp" 55 #include "opto/compile.hpp" 56 #include "opto/intrinsicnode.hpp" 57 #include "opto/node.hpp" 58 #endif 59 60 #ifdef PRODUCT 61 #define BLOCK_COMMENT(str) /* nothing */ 62 #define STOP(error) stop(error) 63 #else 64 #define BLOCK_COMMENT(str) block_comment(str) 65 #define STOP(error) block_comment(error); stop(error) 66 #endif 67 68 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 69 70 // Patch any kind of instruction; there may be several instructions. 71 // Return the total length (in bytes) of the instructions. 72 int MacroAssembler::pd_patch_instruction_size(address branch, address target) { 73 int instructions = 1; 74 assert((uint64_t)target < (1ul << 48), "48-bit overflow in address constant"); 75 long offset = (target - branch) >> 2; 76 unsigned insn = *(unsigned*)branch; 77 if ((Instruction_aarch64::extract(insn, 29, 24) & 0b111011) == 0b011000) { 78 // Load register (literal) 79 Instruction_aarch64::spatch(branch, 23, 5, offset); 80 } else if (Instruction_aarch64::extract(insn, 30, 26) == 0b00101) { 81 // Unconditional branch (immediate) 82 Instruction_aarch64::spatch(branch, 25, 0, offset); 83 } else if (Instruction_aarch64::extract(insn, 31, 25) == 0b0101010) { 84 // Conditional branch (immediate) 85 Instruction_aarch64::spatch(branch, 23, 5, offset); 86 } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011010) { 87 // Compare & branch (immediate) 88 Instruction_aarch64::spatch(branch, 23, 5, offset); 89 } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011011) { 90 // Test & branch (immediate) 91 Instruction_aarch64::spatch(branch, 18, 5, offset); 92 } else if (Instruction_aarch64::extract(insn, 28, 24) == 0b10000) { 93 // PC-rel. addressing 94 offset = target-branch; 95 int shift = Instruction_aarch64::extract(insn, 31, 31); 96 if (shift) { 97 u_int64_t dest = (u_int64_t)target; 98 uint64_t pc_page = (uint64_t)branch >> 12; 99 uint64_t adr_page = (uint64_t)target >> 12; 100 unsigned offset_lo = dest & 0xfff; 101 offset = adr_page - pc_page; 102 103 // We handle 4 types of PC relative addressing 104 // 1 - adrp Rx, target_page 105 // ldr/str Ry, [Rx, #offset_in_page] 106 // 2 - adrp Rx, target_page 107 // add Ry, Rx, #offset_in_page 108 // 3 - adrp Rx, target_page (page aligned reloc, offset == 0) 109 // movk Rx, #imm16<<32 110 // 4 - adrp Rx, target_page (page aligned reloc, offset == 0) 111 // In the first 3 cases we must check that Rx is the same in the adrp and the 112 // subsequent ldr/str, add or movk instruction. Otherwise we could accidentally end 113 // up treating a type 4 relocation as a type 1, 2 or 3 just because it happened 114 // to be followed by a random unrelated ldr/str, add or movk instruction. 115 // 116 unsigned insn2 = ((unsigned*)branch)[1]; 117 if (Instruction_aarch64::extract(insn2, 29, 24) == 0b111001 && 118 Instruction_aarch64::extract(insn, 4, 0) == 119 Instruction_aarch64::extract(insn2, 9, 5)) { 120 // Load/store register (unsigned immediate) 121 unsigned size = Instruction_aarch64::extract(insn2, 31, 30); 122 Instruction_aarch64::patch(branch + sizeof (unsigned), 123 21, 10, offset_lo >> size); 124 guarantee(((dest >> size) << size) == dest, "misaligned target"); 125 instructions = 2; 126 } else if (Instruction_aarch64::extract(insn2, 31, 22) == 0b1001000100 && 127 Instruction_aarch64::extract(insn, 4, 0) == 128 Instruction_aarch64::extract(insn2, 4, 0)) { 129 // add (immediate) 130 Instruction_aarch64::patch(branch + sizeof (unsigned), 131 21, 10, offset_lo); 132 instructions = 2; 133 } else if (Instruction_aarch64::extract(insn2, 31, 21) == 0b11110010110 && 134 Instruction_aarch64::extract(insn, 4, 0) == 135 Instruction_aarch64::extract(insn2, 4, 0)) { 136 // movk #imm16<<32 137 Instruction_aarch64::patch(branch + 4, 20, 5, (uint64_t)target >> 32); 138 long dest = ((long)target & 0xffffffffL) | ((long)branch & 0xffff00000000L); 139 long pc_page = (long)branch >> 12; 140 long adr_page = (long)dest >> 12; 141 offset = adr_page - pc_page; 142 instructions = 2; 143 } 144 } 145 int offset_lo = offset & 3; 146 offset >>= 2; 147 Instruction_aarch64::spatch(branch, 23, 5, offset); 148 Instruction_aarch64::patch(branch, 30, 29, offset_lo); 149 } else if (Instruction_aarch64::extract(insn, 31, 21) == 0b11010010100) { 150 u_int64_t dest = (u_int64_t)target; 151 // Move wide constant 152 assert(nativeInstruction_at(branch+4)->is_movk(), "wrong insns in patch"); 153 assert(nativeInstruction_at(branch+8)->is_movk(), "wrong insns in patch"); 154 Instruction_aarch64::patch(branch, 20, 5, dest & 0xffff); 155 Instruction_aarch64::patch(branch+4, 20, 5, (dest >>= 16) & 0xffff); 156 Instruction_aarch64::patch(branch+8, 20, 5, (dest >>= 16) & 0xffff); 157 assert(target_addr_for_insn(branch) == target, "should be"); 158 instructions = 3; 159 } else if (Instruction_aarch64::extract(insn, 31, 22) == 0b1011100101 && 160 Instruction_aarch64::extract(insn, 4, 0) == 0b11111) { 161 // nothing to do 162 assert(target == 0, "did not expect to relocate target for polling page load"); 163 } else { 164 ShouldNotReachHere(); 165 } 166 return instructions * NativeInstruction::instruction_size; 167 } 168 169 int MacroAssembler::patch_oop(address insn_addr, address o) { 170 int instructions; 171 unsigned insn = *(unsigned*)insn_addr; 172 assert(nativeInstruction_at(insn_addr+4)->is_movk(), "wrong insns in patch"); 173 174 // OOPs are either narrow (32 bits) or wide (48 bits). We encode 175 // narrow OOPs by setting the upper 16 bits in the first 176 // instruction. 177 if (Instruction_aarch64::extract(insn, 31, 21) == 0b11010010101) { 178 // Move narrow OOP 179 narrowOop n = CompressedOops::encode((oop)o); 180 Instruction_aarch64::patch(insn_addr, 20, 5, n >> 16); 181 Instruction_aarch64::patch(insn_addr+4, 20, 5, n & 0xffff); 182 instructions = 2; 183 } else { 184 // Move wide OOP 185 assert(nativeInstruction_at(insn_addr+8)->is_movk(), "wrong insns in patch"); 186 uintptr_t dest = (uintptr_t)o; 187 Instruction_aarch64::patch(insn_addr, 20, 5, dest & 0xffff); 188 Instruction_aarch64::patch(insn_addr+4, 20, 5, (dest >>= 16) & 0xffff); 189 Instruction_aarch64::patch(insn_addr+8, 20, 5, (dest >>= 16) & 0xffff); 190 instructions = 3; 191 } 192 return instructions * NativeInstruction::instruction_size; 193 } 194 195 int MacroAssembler::patch_narrow_klass(address insn_addr, narrowKlass n) { 196 // Metatdata pointers are either narrow (32 bits) or wide (48 bits). 197 // We encode narrow ones by setting the upper 16 bits in the first 198 // instruction. 199 NativeInstruction *insn = nativeInstruction_at(insn_addr); 200 assert(Instruction_aarch64::extract(insn->encoding(), 31, 21) == 0b11010010101 && 201 nativeInstruction_at(insn_addr+4)->is_movk(), "wrong insns in patch"); 202 203 Instruction_aarch64::patch(insn_addr, 20, 5, n >> 16); 204 Instruction_aarch64::patch(insn_addr+4, 20, 5, n & 0xffff); 205 return 2 * NativeInstruction::instruction_size; 206 } 207 208 address MacroAssembler::target_addr_for_insn(address insn_addr, unsigned insn) { 209 long offset = 0; 210 if ((Instruction_aarch64::extract(insn, 29, 24) & 0b011011) == 0b00011000) { 211 // Load register (literal) 212 offset = Instruction_aarch64::sextract(insn, 23, 5); 213 return address(((uint64_t)insn_addr + (offset << 2))); 214 } else if (Instruction_aarch64::extract(insn, 30, 26) == 0b00101) { 215 // Unconditional branch (immediate) 216 offset = Instruction_aarch64::sextract(insn, 25, 0); 217 } else if (Instruction_aarch64::extract(insn, 31, 25) == 0b0101010) { 218 // Conditional branch (immediate) 219 offset = Instruction_aarch64::sextract(insn, 23, 5); 220 } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011010) { 221 // Compare & branch (immediate) 222 offset = Instruction_aarch64::sextract(insn, 23, 5); 223 } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011011) { 224 // Test & branch (immediate) 225 offset = Instruction_aarch64::sextract(insn, 18, 5); 226 } else if (Instruction_aarch64::extract(insn, 28, 24) == 0b10000) { 227 // PC-rel. addressing 228 offset = Instruction_aarch64::extract(insn, 30, 29); 229 offset |= Instruction_aarch64::sextract(insn, 23, 5) << 2; 230 int shift = Instruction_aarch64::extract(insn, 31, 31) ? 12 : 0; 231 if (shift) { 232 offset <<= shift; 233 uint64_t target_page = ((uint64_t)insn_addr) + offset; 234 target_page &= ((uint64_t)-1) << shift; 235 // Return the target address for the following sequences 236 // 1 - adrp Rx, target_page 237 // ldr/str Ry, [Rx, #offset_in_page] 238 // 2 - adrp Rx, target_page 239 // add Ry, Rx, #offset_in_page 240 // 3 - adrp Rx, target_page (page aligned reloc, offset == 0) 241 // movk Rx, #imm12<<32 242 // 4 - adrp Rx, target_page (page aligned reloc, offset == 0) 243 // 244 // In the first two cases we check that the register is the same and 245 // return the target_page + the offset within the page. 246 // Otherwise we assume it is a page aligned relocation and return 247 // the target page only. 248 // 249 unsigned insn2 = ((unsigned*)insn_addr)[1]; 250 if (Instruction_aarch64::extract(insn2, 29, 24) == 0b111001 && 251 Instruction_aarch64::extract(insn, 4, 0) == 252 Instruction_aarch64::extract(insn2, 9, 5)) { 253 // Load/store register (unsigned immediate) 254 unsigned int byte_offset = Instruction_aarch64::extract(insn2, 21, 10); 255 unsigned int size = Instruction_aarch64::extract(insn2, 31, 30); 256 return address(target_page + (byte_offset << size)); 257 } else if (Instruction_aarch64::extract(insn2, 31, 22) == 0b1001000100 && 258 Instruction_aarch64::extract(insn, 4, 0) == 259 Instruction_aarch64::extract(insn2, 4, 0)) { 260 // add (immediate) 261 unsigned int byte_offset = Instruction_aarch64::extract(insn2, 21, 10); 262 return address(target_page + byte_offset); 263 } else { 264 if (Instruction_aarch64::extract(insn2, 31, 21) == 0b11110010110 && 265 Instruction_aarch64::extract(insn, 4, 0) == 266 Instruction_aarch64::extract(insn2, 4, 0)) { 267 target_page = (target_page & 0xffffffff) | 268 ((uint64_t)Instruction_aarch64::extract(insn2, 20, 5) << 32); 269 } 270 return (address)target_page; 271 } 272 } else { 273 ShouldNotReachHere(); 274 } 275 } else if (Instruction_aarch64::extract(insn, 31, 23) == 0b110100101) { 276 u_int32_t *insns = (u_int32_t *)insn_addr; 277 // Move wide constant: movz, movk, movk. See movptr(). 278 assert(nativeInstruction_at(insns+1)->is_movk(), "wrong insns in patch"); 279 assert(nativeInstruction_at(insns+2)->is_movk(), "wrong insns in patch"); 280 return address(u_int64_t(Instruction_aarch64::extract(insns[0], 20, 5)) 281 + (u_int64_t(Instruction_aarch64::extract(insns[1], 20, 5)) << 16) 282 + (u_int64_t(Instruction_aarch64::extract(insns[2], 20, 5)) << 32)); 283 } else if (Instruction_aarch64::extract(insn, 31, 22) == 0b1011100101 && 284 Instruction_aarch64::extract(insn, 4, 0) == 0b11111) { 285 return 0; 286 } else { 287 ShouldNotReachHere(); 288 } 289 return address(((uint64_t)insn_addr + (offset << 2))); 290 } 291 292 void MacroAssembler::safepoint_poll(Label& slow_path) { 293 if (SafepointMechanism::uses_thread_local_poll()) { 294 ldr(rscratch1, Address(rthread, Thread::polling_page_offset())); 295 tbnz(rscratch1, exact_log2(SafepointMechanism::poll_bit()), slow_path); 296 } else { 297 unsigned long offset; 298 adrp(rscratch1, ExternalAddress(SafepointSynchronize::address_of_state()), offset); 299 ldrw(rscratch1, Address(rscratch1, offset)); 300 assert(SafepointSynchronize::_not_synchronized == 0, "rewrite this code"); 301 cbnz(rscratch1, slow_path); 302 } 303 } 304 305 // Just like safepoint_poll, but use an acquiring load for thread- 306 // local polling. 307 // 308 // We need an acquire here to ensure that any subsequent load of the 309 // global SafepointSynchronize::_state flag is ordered after this load 310 // of the local Thread::_polling page. We don't want this poll to 311 // return false (i.e. not safepointing) and a later poll of the global 312 // SafepointSynchronize::_state spuriously to return true. 313 // 314 // This is to avoid a race when we're in a native->Java transition 315 // racing the code which wakes up from a safepoint. 316 // 317 void MacroAssembler::safepoint_poll_acquire(Label& slow_path) { 318 if (SafepointMechanism::uses_thread_local_poll()) { 319 lea(rscratch1, Address(rthread, Thread::polling_page_offset())); 320 ldar(rscratch1, rscratch1); 321 tbnz(rscratch1, exact_log2(SafepointMechanism::poll_bit()), slow_path); 322 } else { 323 safepoint_poll(slow_path); 324 } 325 } 326 327 void MacroAssembler::reset_last_Java_frame(bool clear_fp) { 328 // we must set sp to zero to clear frame 329 str(zr, Address(rthread, JavaThread::last_Java_sp_offset())); 330 331 // must clear fp, so that compiled frames are not confused; it is 332 // possible that we need it only for debugging 333 if (clear_fp) { 334 str(zr, Address(rthread, JavaThread::last_Java_fp_offset())); 335 } 336 337 // Always clear the pc because it could have been set by make_walkable() 338 str(zr, Address(rthread, JavaThread::last_Java_pc_offset())); 339 } 340 341 // Calls to C land 342 // 343 // When entering C land, the rfp, & resp of the last Java frame have to be recorded 344 // in the (thread-local) JavaThread object. When leaving C land, the last Java fp 345 // has to be reset to 0. This is required to allow proper stack traversal. 346 void MacroAssembler::set_last_Java_frame(Register last_java_sp, 347 Register last_java_fp, 348 Register last_java_pc, 349 Register scratch) { 350 351 if (last_java_pc->is_valid()) { 352 str(last_java_pc, Address(rthread, 353 JavaThread::frame_anchor_offset() 354 + JavaFrameAnchor::last_Java_pc_offset())); 355 } 356 357 // determine last_java_sp register 358 if (last_java_sp == sp) { 359 mov(scratch, sp); 360 last_java_sp = scratch; 361 } else if (!last_java_sp->is_valid()) { 362 last_java_sp = esp; 363 } 364 365 str(last_java_sp, Address(rthread, JavaThread::last_Java_sp_offset())); 366 367 // last_java_fp is optional 368 if (last_java_fp->is_valid()) { 369 str(last_java_fp, Address(rthread, JavaThread::last_Java_fp_offset())); 370 } 371 } 372 373 void MacroAssembler::set_last_Java_frame(Register last_java_sp, 374 Register last_java_fp, 375 address last_java_pc, 376 Register scratch) { 377 assert(last_java_pc != NULL, "must provide a valid PC"); 378 379 adr(scratch, last_java_pc); 380 str(scratch, Address(rthread, 381 JavaThread::frame_anchor_offset() 382 + JavaFrameAnchor::last_Java_pc_offset())); 383 384 set_last_Java_frame(last_java_sp, last_java_fp, noreg, scratch); 385 } 386 387 void MacroAssembler::set_last_Java_frame(Register last_java_sp, 388 Register last_java_fp, 389 Label &L, 390 Register scratch) { 391 if (L.is_bound()) { 392 set_last_Java_frame(last_java_sp, last_java_fp, target(L), scratch); 393 } else { 394 InstructionMark im(this); 395 L.add_patch_at(code(), locator()); 396 set_last_Java_frame(last_java_sp, last_java_fp, pc() /* Patched later */, scratch); 397 } 398 } 399 400 void MacroAssembler::far_call(Address entry, CodeBuffer *cbuf, Register tmp) { 401 assert(ReservedCodeCacheSize < 4*G, "branch out of range"); 402 assert(CodeCache::find_blob(entry.target()) != NULL, 403 "destination of far call not found in code cache"); 404 if (far_branches()) { 405 unsigned long offset; 406 // We can use ADRP here because we know that the total size of 407 // the code cache cannot exceed 2Gb. 408 adrp(tmp, entry, offset); 409 add(tmp, tmp, offset); 410 if (cbuf) cbuf->set_insts_mark(); 411 blr(tmp); 412 } else { 413 if (cbuf) cbuf->set_insts_mark(); 414 bl(entry); 415 } 416 } 417 418 void MacroAssembler::far_jump(Address entry, CodeBuffer *cbuf, Register tmp) { 419 assert(ReservedCodeCacheSize < 4*G, "branch out of range"); 420 assert(CodeCache::find_blob(entry.target()) != NULL, 421 "destination of far call not found in code cache"); 422 if (far_branches()) { 423 unsigned long offset; 424 // We can use ADRP here because we know that the total size of 425 // the code cache cannot exceed 2Gb. 426 adrp(tmp, entry, offset); 427 add(tmp, tmp, offset); 428 if (cbuf) cbuf->set_insts_mark(); 429 br(tmp); 430 } else { 431 if (cbuf) cbuf->set_insts_mark(); 432 b(entry); 433 } 434 } 435 436 void MacroAssembler::reserved_stack_check() { 437 // testing if reserved zone needs to be enabled 438 Label no_reserved_zone_enabling; 439 440 ldr(rscratch1, Address(rthread, JavaThread::reserved_stack_activation_offset())); 441 cmp(sp, rscratch1); 442 br(Assembler::LO, no_reserved_zone_enabling); 443 444 enter(); // LR and FP are live. 445 lea(rscratch1, CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone)); 446 mov(c_rarg0, rthread); 447 blr(rscratch1); 448 leave(); 449 450 // We have already removed our own frame. 451 // throw_delayed_StackOverflowError will think that it's been 452 // called by our caller. 453 lea(rscratch1, RuntimeAddress(StubRoutines::throw_delayed_StackOverflowError_entry())); 454 br(rscratch1); 455 should_not_reach_here(); 456 457 bind(no_reserved_zone_enabling); 458 } 459 460 int MacroAssembler::biased_locking_enter(Register lock_reg, 461 Register obj_reg, 462 Register swap_reg, 463 Register tmp_reg, 464 bool swap_reg_contains_mark, 465 Label& done, 466 Label* slow_case, 467 BiasedLockingCounters* counters) { 468 assert(UseBiasedLocking, "why call this otherwise?"); 469 assert_different_registers(lock_reg, obj_reg, swap_reg); 470 471 if (PrintBiasedLockingStatistics && counters == NULL) 472 counters = BiasedLocking::counters(); 473 474 assert_different_registers(lock_reg, obj_reg, swap_reg, tmp_reg, rscratch1, rscratch2, noreg); 475 assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits, "biased locking makes assumptions about bit layout"); 476 Address mark_addr (obj_reg, oopDesc::mark_offset_in_bytes()); 477 Address klass_addr (obj_reg, oopDesc::klass_offset_in_bytes()); 478 Address saved_mark_addr(lock_reg, 0); 479 480 // Biased locking 481 // See whether the lock is currently biased toward our thread and 482 // whether the epoch is still valid 483 // Note that the runtime guarantees sufficient alignment of JavaThread 484 // pointers to allow age to be placed into low bits 485 // First check to see whether biasing is even enabled for this object 486 Label cas_label; 487 int null_check_offset = -1; 488 if (!swap_reg_contains_mark) { 489 null_check_offset = offset(); 490 ldr(swap_reg, mark_addr); 491 } 492 andr(tmp_reg, swap_reg, markOopDesc::biased_lock_mask_in_place); 493 cmp(tmp_reg, (u1)markOopDesc::biased_lock_pattern); 494 br(Assembler::NE, cas_label); 495 // The bias pattern is present in the object's header. Need to check 496 // whether the bias owner and the epoch are both still current. 497 load_prototype_header(tmp_reg, obj_reg); 498 orr(tmp_reg, tmp_reg, rthread); 499 eor(tmp_reg, swap_reg, tmp_reg); 500 andr(tmp_reg, tmp_reg, ~((int) markOopDesc::age_mask_in_place)); 501 if (counters != NULL) { 502 Label around; 503 cbnz(tmp_reg, around); 504 atomic_incw(Address((address)counters->biased_lock_entry_count_addr()), tmp_reg, rscratch1, rscratch2); 505 b(done); 506 bind(around); 507 } else { 508 cbz(tmp_reg, done); 509 } 510 511 Label try_revoke_bias; 512 Label try_rebias; 513 514 // At this point we know that the header has the bias pattern and 515 // that we are not the bias owner in the current epoch. We need to 516 // figure out more details about the state of the header in order to 517 // know what operations can be legally performed on the object's 518 // header. 519 520 // If the low three bits in the xor result aren't clear, that means 521 // the prototype header is no longer biased and we have to revoke 522 // the bias on this object. 523 andr(rscratch1, tmp_reg, markOopDesc::biased_lock_mask_in_place); 524 cbnz(rscratch1, try_revoke_bias); 525 526 // Biasing is still enabled for this data type. See whether the 527 // epoch of the current bias is still valid, meaning that the epoch 528 // bits of the mark word are equal to the epoch bits of the 529 // prototype header. (Note that the prototype header's epoch bits 530 // only change at a safepoint.) If not, attempt to rebias the object 531 // toward the current thread. Note that we must be absolutely sure 532 // that the current epoch is invalid in order to do this because 533 // otherwise the manipulations it performs on the mark word are 534 // illegal. 535 andr(rscratch1, tmp_reg, markOopDesc::epoch_mask_in_place); 536 cbnz(rscratch1, try_rebias); 537 538 // The epoch of the current bias is still valid but we know nothing 539 // about the owner; it might be set or it might be clear. Try to 540 // acquire the bias of the object using an atomic operation. If this 541 // fails we will go in to the runtime to revoke the object's bias. 542 // Note that we first construct the presumed unbiased header so we 543 // don't accidentally blow away another thread's valid bias. 544 { 545 Label here; 546 mov(rscratch1, markOopDesc::biased_lock_mask_in_place | markOopDesc::age_mask_in_place | markOopDesc::epoch_mask_in_place); 547 andr(swap_reg, swap_reg, rscratch1); 548 orr(tmp_reg, swap_reg, rthread); 549 cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, rscratch1, here, slow_case); 550 // If the biasing toward our thread failed, this means that 551 // another thread succeeded in biasing it toward itself and we 552 // need to revoke that bias. The revocation will occur in the 553 // interpreter runtime in the slow case. 554 bind(here); 555 if (counters != NULL) { 556 atomic_incw(Address((address)counters->anonymously_biased_lock_entry_count_addr()), 557 tmp_reg, rscratch1, rscratch2); 558 } 559 } 560 b(done); 561 562 bind(try_rebias); 563 // At this point we know the epoch has expired, meaning that the 564 // current "bias owner", if any, is actually invalid. Under these 565 // circumstances _only_, we are allowed to use the current header's 566 // value as the comparison value when doing the cas to acquire the 567 // bias in the current epoch. In other words, we allow transfer of 568 // the bias from one thread to another directly in this situation. 569 // 570 // FIXME: due to a lack of registers we currently blow away the age 571 // bits in this situation. Should attempt to preserve them. 572 { 573 Label here; 574 load_prototype_header(tmp_reg, obj_reg); 575 orr(tmp_reg, rthread, tmp_reg); 576 cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, rscratch1, here, slow_case); 577 // If the biasing toward our thread failed, then another thread 578 // succeeded in biasing it toward itself and we need to revoke that 579 // bias. The revocation will occur in the runtime in the slow case. 580 bind(here); 581 if (counters != NULL) { 582 atomic_incw(Address((address)counters->rebiased_lock_entry_count_addr()), 583 tmp_reg, rscratch1, rscratch2); 584 } 585 } 586 b(done); 587 588 bind(try_revoke_bias); 589 // The prototype mark in the klass doesn't have the bias bit set any 590 // more, indicating that objects of this data type are not supposed 591 // to be biased any more. We are going to try to reset the mark of 592 // this object to the prototype value and fall through to the 593 // CAS-based locking scheme. Note that if our CAS fails, it means 594 // that another thread raced us for the privilege of revoking the 595 // bias of this particular object, so it's okay to continue in the 596 // normal locking code. 597 // 598 // FIXME: due to a lack of registers we currently blow away the age 599 // bits in this situation. Should attempt to preserve them. 600 { 601 Label here, nope; 602 load_prototype_header(tmp_reg, obj_reg); 603 cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, rscratch1, here, &nope); 604 bind(here); 605 606 // Fall through to the normal CAS-based lock, because no matter what 607 // the result of the above CAS, some thread must have succeeded in 608 // removing the bias bit from the object's header. 609 if (counters != NULL) { 610 atomic_incw(Address((address)counters->revoked_lock_entry_count_addr()), tmp_reg, 611 rscratch1, rscratch2); 612 } 613 bind(nope); 614 } 615 616 bind(cas_label); 617 618 return null_check_offset; 619 } 620 621 void MacroAssembler::biased_locking_exit(Register obj_reg, Register temp_reg, Label& done) { 622 assert(UseBiasedLocking, "why call this otherwise?"); 623 624 // Check for biased locking unlock case, which is a no-op 625 // Note: we do not have to check the thread ID for two reasons. 626 // First, the interpreter checks for IllegalMonitorStateException at 627 // a higher level. Second, if the bias was revoked while we held the 628 // lock, the object could not be rebiased toward another thread, so 629 // the bias bit would be clear. 630 ldr(temp_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes())); 631 andr(temp_reg, temp_reg, markOopDesc::biased_lock_mask_in_place); 632 cmp(temp_reg, (u1)markOopDesc::biased_lock_pattern); 633 br(Assembler::EQ, done); 634 } 635 636 static void pass_arg0(MacroAssembler* masm, Register arg) { 637 if (c_rarg0 != arg ) { 638 masm->mov(c_rarg0, arg); 639 } 640 } 641 642 static void pass_arg1(MacroAssembler* masm, Register arg) { 643 if (c_rarg1 != arg ) { 644 masm->mov(c_rarg1, arg); 645 } 646 } 647 648 static void pass_arg2(MacroAssembler* masm, Register arg) { 649 if (c_rarg2 != arg ) { 650 masm->mov(c_rarg2, arg); 651 } 652 } 653 654 static void pass_arg3(MacroAssembler* masm, Register arg) { 655 if (c_rarg3 != arg ) { 656 masm->mov(c_rarg3, arg); 657 } 658 } 659 660 void MacroAssembler::call_VM_base(Register oop_result, 661 Register java_thread, 662 Register last_java_sp, 663 address entry_point, 664 int number_of_arguments, 665 bool check_exceptions) { 666 // determine java_thread register 667 if (!java_thread->is_valid()) { 668 java_thread = rthread; 669 } 670 671 // determine last_java_sp register 672 if (!last_java_sp->is_valid()) { 673 last_java_sp = esp; 674 } 675 676 // debugging support 677 assert(number_of_arguments >= 0 , "cannot have negative number of arguments"); 678 assert(java_thread == rthread, "unexpected register"); 679 #ifdef ASSERT 680 // TraceBytecodes does not use r12 but saves it over the call, so don't verify 681 // if ((UseCompressedOops || UseCompressedClassPointers) && !TraceBytecodes) verify_heapbase("call_VM_base: heap base corrupted?"); 682 #endif // ASSERT 683 684 assert(java_thread != oop_result , "cannot use the same register for java_thread & oop_result"); 685 assert(java_thread != last_java_sp, "cannot use the same register for java_thread & last_java_sp"); 686 687 // push java thread (becomes first argument of C function) 688 689 mov(c_rarg0, java_thread); 690 691 // set last Java frame before call 692 assert(last_java_sp != rfp, "can't use rfp"); 693 694 Label l; 695 set_last_Java_frame(last_java_sp, rfp, l, rscratch1); 696 697 // do the call, remove parameters 698 MacroAssembler::call_VM_leaf_base(entry_point, number_of_arguments, &l); 699 700 // reset last Java frame 701 // Only interpreter should have to clear fp 702 reset_last_Java_frame(true); 703 704 // C++ interp handles this in the interpreter 705 check_and_handle_popframe(java_thread); 706 check_and_handle_earlyret(java_thread); 707 708 if (check_exceptions) { 709 // check for pending exceptions (java_thread is set upon return) 710 ldr(rscratch1, Address(java_thread, in_bytes(Thread::pending_exception_offset()))); 711 Label ok; 712 cbz(rscratch1, ok); 713 lea(rscratch1, RuntimeAddress(StubRoutines::forward_exception_entry())); 714 br(rscratch1); 715 bind(ok); 716 } 717 718 // get oop result if there is one and reset the value in the thread 719 if (oop_result->is_valid()) { 720 get_vm_result(oop_result, java_thread); 721 } 722 } 723 724 void MacroAssembler::call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions) { 725 call_VM_base(oop_result, noreg, noreg, entry_point, number_of_arguments, check_exceptions); 726 } 727 728 // Maybe emit a call via a trampoline. If the code cache is small 729 // trampolines won't be emitted. 730 731 address MacroAssembler::trampoline_call(Address entry, CodeBuffer *cbuf) { 732 assert(JavaThread::current()->is_Compiler_thread(), "just checking"); 733 assert(entry.rspec().type() == relocInfo::runtime_call_type 734 || entry.rspec().type() == relocInfo::opt_virtual_call_type 735 || entry.rspec().type() == relocInfo::static_call_type 736 || entry.rspec().type() == relocInfo::virtual_call_type, "wrong reloc type"); 737 738 // We need a trampoline if branches are far. 739 if (far_branches()) { 740 bool in_scratch_emit_size = false; 741 #ifdef COMPILER2 742 // We don't want to emit a trampoline if C2 is generating dummy 743 // code during its branch shortening phase. 744 CompileTask* task = ciEnv::current()->task(); 745 in_scratch_emit_size = 746 (task != NULL && is_c2_compile(task->comp_level()) && 747 Compile::current()->in_scratch_emit_size()); 748 #endif 749 if (!in_scratch_emit_size) { 750 address stub = emit_trampoline_stub(offset(), entry.target()); 751 if (stub == NULL) { 752 return NULL; // CodeCache is full 753 } 754 } 755 } 756 757 if (cbuf) cbuf->set_insts_mark(); 758 relocate(entry.rspec()); 759 if (!far_branches()) { 760 bl(entry.target()); 761 } else { 762 bl(pc()); 763 } 764 // just need to return a non-null address 765 return pc(); 766 } 767 768 769 // Emit a trampoline stub for a call to a target which is too far away. 770 // 771 // code sequences: 772 // 773 // call-site: 774 // branch-and-link to <destination> or <trampoline stub> 775 // 776 // Related trampoline stub for this call site in the stub section: 777 // load the call target from the constant pool 778 // branch (LR still points to the call site above) 779 780 address MacroAssembler::emit_trampoline_stub(int insts_call_instruction_offset, 781 address dest) { 782 // Max stub size: alignment nop, TrampolineStub. 783 address stub = start_a_stub(NativeInstruction::instruction_size 784 + NativeCallTrampolineStub::instruction_size); 785 if (stub == NULL) { 786 return NULL; // CodeBuffer::expand failed 787 } 788 789 // Create a trampoline stub relocation which relates this trampoline stub 790 // with the call instruction at insts_call_instruction_offset in the 791 // instructions code-section. 792 align(wordSize); 793 relocate(trampoline_stub_Relocation::spec(code()->insts()->start() 794 + insts_call_instruction_offset)); 795 const int stub_start_offset = offset(); 796 797 // Now, create the trampoline stub's code: 798 // - load the call 799 // - call 800 Label target; 801 ldr(rscratch1, target); 802 br(rscratch1); 803 bind(target); 804 assert(offset() - stub_start_offset == NativeCallTrampolineStub::data_offset, 805 "should be"); 806 emit_int64((int64_t)dest); 807 808 const address stub_start_addr = addr_at(stub_start_offset); 809 810 assert(is_NativeCallTrampolineStub_at(stub_start_addr), "doesn't look like a trampoline"); 811 812 end_a_stub(); 813 return stub_start_addr; 814 } 815 816 void MacroAssembler::emit_static_call_stub() { 817 // CompiledDirectStaticCall::set_to_interpreted knows the 818 // exact layout of this stub. 819 820 isb(); 821 mov_metadata(rmethod, (Metadata*)NULL); 822 823 // Jump to the entry point of the i2c stub. 824 movptr(rscratch1, 0); 825 br(rscratch1); 826 } 827 828 void MacroAssembler::c2bool(Register x) { 829 // implements x == 0 ? 0 : 1 830 // note: must only look at least-significant byte of x 831 // since C-style booleans are stored in one byte 832 // only! (was bug) 833 tst(x, 0xff); 834 cset(x, Assembler::NE); 835 } 836 837 address MacroAssembler::ic_call(address entry, jint method_index) { 838 RelocationHolder rh = virtual_call_Relocation::spec(pc(), method_index); 839 // address const_ptr = long_constant((jlong)Universe::non_oop_word()); 840 // unsigned long offset; 841 // ldr_constant(rscratch2, const_ptr); 842 movptr(rscratch2, (uintptr_t)Universe::non_oop_word()); 843 return trampoline_call(Address(entry, rh)); 844 } 845 846 // Implementation of call_VM versions 847 848 void MacroAssembler::call_VM(Register oop_result, 849 address entry_point, 850 bool check_exceptions) { 851 call_VM_helper(oop_result, entry_point, 0, check_exceptions); 852 } 853 854 void MacroAssembler::call_VM(Register oop_result, 855 address entry_point, 856 Register arg_1, 857 bool check_exceptions) { 858 pass_arg1(this, arg_1); 859 call_VM_helper(oop_result, entry_point, 1, check_exceptions); 860 } 861 862 void MacroAssembler::call_VM(Register oop_result, 863 address entry_point, 864 Register arg_1, 865 Register arg_2, 866 bool check_exceptions) { 867 assert(arg_1 != c_rarg2, "smashed arg"); 868 pass_arg2(this, arg_2); 869 pass_arg1(this, arg_1); 870 call_VM_helper(oop_result, entry_point, 2, check_exceptions); 871 } 872 873 void MacroAssembler::call_VM(Register oop_result, 874 address entry_point, 875 Register arg_1, 876 Register arg_2, 877 Register arg_3, 878 bool check_exceptions) { 879 assert(arg_1 != c_rarg3, "smashed arg"); 880 assert(arg_2 != c_rarg3, "smashed arg"); 881 pass_arg3(this, arg_3); 882 883 assert(arg_1 != c_rarg2, "smashed arg"); 884 pass_arg2(this, arg_2); 885 886 pass_arg1(this, arg_1); 887 call_VM_helper(oop_result, entry_point, 3, check_exceptions); 888 } 889 890 void MacroAssembler::call_VM(Register oop_result, 891 Register last_java_sp, 892 address entry_point, 893 int number_of_arguments, 894 bool check_exceptions) { 895 call_VM_base(oop_result, rthread, last_java_sp, entry_point, number_of_arguments, check_exceptions); 896 } 897 898 void MacroAssembler::call_VM(Register oop_result, 899 Register last_java_sp, 900 address entry_point, 901 Register arg_1, 902 bool check_exceptions) { 903 pass_arg1(this, arg_1); 904 call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions); 905 } 906 907 void MacroAssembler::call_VM(Register oop_result, 908 Register last_java_sp, 909 address entry_point, 910 Register arg_1, 911 Register arg_2, 912 bool check_exceptions) { 913 914 assert(arg_1 != c_rarg2, "smashed arg"); 915 pass_arg2(this, arg_2); 916 pass_arg1(this, arg_1); 917 call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions); 918 } 919 920 void MacroAssembler::call_VM(Register oop_result, 921 Register last_java_sp, 922 address entry_point, 923 Register arg_1, 924 Register arg_2, 925 Register arg_3, 926 bool check_exceptions) { 927 assert(arg_1 != c_rarg3, "smashed arg"); 928 assert(arg_2 != c_rarg3, "smashed arg"); 929 pass_arg3(this, arg_3); 930 assert(arg_1 != c_rarg2, "smashed arg"); 931 pass_arg2(this, arg_2); 932 pass_arg1(this, arg_1); 933 call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions); 934 } 935 936 937 void MacroAssembler::get_vm_result(Register oop_result, Register java_thread) { 938 ldr(oop_result, Address(java_thread, JavaThread::vm_result_offset())); 939 str(zr, Address(java_thread, JavaThread::vm_result_offset())); 940 verify_oop(oop_result, "broken oop in call_VM_base"); 941 } 942 943 void MacroAssembler::get_vm_result_2(Register metadata_result, Register java_thread) { 944 ldr(metadata_result, Address(java_thread, JavaThread::vm_result_2_offset())); 945 str(zr, Address(java_thread, JavaThread::vm_result_2_offset())); 946 } 947 948 void MacroAssembler::align(int modulus) { 949 while (offset() % modulus != 0) nop(); 950 } 951 952 // these are no-ops overridden by InterpreterMacroAssembler 953 954 void MacroAssembler::check_and_handle_earlyret(Register java_thread) { } 955 956 void MacroAssembler::check_and_handle_popframe(Register java_thread) { } 957 958 959 RegisterOrConstant MacroAssembler::delayed_value_impl(intptr_t* delayed_value_addr, 960 Register tmp, 961 int offset) { 962 intptr_t value = *delayed_value_addr; 963 if (value != 0) 964 return RegisterOrConstant(value + offset); 965 966 // load indirectly to solve generation ordering problem 967 ldr(tmp, ExternalAddress((address) delayed_value_addr)); 968 969 if (offset != 0) 970 add(tmp, tmp, offset); 971 972 return RegisterOrConstant(tmp); 973 } 974 975 976 void MacroAssembler:: notify(int type) { 977 if (type == bytecode_start) { 978 // set_last_Java_frame(esp, rfp, (address)NULL); 979 Assembler:: notify(type); 980 // reset_last_Java_frame(true); 981 } 982 else 983 Assembler:: notify(type); 984 } 985 986 // Look up the method for a megamorphic invokeinterface call. 987 // The target method is determined by <intf_klass, itable_index>. 988 // The receiver klass is in recv_klass. 989 // On success, the result will be in method_result, and execution falls through. 990 // On failure, execution transfers to the given label. 991 void MacroAssembler::lookup_interface_method(Register recv_klass, 992 Register intf_klass, 993 RegisterOrConstant itable_index, 994 Register method_result, 995 Register scan_temp, 996 Label& L_no_such_interface, 997 bool return_method) { 998 assert_different_registers(recv_klass, intf_klass, scan_temp); 999 assert_different_registers(method_result, intf_klass, scan_temp); 1000 assert(recv_klass != method_result || !return_method, 1001 "recv_klass can be destroyed when method isn't needed"); 1002 assert(itable_index.is_constant() || itable_index.as_register() == method_result, 1003 "caller must use same register for non-constant itable index as for method"); 1004 1005 // Compute start of first itableOffsetEntry (which is at the end of the vtable) 1006 int vtable_base = in_bytes(Klass::vtable_start_offset()); 1007 int itentry_off = itableMethodEntry::method_offset_in_bytes(); 1008 int scan_step = itableOffsetEntry::size() * wordSize; 1009 int vte_size = vtableEntry::size_in_bytes(); 1010 assert(vte_size == wordSize, "else adjust times_vte_scale"); 1011 1012 ldrw(scan_temp, Address(recv_klass, Klass::vtable_length_offset())); 1013 1014 // %%% Could store the aligned, prescaled offset in the klassoop. 1015 // lea(scan_temp, Address(recv_klass, scan_temp, times_vte_scale, vtable_base)); 1016 lea(scan_temp, Address(recv_klass, scan_temp, Address::lsl(3))); 1017 add(scan_temp, scan_temp, vtable_base); 1018 1019 if (return_method) { 1020 // Adjust recv_klass by scaled itable_index, so we can free itable_index. 1021 assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below"); 1022 // lea(recv_klass, Address(recv_klass, itable_index, Address::times_ptr, itentry_off)); 1023 lea(recv_klass, Address(recv_klass, itable_index, Address::lsl(3))); 1024 if (itentry_off) 1025 add(recv_klass, recv_klass, itentry_off); 1026 } 1027 1028 // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) { 1029 // if (scan->interface() == intf) { 1030 // result = (klass + scan->offset() + itable_index); 1031 // } 1032 // } 1033 Label search, found_method; 1034 1035 for (int peel = 1; peel >= 0; peel--) { 1036 ldr(method_result, Address(scan_temp, itableOffsetEntry::interface_offset_in_bytes())); 1037 cmp(intf_klass, method_result); 1038 1039 if (peel) { 1040 br(Assembler::EQ, found_method); 1041 } else { 1042 br(Assembler::NE, search); 1043 // (invert the test to fall through to found_method...) 1044 } 1045 1046 if (!peel) break; 1047 1048 bind(search); 1049 1050 // Check that the previous entry is non-null. A null entry means that 1051 // the receiver class doesn't implement the interface, and wasn't the 1052 // same as when the caller was compiled. 1053 cbz(method_result, L_no_such_interface); 1054 add(scan_temp, scan_temp, scan_step); 1055 } 1056 1057 bind(found_method); 1058 1059 // Got a hit. 1060 if (return_method) { 1061 ldrw(scan_temp, Address(scan_temp, itableOffsetEntry::offset_offset_in_bytes())); 1062 ldr(method_result, Address(recv_klass, scan_temp, Address::uxtw(0))); 1063 } 1064 } 1065 1066 // virtual method calling 1067 void MacroAssembler::lookup_virtual_method(Register recv_klass, 1068 RegisterOrConstant vtable_index, 1069 Register method_result) { 1070 const int base = in_bytes(Klass::vtable_start_offset()); 1071 assert(vtableEntry::size() * wordSize == 8, 1072 "adjust the scaling in the code below"); 1073 int vtable_offset_in_bytes = base + vtableEntry::method_offset_in_bytes(); 1074 1075 if (vtable_index.is_register()) { 1076 lea(method_result, Address(recv_klass, 1077 vtable_index.as_register(), 1078 Address::lsl(LogBytesPerWord))); 1079 ldr(method_result, Address(method_result, vtable_offset_in_bytes)); 1080 } else { 1081 vtable_offset_in_bytes += vtable_index.as_constant() * wordSize; 1082 ldr(method_result, 1083 form_address(rscratch1, recv_klass, vtable_offset_in_bytes, 0)); 1084 } 1085 } 1086 1087 void MacroAssembler::check_klass_subtype(Register sub_klass, 1088 Register super_klass, 1089 Register temp_reg, 1090 Label& L_success) { 1091 Label L_failure; 1092 check_klass_subtype_fast_path(sub_klass, super_klass, temp_reg, &L_success, &L_failure, NULL); 1093 check_klass_subtype_slow_path(sub_klass, super_klass, temp_reg, noreg, &L_success, NULL); 1094 bind(L_failure); 1095 } 1096 1097 1098 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass, 1099 Register super_klass, 1100 Register temp_reg, 1101 Label* L_success, 1102 Label* L_failure, 1103 Label* L_slow_path, 1104 RegisterOrConstant super_check_offset) { 1105 assert_different_registers(sub_klass, super_klass, temp_reg); 1106 bool must_load_sco = (super_check_offset.constant_or_zero() == -1); 1107 if (super_check_offset.is_register()) { 1108 assert_different_registers(sub_klass, super_klass, 1109 super_check_offset.as_register()); 1110 } else if (must_load_sco) { 1111 assert(temp_reg != noreg, "supply either a temp or a register offset"); 1112 } 1113 1114 Label L_fallthrough; 1115 int label_nulls = 0; 1116 if (L_success == NULL) { L_success = &L_fallthrough; label_nulls++; } 1117 if (L_failure == NULL) { L_failure = &L_fallthrough; label_nulls++; } 1118 if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; } 1119 assert(label_nulls <= 1, "at most one NULL in the batch"); 1120 1121 int sc_offset = in_bytes(Klass::secondary_super_cache_offset()); 1122 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 1123 Address super_check_offset_addr(super_klass, sco_offset); 1124 1125 // Hacked jmp, which may only be used just before L_fallthrough. 1126 #define final_jmp(label) \ 1127 if (&(label) == &L_fallthrough) { /*do nothing*/ } \ 1128 else b(label) /*omit semi*/ 1129 1130 // If the pointers are equal, we are done (e.g., String[] elements). 1131 // This self-check enables sharing of secondary supertype arrays among 1132 // non-primary types such as array-of-interface. Otherwise, each such 1133 // type would need its own customized SSA. 1134 // We move this check to the front of the fast path because many 1135 // type checks are in fact trivially successful in this manner, 1136 // so we get a nicely predicted branch right at the start of the check. 1137 cmp(sub_klass, super_klass); 1138 br(Assembler::EQ, *L_success); 1139 1140 // Check the supertype display: 1141 if (must_load_sco) { 1142 ldrw(temp_reg, super_check_offset_addr); 1143 super_check_offset = RegisterOrConstant(temp_reg); 1144 } 1145 Address super_check_addr(sub_klass, super_check_offset); 1146 ldr(rscratch1, super_check_addr); 1147 cmp(super_klass, rscratch1); // load displayed supertype 1148 1149 // This check has worked decisively for primary supers. 1150 // Secondary supers are sought in the super_cache ('super_cache_addr'). 1151 // (Secondary supers are interfaces and very deeply nested subtypes.) 1152 // This works in the same check above because of a tricky aliasing 1153 // between the super_cache and the primary super display elements. 1154 // (The 'super_check_addr' can address either, as the case requires.) 1155 // Note that the cache is updated below if it does not help us find 1156 // what we need immediately. 1157 // So if it was a primary super, we can just fail immediately. 1158 // Otherwise, it's the slow path for us (no success at this point). 1159 1160 if (super_check_offset.is_register()) { 1161 br(Assembler::EQ, *L_success); 1162 subs(zr, super_check_offset.as_register(), sc_offset); 1163 if (L_failure == &L_fallthrough) { 1164 br(Assembler::EQ, *L_slow_path); 1165 } else { 1166 br(Assembler::NE, *L_failure); 1167 final_jmp(*L_slow_path); 1168 } 1169 } else if (super_check_offset.as_constant() == sc_offset) { 1170 // Need a slow path; fast failure is impossible. 1171 if (L_slow_path == &L_fallthrough) { 1172 br(Assembler::EQ, *L_success); 1173 } else { 1174 br(Assembler::NE, *L_slow_path); 1175 final_jmp(*L_success); 1176 } 1177 } else { 1178 // No slow path; it's a fast decision. 1179 if (L_failure == &L_fallthrough) { 1180 br(Assembler::EQ, *L_success); 1181 } else { 1182 br(Assembler::NE, *L_failure); 1183 final_jmp(*L_success); 1184 } 1185 } 1186 1187 bind(L_fallthrough); 1188 1189 #undef final_jmp 1190 } 1191 1192 // These two are taken from x86, but they look generally useful 1193 1194 // scans count pointer sized words at [addr] for occurence of value, 1195 // generic 1196 void MacroAssembler::repne_scan(Register addr, Register value, Register count, 1197 Register scratch) { 1198 Label Lloop, Lexit; 1199 cbz(count, Lexit); 1200 bind(Lloop); 1201 ldr(scratch, post(addr, wordSize)); 1202 cmp(value, scratch); 1203 br(EQ, Lexit); 1204 sub(count, count, 1); 1205 cbnz(count, Lloop); 1206 bind(Lexit); 1207 } 1208 1209 // scans count 4 byte words at [addr] for occurence of value, 1210 // generic 1211 void MacroAssembler::repne_scanw(Register addr, Register value, Register count, 1212 Register scratch) { 1213 Label Lloop, Lexit; 1214 cbz(count, Lexit); 1215 bind(Lloop); 1216 ldrw(scratch, post(addr, wordSize)); 1217 cmpw(value, scratch); 1218 br(EQ, Lexit); 1219 sub(count, count, 1); 1220 cbnz(count, Lloop); 1221 bind(Lexit); 1222 } 1223 1224 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass, 1225 Register super_klass, 1226 Register temp_reg, 1227 Register temp2_reg, 1228 Label* L_success, 1229 Label* L_failure, 1230 bool set_cond_codes) { 1231 assert_different_registers(sub_klass, super_klass, temp_reg); 1232 if (temp2_reg != noreg) 1233 assert_different_registers(sub_klass, super_klass, temp_reg, temp2_reg, rscratch1); 1234 #define IS_A_TEMP(reg) ((reg) == temp_reg || (reg) == temp2_reg) 1235 1236 Label L_fallthrough; 1237 int label_nulls = 0; 1238 if (L_success == NULL) { L_success = &L_fallthrough; label_nulls++; } 1239 if (L_failure == NULL) { L_failure = &L_fallthrough; label_nulls++; } 1240 assert(label_nulls <= 1, "at most one NULL in the batch"); 1241 1242 // a couple of useful fields in sub_klass: 1243 int ss_offset = in_bytes(Klass::secondary_supers_offset()); 1244 int sc_offset = in_bytes(Klass::secondary_super_cache_offset()); 1245 Address secondary_supers_addr(sub_klass, ss_offset); 1246 Address super_cache_addr( sub_klass, sc_offset); 1247 1248 BLOCK_COMMENT("check_klass_subtype_slow_path"); 1249 1250 // Do a linear scan of the secondary super-klass chain. 1251 // This code is rarely used, so simplicity is a virtue here. 1252 // The repne_scan instruction uses fixed registers, which we must spill. 1253 // Don't worry too much about pre-existing connections with the input regs. 1254 1255 assert(sub_klass != r0, "killed reg"); // killed by mov(r0, super) 1256 assert(sub_klass != r2, "killed reg"); // killed by lea(r2, &pst_counter) 1257 1258 RegSet pushed_registers; 1259 if (!IS_A_TEMP(r2)) pushed_registers += r2; 1260 if (!IS_A_TEMP(r5)) pushed_registers += r5; 1261 1262 if (super_klass != r0 || UseCompressedOops) { 1263 if (!IS_A_TEMP(r0)) pushed_registers += r0; 1264 } 1265 1266 push(pushed_registers, sp); 1267 1268 // Get super_klass value into r0 (even if it was in r5 or r2). 1269 if (super_klass != r0) { 1270 mov(r0, super_klass); 1271 } 1272 1273 #ifndef PRODUCT 1274 mov(rscratch2, (address)&SharedRuntime::_partial_subtype_ctr); 1275 Address pst_counter_addr(rscratch2); 1276 ldr(rscratch1, pst_counter_addr); 1277 add(rscratch1, rscratch1, 1); 1278 str(rscratch1, pst_counter_addr); 1279 #endif //PRODUCT 1280 1281 // We will consult the secondary-super array. 1282 ldr(r5, secondary_supers_addr); 1283 // Load the array length. 1284 ldrw(r2, Address(r5, Array<Klass*>::length_offset_in_bytes())); 1285 // Skip to start of data. 1286 add(r5, r5, Array<Klass*>::base_offset_in_bytes()); 1287 1288 cmp(sp, zr); // Clear Z flag; SP is never zero 1289 // Scan R2 words at [R5] for an occurrence of R0. 1290 // Set NZ/Z based on last compare. 1291 repne_scan(r5, r0, r2, rscratch1); 1292 1293 // Unspill the temp. registers: 1294 pop(pushed_registers, sp); 1295 1296 br(Assembler::NE, *L_failure); 1297 1298 // Success. Cache the super we found and proceed in triumph. 1299 str(super_klass, super_cache_addr); 1300 1301 if (L_success != &L_fallthrough) { 1302 b(*L_success); 1303 } 1304 1305 #undef IS_A_TEMP 1306 1307 bind(L_fallthrough); 1308 } 1309 1310 1311 void MacroAssembler::verify_oop(Register reg, const char* s) { 1312 if (!VerifyOops) return; 1313 1314 // Pass register number to verify_oop_subroutine 1315 const char* b = NULL; 1316 { 1317 ResourceMark rm; 1318 stringStream ss; 1319 ss.print("verify_oop: %s: %s", reg->name(), s); 1320 b = code_string(ss.as_string()); 1321 } 1322 BLOCK_COMMENT("verify_oop {"); 1323 1324 stp(r0, rscratch1, Address(pre(sp, -2 * wordSize))); 1325 stp(rscratch2, lr, Address(pre(sp, -2 * wordSize))); 1326 1327 mov(r0, reg); 1328 mov(rscratch1, (address)b); 1329 1330 // call indirectly to solve generation ordering problem 1331 lea(rscratch2, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address())); 1332 ldr(rscratch2, Address(rscratch2)); 1333 blr(rscratch2); 1334 1335 ldp(rscratch2, lr, Address(post(sp, 2 * wordSize))); 1336 ldp(r0, rscratch1, Address(post(sp, 2 * wordSize))); 1337 1338 BLOCK_COMMENT("} verify_oop"); 1339 } 1340 1341 void MacroAssembler::verify_oop_addr(Address addr, const char* s) { 1342 if (!VerifyOops) return; 1343 1344 const char* b = NULL; 1345 { 1346 ResourceMark rm; 1347 stringStream ss; 1348 ss.print("verify_oop_addr: %s", s); 1349 b = code_string(ss.as_string()); 1350 } 1351 BLOCK_COMMENT("verify_oop_addr {"); 1352 1353 stp(r0, rscratch1, Address(pre(sp, -2 * wordSize))); 1354 stp(rscratch2, lr, Address(pre(sp, -2 * wordSize))); 1355 1356 // addr may contain sp so we will have to adjust it based on the 1357 // pushes that we just did. 1358 if (addr.uses(sp)) { 1359 lea(r0, addr); 1360 ldr(r0, Address(r0, 4 * wordSize)); 1361 } else { 1362 ldr(r0, addr); 1363 } 1364 mov(rscratch1, (address)b); 1365 1366 // call indirectly to solve generation ordering problem 1367 lea(rscratch2, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address())); 1368 ldr(rscratch2, Address(rscratch2)); 1369 blr(rscratch2); 1370 1371 ldp(rscratch2, lr, Address(post(sp, 2 * wordSize))); 1372 ldp(r0, rscratch1, Address(post(sp, 2 * wordSize))); 1373 1374 BLOCK_COMMENT("} verify_oop_addr"); 1375 } 1376 1377 Address MacroAssembler::argument_address(RegisterOrConstant arg_slot, 1378 int extra_slot_offset) { 1379 // cf. TemplateTable::prepare_invoke(), if (load_receiver). 1380 int stackElementSize = Interpreter::stackElementSize; 1381 int offset = Interpreter::expr_offset_in_bytes(extra_slot_offset+0); 1382 #ifdef ASSERT 1383 int offset1 = Interpreter::expr_offset_in_bytes(extra_slot_offset+1); 1384 assert(offset1 - offset == stackElementSize, "correct arithmetic"); 1385 #endif 1386 if (arg_slot.is_constant()) { 1387 return Address(esp, arg_slot.as_constant() * stackElementSize 1388 + offset); 1389 } else { 1390 add(rscratch1, esp, arg_slot.as_register(), 1391 ext::uxtx, exact_log2(stackElementSize)); 1392 return Address(rscratch1, offset); 1393 } 1394 } 1395 1396 void MacroAssembler::call_VM_leaf_base(address entry_point, 1397 int number_of_arguments, 1398 Label *retaddr) { 1399 call_VM_leaf_base1(entry_point, number_of_arguments, 0, ret_type_integral, retaddr); 1400 } 1401 1402 void MacroAssembler::call_VM_leaf_base1(address entry_point, 1403 int number_of_gp_arguments, 1404 int number_of_fp_arguments, 1405 ret_type type, 1406 Label *retaddr) { 1407 Label E, L; 1408 1409 stp(rscratch1, rmethod, Address(pre(sp, -2 * wordSize))); 1410 1411 // We add 1 to number_of_arguments because the thread in arg0 is 1412 // not counted 1413 mov(rscratch1, entry_point); 1414 blrt(rscratch1, number_of_gp_arguments + 1, number_of_fp_arguments, type); 1415 if (retaddr) 1416 bind(*retaddr); 1417 1418 ldp(rscratch1, rmethod, Address(post(sp, 2 * wordSize))); 1419 maybe_isb(); 1420 } 1421 1422 void MacroAssembler::call_VM_leaf(address entry_point, int number_of_arguments) { 1423 call_VM_leaf_base(entry_point, number_of_arguments); 1424 } 1425 1426 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0) { 1427 pass_arg0(this, arg_0); 1428 call_VM_leaf_base(entry_point, 1); 1429 } 1430 1431 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1) { 1432 pass_arg0(this, arg_0); 1433 pass_arg1(this, arg_1); 1434 call_VM_leaf_base(entry_point, 2); 1435 } 1436 1437 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, 1438 Register arg_1, Register arg_2) { 1439 pass_arg0(this, arg_0); 1440 pass_arg1(this, arg_1); 1441 pass_arg2(this, arg_2); 1442 call_VM_leaf_base(entry_point, 3); 1443 } 1444 1445 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0) { 1446 pass_arg0(this, arg_0); 1447 MacroAssembler::call_VM_leaf_base(entry_point, 1); 1448 } 1449 1450 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1) { 1451 1452 assert(arg_0 != c_rarg1, "smashed arg"); 1453 pass_arg1(this, arg_1); 1454 pass_arg0(this, arg_0); 1455 MacroAssembler::call_VM_leaf_base(entry_point, 2); 1456 } 1457 1458 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) { 1459 assert(arg_0 != c_rarg2, "smashed arg"); 1460 assert(arg_1 != c_rarg2, "smashed arg"); 1461 pass_arg2(this, arg_2); 1462 assert(arg_0 != c_rarg1, "smashed arg"); 1463 pass_arg1(this, arg_1); 1464 pass_arg0(this, arg_0); 1465 MacroAssembler::call_VM_leaf_base(entry_point, 3); 1466 } 1467 1468 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) { 1469 assert(arg_0 != c_rarg3, "smashed arg"); 1470 assert(arg_1 != c_rarg3, "smashed arg"); 1471 assert(arg_2 != c_rarg3, "smashed arg"); 1472 pass_arg3(this, arg_3); 1473 assert(arg_0 != c_rarg2, "smashed arg"); 1474 assert(arg_1 != c_rarg2, "smashed arg"); 1475 pass_arg2(this, arg_2); 1476 assert(arg_0 != c_rarg1, "smashed arg"); 1477 pass_arg1(this, arg_1); 1478 pass_arg0(this, arg_0); 1479 MacroAssembler::call_VM_leaf_base(entry_point, 4); 1480 } 1481 1482 void MacroAssembler::null_check(Register reg, int offset) { 1483 if (needs_explicit_null_check(offset)) { 1484 // provoke OS NULL exception if reg = NULL by 1485 // accessing M[reg] w/o changing any registers 1486 // NOTE: this is plenty to provoke a segv 1487 ldr(zr, Address(reg)); 1488 } else { 1489 // nothing to do, (later) access of M[reg + offset] 1490 // will provoke OS NULL exception if reg = NULL 1491 } 1492 } 1493 1494 // MacroAssembler protected routines needed to implement 1495 // public methods 1496 1497 void MacroAssembler::mov(Register r, Address dest) { 1498 code_section()->relocate(pc(), dest.rspec()); 1499 u_int64_t imm64 = (u_int64_t)dest.target(); 1500 movptr(r, imm64); 1501 } 1502 1503 // Move a constant pointer into r. In AArch64 mode the virtual 1504 // address space is 48 bits in size, so we only need three 1505 // instructions to create a patchable instruction sequence that can 1506 // reach anywhere. 1507 void MacroAssembler::movptr(Register r, uintptr_t imm64) { 1508 #ifndef PRODUCT 1509 { 1510 char buffer[64]; 1511 snprintf(buffer, sizeof(buffer), "0x%" PRIX64, imm64); 1512 block_comment(buffer); 1513 } 1514 #endif 1515 assert(imm64 < (1ul << 48), "48-bit overflow in address constant"); 1516 movz(r, imm64 & 0xffff); 1517 imm64 >>= 16; 1518 movk(r, imm64 & 0xffff, 16); 1519 imm64 >>= 16; 1520 movk(r, imm64 & 0xffff, 32); 1521 } 1522 1523 // Macro to mov replicated immediate to vector register. 1524 // Vd will get the following values for different arrangements in T 1525 // imm32 == hex 000000gh T8B: Vd = ghghghghghghghgh 1526 // imm32 == hex 000000gh T16B: Vd = ghghghghghghghghghghghghghghghgh 1527 // imm32 == hex 0000efgh T4H: Vd = efghefghefghefgh 1528 // imm32 == hex 0000efgh T8H: Vd = efghefghefghefghefghefghefghefgh 1529 // imm32 == hex abcdefgh T2S: Vd = abcdefghabcdefgh 1530 // imm32 == hex abcdefgh T4S: Vd = abcdefghabcdefghabcdefghabcdefgh 1531 // T1D/T2D: invalid 1532 void MacroAssembler::mov(FloatRegister Vd, SIMD_Arrangement T, u_int32_t imm32) { 1533 assert(T != T1D && T != T2D, "invalid arrangement"); 1534 if (T == T8B || T == T16B) { 1535 assert((imm32 & ~0xff) == 0, "extraneous bits in unsigned imm32 (T8B/T16B)"); 1536 movi(Vd, T, imm32 & 0xff, 0); 1537 return; 1538 } 1539 u_int32_t nimm32 = ~imm32; 1540 if (T == T4H || T == T8H) { 1541 assert((imm32 & ~0xffff) == 0, "extraneous bits in unsigned imm32 (T4H/T8H)"); 1542 imm32 &= 0xffff; 1543 nimm32 &= 0xffff; 1544 } 1545 u_int32_t x = imm32; 1546 int movi_cnt = 0; 1547 int movn_cnt = 0; 1548 while (x) { if (x & 0xff) movi_cnt++; x >>= 8; } 1549 x = nimm32; 1550 while (x) { if (x & 0xff) movn_cnt++; x >>= 8; } 1551 if (movn_cnt < movi_cnt) imm32 = nimm32; 1552 unsigned lsl = 0; 1553 while (imm32 && (imm32 & 0xff) == 0) { lsl += 8; imm32 >>= 8; } 1554 if (movn_cnt < movi_cnt) 1555 mvni(Vd, T, imm32 & 0xff, lsl); 1556 else 1557 movi(Vd, T, imm32 & 0xff, lsl); 1558 imm32 >>= 8; lsl += 8; 1559 while (imm32) { 1560 while ((imm32 & 0xff) == 0) { lsl += 8; imm32 >>= 8; } 1561 if (movn_cnt < movi_cnt) 1562 bici(Vd, T, imm32 & 0xff, lsl); 1563 else 1564 orri(Vd, T, imm32 & 0xff, lsl); 1565 lsl += 8; imm32 >>= 8; 1566 } 1567 } 1568 1569 void MacroAssembler::mov_immediate64(Register dst, u_int64_t imm64) 1570 { 1571 #ifndef PRODUCT 1572 { 1573 char buffer[64]; 1574 snprintf(buffer, sizeof(buffer), "0x%" PRIX64, imm64); 1575 block_comment(buffer); 1576 } 1577 #endif 1578 if (operand_valid_for_logical_immediate(false, imm64)) { 1579 orr(dst, zr, imm64); 1580 } else { 1581 // we can use a combination of MOVZ or MOVN with 1582 // MOVK to build up the constant 1583 u_int64_t imm_h[4]; 1584 int zero_count = 0; 1585 int neg_count = 0; 1586 int i; 1587 for (i = 0; i < 4; i++) { 1588 imm_h[i] = ((imm64 >> (i * 16)) & 0xffffL); 1589 if (imm_h[i] == 0) { 1590 zero_count++; 1591 } else if (imm_h[i] == 0xffffL) { 1592 neg_count++; 1593 } 1594 } 1595 if (zero_count == 4) { 1596 // one MOVZ will do 1597 movz(dst, 0); 1598 } else if (neg_count == 4) { 1599 // one MOVN will do 1600 movn(dst, 0); 1601 } else if (zero_count == 3) { 1602 for (i = 0; i < 4; i++) { 1603 if (imm_h[i] != 0L) { 1604 movz(dst, (u_int32_t)imm_h[i], (i << 4)); 1605 break; 1606 } 1607 } 1608 } else if (neg_count == 3) { 1609 // one MOVN will do 1610 for (int i = 0; i < 4; i++) { 1611 if (imm_h[i] != 0xffffL) { 1612 movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4)); 1613 break; 1614 } 1615 } 1616 } else if (zero_count == 2) { 1617 // one MOVZ and one MOVK will do 1618 for (i = 0; i < 3; i++) { 1619 if (imm_h[i] != 0L) { 1620 movz(dst, (u_int32_t)imm_h[i], (i << 4)); 1621 i++; 1622 break; 1623 } 1624 } 1625 for (;i < 4; i++) { 1626 if (imm_h[i] != 0L) { 1627 movk(dst, (u_int32_t)imm_h[i], (i << 4)); 1628 } 1629 } 1630 } else if (neg_count == 2) { 1631 // one MOVN and one MOVK will do 1632 for (i = 0; i < 4; i++) { 1633 if (imm_h[i] != 0xffffL) { 1634 movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4)); 1635 i++; 1636 break; 1637 } 1638 } 1639 for (;i < 4; i++) { 1640 if (imm_h[i] != 0xffffL) { 1641 movk(dst, (u_int32_t)imm_h[i], (i << 4)); 1642 } 1643 } 1644 } else if (zero_count == 1) { 1645 // one MOVZ and two MOVKs will do 1646 for (i = 0; i < 4; i++) { 1647 if (imm_h[i] != 0L) { 1648 movz(dst, (u_int32_t)imm_h[i], (i << 4)); 1649 i++; 1650 break; 1651 } 1652 } 1653 for (;i < 4; i++) { 1654 if (imm_h[i] != 0x0L) { 1655 movk(dst, (u_int32_t)imm_h[i], (i << 4)); 1656 } 1657 } 1658 } else if (neg_count == 1) { 1659 // one MOVN and two MOVKs will do 1660 for (i = 0; i < 4; i++) { 1661 if (imm_h[i] != 0xffffL) { 1662 movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4)); 1663 i++; 1664 break; 1665 } 1666 } 1667 for (;i < 4; i++) { 1668 if (imm_h[i] != 0xffffL) { 1669 movk(dst, (u_int32_t)imm_h[i], (i << 4)); 1670 } 1671 } 1672 } else { 1673 // use a MOVZ and 3 MOVKs (makes it easier to debug) 1674 movz(dst, (u_int32_t)imm_h[0], 0); 1675 for (i = 1; i < 4; i++) { 1676 movk(dst, (u_int32_t)imm_h[i], (i << 4)); 1677 } 1678 } 1679 } 1680 } 1681 1682 void MacroAssembler::mov_immediate32(Register dst, u_int32_t imm32) 1683 { 1684 #ifndef PRODUCT 1685 { 1686 char buffer[64]; 1687 snprintf(buffer, sizeof(buffer), "0x%" PRIX32, imm32); 1688 block_comment(buffer); 1689 } 1690 #endif 1691 if (operand_valid_for_logical_immediate(true, imm32)) { 1692 orrw(dst, zr, imm32); 1693 } else { 1694 // we can use MOVZ, MOVN or two calls to MOVK to build up the 1695 // constant 1696 u_int32_t imm_h[2]; 1697 imm_h[0] = imm32 & 0xffff; 1698 imm_h[1] = ((imm32 >> 16) & 0xffff); 1699 if (imm_h[0] == 0) { 1700 movzw(dst, imm_h[1], 16); 1701 } else if (imm_h[0] == 0xffff) { 1702 movnw(dst, imm_h[1] ^ 0xffff, 16); 1703 } else if (imm_h[1] == 0) { 1704 movzw(dst, imm_h[0], 0); 1705 } else if (imm_h[1] == 0xffff) { 1706 movnw(dst, imm_h[0] ^ 0xffff, 0); 1707 } else { 1708 // use a MOVZ and MOVK (makes it easier to debug) 1709 movzw(dst, imm_h[0], 0); 1710 movkw(dst, imm_h[1], 16); 1711 } 1712 } 1713 } 1714 1715 // Form an address from base + offset in Rd. Rd may or may 1716 // not actually be used: you must use the Address that is returned. 1717 // It is up to you to ensure that the shift provided matches the size 1718 // of your data. 1719 Address MacroAssembler::form_address(Register Rd, Register base, long byte_offset, int shift) { 1720 if (Address::offset_ok_for_immed(byte_offset, shift)) 1721 // It fits; no need for any heroics 1722 return Address(base, byte_offset); 1723 1724 // Don't do anything clever with negative or misaligned offsets 1725 unsigned mask = (1 << shift) - 1; 1726 if (byte_offset < 0 || byte_offset & mask) { 1727 mov(Rd, byte_offset); 1728 add(Rd, base, Rd); 1729 return Address(Rd); 1730 } 1731 1732 // See if we can do this with two 12-bit offsets 1733 { 1734 unsigned long word_offset = byte_offset >> shift; 1735 unsigned long masked_offset = word_offset & 0xfff000; 1736 if (Address::offset_ok_for_immed(word_offset - masked_offset) 1737 && Assembler::operand_valid_for_add_sub_immediate(masked_offset << shift)) { 1738 add(Rd, base, masked_offset << shift); 1739 word_offset -= masked_offset; 1740 return Address(Rd, word_offset << shift); 1741 } 1742 } 1743 1744 // Do it the hard way 1745 mov(Rd, byte_offset); 1746 add(Rd, base, Rd); 1747 return Address(Rd); 1748 } 1749 1750 void MacroAssembler::atomic_incw(Register counter_addr, Register tmp, Register tmp2) { 1751 if (UseLSE) { 1752 mov(tmp, 1); 1753 ldadd(Assembler::word, tmp, zr, counter_addr); 1754 return; 1755 } 1756 Label retry_load; 1757 if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH)) 1758 prfm(Address(counter_addr), PSTL1STRM); 1759 bind(retry_load); 1760 // flush and load exclusive from the memory location 1761 ldxrw(tmp, counter_addr); 1762 addw(tmp, tmp, 1); 1763 // if we store+flush with no intervening write tmp wil be zero 1764 stxrw(tmp2, tmp, counter_addr); 1765 cbnzw(tmp2, retry_load); 1766 } 1767 1768 1769 int MacroAssembler::corrected_idivl(Register result, Register ra, Register rb, 1770 bool want_remainder, Register scratch) 1771 { 1772 // Full implementation of Java idiv and irem. The function 1773 // returns the (pc) offset of the div instruction - may be needed 1774 // for implicit exceptions. 1775 // 1776 // constraint : ra/rb =/= scratch 1777 // normal case 1778 // 1779 // input : ra: dividend 1780 // rb: divisor 1781 // 1782 // result: either 1783 // quotient (= ra idiv rb) 1784 // remainder (= ra irem rb) 1785 1786 assert(ra != scratch && rb != scratch, "reg cannot be scratch"); 1787 1788 int idivl_offset = offset(); 1789 if (! want_remainder) { 1790 sdivw(result, ra, rb); 1791 } else { 1792 sdivw(scratch, ra, rb); 1793 Assembler::msubw(result, scratch, rb, ra); 1794 } 1795 1796 return idivl_offset; 1797 } 1798 1799 int MacroAssembler::corrected_idivq(Register result, Register ra, Register rb, 1800 bool want_remainder, Register scratch) 1801 { 1802 // Full implementation of Java ldiv and lrem. The function 1803 // returns the (pc) offset of the div instruction - may be needed 1804 // for implicit exceptions. 1805 // 1806 // constraint : ra/rb =/= scratch 1807 // normal case 1808 // 1809 // input : ra: dividend 1810 // rb: divisor 1811 // 1812 // result: either 1813 // quotient (= ra idiv rb) 1814 // remainder (= ra irem rb) 1815 1816 assert(ra != scratch && rb != scratch, "reg cannot be scratch"); 1817 1818 int idivq_offset = offset(); 1819 if (! want_remainder) { 1820 sdiv(result, ra, rb); 1821 } else { 1822 sdiv(scratch, ra, rb); 1823 Assembler::msub(result, scratch, rb, ra); 1824 } 1825 1826 return idivq_offset; 1827 } 1828 1829 void MacroAssembler::membar(Membar_mask_bits order_constraint) { 1830 address prev = pc() - NativeMembar::instruction_size; 1831 address last = code()->last_insn(); 1832 if (last != NULL && nativeInstruction_at(last)->is_Membar() && prev == last) { 1833 NativeMembar *bar = NativeMembar_at(prev); 1834 // We are merging two memory barrier instructions. On AArch64 we 1835 // can do this simply by ORing them together. 1836 bar->set_kind(bar->get_kind() | order_constraint); 1837 BLOCK_COMMENT("merged membar"); 1838 } else { 1839 code()->set_last_insn(pc()); 1840 dmb(Assembler::barrier(order_constraint)); 1841 } 1842 } 1843 1844 bool MacroAssembler::try_merge_ldst(Register rt, const Address &adr, size_t size_in_bytes, bool is_store) { 1845 if (ldst_can_merge(rt, adr, size_in_bytes, is_store)) { 1846 merge_ldst(rt, adr, size_in_bytes, is_store); 1847 code()->clear_last_insn(); 1848 return true; 1849 } else { 1850 assert(size_in_bytes == 8 || size_in_bytes == 4, "only 8 bytes or 4 bytes load/store is supported."); 1851 const unsigned mask = size_in_bytes - 1; 1852 if (adr.getMode() == Address::base_plus_offset && 1853 (adr.offset() & mask) == 0) { // only supports base_plus_offset. 1854 code()->set_last_insn(pc()); 1855 } 1856 return false; 1857 } 1858 } 1859 1860 void MacroAssembler::ldr(Register Rx, const Address &adr) { 1861 // We always try to merge two adjacent loads into one ldp. 1862 if (!try_merge_ldst(Rx, adr, 8, false)) { 1863 Assembler::ldr(Rx, adr); 1864 } 1865 } 1866 1867 void MacroAssembler::ldrw(Register Rw, const Address &adr) { 1868 // We always try to merge two adjacent loads into one ldp. 1869 if (!try_merge_ldst(Rw, adr, 4, false)) { 1870 Assembler::ldrw(Rw, adr); 1871 } 1872 } 1873 1874 void MacroAssembler::str(Register Rx, const Address &adr) { 1875 // We always try to merge two adjacent stores into one stp. 1876 if (!try_merge_ldst(Rx, adr, 8, true)) { 1877 Assembler::str(Rx, adr); 1878 } 1879 } 1880 1881 void MacroAssembler::strw(Register Rw, const Address &adr) { 1882 // We always try to merge two adjacent stores into one stp. 1883 if (!try_merge_ldst(Rw, adr, 4, true)) { 1884 Assembler::strw(Rw, adr); 1885 } 1886 } 1887 1888 // MacroAssembler routines found actually to be needed 1889 1890 void MacroAssembler::push(Register src) 1891 { 1892 str(src, Address(pre(esp, -1 * wordSize))); 1893 } 1894 1895 void MacroAssembler::pop(Register dst) 1896 { 1897 ldr(dst, Address(post(esp, 1 * wordSize))); 1898 } 1899 1900 // Note: load_unsigned_short used to be called load_unsigned_word. 1901 int MacroAssembler::load_unsigned_short(Register dst, Address src) { 1902 int off = offset(); 1903 ldrh(dst, src); 1904 return off; 1905 } 1906 1907 int MacroAssembler::load_unsigned_byte(Register dst, Address src) { 1908 int off = offset(); 1909 ldrb(dst, src); 1910 return off; 1911 } 1912 1913 int MacroAssembler::load_signed_short(Register dst, Address src) { 1914 int off = offset(); 1915 ldrsh(dst, src); 1916 return off; 1917 } 1918 1919 int MacroAssembler::load_signed_byte(Register dst, Address src) { 1920 int off = offset(); 1921 ldrsb(dst, src); 1922 return off; 1923 } 1924 1925 int MacroAssembler::load_signed_short32(Register dst, Address src) { 1926 int off = offset(); 1927 ldrshw(dst, src); 1928 return off; 1929 } 1930 1931 int MacroAssembler::load_signed_byte32(Register dst, Address src) { 1932 int off = offset(); 1933 ldrsbw(dst, src); 1934 return off; 1935 } 1936 1937 void MacroAssembler::load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed, Register dst2) { 1938 switch (size_in_bytes) { 1939 case 8: ldr(dst, src); break; 1940 case 4: ldrw(dst, src); break; 1941 case 2: is_signed ? load_signed_short(dst, src) : load_unsigned_short(dst, src); break; 1942 case 1: is_signed ? load_signed_byte( dst, src) : load_unsigned_byte( dst, src); break; 1943 default: ShouldNotReachHere(); 1944 } 1945 } 1946 1947 void MacroAssembler::store_sized_value(Address dst, Register src, size_t size_in_bytes, Register src2) { 1948 switch (size_in_bytes) { 1949 case 8: str(src, dst); break; 1950 case 4: strw(src, dst); break; 1951 case 2: strh(src, dst); break; 1952 case 1: strb(src, dst); break; 1953 default: ShouldNotReachHere(); 1954 } 1955 } 1956 1957 void MacroAssembler::decrementw(Register reg, int value) 1958 { 1959 if (value < 0) { incrementw(reg, -value); return; } 1960 if (value == 0) { return; } 1961 if (value < (1 << 12)) { subw(reg, reg, value); return; } 1962 /* else */ { 1963 guarantee(reg != rscratch2, "invalid dst for register decrement"); 1964 movw(rscratch2, (unsigned)value); 1965 subw(reg, reg, rscratch2); 1966 } 1967 } 1968 1969 void MacroAssembler::decrement(Register reg, int value) 1970 { 1971 if (value < 0) { increment(reg, -value); return; } 1972 if (value == 0) { return; } 1973 if (value < (1 << 12)) { sub(reg, reg, value); return; } 1974 /* else */ { 1975 assert(reg != rscratch2, "invalid dst for register decrement"); 1976 mov(rscratch2, (unsigned long)value); 1977 sub(reg, reg, rscratch2); 1978 } 1979 } 1980 1981 void MacroAssembler::decrementw(Address dst, int value) 1982 { 1983 assert(!dst.uses(rscratch1), "invalid dst for address decrement"); 1984 if (dst.getMode() == Address::literal) { 1985 assert(abs(value) < (1 << 12), "invalid value and address mode combination"); 1986 lea(rscratch2, dst); 1987 dst = Address(rscratch2); 1988 } 1989 ldrw(rscratch1, dst); 1990 decrementw(rscratch1, value); 1991 strw(rscratch1, dst); 1992 } 1993 1994 void MacroAssembler::decrement(Address dst, int value) 1995 { 1996 assert(!dst.uses(rscratch1), "invalid address for decrement"); 1997 if (dst.getMode() == Address::literal) { 1998 assert(abs(value) < (1 << 12), "invalid value and address mode combination"); 1999 lea(rscratch2, dst); 2000 dst = Address(rscratch2); 2001 } 2002 ldr(rscratch1, dst); 2003 decrement(rscratch1, value); 2004 str(rscratch1, dst); 2005 } 2006 2007 void MacroAssembler::incrementw(Register reg, int value) 2008 { 2009 if (value < 0) { decrementw(reg, -value); return; } 2010 if (value == 0) { return; } 2011 if (value < (1 << 12)) { addw(reg, reg, value); return; } 2012 /* else */ { 2013 assert(reg != rscratch2, "invalid dst for register increment"); 2014 movw(rscratch2, (unsigned)value); 2015 addw(reg, reg, rscratch2); 2016 } 2017 } 2018 2019 void MacroAssembler::increment(Register reg, int value) 2020 { 2021 if (value < 0) { decrement(reg, -value); return; } 2022 if (value == 0) { return; } 2023 if (value < (1 << 12)) { add(reg, reg, value); return; } 2024 /* else */ { 2025 assert(reg != rscratch2, "invalid dst for register increment"); 2026 movw(rscratch2, (unsigned)value); 2027 add(reg, reg, rscratch2); 2028 } 2029 } 2030 2031 void MacroAssembler::incrementw(Address dst, int value) 2032 { 2033 assert(!dst.uses(rscratch1), "invalid dst for address increment"); 2034 if (dst.getMode() == Address::literal) { 2035 assert(abs(value) < (1 << 12), "invalid value and address mode combination"); 2036 lea(rscratch2, dst); 2037 dst = Address(rscratch2); 2038 } 2039 ldrw(rscratch1, dst); 2040 incrementw(rscratch1, value); 2041 strw(rscratch1, dst); 2042 } 2043 2044 void MacroAssembler::increment(Address dst, int value) 2045 { 2046 assert(!dst.uses(rscratch1), "invalid dst for address increment"); 2047 if (dst.getMode() == Address::literal) { 2048 assert(abs(value) < (1 << 12), "invalid value and address mode combination"); 2049 lea(rscratch2, dst); 2050 dst = Address(rscratch2); 2051 } 2052 ldr(rscratch1, dst); 2053 increment(rscratch1, value); 2054 str(rscratch1, dst); 2055 } 2056 2057 2058 void MacroAssembler::pusha() { 2059 push(0x7fffffff, sp); 2060 } 2061 2062 void MacroAssembler::popa() { 2063 pop(0x7fffffff, sp); 2064 } 2065 2066 // Push lots of registers in the bit set supplied. Don't push sp. 2067 // Return the number of words pushed 2068 int MacroAssembler::push(unsigned int bitset, Register stack) { 2069 int words_pushed = 0; 2070 2071 // Scan bitset to accumulate register pairs 2072 unsigned char regs[32]; 2073 int count = 0; 2074 for (int reg = 0; reg <= 30; reg++) { 2075 if (1 & bitset) 2076 regs[count++] = reg; 2077 bitset >>= 1; 2078 } 2079 regs[count++] = zr->encoding_nocheck(); 2080 count &= ~1; // Only push an even nuber of regs 2081 2082 if (count) { 2083 stp(as_Register(regs[0]), as_Register(regs[1]), 2084 Address(pre(stack, -count * wordSize))); 2085 words_pushed += 2; 2086 } 2087 for (int i = 2; i < count; i += 2) { 2088 stp(as_Register(regs[i]), as_Register(regs[i+1]), 2089 Address(stack, i * wordSize)); 2090 words_pushed += 2; 2091 } 2092 2093 assert(words_pushed == count, "oops, pushed != count"); 2094 2095 return count; 2096 } 2097 2098 int MacroAssembler::pop(unsigned int bitset, Register stack) { 2099 int words_pushed = 0; 2100 2101 // Scan bitset to accumulate register pairs 2102 unsigned char regs[32]; 2103 int count = 0; 2104 for (int reg = 0; reg <= 30; reg++) { 2105 if (1 & bitset) 2106 regs[count++] = reg; 2107 bitset >>= 1; 2108 } 2109 regs[count++] = zr->encoding_nocheck(); 2110 count &= ~1; 2111 2112 for (int i = 2; i < count; i += 2) { 2113 ldp(as_Register(regs[i]), as_Register(regs[i+1]), 2114 Address(stack, i * wordSize)); 2115 words_pushed += 2; 2116 } 2117 if (count) { 2118 ldp(as_Register(regs[0]), as_Register(regs[1]), 2119 Address(post(stack, count * wordSize))); 2120 words_pushed += 2; 2121 } 2122 2123 assert(words_pushed == count, "oops, pushed != count"); 2124 2125 return count; 2126 } 2127 #ifdef ASSERT 2128 void MacroAssembler::verify_heapbase(const char* msg) { 2129 #if 0 2130 assert (UseCompressedOops || UseCompressedClassPointers, "should be compressed"); 2131 assert (Universe::heap() != NULL, "java heap should be initialized"); 2132 if (CheckCompressedOops) { 2133 Label ok; 2134 push(1 << rscratch1->encoding(), sp); // cmpptr trashes rscratch1 2135 cmpptr(rheapbase, ExternalAddress((address)CompressedOops::ptrs_base_addr())); 2136 br(Assembler::EQ, ok); 2137 stop(msg); 2138 bind(ok); 2139 pop(1 << rscratch1->encoding(), sp); 2140 } 2141 #endif 2142 } 2143 #endif 2144 2145 void MacroAssembler::resolve_jobject(Register value, Register thread, Register tmp) { 2146 Label done, not_weak; 2147 cbz(value, done); // Use NULL as-is. 2148 2149 STATIC_ASSERT(JNIHandles::weak_tag_mask == 1u); 2150 tbz(r0, 0, not_weak); // Test for jweak tag. 2151 2152 // Resolve jweak. 2153 access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF, value, 2154 Address(value, -JNIHandles::weak_tag_value), tmp, thread); 2155 verify_oop(value); 2156 b(done); 2157 2158 bind(not_weak); 2159 // Resolve (untagged) jobject. 2160 access_load_at(T_OBJECT, IN_NATIVE, value, Address(value, 0), tmp, thread); 2161 verify_oop(value); 2162 bind(done); 2163 } 2164 2165 void MacroAssembler::stop(const char* msg) { 2166 address ip = pc(); 2167 pusha(); 2168 mov(c_rarg0, (address)msg); 2169 mov(c_rarg1, (address)ip); 2170 mov(c_rarg2, sp); 2171 mov(c_rarg3, CAST_FROM_FN_PTR(address, MacroAssembler::debug64)); 2172 // call(c_rarg3); 2173 blrt(c_rarg3, 3, 0, 1); 2174 hlt(0); 2175 } 2176 2177 void MacroAssembler::warn(const char* msg) { 2178 pusha(); 2179 mov(c_rarg0, (address)msg); 2180 mov(lr, CAST_FROM_FN_PTR(address, warning)); 2181 blrt(lr, 1, 0, MacroAssembler::ret_type_void); 2182 popa(); 2183 } 2184 2185 void MacroAssembler::unimplemented(const char* what) { 2186 const char* buf = NULL; 2187 { 2188 ResourceMark rm; 2189 stringStream ss; 2190 ss.print("unimplemented: %s", what); 2191 buf = code_string(ss.as_string()); 2192 } 2193 stop(buf); 2194 } 2195 2196 // If a constant does not fit in an immediate field, generate some 2197 // number of MOV instructions and then perform the operation. 2198 void MacroAssembler::wrap_add_sub_imm_insn(Register Rd, Register Rn, unsigned imm, 2199 add_sub_imm_insn insn1, 2200 add_sub_reg_insn insn2) { 2201 assert(Rd != zr, "Rd = zr and not setting flags?"); 2202 if (operand_valid_for_add_sub_immediate((int)imm)) { 2203 (this->*insn1)(Rd, Rn, imm); 2204 } else { 2205 if (uabs(imm) < (1 << 24)) { 2206 (this->*insn1)(Rd, Rn, imm & -(1 << 12)); 2207 (this->*insn1)(Rd, Rd, imm & ((1 << 12)-1)); 2208 } else { 2209 assert_different_registers(Rd, Rn); 2210 mov(Rd, (uint64_t)imm); 2211 (this->*insn2)(Rd, Rn, Rd, LSL, 0); 2212 } 2213 } 2214 } 2215 2216 // Seperate vsn which sets the flags. Optimisations are more restricted 2217 // because we must set the flags correctly. 2218 void MacroAssembler::wrap_adds_subs_imm_insn(Register Rd, Register Rn, unsigned imm, 2219 add_sub_imm_insn insn1, 2220 add_sub_reg_insn insn2) { 2221 if (operand_valid_for_add_sub_immediate((int)imm)) { 2222 (this->*insn1)(Rd, Rn, imm); 2223 } else { 2224 assert_different_registers(Rd, Rn); 2225 assert(Rd != zr, "overflow in immediate operand"); 2226 mov(Rd, (uint64_t)imm); 2227 (this->*insn2)(Rd, Rn, Rd, LSL, 0); 2228 } 2229 } 2230 2231 2232 void MacroAssembler::add(Register Rd, Register Rn, RegisterOrConstant increment) { 2233 if (increment.is_register()) { 2234 add(Rd, Rn, increment.as_register()); 2235 } else { 2236 add(Rd, Rn, increment.as_constant()); 2237 } 2238 } 2239 2240 void MacroAssembler::addw(Register Rd, Register Rn, RegisterOrConstant increment) { 2241 if (increment.is_register()) { 2242 addw(Rd, Rn, increment.as_register()); 2243 } else { 2244 addw(Rd, Rn, increment.as_constant()); 2245 } 2246 } 2247 2248 void MacroAssembler::sub(Register Rd, Register Rn, RegisterOrConstant decrement) { 2249 if (decrement.is_register()) { 2250 sub(Rd, Rn, decrement.as_register()); 2251 } else { 2252 sub(Rd, Rn, decrement.as_constant()); 2253 } 2254 } 2255 2256 void MacroAssembler::subw(Register Rd, Register Rn, RegisterOrConstant decrement) { 2257 if (decrement.is_register()) { 2258 subw(Rd, Rn, decrement.as_register()); 2259 } else { 2260 subw(Rd, Rn, decrement.as_constant()); 2261 } 2262 } 2263 2264 void MacroAssembler::reinit_heapbase() 2265 { 2266 if (UseCompressedOops) { 2267 if (Universe::is_fully_initialized()) { 2268 mov(rheapbase, CompressedOops::ptrs_base()); 2269 } else { 2270 lea(rheapbase, ExternalAddress((address)CompressedOops::ptrs_base_addr())); 2271 ldr(rheapbase, Address(rheapbase)); 2272 } 2273 } 2274 } 2275 2276 // this simulates the behaviour of the x86 cmpxchg instruction using a 2277 // load linked/store conditional pair. we use the acquire/release 2278 // versions of these instructions so that we flush pending writes as 2279 // per Java semantics. 2280 2281 // n.b the x86 version assumes the old value to be compared against is 2282 // in rax and updates rax with the value located in memory if the 2283 // cmpxchg fails. we supply a register for the old value explicitly 2284 2285 // the aarch64 load linked/store conditional instructions do not 2286 // accept an offset. so, unlike x86, we must provide a plain register 2287 // to identify the memory word to be compared/exchanged rather than a 2288 // register+offset Address. 2289 2290 void MacroAssembler::cmpxchgptr(Register oldv, Register newv, Register addr, Register tmp, 2291 Label &succeed, Label *fail) { 2292 // oldv holds comparison value 2293 // newv holds value to write in exchange 2294 // addr identifies memory word to compare against/update 2295 if (UseLSE) { 2296 mov(tmp, oldv); 2297 casal(Assembler::xword, oldv, newv, addr); 2298 cmp(tmp, oldv); 2299 br(Assembler::EQ, succeed); 2300 membar(AnyAny); 2301 } else { 2302 Label retry_load, nope; 2303 if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH)) 2304 prfm(Address(addr), PSTL1STRM); 2305 bind(retry_load); 2306 // flush and load exclusive from the memory location 2307 // and fail if it is not what we expect 2308 ldaxr(tmp, addr); 2309 cmp(tmp, oldv); 2310 br(Assembler::NE, nope); 2311 // if we store+flush with no intervening write tmp wil be zero 2312 stlxr(tmp, newv, addr); 2313 cbzw(tmp, succeed); 2314 // retry so we only ever return after a load fails to compare 2315 // ensures we don't return a stale value after a failed write. 2316 b(retry_load); 2317 // if the memory word differs we return it in oldv and signal a fail 2318 bind(nope); 2319 membar(AnyAny); 2320 mov(oldv, tmp); 2321 } 2322 if (fail) 2323 b(*fail); 2324 } 2325 2326 void MacroAssembler::cmpxchg_obj_header(Register oldv, Register newv, Register obj, Register tmp, 2327 Label &succeed, Label *fail) { 2328 assert(oopDesc::mark_offset_in_bytes() == 0, "assumption"); 2329 cmpxchgptr(oldv, newv, obj, tmp, succeed, fail); 2330 } 2331 2332 void MacroAssembler::cmpxchgw(Register oldv, Register newv, Register addr, Register tmp, 2333 Label &succeed, Label *fail) { 2334 // oldv holds comparison value 2335 // newv holds value to write in exchange 2336 // addr identifies memory word to compare against/update 2337 // tmp returns 0/1 for success/failure 2338 if (UseLSE) { 2339 mov(tmp, oldv); 2340 casal(Assembler::word, oldv, newv, addr); 2341 cmp(tmp, oldv); 2342 br(Assembler::EQ, succeed); 2343 membar(AnyAny); 2344 } else { 2345 Label retry_load, nope; 2346 if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH)) 2347 prfm(Address(addr), PSTL1STRM); 2348 bind(retry_load); 2349 // flush and load exclusive from the memory location 2350 // and fail if it is not what we expect 2351 ldaxrw(tmp, addr); 2352 cmp(tmp, oldv); 2353 br(Assembler::NE, nope); 2354 // if we store+flush with no intervening write tmp wil be zero 2355 stlxrw(tmp, newv, addr); 2356 cbzw(tmp, succeed); 2357 // retry so we only ever return after a load fails to compare 2358 // ensures we don't return a stale value after a failed write. 2359 b(retry_load); 2360 // if the memory word differs we return it in oldv and signal a fail 2361 bind(nope); 2362 membar(AnyAny); 2363 mov(oldv, tmp); 2364 } 2365 if (fail) 2366 b(*fail); 2367 } 2368 2369 // A generic CAS; success or failure is in the EQ flag. A weak CAS 2370 // doesn't retry and may fail spuriously. If the oldval is wanted, 2371 // Pass a register for the result, otherwise pass noreg. 2372 2373 // Clobbers rscratch1 2374 void MacroAssembler::cmpxchg(Register addr, Register expected, 2375 Register new_val, 2376 enum operand_size size, 2377 bool acquire, bool release, 2378 bool weak, 2379 Register result) { 2380 if (result == noreg) result = rscratch1; 2381 BLOCK_COMMENT("cmpxchg {"); 2382 if (UseLSE) { 2383 mov(result, expected); 2384 lse_cas(result, new_val, addr, size, acquire, release, /*not_pair*/ true); 2385 compare_eq(result, expected, size); 2386 } else { 2387 Label retry_load, done; 2388 if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH)) 2389 prfm(Address(addr), PSTL1STRM); 2390 bind(retry_load); 2391 load_exclusive(result, addr, size, acquire); 2392 compare_eq(result, expected, size); 2393 br(Assembler::NE, done); 2394 store_exclusive(rscratch1, new_val, addr, size, release); 2395 if (weak) { 2396 cmpw(rscratch1, 0u); // If the store fails, return NE to our caller. 2397 } else { 2398 cbnzw(rscratch1, retry_load); 2399 } 2400 bind(done); 2401 } 2402 BLOCK_COMMENT("} cmpxchg"); 2403 } 2404 2405 // A generic comparison. Only compares for equality, clobbers rscratch1. 2406 void MacroAssembler::compare_eq(Register rm, Register rn, enum operand_size size) { 2407 if (size == xword) { 2408 cmp(rm, rn); 2409 } else if (size == word) { 2410 cmpw(rm, rn); 2411 } else if (size == halfword) { 2412 eorw(rscratch1, rm, rn); 2413 ands(zr, rscratch1, 0xffff); 2414 } else if (size == byte) { 2415 eorw(rscratch1, rm, rn); 2416 ands(zr, rscratch1, 0xff); 2417 } else { 2418 ShouldNotReachHere(); 2419 } 2420 } 2421 2422 2423 static bool different(Register a, RegisterOrConstant b, Register c) { 2424 if (b.is_constant()) 2425 return a != c; 2426 else 2427 return a != b.as_register() && a != c && b.as_register() != c; 2428 } 2429 2430 #define ATOMIC_OP(NAME, LDXR, OP, IOP, AOP, STXR, sz) \ 2431 void MacroAssembler::atomic_##NAME(Register prev, RegisterOrConstant incr, Register addr) { \ 2432 if (UseLSE) { \ 2433 prev = prev->is_valid() ? prev : zr; \ 2434 if (incr.is_register()) { \ 2435 AOP(sz, incr.as_register(), prev, addr); \ 2436 } else { \ 2437 mov(rscratch2, incr.as_constant()); \ 2438 AOP(sz, rscratch2, prev, addr); \ 2439 } \ 2440 return; \ 2441 } \ 2442 Register result = rscratch2; \ 2443 if (prev->is_valid()) \ 2444 result = different(prev, incr, addr) ? prev : rscratch2; \ 2445 \ 2446 Label retry_load; \ 2447 if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH)) \ 2448 prfm(Address(addr), PSTL1STRM); \ 2449 bind(retry_load); \ 2450 LDXR(result, addr); \ 2451 OP(rscratch1, result, incr); \ 2452 STXR(rscratch2, rscratch1, addr); \ 2453 cbnzw(rscratch2, retry_load); \ 2454 if (prev->is_valid() && prev != result) { \ 2455 IOP(prev, rscratch1, incr); \ 2456 } \ 2457 } 2458 2459 ATOMIC_OP(add, ldxr, add, sub, ldadd, stxr, Assembler::xword) 2460 ATOMIC_OP(addw, ldxrw, addw, subw, ldadd, stxrw, Assembler::word) 2461 ATOMIC_OP(addal, ldaxr, add, sub, ldaddal, stlxr, Assembler::xword) 2462 ATOMIC_OP(addalw, ldaxrw, addw, subw, ldaddal, stlxrw, Assembler::word) 2463 2464 #undef ATOMIC_OP 2465 2466 #define ATOMIC_XCHG(OP, AOP, LDXR, STXR, sz) \ 2467 void MacroAssembler::atomic_##OP(Register prev, Register newv, Register addr) { \ 2468 if (UseLSE) { \ 2469 prev = prev->is_valid() ? prev : zr; \ 2470 AOP(sz, newv, prev, addr); \ 2471 return; \ 2472 } \ 2473 Register result = rscratch2; \ 2474 if (prev->is_valid()) \ 2475 result = different(prev, newv, addr) ? prev : rscratch2; \ 2476 \ 2477 Label retry_load; \ 2478 if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH)) \ 2479 prfm(Address(addr), PSTL1STRM); \ 2480 bind(retry_load); \ 2481 LDXR(result, addr); \ 2482 STXR(rscratch1, newv, addr); \ 2483 cbnzw(rscratch1, retry_load); \ 2484 if (prev->is_valid() && prev != result) \ 2485 mov(prev, result); \ 2486 } 2487 2488 ATOMIC_XCHG(xchg, swp, ldxr, stxr, Assembler::xword) 2489 ATOMIC_XCHG(xchgw, swp, ldxrw, stxrw, Assembler::word) 2490 ATOMIC_XCHG(xchgal, swpal, ldaxr, stlxr, Assembler::xword) 2491 ATOMIC_XCHG(xchgalw, swpal, ldaxrw, stlxrw, Assembler::word) 2492 2493 #undef ATOMIC_XCHG 2494 2495 #ifndef PRODUCT 2496 extern "C" void findpc(intptr_t x); 2497 #endif 2498 2499 void MacroAssembler::debug64(char* msg, int64_t pc, int64_t regs[]) 2500 { 2501 // In order to get locks to work, we need to fake a in_VM state 2502 if (ShowMessageBoxOnError ) { 2503 JavaThread* thread = JavaThread::current(); 2504 JavaThreadState saved_state = thread->thread_state(); 2505 thread->set_thread_state(_thread_in_vm); 2506 #ifndef PRODUCT 2507 if (CountBytecodes || TraceBytecodes || StopInterpreterAt) { 2508 ttyLocker ttyl; 2509 BytecodeCounter::print(); 2510 } 2511 #endif 2512 if (os::message_box(msg, "Execution stopped, print registers?")) { 2513 ttyLocker ttyl; 2514 tty->print_cr(" pc = 0x%016lx", pc); 2515 #ifndef PRODUCT 2516 tty->cr(); 2517 findpc(pc); 2518 tty->cr(); 2519 #endif 2520 tty->print_cr(" r0 = 0x%016lx", regs[0]); 2521 tty->print_cr(" r1 = 0x%016lx", regs[1]); 2522 tty->print_cr(" r2 = 0x%016lx", regs[2]); 2523 tty->print_cr(" r3 = 0x%016lx", regs[3]); 2524 tty->print_cr(" r4 = 0x%016lx", regs[4]); 2525 tty->print_cr(" r5 = 0x%016lx", regs[5]); 2526 tty->print_cr(" r6 = 0x%016lx", regs[6]); 2527 tty->print_cr(" r7 = 0x%016lx", regs[7]); 2528 tty->print_cr(" r8 = 0x%016lx", regs[8]); 2529 tty->print_cr(" r9 = 0x%016lx", regs[9]); 2530 tty->print_cr("r10 = 0x%016lx", regs[10]); 2531 tty->print_cr("r11 = 0x%016lx", regs[11]); 2532 tty->print_cr("r12 = 0x%016lx", regs[12]); 2533 tty->print_cr("r13 = 0x%016lx", regs[13]); 2534 tty->print_cr("r14 = 0x%016lx", regs[14]); 2535 tty->print_cr("r15 = 0x%016lx", regs[15]); 2536 tty->print_cr("r16 = 0x%016lx", regs[16]); 2537 tty->print_cr("r17 = 0x%016lx", regs[17]); 2538 tty->print_cr("r18 = 0x%016lx", regs[18]); 2539 tty->print_cr("r19 = 0x%016lx", regs[19]); 2540 tty->print_cr("r20 = 0x%016lx", regs[20]); 2541 tty->print_cr("r21 = 0x%016lx", regs[21]); 2542 tty->print_cr("r22 = 0x%016lx", regs[22]); 2543 tty->print_cr("r23 = 0x%016lx", regs[23]); 2544 tty->print_cr("r24 = 0x%016lx", regs[24]); 2545 tty->print_cr("r25 = 0x%016lx", regs[25]); 2546 tty->print_cr("r26 = 0x%016lx", regs[26]); 2547 tty->print_cr("r27 = 0x%016lx", regs[27]); 2548 tty->print_cr("r28 = 0x%016lx", regs[28]); 2549 tty->print_cr("r30 = 0x%016lx", regs[30]); 2550 tty->print_cr("r31 = 0x%016lx", regs[31]); 2551 BREAKPOINT; 2552 } 2553 ThreadStateTransition::transition(thread, _thread_in_vm, saved_state); 2554 } else { 2555 ttyLocker ttyl; 2556 ::tty->print_cr("=============== DEBUG MESSAGE: %s ================\n", 2557 msg); 2558 assert(false, "DEBUG MESSAGE: %s", msg); 2559 } 2560 } 2561 2562 #ifdef BUILTIN_SIM 2563 // routine to generate an x86 prolog for a stub function which 2564 // bootstraps into the generated ARM code which directly follows the 2565 // stub 2566 // 2567 // the argument encodes the number of general and fp registers 2568 // passed by the caller and the callng convention (currently just 2569 // the number of general registers and assumes C argument passing) 2570 2571 extern "C" { 2572 int aarch64_stub_prolog_size(); 2573 void aarch64_stub_prolog(); 2574 void aarch64_prolog(); 2575 } 2576 2577 void MacroAssembler::c_stub_prolog(int gp_arg_count, int fp_arg_count, int ret_type, 2578 address *prolog_ptr) 2579 { 2580 int calltype = (((ret_type & 0x3) << 8) | 2581 ((fp_arg_count & 0xf) << 4) | 2582 (gp_arg_count & 0xf)); 2583 2584 // the addresses for the x86 to ARM entry code we need to use 2585 address start = pc(); 2586 // printf("start = %lx\n", start); 2587 int byteCount = aarch64_stub_prolog_size(); 2588 // printf("byteCount = %x\n", byteCount); 2589 int instructionCount = (byteCount + 3)/ 4; 2590 // printf("instructionCount = %x\n", instructionCount); 2591 for (int i = 0; i < instructionCount; i++) { 2592 nop(); 2593 } 2594 2595 memcpy(start, (void*)aarch64_stub_prolog, byteCount); 2596 2597 // write the address of the setup routine and the call format at the 2598 // end of into the copied code 2599 u_int64_t *patch_end = (u_int64_t *)(start + byteCount); 2600 if (prolog_ptr) 2601 patch_end[-2] = (u_int64_t)prolog_ptr; 2602 patch_end[-1] = calltype; 2603 } 2604 #endif 2605 2606 void MacroAssembler::push_call_clobbered_registers() { 2607 int step = 4 * wordSize; 2608 push(RegSet::range(r0, r18) - RegSet::of(rscratch1, rscratch2), sp); 2609 sub(sp, sp, step); 2610 mov(rscratch1, -step); 2611 // Push v0-v7, v16-v31. 2612 for (int i = 31; i>= 4; i -= 4) { 2613 if (i <= v7->encoding() || i >= v16->encoding()) 2614 st1(as_FloatRegister(i-3), as_FloatRegister(i-2), as_FloatRegister(i-1), 2615 as_FloatRegister(i), T1D, Address(post(sp, rscratch1))); 2616 } 2617 st1(as_FloatRegister(0), as_FloatRegister(1), as_FloatRegister(2), 2618 as_FloatRegister(3), T1D, Address(sp)); 2619 } 2620 2621 void MacroAssembler::pop_call_clobbered_registers() { 2622 for (int i = 0; i < 32; i += 4) { 2623 if (i <= v7->encoding() || i >= v16->encoding()) 2624 ld1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2), 2625 as_FloatRegister(i+3), T1D, Address(post(sp, 4 * wordSize))); 2626 } 2627 2628 pop(RegSet::range(r0, r18) - RegSet::of(rscratch1, rscratch2), sp); 2629 } 2630 2631 void MacroAssembler::push_CPU_state(bool save_vectors) { 2632 int step = (save_vectors ? 8 : 4) * wordSize; 2633 push(0x3fffffff, sp); // integer registers except lr & sp 2634 mov(rscratch1, -step); 2635 sub(sp, sp, step); 2636 for (int i = 28; i >= 4; i -= 4) { 2637 st1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2), 2638 as_FloatRegister(i+3), save_vectors ? T2D : T1D, Address(post(sp, rscratch1))); 2639 } 2640 st1(v0, v1, v2, v3, save_vectors ? T2D : T1D, sp); 2641 } 2642 2643 void MacroAssembler::pop_CPU_state(bool restore_vectors) { 2644 int step = (restore_vectors ? 8 : 4) * wordSize; 2645 for (int i = 0; i <= 28; i += 4) 2646 ld1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2), 2647 as_FloatRegister(i+3), restore_vectors ? T2D : T1D, Address(post(sp, step))); 2648 pop(0x3fffffff, sp); // integer registers except lr & sp 2649 } 2650 2651 /** 2652 * Helpers for multiply_to_len(). 2653 */ 2654 void MacroAssembler::add2_with_carry(Register final_dest_hi, Register dest_hi, Register dest_lo, 2655 Register src1, Register src2) { 2656 adds(dest_lo, dest_lo, src1); 2657 adc(dest_hi, dest_hi, zr); 2658 adds(dest_lo, dest_lo, src2); 2659 adc(final_dest_hi, dest_hi, zr); 2660 } 2661 2662 // Generate an address from (r + r1 extend offset). "size" is the 2663 // size of the operand. The result may be in rscratch2. 2664 Address MacroAssembler::offsetted_address(Register r, Register r1, 2665 Address::extend ext, int offset, int size) { 2666 if (offset || (ext.shift() % size != 0)) { 2667 lea(rscratch2, Address(r, r1, ext)); 2668 return Address(rscratch2, offset); 2669 } else { 2670 return Address(r, r1, ext); 2671 } 2672 } 2673 2674 Address MacroAssembler::spill_address(int size, int offset, Register tmp) 2675 { 2676 assert(offset >= 0, "spill to negative address?"); 2677 // Offset reachable ? 2678 // Not aligned - 9 bits signed offset 2679 // Aligned - 12 bits unsigned offset shifted 2680 Register base = sp; 2681 if ((offset & (size-1)) && offset >= (1<<8)) { 2682 add(tmp, base, offset & ((1<<12)-1)); 2683 base = tmp; 2684 offset &= -1u<<12; 2685 } 2686 2687 if (offset >= (1<<12) * size) { 2688 add(tmp, base, offset & (((1<<12)-1)<<12)); 2689 base = tmp; 2690 offset &= ~(((1<<12)-1)<<12); 2691 } 2692 2693 return Address(base, offset); 2694 } 2695 2696 // Checks whether offset is aligned. 2697 // Returns true if it is, else false. 2698 bool MacroAssembler::merge_alignment_check(Register base, 2699 size_t size, 2700 long cur_offset, 2701 long prev_offset) const { 2702 if (AvoidUnalignedAccesses) { 2703 if (base == sp) { 2704 // Checks whether low offset if aligned to pair of registers. 2705 long pair_mask = size * 2 - 1; 2706 long offset = prev_offset > cur_offset ? cur_offset : prev_offset; 2707 return (offset & pair_mask) == 0; 2708 } else { // If base is not sp, we can't guarantee the access is aligned. 2709 return false; 2710 } 2711 } else { 2712 long mask = size - 1; 2713 // Load/store pair instruction only supports element size aligned offset. 2714 return (cur_offset & mask) == 0 && (prev_offset & mask) == 0; 2715 } 2716 } 2717 2718 // Checks whether current and previous loads/stores can be merged. 2719 // Returns true if it can be merged, else false. 2720 bool MacroAssembler::ldst_can_merge(Register rt, 2721 const Address &adr, 2722 size_t cur_size_in_bytes, 2723 bool is_store) const { 2724 address prev = pc() - NativeInstruction::instruction_size; 2725 address last = code()->last_insn(); 2726 2727 if (last == NULL || !nativeInstruction_at(last)->is_Imm_LdSt()) { 2728 return false; 2729 } 2730 2731 if (adr.getMode() != Address::base_plus_offset || prev != last) { 2732 return false; 2733 } 2734 2735 NativeLdSt* prev_ldst = NativeLdSt_at(prev); 2736 size_t prev_size_in_bytes = prev_ldst->size_in_bytes(); 2737 2738 assert(prev_size_in_bytes == 4 || prev_size_in_bytes == 8, "only supports 64/32bit merging."); 2739 assert(cur_size_in_bytes == 4 || cur_size_in_bytes == 8, "only supports 64/32bit merging."); 2740 2741 if (cur_size_in_bytes != prev_size_in_bytes || is_store != prev_ldst->is_store()) { 2742 return false; 2743 } 2744 2745 long max_offset = 63 * prev_size_in_bytes; 2746 long min_offset = -64 * prev_size_in_bytes; 2747 2748 assert(prev_ldst->is_not_pre_post_index(), "pre-index or post-index is not supported to be merged."); 2749 2750 // Only same base can be merged. 2751 if (adr.base() != prev_ldst->base()) { 2752 return false; 2753 } 2754 2755 long cur_offset = adr.offset(); 2756 long prev_offset = prev_ldst->offset(); 2757 size_t diff = abs(cur_offset - prev_offset); 2758 if (diff != prev_size_in_bytes) { 2759 return false; 2760 } 2761 2762 // Following cases can not be merged: 2763 // ldr x2, [x2, #8] 2764 // ldr x3, [x2, #16] 2765 // or: 2766 // ldr x2, [x3, #8] 2767 // ldr x2, [x3, #16] 2768 // If t1 and t2 is the same in "ldp t1, t2, [xn, #imm]", we'll get SIGILL. 2769 if (!is_store && (adr.base() == prev_ldst->target() || rt == prev_ldst->target())) { 2770 return false; 2771 } 2772 2773 long low_offset = prev_offset > cur_offset ? cur_offset : prev_offset; 2774 // Offset range must be in ldp/stp instruction's range. 2775 if (low_offset > max_offset || low_offset < min_offset) { 2776 return false; 2777 } 2778 2779 if (merge_alignment_check(adr.base(), prev_size_in_bytes, cur_offset, prev_offset)) { 2780 return true; 2781 } 2782 2783 return false; 2784 } 2785 2786 // Merge current load/store with previous load/store into ldp/stp. 2787 void MacroAssembler::merge_ldst(Register rt, 2788 const Address &adr, 2789 size_t cur_size_in_bytes, 2790 bool is_store) { 2791 2792 assert(ldst_can_merge(rt, adr, cur_size_in_bytes, is_store) == true, "cur and prev must be able to be merged."); 2793 2794 Register rt_low, rt_high; 2795 address prev = pc() - NativeInstruction::instruction_size; 2796 NativeLdSt* prev_ldst = NativeLdSt_at(prev); 2797 2798 long offset; 2799 2800 if (adr.offset() < prev_ldst->offset()) { 2801 offset = adr.offset(); 2802 rt_low = rt; 2803 rt_high = prev_ldst->target(); 2804 } else { 2805 offset = prev_ldst->offset(); 2806 rt_low = prev_ldst->target(); 2807 rt_high = rt; 2808 } 2809 2810 Address adr_p = Address(prev_ldst->base(), offset); 2811 // Overwrite previous generated binary. 2812 code_section()->set_end(prev); 2813 2814 const int sz = prev_ldst->size_in_bytes(); 2815 assert(sz == 8 || sz == 4, "only supports 64/32bit merging."); 2816 if (!is_store) { 2817 BLOCK_COMMENT("merged ldr pair"); 2818 if (sz == 8) { 2819 ldp(rt_low, rt_high, adr_p); 2820 } else { 2821 ldpw(rt_low, rt_high, adr_p); 2822 } 2823 } else { 2824 BLOCK_COMMENT("merged str pair"); 2825 if (sz == 8) { 2826 stp(rt_low, rt_high, adr_p); 2827 } else { 2828 stpw(rt_low, rt_high, adr_p); 2829 } 2830 } 2831 } 2832 2833 /** 2834 * Multiply 64 bit by 64 bit first loop. 2835 */ 2836 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart, 2837 Register y, Register y_idx, Register z, 2838 Register carry, Register product, 2839 Register idx, Register kdx) { 2840 // 2841 // jlong carry, x[], y[], z[]; 2842 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) { 2843 // huge_128 product = y[idx] * x[xstart] + carry; 2844 // z[kdx] = (jlong)product; 2845 // carry = (jlong)(product >>> 64); 2846 // } 2847 // z[xstart] = carry; 2848 // 2849 2850 Label L_first_loop, L_first_loop_exit; 2851 Label L_one_x, L_one_y, L_multiply; 2852 2853 subsw(xstart, xstart, 1); 2854 br(Assembler::MI, L_one_x); 2855 2856 lea(rscratch1, Address(x, xstart, Address::lsl(LogBytesPerInt))); 2857 ldr(x_xstart, Address(rscratch1)); 2858 ror(x_xstart, x_xstart, 32); // convert big-endian to little-endian 2859 2860 bind(L_first_loop); 2861 subsw(idx, idx, 1); 2862 br(Assembler::MI, L_first_loop_exit); 2863 subsw(idx, idx, 1); 2864 br(Assembler::MI, L_one_y); 2865 lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt))); 2866 ldr(y_idx, Address(rscratch1)); 2867 ror(y_idx, y_idx, 32); // convert big-endian to little-endian 2868 bind(L_multiply); 2869 2870 // AArch64 has a multiply-accumulate instruction that we can't use 2871 // here because it has no way to process carries, so we have to use 2872 // separate add and adc instructions. Bah. 2873 umulh(rscratch1, x_xstart, y_idx); // x_xstart * y_idx -> rscratch1:product 2874 mul(product, x_xstart, y_idx); 2875 adds(product, product, carry); 2876 adc(carry, rscratch1, zr); // x_xstart * y_idx + carry -> carry:product 2877 2878 subw(kdx, kdx, 2); 2879 ror(product, product, 32); // back to big-endian 2880 str(product, offsetted_address(z, kdx, Address::uxtw(LogBytesPerInt), 0, BytesPerLong)); 2881 2882 b(L_first_loop); 2883 2884 bind(L_one_y); 2885 ldrw(y_idx, Address(y, 0)); 2886 b(L_multiply); 2887 2888 bind(L_one_x); 2889 ldrw(x_xstart, Address(x, 0)); 2890 b(L_first_loop); 2891 2892 bind(L_first_loop_exit); 2893 } 2894 2895 /** 2896 * Multiply 128 bit by 128. Unrolled inner loop. 2897 * 2898 */ 2899 void MacroAssembler::multiply_128_x_128_loop(Register y, Register z, 2900 Register carry, Register carry2, 2901 Register idx, Register jdx, 2902 Register yz_idx1, Register yz_idx2, 2903 Register tmp, Register tmp3, Register tmp4, 2904 Register tmp6, Register product_hi) { 2905 2906 // jlong carry, x[], y[], z[]; 2907 // int kdx = ystart+1; 2908 // for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop 2909 // huge_128 tmp3 = (y[idx+1] * product_hi) + z[kdx+idx+1] + carry; 2910 // jlong carry2 = (jlong)(tmp3 >>> 64); 2911 // huge_128 tmp4 = (y[idx] * product_hi) + z[kdx+idx] + carry2; 2912 // carry = (jlong)(tmp4 >>> 64); 2913 // z[kdx+idx+1] = (jlong)tmp3; 2914 // z[kdx+idx] = (jlong)tmp4; 2915 // } 2916 // idx += 2; 2917 // if (idx > 0) { 2918 // yz_idx1 = (y[idx] * product_hi) + z[kdx+idx] + carry; 2919 // z[kdx+idx] = (jlong)yz_idx1; 2920 // carry = (jlong)(yz_idx1 >>> 64); 2921 // } 2922 // 2923 2924 Label L_third_loop, L_third_loop_exit, L_post_third_loop_done; 2925 2926 lsrw(jdx, idx, 2); 2927 2928 bind(L_third_loop); 2929 2930 subsw(jdx, jdx, 1); 2931 br(Assembler::MI, L_third_loop_exit); 2932 subw(idx, idx, 4); 2933 2934 lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt))); 2935 2936 ldp(yz_idx2, yz_idx1, Address(rscratch1, 0)); 2937 2938 lea(tmp6, Address(z, idx, Address::uxtw(LogBytesPerInt))); 2939 2940 ror(yz_idx1, yz_idx1, 32); // convert big-endian to little-endian 2941 ror(yz_idx2, yz_idx2, 32); 2942 2943 ldp(rscratch2, rscratch1, Address(tmp6, 0)); 2944 2945 mul(tmp3, product_hi, yz_idx1); // yz_idx1 * product_hi -> tmp4:tmp3 2946 umulh(tmp4, product_hi, yz_idx1); 2947 2948 ror(rscratch1, rscratch1, 32); // convert big-endian to little-endian 2949 ror(rscratch2, rscratch2, 32); 2950 2951 mul(tmp, product_hi, yz_idx2); // yz_idx2 * product_hi -> carry2:tmp 2952 umulh(carry2, product_hi, yz_idx2); 2953 2954 // propagate sum of both multiplications into carry:tmp4:tmp3 2955 adds(tmp3, tmp3, carry); 2956 adc(tmp4, tmp4, zr); 2957 adds(tmp3, tmp3, rscratch1); 2958 adcs(tmp4, tmp4, tmp); 2959 adc(carry, carry2, zr); 2960 adds(tmp4, tmp4, rscratch2); 2961 adc(carry, carry, zr); 2962 2963 ror(tmp3, tmp3, 32); // convert little-endian to big-endian 2964 ror(tmp4, tmp4, 32); 2965 stp(tmp4, tmp3, Address(tmp6, 0)); 2966 2967 b(L_third_loop); 2968 bind (L_third_loop_exit); 2969 2970 andw (idx, idx, 0x3); 2971 cbz(idx, L_post_third_loop_done); 2972 2973 Label L_check_1; 2974 subsw(idx, idx, 2); 2975 br(Assembler::MI, L_check_1); 2976 2977 lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt))); 2978 ldr(yz_idx1, Address(rscratch1, 0)); 2979 ror(yz_idx1, yz_idx1, 32); 2980 mul(tmp3, product_hi, yz_idx1); // yz_idx1 * product_hi -> tmp4:tmp3 2981 umulh(tmp4, product_hi, yz_idx1); 2982 lea(rscratch1, Address(z, idx, Address::uxtw(LogBytesPerInt))); 2983 ldr(yz_idx2, Address(rscratch1, 0)); 2984 ror(yz_idx2, yz_idx2, 32); 2985 2986 add2_with_carry(carry, tmp4, tmp3, carry, yz_idx2); 2987 2988 ror(tmp3, tmp3, 32); 2989 str(tmp3, Address(rscratch1, 0)); 2990 2991 bind (L_check_1); 2992 2993 andw (idx, idx, 0x1); 2994 subsw(idx, idx, 1); 2995 br(Assembler::MI, L_post_third_loop_done); 2996 ldrw(tmp4, Address(y, idx, Address::uxtw(LogBytesPerInt))); 2997 mul(tmp3, tmp4, product_hi); // tmp4 * product_hi -> carry2:tmp3 2998 umulh(carry2, tmp4, product_hi); 2999 ldrw(tmp4, Address(z, idx, Address::uxtw(LogBytesPerInt))); 3000 3001 add2_with_carry(carry2, tmp3, tmp4, carry); 3002 3003 strw(tmp3, Address(z, idx, Address::uxtw(LogBytesPerInt))); 3004 extr(carry, carry2, tmp3, 32); 3005 3006 bind(L_post_third_loop_done); 3007 } 3008 3009 /** 3010 * Code for BigInteger::multiplyToLen() instrinsic. 3011 * 3012 * r0: x 3013 * r1: xlen 3014 * r2: y 3015 * r3: ylen 3016 * r4: z 3017 * r5: zlen 3018 * r10: tmp1 3019 * r11: tmp2 3020 * r12: tmp3 3021 * r13: tmp4 3022 * r14: tmp5 3023 * r15: tmp6 3024 * r16: tmp7 3025 * 3026 */ 3027 void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Register ylen, 3028 Register z, Register zlen, 3029 Register tmp1, Register tmp2, Register tmp3, Register tmp4, 3030 Register tmp5, Register tmp6, Register product_hi) { 3031 3032 assert_different_registers(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6); 3033 3034 const Register idx = tmp1; 3035 const Register kdx = tmp2; 3036 const Register xstart = tmp3; 3037 3038 const Register y_idx = tmp4; 3039 const Register carry = tmp5; 3040 const Register product = xlen; 3041 const Register x_xstart = zlen; // reuse register 3042 3043 // First Loop. 3044 // 3045 // final static long LONG_MASK = 0xffffffffL; 3046 // int xstart = xlen - 1; 3047 // int ystart = ylen - 1; 3048 // long carry = 0; 3049 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) { 3050 // long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry; 3051 // z[kdx] = (int)product; 3052 // carry = product >>> 32; 3053 // } 3054 // z[xstart] = (int)carry; 3055 // 3056 3057 movw(idx, ylen); // idx = ylen; 3058 movw(kdx, zlen); // kdx = xlen+ylen; 3059 mov(carry, zr); // carry = 0; 3060 3061 Label L_done; 3062 3063 movw(xstart, xlen); 3064 subsw(xstart, xstart, 1); 3065 br(Assembler::MI, L_done); 3066 3067 multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, carry, product, idx, kdx); 3068 3069 Label L_second_loop; 3070 cbzw(kdx, L_second_loop); 3071 3072 Label L_carry; 3073 subw(kdx, kdx, 1); 3074 cbzw(kdx, L_carry); 3075 3076 strw(carry, Address(z, kdx, Address::uxtw(LogBytesPerInt))); 3077 lsr(carry, carry, 32); 3078 subw(kdx, kdx, 1); 3079 3080 bind(L_carry); 3081 strw(carry, Address(z, kdx, Address::uxtw(LogBytesPerInt))); 3082 3083 // Second and third (nested) loops. 3084 // 3085 // for (int i = xstart-1; i >= 0; i--) { // Second loop 3086 // carry = 0; 3087 // for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop 3088 // long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) + 3089 // (z[k] & LONG_MASK) + carry; 3090 // z[k] = (int)product; 3091 // carry = product >>> 32; 3092 // } 3093 // z[i] = (int)carry; 3094 // } 3095 // 3096 // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = product_hi 3097 3098 const Register jdx = tmp1; 3099 3100 bind(L_second_loop); 3101 mov(carry, zr); // carry = 0; 3102 movw(jdx, ylen); // j = ystart+1 3103 3104 subsw(xstart, xstart, 1); // i = xstart-1; 3105 br(Assembler::MI, L_done); 3106 3107 str(z, Address(pre(sp, -4 * wordSize))); 3108 3109 Label L_last_x; 3110 lea(z, offsetted_address(z, xstart, Address::uxtw(LogBytesPerInt), 4, BytesPerInt)); // z = z + k - j 3111 subsw(xstart, xstart, 1); // i = xstart-1; 3112 br(Assembler::MI, L_last_x); 3113 3114 lea(rscratch1, Address(x, xstart, Address::uxtw(LogBytesPerInt))); 3115 ldr(product_hi, Address(rscratch1)); 3116 ror(product_hi, product_hi, 32); // convert big-endian to little-endian 3117 3118 Label L_third_loop_prologue; 3119 bind(L_third_loop_prologue); 3120 3121 str(ylen, Address(sp, wordSize)); 3122 stp(x, xstart, Address(sp, 2 * wordSize)); 3123 multiply_128_x_128_loop(y, z, carry, x, jdx, ylen, product, 3124 tmp2, x_xstart, tmp3, tmp4, tmp6, product_hi); 3125 ldp(z, ylen, Address(post(sp, 2 * wordSize))); 3126 ldp(x, xlen, Address(post(sp, 2 * wordSize))); // copy old xstart -> xlen 3127 3128 addw(tmp3, xlen, 1); 3129 strw(carry, Address(z, tmp3, Address::uxtw(LogBytesPerInt))); 3130 subsw(tmp3, tmp3, 1); 3131 br(Assembler::MI, L_done); 3132 3133 lsr(carry, carry, 32); 3134 strw(carry, Address(z, tmp3, Address::uxtw(LogBytesPerInt))); 3135 b(L_second_loop); 3136 3137 // Next infrequent code is moved outside loops. 3138 bind(L_last_x); 3139 ldrw(product_hi, Address(x, 0)); 3140 b(L_third_loop_prologue); 3141 3142 bind(L_done); 3143 } 3144 3145 // Code for BigInteger::mulAdd instrinsic 3146 // out = r0 3147 // in = r1 3148 // offset = r2 (already out.length-offset) 3149 // len = r3 3150 // k = r4 3151 // 3152 // pseudo code from java implementation: 3153 // carry = 0; 3154 // offset = out.length-offset - 1; 3155 // for (int j=len-1; j >= 0; j--) { 3156 // product = (in[j] & LONG_MASK) * kLong + (out[offset] & LONG_MASK) + carry; 3157 // out[offset--] = (int)product; 3158 // carry = product >>> 32; 3159 // } 3160 // return (int)carry; 3161 void MacroAssembler::mul_add(Register out, Register in, Register offset, 3162 Register len, Register k) { 3163 Label LOOP, END; 3164 // pre-loop 3165 cmp(len, zr); // cmp, not cbz/cbnz: to use condition twice => less branches 3166 csel(out, zr, out, Assembler::EQ); 3167 br(Assembler::EQ, END); 3168 add(in, in, len, LSL, 2); // in[j+1] address 3169 add(offset, out, offset, LSL, 2); // out[offset + 1] address 3170 mov(out, zr); // used to keep carry now 3171 BIND(LOOP); 3172 ldrw(rscratch1, Address(pre(in, -4))); 3173 madd(rscratch1, rscratch1, k, out); 3174 ldrw(rscratch2, Address(pre(offset, -4))); 3175 add(rscratch1, rscratch1, rscratch2); 3176 strw(rscratch1, Address(offset)); 3177 lsr(out, rscratch1, 32); 3178 subs(len, len, 1); 3179 br(Assembler::NE, LOOP); 3180 BIND(END); 3181 } 3182 3183 /** 3184 * Emits code to update CRC-32 with a byte value according to constants in table 3185 * 3186 * @param [in,out]crc Register containing the crc. 3187 * @param [in]val Register containing the byte to fold into the CRC. 3188 * @param [in]table Register containing the table of crc constants. 3189 * 3190 * uint32_t crc; 3191 * val = crc_table[(val ^ crc) & 0xFF]; 3192 * crc = val ^ (crc >> 8); 3193 * 3194 */ 3195 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) { 3196 eor(val, val, crc); 3197 andr(val, val, 0xff); 3198 ldrw(val, Address(table, val, Address::lsl(2))); 3199 eor(crc, val, crc, Assembler::LSR, 8); 3200 } 3201 3202 /** 3203 * Emits code to update CRC-32 with a 32-bit value according to tables 0 to 3 3204 * 3205 * @param [in,out]crc Register containing the crc. 3206 * @param [in]v Register containing the 32-bit to fold into the CRC. 3207 * @param [in]table0 Register containing table 0 of crc constants. 3208 * @param [in]table1 Register containing table 1 of crc constants. 3209 * @param [in]table2 Register containing table 2 of crc constants. 3210 * @param [in]table3 Register containing table 3 of crc constants. 3211 * 3212 * uint32_t crc; 3213 * v = crc ^ v 3214 * crc = table3[v&0xff]^table2[(v>>8)&0xff]^table1[(v>>16)&0xff]^table0[v>>24] 3215 * 3216 */ 3217 void MacroAssembler::update_word_crc32(Register crc, Register v, Register tmp, 3218 Register table0, Register table1, Register table2, Register table3, 3219 bool upper) { 3220 eor(v, crc, v, upper ? LSR:LSL, upper ? 32:0); 3221 uxtb(tmp, v); 3222 ldrw(crc, Address(table3, tmp, Address::lsl(2))); 3223 ubfx(tmp, v, 8, 8); 3224 ldrw(tmp, Address(table2, tmp, Address::lsl(2))); 3225 eor(crc, crc, tmp); 3226 ubfx(tmp, v, 16, 8); 3227 ldrw(tmp, Address(table1, tmp, Address::lsl(2))); 3228 eor(crc, crc, tmp); 3229 ubfx(tmp, v, 24, 8); 3230 ldrw(tmp, Address(table0, tmp, Address::lsl(2))); 3231 eor(crc, crc, tmp); 3232 } 3233 3234 void MacroAssembler::kernel_crc32_using_crc32(Register crc, Register buf, 3235 Register len, Register tmp0, Register tmp1, Register tmp2, 3236 Register tmp3) { 3237 Label CRC_by64_loop, CRC_by4_loop, CRC_by1_loop, CRC_less64, CRC_by64_pre, CRC_by32_loop, CRC_less32, L_exit; 3238 assert_different_registers(crc, buf, len, tmp0, tmp1, tmp2, tmp3); 3239 3240 mvnw(crc, crc); 3241 3242 subs(len, len, 128); 3243 br(Assembler::GE, CRC_by64_pre); 3244 BIND(CRC_less64); 3245 adds(len, len, 128-32); 3246 br(Assembler::GE, CRC_by32_loop); 3247 BIND(CRC_less32); 3248 adds(len, len, 32-4); 3249 br(Assembler::GE, CRC_by4_loop); 3250 adds(len, len, 4); 3251 br(Assembler::GT, CRC_by1_loop); 3252 b(L_exit); 3253 3254 BIND(CRC_by32_loop); 3255 ldp(tmp0, tmp1, Address(post(buf, 16))); 3256 subs(len, len, 32); 3257 crc32x(crc, crc, tmp0); 3258 ldr(tmp2, Address(post(buf, 8))); 3259 crc32x(crc, crc, tmp1); 3260 ldr(tmp3, Address(post(buf, 8))); 3261 crc32x(crc, crc, tmp2); 3262 crc32x(crc, crc, tmp3); 3263 br(Assembler::GE, CRC_by32_loop); 3264 cmn(len, 32); 3265 br(Assembler::NE, CRC_less32); 3266 b(L_exit); 3267 3268 BIND(CRC_by4_loop); 3269 ldrw(tmp0, Address(post(buf, 4))); 3270 subs(len, len, 4); 3271 crc32w(crc, crc, tmp0); 3272 br(Assembler::GE, CRC_by4_loop); 3273 adds(len, len, 4); 3274 br(Assembler::LE, L_exit); 3275 BIND(CRC_by1_loop); 3276 ldrb(tmp0, Address(post(buf, 1))); 3277 subs(len, len, 1); 3278 crc32b(crc, crc, tmp0); 3279 br(Assembler::GT, CRC_by1_loop); 3280 b(L_exit); 3281 3282 BIND(CRC_by64_pre); 3283 sub(buf, buf, 8); 3284 ldp(tmp0, tmp1, Address(buf, 8)); 3285 crc32x(crc, crc, tmp0); 3286 ldr(tmp2, Address(buf, 24)); 3287 crc32x(crc, crc, tmp1); 3288 ldr(tmp3, Address(buf, 32)); 3289 crc32x(crc, crc, tmp2); 3290 ldr(tmp0, Address(buf, 40)); 3291 crc32x(crc, crc, tmp3); 3292 ldr(tmp1, Address(buf, 48)); 3293 crc32x(crc, crc, tmp0); 3294 ldr(tmp2, Address(buf, 56)); 3295 crc32x(crc, crc, tmp1); 3296 ldr(tmp3, Address(pre(buf, 64))); 3297 3298 b(CRC_by64_loop); 3299 3300 align(CodeEntryAlignment); 3301 BIND(CRC_by64_loop); 3302 subs(len, len, 64); 3303 crc32x(crc, crc, tmp2); 3304 ldr(tmp0, Address(buf, 8)); 3305 crc32x(crc, crc, tmp3); 3306 ldr(tmp1, Address(buf, 16)); 3307 crc32x(crc, crc, tmp0); 3308 ldr(tmp2, Address(buf, 24)); 3309 crc32x(crc, crc, tmp1); 3310 ldr(tmp3, Address(buf, 32)); 3311 crc32x(crc, crc, tmp2); 3312 ldr(tmp0, Address(buf, 40)); 3313 crc32x(crc, crc, tmp3); 3314 ldr(tmp1, Address(buf, 48)); 3315 crc32x(crc, crc, tmp0); 3316 ldr(tmp2, Address(buf, 56)); 3317 crc32x(crc, crc, tmp1); 3318 ldr(tmp3, Address(pre(buf, 64))); 3319 br(Assembler::GE, CRC_by64_loop); 3320 3321 // post-loop 3322 crc32x(crc, crc, tmp2); 3323 crc32x(crc, crc, tmp3); 3324 3325 sub(len, len, 64); 3326 add(buf, buf, 8); 3327 cmn(len, 128); 3328 br(Assembler::NE, CRC_less64); 3329 BIND(L_exit); 3330 mvnw(crc, crc); 3331 } 3332 3333 /** 3334 * @param crc register containing existing CRC (32-bit) 3335 * @param buf register pointing to input byte buffer (byte*) 3336 * @param len register containing number of bytes 3337 * @param table register that will contain address of CRC table 3338 * @param tmp scratch register 3339 */ 3340 void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len, 3341 Register table0, Register table1, Register table2, Register table3, 3342 Register tmp, Register tmp2, Register tmp3) { 3343 Label L_by16, L_by16_loop, L_by4, L_by4_loop, L_by1, L_by1_loop, L_exit; 3344 unsigned long offset; 3345 3346 if (UseCRC32) { 3347 kernel_crc32_using_crc32(crc, buf, len, table0, table1, table2, table3); 3348 return; 3349 } 3350 3351 mvnw(crc, crc); 3352 3353 adrp(table0, ExternalAddress(StubRoutines::crc_table_addr()), offset); 3354 if (offset) add(table0, table0, offset); 3355 add(table1, table0, 1*256*sizeof(juint)); 3356 add(table2, table0, 2*256*sizeof(juint)); 3357 add(table3, table0, 3*256*sizeof(juint)); 3358 3359 if (UseNeon) { 3360 cmp(len, (u1)64); 3361 br(Assembler::LT, L_by16); 3362 eor(v16, T16B, v16, v16); 3363 3364 Label L_fold; 3365 3366 add(tmp, table0, 4*256*sizeof(juint)); // Point at the Neon constants 3367 3368 ld1(v0, v1, T2D, post(buf, 32)); 3369 ld1r(v4, T2D, post(tmp, 8)); 3370 ld1r(v5, T2D, post(tmp, 8)); 3371 ld1r(v6, T2D, post(tmp, 8)); 3372 ld1r(v7, T2D, post(tmp, 8)); 3373 mov(v16, T4S, 0, crc); 3374 3375 eor(v0, T16B, v0, v16); 3376 sub(len, len, 64); 3377 3378 BIND(L_fold); 3379 pmull(v22, T8H, v0, v5, T8B); 3380 pmull(v20, T8H, v0, v7, T8B); 3381 pmull(v23, T8H, v0, v4, T8B); 3382 pmull(v21, T8H, v0, v6, T8B); 3383 3384 pmull2(v18, T8H, v0, v5, T16B); 3385 pmull2(v16, T8H, v0, v7, T16B); 3386 pmull2(v19, T8H, v0, v4, T16B); 3387 pmull2(v17, T8H, v0, v6, T16B); 3388 3389 uzp1(v24, T8H, v20, v22); 3390 uzp2(v25, T8H, v20, v22); 3391 eor(v20, T16B, v24, v25); 3392 3393 uzp1(v26, T8H, v16, v18); 3394 uzp2(v27, T8H, v16, v18); 3395 eor(v16, T16B, v26, v27); 3396 3397 ushll2(v22, T4S, v20, T8H, 8); 3398 ushll(v20, T4S, v20, T4H, 8); 3399 3400 ushll2(v18, T4S, v16, T8H, 8); 3401 ushll(v16, T4S, v16, T4H, 8); 3402 3403 eor(v22, T16B, v23, v22); 3404 eor(v18, T16B, v19, v18); 3405 eor(v20, T16B, v21, v20); 3406 eor(v16, T16B, v17, v16); 3407 3408 uzp1(v17, T2D, v16, v20); 3409 uzp2(v21, T2D, v16, v20); 3410 eor(v17, T16B, v17, v21); 3411 3412 ushll2(v20, T2D, v17, T4S, 16); 3413 ushll(v16, T2D, v17, T2S, 16); 3414 3415 eor(v20, T16B, v20, v22); 3416 eor(v16, T16B, v16, v18); 3417 3418 uzp1(v17, T2D, v20, v16); 3419 uzp2(v21, T2D, v20, v16); 3420 eor(v28, T16B, v17, v21); 3421 3422 pmull(v22, T8H, v1, v5, T8B); 3423 pmull(v20, T8H, v1, v7, T8B); 3424 pmull(v23, T8H, v1, v4, T8B); 3425 pmull(v21, T8H, v1, v6, T8B); 3426 3427 pmull2(v18, T8H, v1, v5, T16B); 3428 pmull2(v16, T8H, v1, v7, T16B); 3429 pmull2(v19, T8H, v1, v4, T16B); 3430 pmull2(v17, T8H, v1, v6, T16B); 3431 3432 ld1(v0, v1, T2D, post(buf, 32)); 3433 3434 uzp1(v24, T8H, v20, v22); 3435 uzp2(v25, T8H, v20, v22); 3436 eor(v20, T16B, v24, v25); 3437 3438 uzp1(v26, T8H, v16, v18); 3439 uzp2(v27, T8H, v16, v18); 3440 eor(v16, T16B, v26, v27); 3441 3442 ushll2(v22, T4S, v20, T8H, 8); 3443 ushll(v20, T4S, v20, T4H, 8); 3444 3445 ushll2(v18, T4S, v16, T8H, 8); 3446 ushll(v16, T4S, v16, T4H, 8); 3447 3448 eor(v22, T16B, v23, v22); 3449 eor(v18, T16B, v19, v18); 3450 eor(v20, T16B, v21, v20); 3451 eor(v16, T16B, v17, v16); 3452 3453 uzp1(v17, T2D, v16, v20); 3454 uzp2(v21, T2D, v16, v20); 3455 eor(v16, T16B, v17, v21); 3456 3457 ushll2(v20, T2D, v16, T4S, 16); 3458 ushll(v16, T2D, v16, T2S, 16); 3459 3460 eor(v20, T16B, v22, v20); 3461 eor(v16, T16B, v16, v18); 3462 3463 uzp1(v17, T2D, v20, v16); 3464 uzp2(v21, T2D, v20, v16); 3465 eor(v20, T16B, v17, v21); 3466 3467 shl(v16, T2D, v28, 1); 3468 shl(v17, T2D, v20, 1); 3469 3470 eor(v0, T16B, v0, v16); 3471 eor(v1, T16B, v1, v17); 3472 3473 subs(len, len, 32); 3474 br(Assembler::GE, L_fold); 3475 3476 mov(crc, 0); 3477 mov(tmp, v0, T1D, 0); 3478 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false); 3479 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true); 3480 mov(tmp, v0, T1D, 1); 3481 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false); 3482 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true); 3483 mov(tmp, v1, T1D, 0); 3484 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false); 3485 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true); 3486 mov(tmp, v1, T1D, 1); 3487 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false); 3488 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true); 3489 3490 add(len, len, 32); 3491 } 3492 3493 BIND(L_by16); 3494 subs(len, len, 16); 3495 br(Assembler::GE, L_by16_loop); 3496 adds(len, len, 16-4); 3497 br(Assembler::GE, L_by4_loop); 3498 adds(len, len, 4); 3499 br(Assembler::GT, L_by1_loop); 3500 b(L_exit); 3501 3502 BIND(L_by4_loop); 3503 ldrw(tmp, Address(post(buf, 4))); 3504 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3); 3505 subs(len, len, 4); 3506 br(Assembler::GE, L_by4_loop); 3507 adds(len, len, 4); 3508 br(Assembler::LE, L_exit); 3509 BIND(L_by1_loop); 3510 subs(len, len, 1); 3511 ldrb(tmp, Address(post(buf, 1))); 3512 update_byte_crc32(crc, tmp, table0); 3513 br(Assembler::GT, L_by1_loop); 3514 b(L_exit); 3515 3516 align(CodeEntryAlignment); 3517 BIND(L_by16_loop); 3518 subs(len, len, 16); 3519 ldp(tmp, tmp3, Address(post(buf, 16))); 3520 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false); 3521 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true); 3522 update_word_crc32(crc, tmp3, tmp2, table0, table1, table2, table3, false); 3523 update_word_crc32(crc, tmp3, tmp2, table0, table1, table2, table3, true); 3524 br(Assembler::GE, L_by16_loop); 3525 adds(len, len, 16-4); 3526 br(Assembler::GE, L_by4_loop); 3527 adds(len, len, 4); 3528 br(Assembler::GT, L_by1_loop); 3529 BIND(L_exit); 3530 mvnw(crc, crc); 3531 } 3532 3533 void MacroAssembler::kernel_crc32c_using_crc32c(Register crc, Register buf, 3534 Register len, Register tmp0, Register tmp1, Register tmp2, 3535 Register tmp3) { 3536 Label CRC_by64_loop, CRC_by4_loop, CRC_by1_loop, CRC_less64, CRC_by64_pre, CRC_by32_loop, CRC_less32, L_exit; 3537 assert_different_registers(crc, buf, len, tmp0, tmp1, tmp2, tmp3); 3538 3539 subs(len, len, 128); 3540 br(Assembler::GE, CRC_by64_pre); 3541 BIND(CRC_less64); 3542 adds(len, len, 128-32); 3543 br(Assembler::GE, CRC_by32_loop); 3544 BIND(CRC_less32); 3545 adds(len, len, 32-4); 3546 br(Assembler::GE, CRC_by4_loop); 3547 adds(len, len, 4); 3548 br(Assembler::GT, CRC_by1_loop); 3549 b(L_exit); 3550 3551 BIND(CRC_by32_loop); 3552 ldp(tmp0, tmp1, Address(post(buf, 16))); 3553 subs(len, len, 32); 3554 crc32cx(crc, crc, tmp0); 3555 ldr(tmp2, Address(post(buf, 8))); 3556 crc32cx(crc, crc, tmp1); 3557 ldr(tmp3, Address(post(buf, 8))); 3558 crc32cx(crc, crc, tmp2); 3559 crc32cx(crc, crc, tmp3); 3560 br(Assembler::GE, CRC_by32_loop); 3561 cmn(len, 32); 3562 br(Assembler::NE, CRC_less32); 3563 b(L_exit); 3564 3565 BIND(CRC_by4_loop); 3566 ldrw(tmp0, Address(post(buf, 4))); 3567 subs(len, len, 4); 3568 crc32cw(crc, crc, tmp0); 3569 br(Assembler::GE, CRC_by4_loop); 3570 adds(len, len, 4); 3571 br(Assembler::LE, L_exit); 3572 BIND(CRC_by1_loop); 3573 ldrb(tmp0, Address(post(buf, 1))); 3574 subs(len, len, 1); 3575 crc32cb(crc, crc, tmp0); 3576 br(Assembler::GT, CRC_by1_loop); 3577 b(L_exit); 3578 3579 BIND(CRC_by64_pre); 3580 sub(buf, buf, 8); 3581 ldp(tmp0, tmp1, Address(buf, 8)); 3582 crc32cx(crc, crc, tmp0); 3583 ldr(tmp2, Address(buf, 24)); 3584 crc32cx(crc, crc, tmp1); 3585 ldr(tmp3, Address(buf, 32)); 3586 crc32cx(crc, crc, tmp2); 3587 ldr(tmp0, Address(buf, 40)); 3588 crc32cx(crc, crc, tmp3); 3589 ldr(tmp1, Address(buf, 48)); 3590 crc32cx(crc, crc, tmp0); 3591 ldr(tmp2, Address(buf, 56)); 3592 crc32cx(crc, crc, tmp1); 3593 ldr(tmp3, Address(pre(buf, 64))); 3594 3595 b(CRC_by64_loop); 3596 3597 align(CodeEntryAlignment); 3598 BIND(CRC_by64_loop); 3599 subs(len, len, 64); 3600 crc32cx(crc, crc, tmp2); 3601 ldr(tmp0, Address(buf, 8)); 3602 crc32cx(crc, crc, tmp3); 3603 ldr(tmp1, Address(buf, 16)); 3604 crc32cx(crc, crc, tmp0); 3605 ldr(tmp2, Address(buf, 24)); 3606 crc32cx(crc, crc, tmp1); 3607 ldr(tmp3, Address(buf, 32)); 3608 crc32cx(crc, crc, tmp2); 3609 ldr(tmp0, Address(buf, 40)); 3610 crc32cx(crc, crc, tmp3); 3611 ldr(tmp1, Address(buf, 48)); 3612 crc32cx(crc, crc, tmp0); 3613 ldr(tmp2, Address(buf, 56)); 3614 crc32cx(crc, crc, tmp1); 3615 ldr(tmp3, Address(pre(buf, 64))); 3616 br(Assembler::GE, CRC_by64_loop); 3617 3618 // post-loop 3619 crc32cx(crc, crc, tmp2); 3620 crc32cx(crc, crc, tmp3); 3621 3622 sub(len, len, 64); 3623 add(buf, buf, 8); 3624 cmn(len, 128); 3625 br(Assembler::NE, CRC_less64); 3626 BIND(L_exit); 3627 } 3628 3629 /** 3630 * @param crc register containing existing CRC (32-bit) 3631 * @param buf register pointing to input byte buffer (byte*) 3632 * @param len register containing number of bytes 3633 * @param table register that will contain address of CRC table 3634 * @param tmp scratch register 3635 */ 3636 void MacroAssembler::kernel_crc32c(Register crc, Register buf, Register len, 3637 Register table0, Register table1, Register table2, Register table3, 3638 Register tmp, Register tmp2, Register tmp3) { 3639 kernel_crc32c_using_crc32c(crc, buf, len, table0, table1, table2, table3); 3640 } 3641 3642 3643 SkipIfEqual::SkipIfEqual( 3644 MacroAssembler* masm, const bool* flag_addr, bool value) { 3645 _masm = masm; 3646 unsigned long offset; 3647 _masm->adrp(rscratch1, ExternalAddress((address)flag_addr), offset); 3648 _masm->ldrb(rscratch1, Address(rscratch1, offset)); 3649 _masm->cbzw(rscratch1, _label); 3650 } 3651 3652 SkipIfEqual::~SkipIfEqual() { 3653 _masm->bind(_label); 3654 } 3655 3656 void MacroAssembler::addptr(const Address &dst, int32_t src) { 3657 Address adr; 3658 switch(dst.getMode()) { 3659 case Address::base_plus_offset: 3660 // This is the expected mode, although we allow all the other 3661 // forms below. 3662 adr = form_address(rscratch2, dst.base(), dst.offset(), LogBytesPerWord); 3663 break; 3664 default: 3665 lea(rscratch2, dst); 3666 adr = Address(rscratch2); 3667 break; 3668 } 3669 ldr(rscratch1, adr); 3670 add(rscratch1, rscratch1, src); 3671 str(rscratch1, adr); 3672 } 3673 3674 void MacroAssembler::cmpptr(Register src1, Address src2) { 3675 unsigned long offset; 3676 adrp(rscratch1, src2, offset); 3677 ldr(rscratch1, Address(rscratch1, offset)); 3678 cmp(src1, rscratch1); 3679 } 3680 3681 void MacroAssembler::cmpoop(Register obj1, Register obj2) { 3682 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 3683 bs->obj_equals(this, obj1, obj2); 3684 } 3685 3686 void MacroAssembler::load_klass(Register dst, Register src) { 3687 if (UseCompressedClassPointers) { 3688 ldrw(dst, Address(src, oopDesc::klass_offset_in_bytes())); 3689 decode_klass_not_null(dst); 3690 } else { 3691 ldr(dst, Address(src, oopDesc::klass_offset_in_bytes())); 3692 } 3693 } 3694 3695 // ((OopHandle)result).resolve(); 3696 void MacroAssembler::resolve_oop_handle(Register result, Register tmp) { 3697 // OopHandle::resolve is an indirection. 3698 access_load_at(T_OBJECT, IN_NATIVE, result, Address(result, 0), tmp, noreg); 3699 } 3700 3701 void MacroAssembler::load_mirror(Register dst, Register method, Register tmp) { 3702 const int mirror_offset = in_bytes(Klass::java_mirror_offset()); 3703 ldr(dst, Address(rmethod, Method::const_offset())); 3704 ldr(dst, Address(dst, ConstMethod::constants_offset())); 3705 ldr(dst, Address(dst, ConstantPool::pool_holder_offset_in_bytes())); 3706 ldr(dst, Address(dst, mirror_offset)); 3707 resolve_oop_handle(dst, tmp); 3708 } 3709 3710 void MacroAssembler::cmp_klass(Register oop, Register trial_klass, Register tmp) { 3711 if (UseCompressedClassPointers) { 3712 ldrw(tmp, Address(oop, oopDesc::klass_offset_in_bytes())); 3713 if (CompressedKlassPointers::base() == NULL) { 3714 cmp(trial_klass, tmp, LSL, CompressedKlassPointers::shift()); 3715 return; 3716 } else if (((uint64_t)CompressedKlassPointers::base() & 0xffffffff) == 0 3717 && CompressedKlassPointers::shift() == 0) { 3718 // Only the bottom 32 bits matter 3719 cmpw(trial_klass, tmp); 3720 return; 3721 } 3722 decode_klass_not_null(tmp); 3723 } else { 3724 ldr(tmp, Address(oop, oopDesc::klass_offset_in_bytes())); 3725 } 3726 cmp(trial_klass, tmp); 3727 } 3728 3729 void MacroAssembler::load_prototype_header(Register dst, Register src) { 3730 load_klass(dst, src); 3731 ldr(dst, Address(dst, Klass::prototype_header_offset())); 3732 } 3733 3734 void MacroAssembler::store_klass(Register dst, Register src) { 3735 // FIXME: Should this be a store release? concurrent gcs assumes 3736 // klass length is valid if klass field is not null. 3737 if (UseCompressedClassPointers) { 3738 encode_klass_not_null(src); 3739 strw(src, Address(dst, oopDesc::klass_offset_in_bytes())); 3740 } else { 3741 str(src, Address(dst, oopDesc::klass_offset_in_bytes())); 3742 } 3743 } 3744 3745 void MacroAssembler::store_klass_gap(Register dst, Register src) { 3746 if (UseCompressedClassPointers) { 3747 // Store to klass gap in destination 3748 strw(src, Address(dst, oopDesc::klass_gap_offset_in_bytes())); 3749 } 3750 } 3751 3752 // Algorithm must match CompressedOops::encode. 3753 void MacroAssembler::encode_heap_oop(Register d, Register s) { 3754 #ifdef ASSERT 3755 verify_heapbase("MacroAssembler::encode_heap_oop: heap base corrupted?"); 3756 #endif 3757 verify_oop(s, "broken oop in encode_heap_oop"); 3758 if (CompressedOops::base() == NULL) { 3759 if (CompressedOops::shift() != 0) { 3760 assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong"); 3761 lsr(d, s, LogMinObjAlignmentInBytes); 3762 } else { 3763 mov(d, s); 3764 } 3765 } else { 3766 subs(d, s, rheapbase); 3767 csel(d, d, zr, Assembler::HS); 3768 lsr(d, d, LogMinObjAlignmentInBytes); 3769 3770 /* Old algorithm: is this any worse? 3771 Label nonnull; 3772 cbnz(r, nonnull); 3773 sub(r, r, rheapbase); 3774 bind(nonnull); 3775 lsr(r, r, LogMinObjAlignmentInBytes); 3776 */ 3777 } 3778 } 3779 3780 void MacroAssembler::encode_heap_oop_not_null(Register r) { 3781 #ifdef ASSERT 3782 verify_heapbase("MacroAssembler::encode_heap_oop_not_null: heap base corrupted?"); 3783 if (CheckCompressedOops) { 3784 Label ok; 3785 cbnz(r, ok); 3786 stop("null oop passed to encode_heap_oop_not_null"); 3787 bind(ok); 3788 } 3789 #endif 3790 verify_oop(r, "broken oop in encode_heap_oop_not_null"); 3791 if (CompressedOops::base() != NULL) { 3792 sub(r, r, rheapbase); 3793 } 3794 if (CompressedOops::shift() != 0) { 3795 assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong"); 3796 lsr(r, r, LogMinObjAlignmentInBytes); 3797 } 3798 } 3799 3800 void MacroAssembler::encode_heap_oop_not_null(Register dst, Register src) { 3801 #ifdef ASSERT 3802 verify_heapbase("MacroAssembler::encode_heap_oop_not_null2: heap base corrupted?"); 3803 if (CheckCompressedOops) { 3804 Label ok; 3805 cbnz(src, ok); 3806 stop("null oop passed to encode_heap_oop_not_null2"); 3807 bind(ok); 3808 } 3809 #endif 3810 verify_oop(src, "broken oop in encode_heap_oop_not_null2"); 3811 3812 Register data = src; 3813 if (CompressedOops::base() != NULL) { 3814 sub(dst, src, rheapbase); 3815 data = dst; 3816 } 3817 if (CompressedOops::shift() != 0) { 3818 assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong"); 3819 lsr(dst, data, LogMinObjAlignmentInBytes); 3820 data = dst; 3821 } 3822 if (data == src) 3823 mov(dst, src); 3824 } 3825 3826 void MacroAssembler::decode_heap_oop(Register d, Register s) { 3827 #ifdef ASSERT 3828 verify_heapbase("MacroAssembler::decode_heap_oop: heap base corrupted?"); 3829 #endif 3830 if (CompressedOops::base() == NULL) { 3831 if (CompressedOops::shift() != 0 || d != s) { 3832 lsl(d, s, CompressedOops::shift()); 3833 } 3834 } else { 3835 Label done; 3836 if (d != s) 3837 mov(d, s); 3838 cbz(s, done); 3839 add(d, rheapbase, s, Assembler::LSL, LogMinObjAlignmentInBytes); 3840 bind(done); 3841 } 3842 verify_oop(d, "broken oop in decode_heap_oop"); 3843 } 3844 3845 void MacroAssembler::decode_heap_oop_not_null(Register r) { 3846 assert (UseCompressedOops, "should only be used for compressed headers"); 3847 assert (Universe::heap() != NULL, "java heap should be initialized"); 3848 // Cannot assert, unverified entry point counts instructions (see .ad file) 3849 // vtableStubs also counts instructions in pd_code_size_limit. 3850 // Also do not verify_oop as this is called by verify_oop. 3851 if (CompressedOops::shift() != 0) { 3852 assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong"); 3853 if (CompressedOops::base() != NULL) { 3854 add(r, rheapbase, r, Assembler::LSL, LogMinObjAlignmentInBytes); 3855 } else { 3856 add(r, zr, r, Assembler::LSL, LogMinObjAlignmentInBytes); 3857 } 3858 } else { 3859 assert (CompressedOops::base() == NULL, "sanity"); 3860 } 3861 } 3862 3863 void MacroAssembler::decode_heap_oop_not_null(Register dst, Register src) { 3864 assert (UseCompressedOops, "should only be used for compressed headers"); 3865 assert (Universe::heap() != NULL, "java heap should be initialized"); 3866 // Cannot assert, unverified entry point counts instructions (see .ad file) 3867 // vtableStubs also counts instructions in pd_code_size_limit. 3868 // Also do not verify_oop as this is called by verify_oop. 3869 if (CompressedOops::shift() != 0) { 3870 assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong"); 3871 if (CompressedOops::base() != NULL) { 3872 add(dst, rheapbase, src, Assembler::LSL, LogMinObjAlignmentInBytes); 3873 } else { 3874 add(dst, zr, src, Assembler::LSL, LogMinObjAlignmentInBytes); 3875 } 3876 } else { 3877 assert (CompressedOops::base() == NULL, "sanity"); 3878 if (dst != src) { 3879 mov(dst, src); 3880 } 3881 } 3882 } 3883 3884 void MacroAssembler::encode_klass_not_null(Register dst, Register src) { 3885 if (CompressedKlassPointers::base() == NULL) { 3886 if (CompressedKlassPointers::shift() != 0) { 3887 assert (LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong"); 3888 lsr(dst, src, LogKlassAlignmentInBytes); 3889 } else { 3890 if (dst != src) mov(dst, src); 3891 } 3892 return; 3893 } 3894 3895 if (use_XOR_for_compressed_class_base) { 3896 if (CompressedKlassPointers::shift() != 0) { 3897 eor(dst, src, (uint64_t)CompressedKlassPointers::base()); 3898 lsr(dst, dst, LogKlassAlignmentInBytes); 3899 } else { 3900 eor(dst, src, (uint64_t)CompressedKlassPointers::base()); 3901 } 3902 return; 3903 } 3904 3905 if (((uint64_t)CompressedKlassPointers::base() & 0xffffffff) == 0 3906 && CompressedKlassPointers::shift() == 0) { 3907 movw(dst, src); 3908 return; 3909 } 3910 3911 #ifdef ASSERT 3912 verify_heapbase("MacroAssembler::encode_klass_not_null2: heap base corrupted?"); 3913 #endif 3914 3915 Register rbase = dst; 3916 if (dst == src) rbase = rheapbase; 3917 mov(rbase, (uint64_t)CompressedKlassPointers::base()); 3918 sub(dst, src, rbase); 3919 if (CompressedKlassPointers::shift() != 0) { 3920 assert (LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong"); 3921 lsr(dst, dst, LogKlassAlignmentInBytes); 3922 } 3923 if (dst == src) reinit_heapbase(); 3924 } 3925 3926 void MacroAssembler::encode_klass_not_null(Register r) { 3927 encode_klass_not_null(r, r); 3928 } 3929 3930 void MacroAssembler::decode_klass_not_null(Register dst, Register src) { 3931 Register rbase = dst; 3932 assert (UseCompressedClassPointers, "should only be used for compressed headers"); 3933 3934 if (CompressedKlassPointers::base() == NULL) { 3935 if (CompressedKlassPointers::shift() != 0) { 3936 assert(LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong"); 3937 lsl(dst, src, LogKlassAlignmentInBytes); 3938 } else { 3939 if (dst != src) mov(dst, src); 3940 } 3941 return; 3942 } 3943 3944 if (use_XOR_for_compressed_class_base) { 3945 if (CompressedKlassPointers::shift() != 0) { 3946 lsl(dst, src, LogKlassAlignmentInBytes); 3947 eor(dst, dst, (uint64_t)CompressedKlassPointers::base()); 3948 } else { 3949 eor(dst, src, (uint64_t)CompressedKlassPointers::base()); 3950 } 3951 return; 3952 } 3953 3954 if (((uint64_t)CompressedKlassPointers::base() & 0xffffffff) == 0 3955 && CompressedKlassPointers::shift() == 0) { 3956 if (dst != src) 3957 movw(dst, src); 3958 movk(dst, (uint64_t)CompressedKlassPointers::base() >> 32, 32); 3959 return; 3960 } 3961 3962 // Cannot assert, unverified entry point counts instructions (see .ad file) 3963 // vtableStubs also counts instructions in pd_code_size_limit. 3964 // Also do not verify_oop as this is called by verify_oop. 3965 if (dst == src) rbase = rheapbase; 3966 mov(rbase, (uint64_t)CompressedKlassPointers::base()); 3967 if (CompressedKlassPointers::shift() != 0) { 3968 assert(LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong"); 3969 add(dst, rbase, src, Assembler::LSL, LogKlassAlignmentInBytes); 3970 } else { 3971 add(dst, rbase, src); 3972 } 3973 if (dst == src) reinit_heapbase(); 3974 } 3975 3976 void MacroAssembler::decode_klass_not_null(Register r) { 3977 decode_klass_not_null(r, r); 3978 } 3979 3980 void MacroAssembler::set_narrow_oop(Register dst, jobject obj) { 3981 #ifdef ASSERT 3982 { 3983 ThreadInVMfromUnknown tiv; 3984 assert (UseCompressedOops, "should only be used for compressed oops"); 3985 assert (Universe::heap() != NULL, "java heap should be initialized"); 3986 assert (oop_recorder() != NULL, "this assembler needs an OopRecorder"); 3987 assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "should be real oop"); 3988 } 3989 #endif 3990 int oop_index = oop_recorder()->find_index(obj); 3991 InstructionMark im(this); 3992 RelocationHolder rspec = oop_Relocation::spec(oop_index); 3993 code_section()->relocate(inst_mark(), rspec); 3994 movz(dst, 0xDEAD, 16); 3995 movk(dst, 0xBEEF); 3996 } 3997 3998 void MacroAssembler::set_narrow_klass(Register dst, Klass* k) { 3999 assert (UseCompressedClassPointers, "should only be used for compressed headers"); 4000 assert (oop_recorder() != NULL, "this assembler needs an OopRecorder"); 4001 int index = oop_recorder()->find_index(k); 4002 assert(! Universe::heap()->is_in_reserved(k), "should not be an oop"); 4003 4004 InstructionMark im(this); 4005 RelocationHolder rspec = metadata_Relocation::spec(index); 4006 code_section()->relocate(inst_mark(), rspec); 4007 narrowKlass nk = CompressedKlassPointers::encode(k); 4008 movz(dst, (nk >> 16), 16); 4009 movk(dst, nk & 0xffff); 4010 } 4011 4012 void MacroAssembler::access_load_at(BasicType type, DecoratorSet decorators, 4013 Register dst, Address src, 4014 Register tmp1, Register thread_tmp) { 4015 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 4016 decorators = AccessInternal::decorator_fixup(decorators); 4017 bool as_raw = (decorators & AS_RAW) != 0; 4018 if (as_raw) { 4019 bs->BarrierSetAssembler::load_at(this, decorators, type, dst, src, tmp1, thread_tmp); 4020 } else { 4021 bs->load_at(this, decorators, type, dst, src, tmp1, thread_tmp); 4022 } 4023 } 4024 4025 void MacroAssembler::access_store_at(BasicType type, DecoratorSet decorators, 4026 Address dst, Register src, 4027 Register tmp1, Register thread_tmp) { 4028 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 4029 decorators = AccessInternal::decorator_fixup(decorators); 4030 bool as_raw = (decorators & AS_RAW) != 0; 4031 if (as_raw) { 4032 bs->BarrierSetAssembler::store_at(this, decorators, type, dst, src, tmp1, thread_tmp); 4033 } else { 4034 bs->store_at(this, decorators, type, dst, src, tmp1, thread_tmp); 4035 } 4036 } 4037 4038 void MacroAssembler::resolve(DecoratorSet decorators, Register obj) { 4039 // Use stronger ACCESS_WRITE|ACCESS_READ by default. 4040 if ((decorators & (ACCESS_READ | ACCESS_WRITE)) == 0) { 4041 decorators |= ACCESS_READ | ACCESS_WRITE; 4042 } 4043 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 4044 return bs->resolve(this, decorators, obj); 4045 } 4046 4047 void MacroAssembler::load_heap_oop(Register dst, Address src, Register tmp1, 4048 Register thread_tmp, DecoratorSet decorators) { 4049 access_load_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, thread_tmp); 4050 } 4051 4052 void MacroAssembler::load_heap_oop_not_null(Register dst, Address src, Register tmp1, 4053 Register thread_tmp, DecoratorSet decorators) { 4054 access_load_at(T_OBJECT, IN_HEAP | IS_NOT_NULL | decorators, dst, src, tmp1, thread_tmp); 4055 } 4056 4057 void MacroAssembler::store_heap_oop(Address dst, Register src, Register tmp1, 4058 Register thread_tmp, DecoratorSet decorators) { 4059 access_store_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, thread_tmp); 4060 } 4061 4062 // Used for storing NULLs. 4063 void MacroAssembler::store_heap_oop_null(Address dst) { 4064 access_store_at(T_OBJECT, IN_HEAP, dst, noreg, noreg, noreg); 4065 } 4066 4067 Address MacroAssembler::allocate_metadata_address(Metadata* obj) { 4068 assert(oop_recorder() != NULL, "this assembler needs a Recorder"); 4069 int index = oop_recorder()->allocate_metadata_index(obj); 4070 RelocationHolder rspec = metadata_Relocation::spec(index); 4071 return Address((address)obj, rspec); 4072 } 4073 4074 // Move an oop into a register. immediate is true if we want 4075 // immediate instrcutions, i.e. we are not going to patch this 4076 // instruction while the code is being executed by another thread. In 4077 // that case we can use move immediates rather than the constant pool. 4078 void MacroAssembler::movoop(Register dst, jobject obj, bool immediate) { 4079 int oop_index; 4080 if (obj == NULL) { 4081 oop_index = oop_recorder()->allocate_oop_index(obj); 4082 } else { 4083 #ifdef ASSERT 4084 { 4085 ThreadInVMfromUnknown tiv; 4086 assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "should be real oop"); 4087 } 4088 #endif 4089 oop_index = oop_recorder()->find_index(obj); 4090 } 4091 RelocationHolder rspec = oop_Relocation::spec(oop_index); 4092 if (! immediate) { 4093 address dummy = address(uintptr_t(pc()) & -wordSize); // A nearby aligned address 4094 ldr_constant(dst, Address(dummy, rspec)); 4095 } else 4096 mov(dst, Address((address)obj, rspec)); 4097 } 4098 4099 // Move a metadata address into a register. 4100 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) { 4101 int oop_index; 4102 if (obj == NULL) { 4103 oop_index = oop_recorder()->allocate_metadata_index(obj); 4104 } else { 4105 oop_index = oop_recorder()->find_index(obj); 4106 } 4107 RelocationHolder rspec = metadata_Relocation::spec(oop_index); 4108 mov(dst, Address((address)obj, rspec)); 4109 } 4110 4111 Address MacroAssembler::constant_oop_address(jobject obj) { 4112 #ifdef ASSERT 4113 { 4114 ThreadInVMfromUnknown tiv; 4115 assert(oop_recorder() != NULL, "this assembler needs an OopRecorder"); 4116 assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "not an oop"); 4117 } 4118 #endif 4119 int oop_index = oop_recorder()->find_index(obj); 4120 return Address((address)obj, oop_Relocation::spec(oop_index)); 4121 } 4122 4123 // Defines obj, preserves var_size_in_bytes, okay for t2 == var_size_in_bytes. 4124 void MacroAssembler::tlab_allocate(Register obj, 4125 Register var_size_in_bytes, 4126 int con_size_in_bytes, 4127 Register t1, 4128 Register t2, 4129 Label& slow_case) { 4130 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 4131 bs->tlab_allocate(this, obj, var_size_in_bytes, con_size_in_bytes, t1, t2, slow_case); 4132 } 4133 4134 // Defines obj, preserves var_size_in_bytes 4135 void MacroAssembler::eden_allocate(Register obj, 4136 Register var_size_in_bytes, 4137 int con_size_in_bytes, 4138 Register t1, 4139 Label& slow_case) { 4140 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 4141 bs->eden_allocate(this, obj, var_size_in_bytes, con_size_in_bytes, t1, slow_case); 4142 } 4143 4144 // Zero words; len is in bytes 4145 // Destroys all registers except addr 4146 // len must be a nonzero multiple of wordSize 4147 void MacroAssembler::zero_memory(Register addr, Register len, Register t1) { 4148 assert_different_registers(addr, len, t1, rscratch1, rscratch2); 4149 4150 #ifdef ASSERT 4151 { Label L; 4152 tst(len, BytesPerWord - 1); 4153 br(Assembler::EQ, L); 4154 stop("len is not a multiple of BytesPerWord"); 4155 bind(L); 4156 } 4157 #endif 4158 4159 #ifndef PRODUCT 4160 block_comment("zero memory"); 4161 #endif 4162 4163 Label loop; 4164 Label entry; 4165 4166 // Algorithm: 4167 // 4168 // scratch1 = cnt & 7; 4169 // cnt -= scratch1; 4170 // p += scratch1; 4171 // switch (scratch1) { 4172 // do { 4173 // cnt -= 8; 4174 // p[-8] = 0; 4175 // case 7: 4176 // p[-7] = 0; 4177 // case 6: 4178 // p[-6] = 0; 4179 // // ... 4180 // case 1: 4181 // p[-1] = 0; 4182 // case 0: 4183 // p += 8; 4184 // } while (cnt); 4185 // } 4186 4187 const int unroll = 8; // Number of str(zr) instructions we'll unroll 4188 4189 lsr(len, len, LogBytesPerWord); 4190 andr(rscratch1, len, unroll - 1); // tmp1 = cnt % unroll 4191 sub(len, len, rscratch1); // cnt -= unroll 4192 // t1 always points to the end of the region we're about to zero 4193 add(t1, addr, rscratch1, Assembler::LSL, LogBytesPerWord); 4194 adr(rscratch2, entry); 4195 sub(rscratch2, rscratch2, rscratch1, Assembler::LSL, 2); 4196 br(rscratch2); 4197 bind(loop); 4198 sub(len, len, unroll); 4199 for (int i = -unroll; i < 0; i++) 4200 Assembler::str(zr, Address(t1, i * wordSize)); 4201 bind(entry); 4202 add(t1, t1, unroll * wordSize); 4203 cbnz(len, loop); 4204 } 4205 4206 void MacroAssembler::verify_tlab() { 4207 #ifdef ASSERT 4208 if (UseTLAB && VerifyOops) { 4209 Label next, ok; 4210 4211 stp(rscratch2, rscratch1, Address(pre(sp, -16))); 4212 4213 ldr(rscratch2, Address(rthread, in_bytes(JavaThread::tlab_top_offset()))); 4214 ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_start_offset()))); 4215 cmp(rscratch2, rscratch1); 4216 br(Assembler::HS, next); 4217 STOP("assert(top >= start)"); 4218 should_not_reach_here(); 4219 4220 bind(next); 4221 ldr(rscratch2, Address(rthread, in_bytes(JavaThread::tlab_end_offset()))); 4222 ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_top_offset()))); 4223 cmp(rscratch2, rscratch1); 4224 br(Assembler::HS, ok); 4225 STOP("assert(top <= end)"); 4226 should_not_reach_here(); 4227 4228 bind(ok); 4229 ldp(rscratch2, rscratch1, Address(post(sp, 16))); 4230 } 4231 #endif 4232 } 4233 4234 // Writes to stack successive pages until offset reached to check for 4235 // stack overflow + shadow pages. This clobbers tmp. 4236 void MacroAssembler::bang_stack_size(Register size, Register tmp) { 4237 assert_different_registers(tmp, size, rscratch1); 4238 mov(tmp, sp); 4239 // Bang stack for total size given plus shadow page size. 4240 // Bang one page at a time because large size can bang beyond yellow and 4241 // red zones. 4242 Label loop; 4243 mov(rscratch1, os::vm_page_size()); 4244 bind(loop); 4245 lea(tmp, Address(tmp, -os::vm_page_size())); 4246 subsw(size, size, rscratch1); 4247 str(size, Address(tmp)); 4248 br(Assembler::GT, loop); 4249 4250 // Bang down shadow pages too. 4251 // At this point, (tmp-0) is the last address touched, so don't 4252 // touch it again. (It was touched as (tmp-pagesize) but then tmp 4253 // was post-decremented.) Skip this address by starting at i=1, and 4254 // touch a few more pages below. N.B. It is important to touch all 4255 // the way down to and including i=StackShadowPages. 4256 for (int i = 0; i < (int)(JavaThread::stack_shadow_zone_size() / os::vm_page_size()) - 1; i++) { 4257 // this could be any sized move but this is can be a debugging crumb 4258 // so the bigger the better. 4259 lea(tmp, Address(tmp, -os::vm_page_size())); 4260 str(size, Address(tmp)); 4261 } 4262 } 4263 4264 4265 // Move the address of the polling page into dest. 4266 void MacroAssembler::get_polling_page(Register dest, address page, relocInfo::relocType rtype) { 4267 if (SafepointMechanism::uses_thread_local_poll()) { 4268 ldr(dest, Address(rthread, Thread::polling_page_offset())); 4269 } else { 4270 unsigned long off; 4271 adrp(dest, Address(page, rtype), off); 4272 assert(off == 0, "polling page must be page aligned"); 4273 } 4274 } 4275 4276 // Move the address of the polling page into r, then read the polling 4277 // page. 4278 address MacroAssembler::read_polling_page(Register r, address page, relocInfo::relocType rtype) { 4279 get_polling_page(r, page, rtype); 4280 return read_polling_page(r, rtype); 4281 } 4282 4283 // Read the polling page. The address of the polling page must 4284 // already be in r. 4285 address MacroAssembler::read_polling_page(Register r, relocInfo::relocType rtype) { 4286 InstructionMark im(this); 4287 code_section()->relocate(inst_mark(), rtype); 4288 ldrw(zr, Address(r, 0)); 4289 return inst_mark(); 4290 } 4291 4292 void MacroAssembler::adrp(Register reg1, const Address &dest, unsigned long &byte_offset) { 4293 relocInfo::relocType rtype = dest.rspec().reloc()->type(); 4294 unsigned long low_page = (unsigned long)CodeCache::low_bound() >> 12; 4295 unsigned long high_page = (unsigned long)(CodeCache::high_bound()-1) >> 12; 4296 unsigned long dest_page = (unsigned long)dest.target() >> 12; 4297 long offset_low = dest_page - low_page; 4298 long offset_high = dest_page - high_page; 4299 4300 assert(is_valid_AArch64_address(dest.target()), "bad address"); 4301 assert(dest.getMode() == Address::literal, "ADRP must be applied to a literal address"); 4302 4303 InstructionMark im(this); 4304 code_section()->relocate(inst_mark(), dest.rspec()); 4305 // 8143067: Ensure that the adrp can reach the dest from anywhere within 4306 // the code cache so that if it is relocated we know it will still reach 4307 if (offset_high >= -(1<<20) && offset_low < (1<<20)) { 4308 _adrp(reg1, dest.target()); 4309 } else { 4310 unsigned long target = (unsigned long)dest.target(); 4311 unsigned long adrp_target 4312 = (target & 0xffffffffUL) | ((unsigned long)pc() & 0xffff00000000UL); 4313 4314 _adrp(reg1, (address)adrp_target); 4315 movk(reg1, target >> 32, 32); 4316 } 4317 byte_offset = (unsigned long)dest.target() & 0xfff; 4318 } 4319 4320 void MacroAssembler::load_byte_map_base(Register reg) { 4321 CardTable::CardValue* byte_map_base = 4322 ((CardTableBarrierSet*)(BarrierSet::barrier_set()))->card_table()->byte_map_base(); 4323 4324 if (is_valid_AArch64_address((address)byte_map_base)) { 4325 // Strictly speaking the byte_map_base isn't an address at all, 4326 // and it might even be negative. 4327 unsigned long offset; 4328 adrp(reg, ExternalAddress((address)byte_map_base), offset); 4329 // We expect offset to be zero with most collectors. 4330 if (offset != 0) { 4331 add(reg, reg, offset); 4332 } 4333 } else { 4334 mov(reg, (uint64_t)byte_map_base); 4335 } 4336 } 4337 4338 void MacroAssembler::build_frame(int framesize) { 4339 assert(framesize > 0, "framesize must be > 0"); 4340 if (framesize < ((1 << 9) + 2 * wordSize)) { 4341 sub(sp, sp, framesize); 4342 stp(rfp, lr, Address(sp, framesize - 2 * wordSize)); 4343 if (PreserveFramePointer) add(rfp, sp, framesize - 2 * wordSize); 4344 } else { 4345 stp(rfp, lr, Address(pre(sp, -2 * wordSize))); 4346 if (PreserveFramePointer) mov(rfp, sp); 4347 if (framesize < ((1 << 12) + 2 * wordSize)) 4348 sub(sp, sp, framesize - 2 * wordSize); 4349 else { 4350 mov(rscratch1, framesize - 2 * wordSize); 4351 sub(sp, sp, rscratch1); 4352 } 4353 } 4354 } 4355 4356 void MacroAssembler::remove_frame(int framesize) { 4357 assert(framesize > 0, "framesize must be > 0"); 4358 if (framesize < ((1 << 9) + 2 * wordSize)) { 4359 ldp(rfp, lr, Address(sp, framesize - 2 * wordSize)); 4360 add(sp, sp, framesize); 4361 } else { 4362 if (framesize < ((1 << 12) + 2 * wordSize)) 4363 add(sp, sp, framesize - 2 * wordSize); 4364 else { 4365 mov(rscratch1, framesize - 2 * wordSize); 4366 add(sp, sp, rscratch1); 4367 } 4368 ldp(rfp, lr, Address(post(sp, 2 * wordSize))); 4369 } 4370 } 4371 4372 #ifdef COMPILER2 4373 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr); 4374 4375 // Search for str1 in str2 and return index or -1 4376 void MacroAssembler::string_indexof(Register str2, Register str1, 4377 Register cnt2, Register cnt1, 4378 Register tmp1, Register tmp2, 4379 Register tmp3, Register tmp4, 4380 Register tmp5, Register tmp6, 4381 int icnt1, Register result, int ae) { 4382 // NOTE: tmp5, tmp6 can be zr depending on specific method version 4383 Label LINEARSEARCH, LINEARSTUB, LINEAR_MEDIUM, DONE, NOMATCH, MATCH; 4384 4385 Register ch1 = rscratch1; 4386 Register ch2 = rscratch2; 4387 Register cnt1tmp = tmp1; 4388 Register cnt2tmp = tmp2; 4389 Register cnt1_neg = cnt1; 4390 Register cnt2_neg = cnt2; 4391 Register result_tmp = tmp4; 4392 4393 bool isL = ae == StrIntrinsicNode::LL; 4394 4395 bool str1_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL; 4396 bool str2_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU; 4397 int str1_chr_shift = str1_isL ? 0:1; 4398 int str2_chr_shift = str2_isL ? 0:1; 4399 int str1_chr_size = str1_isL ? 1:2; 4400 int str2_chr_size = str2_isL ? 1:2; 4401 chr_insn str1_load_1chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb : 4402 (chr_insn)&MacroAssembler::ldrh; 4403 chr_insn str2_load_1chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb : 4404 (chr_insn)&MacroAssembler::ldrh; 4405 chr_insn load_2chr = isL ? (chr_insn)&MacroAssembler::ldrh : (chr_insn)&MacroAssembler::ldrw; 4406 chr_insn load_4chr = isL ? (chr_insn)&MacroAssembler::ldrw : (chr_insn)&MacroAssembler::ldr; 4407 4408 // Note, inline_string_indexOf() generates checks: 4409 // if (substr.count > string.count) return -1; 4410 // if (substr.count == 0) return 0; 4411 4412 // We have two strings, a source string in str2, cnt2 and a pattern string 4413 // in str1, cnt1. Find the 1st occurence of pattern in source or return -1. 4414 4415 // For larger pattern and source we use a simplified Boyer Moore algorithm. 4416 // With a small pattern and source we use linear scan. 4417 4418 if (icnt1 == -1) { 4419 sub(result_tmp, cnt2, cnt1); 4420 cmp(cnt1, (u1)8); // Use Linear Scan if cnt1 < 8 || cnt1 >= 256 4421 br(LT, LINEARSEARCH); 4422 dup(v0, T16B, cnt1); // done in separate FPU pipeline. Almost no penalty 4423 subs(zr, cnt1, 256); 4424 lsr(tmp1, cnt2, 2); 4425 ccmp(cnt1, tmp1, 0b0000, LT); // Source must be 4 * pattern for BM 4426 br(GE, LINEARSTUB); 4427 } 4428 4429 // The Boyer Moore alogorithm is based on the description here:- 4430 // 4431 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm 4432 // 4433 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule 4434 // and the 'Good Suffix' rule. 4435 // 4436 // These rules are essentially heuristics for how far we can shift the 4437 // pattern along the search string. 4438 // 4439 // The implementation here uses the 'Bad Character' rule only because of the 4440 // complexity of initialisation for the 'Good Suffix' rule. 4441 // 4442 // This is also known as the Boyer-Moore-Horspool algorithm:- 4443 // 4444 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm 4445 // 4446 // This particular implementation has few java-specific optimizations. 4447 // 4448 // #define ASIZE 256 4449 // 4450 // int bm(unsigned char *x, int m, unsigned char *y, int n) { 4451 // int i, j; 4452 // unsigned c; 4453 // unsigned char bc[ASIZE]; 4454 // 4455 // /* Preprocessing */ 4456 // for (i = 0; i < ASIZE; ++i) 4457 // bc[i] = m; 4458 // for (i = 0; i < m - 1; ) { 4459 // c = x[i]; 4460 // ++i; 4461 // // c < 256 for Latin1 string, so, no need for branch 4462 // #ifdef PATTERN_STRING_IS_LATIN1 4463 // bc[c] = m - i; 4464 // #else 4465 // if (c < ASIZE) bc[c] = m - i; 4466 // #endif 4467 // } 4468 // 4469 // /* Searching */ 4470 // j = 0; 4471 // while (j <= n - m) { 4472 // c = y[i+j]; 4473 // if (x[m-1] == c) 4474 // for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i); 4475 // if (i < 0) return j; 4476 // // c < 256 for Latin1 string, so, no need for branch 4477 // #ifdef SOURCE_STRING_IS_LATIN1 4478 // // LL case: (c< 256) always true. Remove branch 4479 // j += bc[y[j+m-1]]; 4480 // #endif 4481 // #ifndef PATTERN_STRING_IS_UTF 4482 // // UU case: need if (c<ASIZE) check. Skip 1 character if not. 4483 // if (c < ASIZE) 4484 // j += bc[y[j+m-1]]; 4485 // else 4486 // j += 1 4487 // #endif 4488 // #ifdef PATTERN_IS_LATIN1_AND_SOURCE_IS_UTF 4489 // // UL case: need if (c<ASIZE) check. Skip <pattern length> if not. 4490 // if (c < ASIZE) 4491 // j += bc[y[j+m-1]]; 4492 // else 4493 // j += m 4494 // #endif 4495 // } 4496 // } 4497 4498 if (icnt1 == -1) { 4499 Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH, 4500 BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP; 4501 Register cnt1end = tmp2; 4502 Register str2end = cnt2; 4503 Register skipch = tmp2; 4504 4505 // str1 length is >=8, so, we can read at least 1 register for cases when 4506 // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for 4507 // UL case. We'll re-read last character in inner pre-loop code to have 4508 // single outer pre-loop load 4509 const int firstStep = isL ? 7 : 3; 4510 4511 const int ASIZE = 256; 4512 const int STORED_BYTES = 32; // amount of bytes stored per instruction 4513 sub(sp, sp, ASIZE); 4514 mov(tmp5, ASIZE/STORED_BYTES); // loop iterations 4515 mov(ch1, sp); 4516 BIND(BM_INIT_LOOP); 4517 stpq(v0, v0, Address(post(ch1, STORED_BYTES))); 4518 subs(tmp5, tmp5, 1); 4519 br(GT, BM_INIT_LOOP); 4520 4521 sub(cnt1tmp, cnt1, 1); 4522 mov(tmp5, str2); 4523 add(str2end, str2, result_tmp, LSL, str2_chr_shift); 4524 sub(ch2, cnt1, 1); 4525 mov(tmp3, str1); 4526 BIND(BCLOOP); 4527 (this->*str1_load_1chr)(ch1, Address(post(tmp3, str1_chr_size))); 4528 if (!str1_isL) { 4529 subs(zr, ch1, ASIZE); 4530 br(HS, BCSKIP); 4531 } 4532 strb(ch2, Address(sp, ch1)); 4533 BIND(BCSKIP); 4534 subs(ch2, ch2, 1); 4535 br(GT, BCLOOP); 4536 4537 add(tmp6, str1, cnt1, LSL, str1_chr_shift); // address after str1 4538 if (str1_isL == str2_isL) { 4539 // load last 8 bytes (8LL/4UU symbols) 4540 ldr(tmp6, Address(tmp6, -wordSize)); 4541 } else { 4542 ldrw(tmp6, Address(tmp6, -wordSize/2)); // load last 4 bytes(4 symbols) 4543 // convert Latin1 to UTF. We'll have to wait until load completed, but 4544 // it's still faster than per-character loads+checks 4545 lsr(tmp3, tmp6, BitsPerByte * (wordSize/2 - str1_chr_size)); // str1[N-1] 4546 ubfx(ch1, tmp6, 8, 8); // str1[N-2] 4547 ubfx(ch2, tmp6, 16, 8); // str1[N-3] 4548 andr(tmp6, tmp6, 0xFF); // str1[N-4] 4549 orr(ch2, ch1, ch2, LSL, 16); 4550 orr(tmp6, tmp6, tmp3, LSL, 48); 4551 orr(tmp6, tmp6, ch2, LSL, 16); 4552 } 4553 BIND(BMLOOPSTR2); 4554 (this->*str2_load_1chr)(skipch, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift))); 4555 sub(cnt1tmp, cnt1tmp, firstStep); // cnt1tmp is positive here, because cnt1 >= 8 4556 if (str1_isL == str2_isL) { 4557 // re-init tmp3. It's for free because it's executed in parallel with 4558 // load above. Alternative is to initialize it before loop, but it'll 4559 // affect performance on in-order systems with 2 or more ld/st pipelines 4560 lsr(tmp3, tmp6, BitsPerByte * (wordSize - str1_chr_size)); 4561 } 4562 if (!isL) { // UU/UL case 4563 lsl(ch2, cnt1tmp, 1); // offset in bytes 4564 } 4565 cmp(tmp3, skipch); 4566 br(NE, BMSKIP); 4567 ldr(ch2, Address(str2, isL ? cnt1tmp : ch2)); 4568 mov(ch1, tmp6); 4569 if (isL) { 4570 b(BMLOOPSTR1_AFTER_LOAD); 4571 } else { 4572 sub(cnt1tmp, cnt1tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8 4573 b(BMLOOPSTR1_CMP); 4574 } 4575 BIND(BMLOOPSTR1); 4576 (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift))); 4577 (this->*str2_load_1chr)(ch2, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift))); 4578 BIND(BMLOOPSTR1_AFTER_LOAD); 4579 subs(cnt1tmp, cnt1tmp, 1); 4580 br(LT, BMLOOPSTR1_LASTCMP); 4581 BIND(BMLOOPSTR1_CMP); 4582 cmp(ch1, ch2); 4583 br(EQ, BMLOOPSTR1); 4584 BIND(BMSKIP); 4585 if (!isL) { 4586 // if we've met UTF symbol while searching Latin1 pattern, then we can 4587 // skip cnt1 symbols 4588 if (str1_isL != str2_isL) { 4589 mov(result_tmp, cnt1); 4590 } else { 4591 mov(result_tmp, 1); 4592 } 4593 subs(zr, skipch, ASIZE); 4594 br(HS, BMADV); 4595 } 4596 ldrb(result_tmp, Address(sp, skipch)); // load skip distance 4597 BIND(BMADV); 4598 sub(cnt1tmp, cnt1, 1); 4599 add(str2, str2, result_tmp, LSL, str2_chr_shift); 4600 cmp(str2, str2end); 4601 br(LE, BMLOOPSTR2); 4602 add(sp, sp, ASIZE); 4603 b(NOMATCH); 4604 BIND(BMLOOPSTR1_LASTCMP); 4605 cmp(ch1, ch2); 4606 br(NE, BMSKIP); 4607 BIND(BMMATCH); 4608 sub(result, str2, tmp5); 4609 if (!str2_isL) lsr(result, result, 1); 4610 add(sp, sp, ASIZE); 4611 b(DONE); 4612 4613 BIND(LINEARSTUB); 4614 cmp(cnt1, (u1)16); // small patterns still should be handled by simple algorithm 4615 br(LT, LINEAR_MEDIUM); 4616 mov(result, zr); 4617 RuntimeAddress stub = NULL; 4618 if (isL) { 4619 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ll()); 4620 assert(stub.target() != NULL, "string_indexof_linear_ll stub has not been generated"); 4621 } else if (str1_isL) { 4622 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ul()); 4623 assert(stub.target() != NULL, "string_indexof_linear_ul stub has not been generated"); 4624 } else { 4625 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_uu()); 4626 assert(stub.target() != NULL, "string_indexof_linear_uu stub has not been generated"); 4627 } 4628 trampoline_call(stub); 4629 b(DONE); 4630 } 4631 4632 BIND(LINEARSEARCH); 4633 { 4634 Label DO1, DO2, DO3; 4635 4636 Register str2tmp = tmp2; 4637 Register first = tmp3; 4638 4639 if (icnt1 == -1) 4640 { 4641 Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT; 4642 4643 cmp(cnt1, u1(str1_isL == str2_isL ? 4 : 2)); 4644 br(LT, DOSHORT); 4645 BIND(LINEAR_MEDIUM); 4646 (this->*str1_load_1chr)(first, Address(str1)); 4647 lea(str1, Address(str1, cnt1, Address::lsl(str1_chr_shift))); 4648 sub(cnt1_neg, zr, cnt1, LSL, str1_chr_shift); 4649 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 4650 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 4651 4652 BIND(FIRST_LOOP); 4653 (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg)); 4654 cmp(first, ch2); 4655 br(EQ, STR1_LOOP); 4656 BIND(STR2_NEXT); 4657 adds(cnt2_neg, cnt2_neg, str2_chr_size); 4658 br(LE, FIRST_LOOP); 4659 b(NOMATCH); 4660 4661 BIND(STR1_LOOP); 4662 adds(cnt1tmp, cnt1_neg, str1_chr_size); 4663 add(cnt2tmp, cnt2_neg, str2_chr_size); 4664 br(GE, MATCH); 4665 4666 BIND(STR1_NEXT); 4667 (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp)); 4668 (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp)); 4669 cmp(ch1, ch2); 4670 br(NE, STR2_NEXT); 4671 adds(cnt1tmp, cnt1tmp, str1_chr_size); 4672 add(cnt2tmp, cnt2tmp, str2_chr_size); 4673 br(LT, STR1_NEXT); 4674 b(MATCH); 4675 4676 BIND(DOSHORT); 4677 if (str1_isL == str2_isL) { 4678 cmp(cnt1, (u1)2); 4679 br(LT, DO1); 4680 br(GT, DO3); 4681 } 4682 } 4683 4684 if (icnt1 == 4) { 4685 Label CH1_LOOP; 4686 4687 (this->*load_4chr)(ch1, str1); 4688 sub(result_tmp, cnt2, 4); 4689 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 4690 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 4691 4692 BIND(CH1_LOOP); 4693 (this->*load_4chr)(ch2, Address(str2, cnt2_neg)); 4694 cmp(ch1, ch2); 4695 br(EQ, MATCH); 4696 adds(cnt2_neg, cnt2_neg, str2_chr_size); 4697 br(LE, CH1_LOOP); 4698 b(NOMATCH); 4699 } 4700 4701 if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 2) { 4702 Label CH1_LOOP; 4703 4704 BIND(DO2); 4705 (this->*load_2chr)(ch1, str1); 4706 if (icnt1 == 2) { 4707 sub(result_tmp, cnt2, 2); 4708 } 4709 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 4710 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 4711 BIND(CH1_LOOP); 4712 (this->*load_2chr)(ch2, Address(str2, cnt2_neg)); 4713 cmp(ch1, ch2); 4714 br(EQ, MATCH); 4715 adds(cnt2_neg, cnt2_neg, str2_chr_size); 4716 br(LE, CH1_LOOP); 4717 b(NOMATCH); 4718 } 4719 4720 if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 3) { 4721 Label FIRST_LOOP, STR2_NEXT, STR1_LOOP; 4722 4723 BIND(DO3); 4724 (this->*load_2chr)(first, str1); 4725 (this->*str1_load_1chr)(ch1, Address(str1, 2*str1_chr_size)); 4726 if (icnt1 == 3) { 4727 sub(result_tmp, cnt2, 3); 4728 } 4729 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 4730 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 4731 BIND(FIRST_LOOP); 4732 (this->*load_2chr)(ch2, Address(str2, cnt2_neg)); 4733 cmpw(first, ch2); 4734 br(EQ, STR1_LOOP); 4735 BIND(STR2_NEXT); 4736 adds(cnt2_neg, cnt2_neg, str2_chr_size); 4737 br(LE, FIRST_LOOP); 4738 b(NOMATCH); 4739 4740 BIND(STR1_LOOP); 4741 add(cnt2tmp, cnt2_neg, 2*str2_chr_size); 4742 (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp)); 4743 cmp(ch1, ch2); 4744 br(NE, STR2_NEXT); 4745 b(MATCH); 4746 } 4747 4748 if (icnt1 == -1 || icnt1 == 1) { 4749 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP; 4750 4751 BIND(DO1); 4752 (this->*str1_load_1chr)(ch1, str1); 4753 cmp(cnt2, (u1)8); 4754 br(LT, DO1_SHORT); 4755 4756 sub(result_tmp, cnt2, 8/str2_chr_size); 4757 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 4758 mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001); 4759 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 4760 4761 if (str2_isL) { 4762 orr(ch1, ch1, ch1, LSL, 8); 4763 } 4764 orr(ch1, ch1, ch1, LSL, 16); 4765 orr(ch1, ch1, ch1, LSL, 32); 4766 BIND(CH1_LOOP); 4767 ldr(ch2, Address(str2, cnt2_neg)); 4768 eor(ch2, ch1, ch2); 4769 sub(tmp1, ch2, tmp3); 4770 orr(tmp2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 4771 bics(tmp1, tmp1, tmp2); 4772 br(NE, HAS_ZERO); 4773 adds(cnt2_neg, cnt2_neg, 8); 4774 br(LT, CH1_LOOP); 4775 4776 cmp(cnt2_neg, (u1)8); 4777 mov(cnt2_neg, 0); 4778 br(LT, CH1_LOOP); 4779 b(NOMATCH); 4780 4781 BIND(HAS_ZERO); 4782 rev(tmp1, tmp1); 4783 clz(tmp1, tmp1); 4784 add(cnt2_neg, cnt2_neg, tmp1, LSR, 3); 4785 b(MATCH); 4786 4787 BIND(DO1_SHORT); 4788 mov(result_tmp, cnt2); 4789 lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift))); 4790 sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift); 4791 BIND(DO1_LOOP); 4792 (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg)); 4793 cmpw(ch1, ch2); 4794 br(EQ, MATCH); 4795 adds(cnt2_neg, cnt2_neg, str2_chr_size); 4796 br(LT, DO1_LOOP); 4797 } 4798 } 4799 BIND(NOMATCH); 4800 mov(result, -1); 4801 b(DONE); 4802 BIND(MATCH); 4803 add(result, result_tmp, cnt2_neg, ASR, str2_chr_shift); 4804 BIND(DONE); 4805 } 4806 4807 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr); 4808 typedef void (MacroAssembler::* uxt_insn)(Register Rd, Register Rn); 4809 4810 void MacroAssembler::string_indexof_char(Register str1, Register cnt1, 4811 Register ch, Register result, 4812 Register tmp1, Register tmp2, Register tmp3) 4813 { 4814 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE; 4815 Register cnt1_neg = cnt1; 4816 Register ch1 = rscratch1; 4817 Register result_tmp = rscratch2; 4818 4819 cmp(cnt1, (u1)4); 4820 br(LT, DO1_SHORT); 4821 4822 orr(ch, ch, ch, LSL, 16); 4823 orr(ch, ch, ch, LSL, 32); 4824 4825 sub(cnt1, cnt1, 4); 4826 mov(result_tmp, cnt1); 4827 lea(str1, Address(str1, cnt1, Address::uxtw(1))); 4828 sub(cnt1_neg, zr, cnt1, LSL, 1); 4829 4830 mov(tmp3, 0x0001000100010001); 4831 4832 BIND(CH1_LOOP); 4833 ldr(ch1, Address(str1, cnt1_neg)); 4834 eor(ch1, ch, ch1); 4835 sub(tmp1, ch1, tmp3); 4836 orr(tmp2, ch1, 0x7fff7fff7fff7fff); 4837 bics(tmp1, tmp1, tmp2); 4838 br(NE, HAS_ZERO); 4839 adds(cnt1_neg, cnt1_neg, 8); 4840 br(LT, CH1_LOOP); 4841 4842 cmp(cnt1_neg, (u1)8); 4843 mov(cnt1_neg, 0); 4844 br(LT, CH1_LOOP); 4845 b(NOMATCH); 4846 4847 BIND(HAS_ZERO); 4848 rev(tmp1, tmp1); 4849 clz(tmp1, tmp1); 4850 add(cnt1_neg, cnt1_neg, tmp1, LSR, 3); 4851 b(MATCH); 4852 4853 BIND(DO1_SHORT); 4854 mov(result_tmp, cnt1); 4855 lea(str1, Address(str1, cnt1, Address::uxtw(1))); 4856 sub(cnt1_neg, zr, cnt1, LSL, 1); 4857 BIND(DO1_LOOP); 4858 ldrh(ch1, Address(str1, cnt1_neg)); 4859 cmpw(ch, ch1); 4860 br(EQ, MATCH); 4861 adds(cnt1_neg, cnt1_neg, 2); 4862 br(LT, DO1_LOOP); 4863 BIND(NOMATCH); 4864 mov(result, -1); 4865 b(DONE); 4866 BIND(MATCH); 4867 add(result, result_tmp, cnt1_neg, ASR, 1); 4868 BIND(DONE); 4869 } 4870 4871 // Compare strings. 4872 void MacroAssembler::string_compare(Register str1, Register str2, 4873 Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2, 4874 FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3, int ae) { 4875 Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB, 4876 DIFFERENCE, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, 4877 SHORT_LOOP_START, TAIL_CHECK; 4878 4879 const u1 STUB_THRESHOLD = 64 + 8; 4880 bool isLL = ae == StrIntrinsicNode::LL; 4881 bool isLU = ae == StrIntrinsicNode::LU; 4882 bool isUL = ae == StrIntrinsicNode::UL; 4883 4884 bool str1_isL = isLL || isLU; 4885 bool str2_isL = isLL || isUL; 4886 4887 int str1_chr_shift = str1_isL ? 0 : 1; 4888 int str2_chr_shift = str2_isL ? 0 : 1; 4889 int str1_chr_size = str1_isL ? 1 : 2; 4890 int str2_chr_size = str2_isL ? 1 : 2; 4891 int minCharsInWord = isLL ? wordSize : wordSize/2; 4892 4893 FloatRegister vtmpZ = vtmp1, vtmp = vtmp2; 4894 chr_insn str1_load_chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb : 4895 (chr_insn)&MacroAssembler::ldrh; 4896 chr_insn str2_load_chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb : 4897 (chr_insn)&MacroAssembler::ldrh; 4898 uxt_insn ext_chr = isLL ? (uxt_insn)&MacroAssembler::uxtbw : 4899 (uxt_insn)&MacroAssembler::uxthw; 4900 4901 BLOCK_COMMENT("string_compare {"); 4902 4903 // Bizzarely, the counts are passed in bytes, regardless of whether they 4904 // are L or U strings, however the result is always in characters. 4905 if (!str1_isL) asrw(cnt1, cnt1, 1); 4906 if (!str2_isL) asrw(cnt2, cnt2, 1); 4907 4908 // Compute the minimum of the string lengths and save the difference. 4909 subsw(result, cnt1, cnt2); 4910 cselw(cnt2, cnt1, cnt2, Assembler::LE); // min 4911 4912 // A very short string 4913 cmpw(cnt2, minCharsInWord); 4914 br(Assembler::LE, SHORT_STRING); 4915 4916 // Compare longwords 4917 // load first parts of strings and finish initialization while loading 4918 { 4919 if (str1_isL == str2_isL) { // LL or UU 4920 ldr(tmp1, Address(str1)); 4921 cmp(str1, str2); 4922 br(Assembler::EQ, DONE); 4923 ldr(tmp2, Address(str2)); 4924 cmp(cnt2, STUB_THRESHOLD); 4925 br(GE, STUB); 4926 subsw(cnt2, cnt2, minCharsInWord); 4927 br(EQ, TAIL_CHECK); 4928 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift))); 4929 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift))); 4930 sub(cnt2, zr, cnt2, LSL, str2_chr_shift); 4931 } else if (isLU) { 4932 ldrs(vtmp, Address(str1)); 4933 cmp(str1, str2); 4934 br(Assembler::EQ, DONE); 4935 ldr(tmp2, Address(str2)); 4936 cmp(cnt2, STUB_THRESHOLD); 4937 br(GE, STUB); 4938 subw(cnt2, cnt2, 4); 4939 eor(vtmpZ, T16B, vtmpZ, vtmpZ); 4940 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift))); 4941 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift))); 4942 zip1(vtmp, T8B, vtmp, vtmpZ); 4943 sub(cnt1, zr, cnt2, LSL, str1_chr_shift); 4944 sub(cnt2, zr, cnt2, LSL, str2_chr_shift); 4945 add(cnt1, cnt1, 4); 4946 fmovd(tmp1, vtmp); 4947 } else { // UL case 4948 ldr(tmp1, Address(str1)); 4949 cmp(str1, str2); 4950 br(Assembler::EQ, DONE); 4951 ldrs(vtmp, Address(str2)); 4952 cmp(cnt2, STUB_THRESHOLD); 4953 br(GE, STUB); 4954 subw(cnt2, cnt2, 4); 4955 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift))); 4956 eor(vtmpZ, T16B, vtmpZ, vtmpZ); 4957 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift))); 4958 sub(cnt1, zr, cnt2, LSL, str1_chr_shift); 4959 zip1(vtmp, T8B, vtmp, vtmpZ); 4960 sub(cnt2, zr, cnt2, LSL, str2_chr_shift); 4961 add(cnt1, cnt1, 8); 4962 fmovd(tmp2, vtmp); 4963 } 4964 adds(cnt2, cnt2, isUL ? 4 : 8); 4965 br(GE, TAIL); 4966 eor(rscratch2, tmp1, tmp2); 4967 cbnz(rscratch2, DIFFERENCE); 4968 // main loop 4969 bind(NEXT_WORD); 4970 if (str1_isL == str2_isL) { 4971 ldr(tmp1, Address(str1, cnt2)); 4972 ldr(tmp2, Address(str2, cnt2)); 4973 adds(cnt2, cnt2, 8); 4974 } else if (isLU) { 4975 ldrs(vtmp, Address(str1, cnt1)); 4976 ldr(tmp2, Address(str2, cnt2)); 4977 add(cnt1, cnt1, 4); 4978 zip1(vtmp, T8B, vtmp, vtmpZ); 4979 fmovd(tmp1, vtmp); 4980 adds(cnt2, cnt2, 8); 4981 } else { // UL 4982 ldrs(vtmp, Address(str2, cnt2)); 4983 ldr(tmp1, Address(str1, cnt1)); 4984 zip1(vtmp, T8B, vtmp, vtmpZ); 4985 add(cnt1, cnt1, 8); 4986 fmovd(tmp2, vtmp); 4987 adds(cnt2, cnt2, 4); 4988 } 4989 br(GE, TAIL); 4990 4991 eor(rscratch2, tmp1, tmp2); 4992 cbz(rscratch2, NEXT_WORD); 4993 b(DIFFERENCE); 4994 bind(TAIL); 4995 eor(rscratch2, tmp1, tmp2); 4996 cbnz(rscratch2, DIFFERENCE); 4997 // Last longword. In the case where length == 4 we compare the 4998 // same longword twice, but that's still faster than another 4999 // conditional branch. 5000 if (str1_isL == str2_isL) { 5001 ldr(tmp1, Address(str1)); 5002 ldr(tmp2, Address(str2)); 5003 } else if (isLU) { 5004 ldrs(vtmp, Address(str1)); 5005 ldr(tmp2, Address(str2)); 5006 zip1(vtmp, T8B, vtmp, vtmpZ); 5007 fmovd(tmp1, vtmp); 5008 } else { // UL 5009 ldrs(vtmp, Address(str2)); 5010 ldr(tmp1, Address(str1)); 5011 zip1(vtmp, T8B, vtmp, vtmpZ); 5012 fmovd(tmp2, vtmp); 5013 } 5014 bind(TAIL_CHECK); 5015 eor(rscratch2, tmp1, tmp2); 5016 cbz(rscratch2, DONE); 5017 5018 // Find the first different characters in the longwords and 5019 // compute their difference. 5020 bind(DIFFERENCE); 5021 rev(rscratch2, rscratch2); 5022 clz(rscratch2, rscratch2); 5023 andr(rscratch2, rscratch2, isLL ? -8 : -16); 5024 lsrv(tmp1, tmp1, rscratch2); 5025 (this->*ext_chr)(tmp1, tmp1); 5026 lsrv(tmp2, tmp2, rscratch2); 5027 (this->*ext_chr)(tmp2, tmp2); 5028 subw(result, tmp1, tmp2); 5029 b(DONE); 5030 } 5031 5032 bind(STUB); 5033 RuntimeAddress stub = NULL; 5034 switch(ae) { 5035 case StrIntrinsicNode::LL: 5036 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LL()); 5037 break; 5038 case StrIntrinsicNode::UU: 5039 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UU()); 5040 break; 5041 case StrIntrinsicNode::LU: 5042 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LU()); 5043 break; 5044 case StrIntrinsicNode::UL: 5045 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UL()); 5046 break; 5047 default: 5048 ShouldNotReachHere(); 5049 } 5050 assert(stub.target() != NULL, "compare_long_string stub has not been generated"); 5051 trampoline_call(stub); 5052 b(DONE); 5053 5054 bind(SHORT_STRING); 5055 // Is the minimum length zero? 5056 cbz(cnt2, DONE); 5057 // arrange code to do most branches while loading and loading next characters 5058 // while comparing previous 5059 (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size))); 5060 subs(cnt2, cnt2, 1); 5061 br(EQ, SHORT_LAST_INIT); 5062 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size))); 5063 b(SHORT_LOOP_START); 5064 bind(SHORT_LOOP); 5065 subs(cnt2, cnt2, 1); 5066 br(EQ, SHORT_LAST); 5067 bind(SHORT_LOOP_START); 5068 (this->*str1_load_chr)(tmp2, Address(post(str1, str1_chr_size))); 5069 (this->*str2_load_chr)(rscratch1, Address(post(str2, str2_chr_size))); 5070 cmp(tmp1, cnt1); 5071 br(NE, SHORT_LOOP_TAIL); 5072 subs(cnt2, cnt2, 1); 5073 br(EQ, SHORT_LAST2); 5074 (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size))); 5075 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size))); 5076 cmp(tmp2, rscratch1); 5077 br(EQ, SHORT_LOOP); 5078 sub(result, tmp2, rscratch1); 5079 b(DONE); 5080 bind(SHORT_LOOP_TAIL); 5081 sub(result, tmp1, cnt1); 5082 b(DONE); 5083 bind(SHORT_LAST2); 5084 cmp(tmp2, rscratch1); 5085 br(EQ, DONE); 5086 sub(result, tmp2, rscratch1); 5087 5088 b(DONE); 5089 bind(SHORT_LAST_INIT); 5090 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size))); 5091 bind(SHORT_LAST); 5092 cmp(tmp1, cnt1); 5093 br(EQ, DONE); 5094 sub(result, tmp1, cnt1); 5095 5096 bind(DONE); 5097 5098 BLOCK_COMMENT("} string_compare"); 5099 } 5100 #endif // COMPILER2 5101 5102 // This method checks if provided byte array contains byte with highest bit set. 5103 void MacroAssembler::has_negatives(Register ary1, Register len, Register result) { 5104 // Simple and most common case of aligned small array which is not at the 5105 // end of memory page is placed here. All other cases are in stub. 5106 Label LOOP, END, STUB, STUB_LONG, SET_RESULT, DONE; 5107 const uint64_t UPPER_BIT_MASK=0x8080808080808080; 5108 assert_different_registers(ary1, len, result); 5109 5110 cmpw(len, 0); 5111 br(LE, SET_RESULT); 5112 cmpw(len, 4 * wordSize); 5113 br(GE, STUB_LONG); // size > 32 then go to stub 5114 5115 int shift = 64 - exact_log2(os::vm_page_size()); 5116 lsl(rscratch1, ary1, shift); 5117 mov(rscratch2, (size_t)(4 * wordSize) << shift); 5118 adds(rscratch2, rscratch1, rscratch2); // At end of page? 5119 br(CS, STUB); // at the end of page then go to stub 5120 subs(len, len, wordSize); 5121 br(LT, END); 5122 5123 BIND(LOOP); 5124 ldr(rscratch1, Address(post(ary1, wordSize))); 5125 tst(rscratch1, UPPER_BIT_MASK); 5126 br(NE, SET_RESULT); 5127 subs(len, len, wordSize); 5128 br(GE, LOOP); 5129 cmpw(len, -wordSize); 5130 br(EQ, SET_RESULT); 5131 5132 BIND(END); 5133 ldr(result, Address(ary1)); 5134 sub(len, zr, len, LSL, 3); // LSL 3 is to get bits from bytes 5135 lslv(result, result, len); 5136 tst(result, UPPER_BIT_MASK); 5137 b(SET_RESULT); 5138 5139 BIND(STUB); 5140 RuntimeAddress has_neg = RuntimeAddress(StubRoutines::aarch64::has_negatives()); 5141 assert(has_neg.target() != NULL, "has_negatives stub has not been generated"); 5142 trampoline_call(has_neg); 5143 b(DONE); 5144 5145 BIND(STUB_LONG); 5146 RuntimeAddress has_neg_long = RuntimeAddress( 5147 StubRoutines::aarch64::has_negatives_long()); 5148 assert(has_neg_long.target() != NULL, "has_negatives stub has not been generated"); 5149 trampoline_call(has_neg_long); 5150 b(DONE); 5151 5152 BIND(SET_RESULT); 5153 cset(result, NE); // set true or false 5154 5155 BIND(DONE); 5156 } 5157 5158 void MacroAssembler::arrays_equals(Register a1, Register a2, Register tmp3, 5159 Register tmp4, Register tmp5, Register result, 5160 Register cnt1, int elem_size) { 5161 Label DONE, SAME; 5162 Register tmp1 = rscratch1; 5163 Register tmp2 = rscratch2; 5164 Register cnt2 = tmp2; // cnt2 only used in array length compare 5165 int elem_per_word = wordSize/elem_size; 5166 int log_elem_size = exact_log2(elem_size); 5167 int length_offset = arrayOopDesc::length_offset_in_bytes(); 5168 int base_offset 5169 = arrayOopDesc::base_offset_in_bytes(elem_size == 2 ? T_CHAR : T_BYTE); 5170 int stubBytesThreshold = 3 * 64 + (UseSIMDForArrayEquals ? 0 : 16); 5171 5172 assert(elem_size == 1 || elem_size == 2, "must be char or byte"); 5173 assert_different_registers(a1, a2, result, cnt1, rscratch1, rscratch2); 5174 5175 #ifndef PRODUCT 5176 { 5177 const char kind = (elem_size == 2) ? 'U' : 'L'; 5178 char comment[64]; 5179 snprintf(comment, sizeof comment, "array_equals%c{", kind); 5180 BLOCK_COMMENT(comment); 5181 } 5182 #endif 5183 5184 // if (a1 == a2) 5185 // return true; 5186 cmpoop(a1, a2); // May have read barriers for a1 and a2. 5187 br(EQ, SAME); 5188 5189 if (UseSimpleArrayEquals) { 5190 Label NEXT_WORD, SHORT, TAIL03, TAIL01, A_MIGHT_BE_NULL, A_IS_NOT_NULL; 5191 // if (a1 == null || a2 == null) 5192 // return false; 5193 // a1 & a2 == 0 means (some-pointer is null) or 5194 // (very-rare-or-even-probably-impossible-pointer-values) 5195 // so, we can save one branch in most cases 5196 tst(a1, a2); 5197 mov(result, false); 5198 br(EQ, A_MIGHT_BE_NULL); 5199 // if (a1.length != a2.length) 5200 // return false; 5201 bind(A_IS_NOT_NULL); 5202 ldrw(cnt1, Address(a1, length_offset)); 5203 ldrw(cnt2, Address(a2, length_offset)); 5204 eorw(tmp5, cnt1, cnt2); 5205 cbnzw(tmp5, DONE); 5206 lea(a1, Address(a1, base_offset)); 5207 lea(a2, Address(a2, base_offset)); 5208 // Check for short strings, i.e. smaller than wordSize. 5209 subs(cnt1, cnt1, elem_per_word); 5210 br(Assembler::LT, SHORT); 5211 // Main 8 byte comparison loop. 5212 bind(NEXT_WORD); { 5213 ldr(tmp1, Address(post(a1, wordSize))); 5214 ldr(tmp2, Address(post(a2, wordSize))); 5215 subs(cnt1, cnt1, elem_per_word); 5216 eor(tmp5, tmp1, tmp2); 5217 cbnz(tmp5, DONE); 5218 } br(GT, NEXT_WORD); 5219 // Last longword. In the case where length == 4 we compare the 5220 // same longword twice, but that's still faster than another 5221 // conditional branch. 5222 // cnt1 could be 0, -1, -2, -3, -4 for chars; -4 only happens when 5223 // length == 4. 5224 if (log_elem_size > 0) 5225 lsl(cnt1, cnt1, log_elem_size); 5226 ldr(tmp3, Address(a1, cnt1)); 5227 ldr(tmp4, Address(a2, cnt1)); 5228 eor(tmp5, tmp3, tmp4); 5229 cbnz(tmp5, DONE); 5230 b(SAME); 5231 bind(A_MIGHT_BE_NULL); 5232 // in case both a1 and a2 are not-null, proceed with loads 5233 cbz(a1, DONE); 5234 cbz(a2, DONE); 5235 b(A_IS_NOT_NULL); 5236 bind(SHORT); 5237 5238 tbz(cnt1, 2 - log_elem_size, TAIL03); // 0-7 bytes left. 5239 { 5240 ldrw(tmp1, Address(post(a1, 4))); 5241 ldrw(tmp2, Address(post(a2, 4))); 5242 eorw(tmp5, tmp1, tmp2); 5243 cbnzw(tmp5, DONE); 5244 } 5245 bind(TAIL03); 5246 tbz(cnt1, 1 - log_elem_size, TAIL01); // 0-3 bytes left. 5247 { 5248 ldrh(tmp3, Address(post(a1, 2))); 5249 ldrh(tmp4, Address(post(a2, 2))); 5250 eorw(tmp5, tmp3, tmp4); 5251 cbnzw(tmp5, DONE); 5252 } 5253 bind(TAIL01); 5254 if (elem_size == 1) { // Only needed when comparing byte arrays. 5255 tbz(cnt1, 0, SAME); // 0-1 bytes left. 5256 { 5257 ldrb(tmp1, a1); 5258 ldrb(tmp2, a2); 5259 eorw(tmp5, tmp1, tmp2); 5260 cbnzw(tmp5, DONE); 5261 } 5262 } 5263 } else { 5264 Label NEXT_DWORD, SHORT, TAIL, TAIL2, STUB, EARLY_OUT, 5265 CSET_EQ, LAST_CHECK; 5266 mov(result, false); 5267 cbz(a1, DONE); 5268 ldrw(cnt1, Address(a1, length_offset)); 5269 cbz(a2, DONE); 5270 ldrw(cnt2, Address(a2, length_offset)); 5271 // on most CPUs a2 is still "locked"(surprisingly) in ldrw and it's 5272 // faster to perform another branch before comparing a1 and a2 5273 cmp(cnt1, (u1)elem_per_word); 5274 br(LE, SHORT); // short or same 5275 ldr(tmp3, Address(pre(a1, base_offset))); 5276 subs(zr, cnt1, stubBytesThreshold); 5277 br(GE, STUB); 5278 ldr(tmp4, Address(pre(a2, base_offset))); 5279 sub(tmp5, zr, cnt1, LSL, 3 + log_elem_size); 5280 cmp(cnt2, cnt1); 5281 br(NE, DONE); 5282 5283 // Main 16 byte comparison loop with 2 exits 5284 bind(NEXT_DWORD); { 5285 ldr(tmp1, Address(pre(a1, wordSize))); 5286 ldr(tmp2, Address(pre(a2, wordSize))); 5287 subs(cnt1, cnt1, 2 * elem_per_word); 5288 br(LE, TAIL); 5289 eor(tmp4, tmp3, tmp4); 5290 cbnz(tmp4, DONE); 5291 ldr(tmp3, Address(pre(a1, wordSize))); 5292 ldr(tmp4, Address(pre(a2, wordSize))); 5293 cmp(cnt1, (u1)elem_per_word); 5294 br(LE, TAIL2); 5295 cmp(tmp1, tmp2); 5296 } br(EQ, NEXT_DWORD); 5297 b(DONE); 5298 5299 bind(TAIL); 5300 eor(tmp4, tmp3, tmp4); 5301 eor(tmp2, tmp1, tmp2); 5302 lslv(tmp2, tmp2, tmp5); 5303 orr(tmp5, tmp4, tmp2); 5304 cmp(tmp5, zr); 5305 b(CSET_EQ); 5306 5307 bind(TAIL2); 5308 eor(tmp2, tmp1, tmp2); 5309 cbnz(tmp2, DONE); 5310 b(LAST_CHECK); 5311 5312 bind(STUB); 5313 ldr(tmp4, Address(pre(a2, base_offset))); 5314 cmp(cnt2, cnt1); 5315 br(NE, DONE); 5316 if (elem_size == 2) { // convert to byte counter 5317 lsl(cnt1, cnt1, 1); 5318 } 5319 eor(tmp5, tmp3, tmp4); 5320 cbnz(tmp5, DONE); 5321 RuntimeAddress stub = RuntimeAddress(StubRoutines::aarch64::large_array_equals()); 5322 assert(stub.target() != NULL, "array_equals_long stub has not been generated"); 5323 trampoline_call(stub); 5324 b(DONE); 5325 5326 bind(EARLY_OUT); 5327 // (a1 != null && a2 == null) || (a1 != null && a2 != null && a1 == a2) 5328 // so, if a2 == null => return false(0), else return true, so we can return a2 5329 mov(result, a2); 5330 b(DONE); 5331 bind(SHORT); 5332 cmp(cnt2, cnt1); 5333 br(NE, DONE); 5334 cbz(cnt1, SAME); 5335 sub(tmp5, zr, cnt1, LSL, 3 + log_elem_size); 5336 ldr(tmp3, Address(a1, base_offset)); 5337 ldr(tmp4, Address(a2, base_offset)); 5338 bind(LAST_CHECK); 5339 eor(tmp4, tmp3, tmp4); 5340 lslv(tmp5, tmp4, tmp5); 5341 cmp(tmp5, zr); 5342 bind(CSET_EQ); 5343 cset(result, EQ); 5344 b(DONE); 5345 } 5346 5347 bind(SAME); 5348 mov(result, true); 5349 // That's it. 5350 bind(DONE); 5351 5352 BLOCK_COMMENT("} array_equals"); 5353 } 5354 5355 // Compare Strings 5356 5357 // For Strings we're passed the address of the first characters in a1 5358 // and a2 and the length in cnt1. 5359 // elem_size is the element size in bytes: either 1 or 2. 5360 // There are two implementations. For arrays >= 8 bytes, all 5361 // comparisons (including the final one, which may overlap) are 5362 // performed 8 bytes at a time. For strings < 8 bytes, we compare a 5363 // halfword, then a short, and then a byte. 5364 5365 void MacroAssembler::string_equals(Register a1, Register a2, 5366 Register result, Register cnt1, int elem_size) 5367 { 5368 Label SAME, DONE, SHORT, NEXT_WORD; 5369 Register tmp1 = rscratch1; 5370 Register tmp2 = rscratch2; 5371 Register cnt2 = tmp2; // cnt2 only used in array length compare 5372 5373 assert(elem_size == 1 || elem_size == 2, "must be 2 or 1 byte"); 5374 assert_different_registers(a1, a2, result, cnt1, rscratch1, rscratch2); 5375 5376 #ifndef PRODUCT 5377 { 5378 const char kind = (elem_size == 2) ? 'U' : 'L'; 5379 char comment[64]; 5380 snprintf(comment, sizeof comment, "{string_equals%c", kind); 5381 BLOCK_COMMENT(comment); 5382 } 5383 #endif 5384 5385 mov(result, false); 5386 5387 // Check for short strings, i.e. smaller than wordSize. 5388 subs(cnt1, cnt1, wordSize); 5389 br(Assembler::LT, SHORT); 5390 // Main 8 byte comparison loop. 5391 bind(NEXT_WORD); { 5392 ldr(tmp1, Address(post(a1, wordSize))); 5393 ldr(tmp2, Address(post(a2, wordSize))); 5394 subs(cnt1, cnt1, wordSize); 5395 eor(tmp1, tmp1, tmp2); 5396 cbnz(tmp1, DONE); 5397 } br(GT, NEXT_WORD); 5398 // Last longword. In the case where length == 4 we compare the 5399 // same longword twice, but that's still faster than another 5400 // conditional branch. 5401 // cnt1 could be 0, -1, -2, -3, -4 for chars; -4 only happens when 5402 // length == 4. 5403 ldr(tmp1, Address(a1, cnt1)); 5404 ldr(tmp2, Address(a2, cnt1)); 5405 eor(tmp2, tmp1, tmp2); 5406 cbnz(tmp2, DONE); 5407 b(SAME); 5408 5409 bind(SHORT); 5410 Label TAIL03, TAIL01; 5411 5412 tbz(cnt1, 2, TAIL03); // 0-7 bytes left. 5413 { 5414 ldrw(tmp1, Address(post(a1, 4))); 5415 ldrw(tmp2, Address(post(a2, 4))); 5416 eorw(tmp1, tmp1, tmp2); 5417 cbnzw(tmp1, DONE); 5418 } 5419 bind(TAIL03); 5420 tbz(cnt1, 1, TAIL01); // 0-3 bytes left. 5421 { 5422 ldrh(tmp1, Address(post(a1, 2))); 5423 ldrh(tmp2, Address(post(a2, 2))); 5424 eorw(tmp1, tmp1, tmp2); 5425 cbnzw(tmp1, DONE); 5426 } 5427 bind(TAIL01); 5428 if (elem_size == 1) { // Only needed when comparing 1-byte elements 5429 tbz(cnt1, 0, SAME); // 0-1 bytes left. 5430 { 5431 ldrb(tmp1, a1); 5432 ldrb(tmp2, a2); 5433 eorw(tmp1, tmp1, tmp2); 5434 cbnzw(tmp1, DONE); 5435 } 5436 } 5437 // Arrays are equal. 5438 bind(SAME); 5439 mov(result, true); 5440 5441 // That's it. 5442 bind(DONE); 5443 BLOCK_COMMENT("} string_equals"); 5444 } 5445 5446 5447 // The size of the blocks erased by the zero_blocks stub. We must 5448 // handle anything smaller than this ourselves in zero_words(). 5449 const int MacroAssembler::zero_words_block_size = 8; 5450 5451 // zero_words() is used by C2 ClearArray patterns. It is as small as 5452 // possible, handling small word counts locally and delegating 5453 // anything larger to the zero_blocks stub. It is expanded many times 5454 // in compiled code, so it is important to keep it short. 5455 5456 // ptr: Address of a buffer to be zeroed. 5457 // cnt: Count in HeapWords. 5458 // 5459 // ptr, cnt, rscratch1, and rscratch2 are clobbered. 5460 void MacroAssembler::zero_words(Register ptr, Register cnt) 5461 { 5462 assert(is_power_of_2(zero_words_block_size), "adjust this"); 5463 assert(ptr == r10 && cnt == r11, "mismatch in register usage"); 5464 5465 BLOCK_COMMENT("zero_words {"); 5466 cmp(cnt, (u1)zero_words_block_size); 5467 Label around; 5468 br(LO, around); 5469 { 5470 RuntimeAddress zero_blocks = RuntimeAddress(StubRoutines::aarch64::zero_blocks()); 5471 assert(zero_blocks.target() != NULL, "zero_blocks stub has not been generated"); 5472 if (StubRoutines::aarch64::complete()) { 5473 trampoline_call(zero_blocks); 5474 } else { 5475 bl(zero_blocks); 5476 } 5477 } 5478 bind(around); 5479 for (int i = zero_words_block_size >> 1; i > 1; i >>= 1) { 5480 Label l; 5481 tbz(cnt, exact_log2(i), l); 5482 for (int j = 0; j < i; j += 2) { 5483 stp(zr, zr, post(ptr, 16)); 5484 } 5485 bind(l); 5486 } 5487 { 5488 Label l; 5489 tbz(cnt, 0, l); 5490 str(zr, Address(ptr)); 5491 bind(l); 5492 } 5493 BLOCK_COMMENT("} zero_words"); 5494 } 5495 5496 // base: Address of a buffer to be zeroed, 8 bytes aligned. 5497 // cnt: Immediate count in HeapWords. 5498 #define SmallArraySize (18 * BytesPerLong) 5499 void MacroAssembler::zero_words(Register base, u_int64_t cnt) 5500 { 5501 BLOCK_COMMENT("zero_words {"); 5502 int i = cnt & 1; // store any odd word to start 5503 if (i) str(zr, Address(base)); 5504 5505 if (cnt <= SmallArraySize / BytesPerLong) { 5506 for (; i < (int)cnt; i += 2) 5507 stp(zr, zr, Address(base, i * wordSize)); 5508 } else { 5509 const int unroll = 4; // Number of stp(zr, zr) instructions we'll unroll 5510 int remainder = cnt % (2 * unroll); 5511 for (; i < remainder; i += 2) 5512 stp(zr, zr, Address(base, i * wordSize)); 5513 5514 Label loop; 5515 Register cnt_reg = rscratch1; 5516 Register loop_base = rscratch2; 5517 cnt = cnt - remainder; 5518 mov(cnt_reg, cnt); 5519 // adjust base and prebias by -2 * wordSize so we can pre-increment 5520 add(loop_base, base, (remainder - 2) * wordSize); 5521 bind(loop); 5522 sub(cnt_reg, cnt_reg, 2 * unroll); 5523 for (i = 1; i < unroll; i++) 5524 stp(zr, zr, Address(loop_base, 2 * i * wordSize)); 5525 stp(zr, zr, Address(pre(loop_base, 2 * unroll * wordSize))); 5526 cbnz(cnt_reg, loop); 5527 } 5528 BLOCK_COMMENT("} zero_words"); 5529 } 5530 5531 // Zero blocks of memory by using DC ZVA. 5532 // 5533 // Aligns the base address first sufficently for DC ZVA, then uses 5534 // DC ZVA repeatedly for every full block. cnt is the size to be 5535 // zeroed in HeapWords. Returns the count of words left to be zeroed 5536 // in cnt. 5537 // 5538 // NOTE: This is intended to be used in the zero_blocks() stub. If 5539 // you want to use it elsewhere, note that cnt must be >= 2*zva_length. 5540 void MacroAssembler::zero_dcache_blocks(Register base, Register cnt) { 5541 Register tmp = rscratch1; 5542 Register tmp2 = rscratch2; 5543 int zva_length = VM_Version::zva_length(); 5544 Label initial_table_end, loop_zva; 5545 Label fini; 5546 5547 // Base must be 16 byte aligned. If not just return and let caller handle it 5548 tst(base, 0x0f); 5549 br(Assembler::NE, fini); 5550 // Align base with ZVA length. 5551 neg(tmp, base); 5552 andr(tmp, tmp, zva_length - 1); 5553 5554 // tmp: the number of bytes to be filled to align the base with ZVA length. 5555 add(base, base, tmp); 5556 sub(cnt, cnt, tmp, Assembler::ASR, 3); 5557 adr(tmp2, initial_table_end); 5558 sub(tmp2, tmp2, tmp, Assembler::LSR, 2); 5559 br(tmp2); 5560 5561 for (int i = -zva_length + 16; i < 0; i += 16) 5562 stp(zr, zr, Address(base, i)); 5563 bind(initial_table_end); 5564 5565 sub(cnt, cnt, zva_length >> 3); 5566 bind(loop_zva); 5567 dc(Assembler::ZVA, base); 5568 subs(cnt, cnt, zva_length >> 3); 5569 add(base, base, zva_length); 5570 br(Assembler::GE, loop_zva); 5571 add(cnt, cnt, zva_length >> 3); // count not zeroed by DC ZVA 5572 bind(fini); 5573 } 5574 5575 // base: Address of a buffer to be filled, 8 bytes aligned. 5576 // cnt: Count in 8-byte unit. 5577 // value: Value to be filled with. 5578 // base will point to the end of the buffer after filling. 5579 void MacroAssembler::fill_words(Register base, Register cnt, Register value) 5580 { 5581 // Algorithm: 5582 // 5583 // scratch1 = cnt & 7; 5584 // cnt -= scratch1; 5585 // p += scratch1; 5586 // switch (scratch1) { 5587 // do { 5588 // cnt -= 8; 5589 // p[-8] = v; 5590 // case 7: 5591 // p[-7] = v; 5592 // case 6: 5593 // p[-6] = v; 5594 // // ... 5595 // case 1: 5596 // p[-1] = v; 5597 // case 0: 5598 // p += 8; 5599 // } while (cnt); 5600 // } 5601 5602 assert_different_registers(base, cnt, value, rscratch1, rscratch2); 5603 5604 Label fini, skip, entry, loop; 5605 const int unroll = 8; // Number of stp instructions we'll unroll 5606 5607 cbz(cnt, fini); 5608 tbz(base, 3, skip); 5609 str(value, Address(post(base, 8))); 5610 sub(cnt, cnt, 1); 5611 bind(skip); 5612 5613 andr(rscratch1, cnt, (unroll-1) * 2); 5614 sub(cnt, cnt, rscratch1); 5615 add(base, base, rscratch1, Assembler::LSL, 3); 5616 adr(rscratch2, entry); 5617 sub(rscratch2, rscratch2, rscratch1, Assembler::LSL, 1); 5618 br(rscratch2); 5619 5620 bind(loop); 5621 add(base, base, unroll * 16); 5622 for (int i = -unroll; i < 0; i++) 5623 stp(value, value, Address(base, i * 16)); 5624 bind(entry); 5625 subs(cnt, cnt, unroll * 2); 5626 br(Assembler::GE, loop); 5627 5628 tbz(cnt, 0, fini); 5629 str(value, Address(post(base, 8))); 5630 bind(fini); 5631 } 5632 5633 // Intrinsic for sun/nio/cs/ISO_8859_1$Encoder.implEncodeISOArray and 5634 // java/lang/StringUTF16.compress. 5635 void MacroAssembler::encode_iso_array(Register src, Register dst, 5636 Register len, Register result, 5637 FloatRegister Vtmp1, FloatRegister Vtmp2, 5638 FloatRegister Vtmp3, FloatRegister Vtmp4) 5639 { 5640 Label DONE, SET_RESULT, NEXT_32, NEXT_32_PRFM, LOOP_8, NEXT_8, LOOP_1, NEXT_1, 5641 NEXT_32_START, NEXT_32_PRFM_START; 5642 Register tmp1 = rscratch1, tmp2 = rscratch2; 5643 5644 mov(result, len); // Save initial len 5645 5646 #ifndef BUILTIN_SIM 5647 cmp(len, (u1)8); // handle shortest strings first 5648 br(LT, LOOP_1); 5649 cmp(len, (u1)32); 5650 br(LT, NEXT_8); 5651 // The following code uses the SIMD 'uzp1' and 'uzp2' instructions 5652 // to convert chars to bytes 5653 if (SoftwarePrefetchHintDistance >= 0) { 5654 ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src); 5655 subs(tmp2, len, SoftwarePrefetchHintDistance/2 + 16); 5656 br(LE, NEXT_32_START); 5657 b(NEXT_32_PRFM_START); 5658 BIND(NEXT_32_PRFM); 5659 ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src); 5660 BIND(NEXT_32_PRFM_START); 5661 prfm(Address(src, SoftwarePrefetchHintDistance)); 5662 orr(v4, T16B, Vtmp1, Vtmp2); 5663 orr(v5, T16B, Vtmp3, Vtmp4); 5664 uzp1(Vtmp1, T16B, Vtmp1, Vtmp2); 5665 uzp1(Vtmp3, T16B, Vtmp3, Vtmp4); 5666 uzp2(v5, T16B, v4, v5); // high bytes 5667 umov(tmp2, v5, D, 1); 5668 fmovd(tmp1, v5); 5669 orr(tmp1, tmp1, tmp2); 5670 cbnz(tmp1, LOOP_8); 5671 stpq(Vtmp1, Vtmp3, dst); 5672 sub(len, len, 32); 5673 add(dst, dst, 32); 5674 add(src, src, 64); 5675 subs(tmp2, len, SoftwarePrefetchHintDistance/2 + 16); 5676 br(GE, NEXT_32_PRFM); 5677 cmp(len, (u1)32); 5678 br(LT, LOOP_8); 5679 BIND(NEXT_32); 5680 ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src); 5681 BIND(NEXT_32_START); 5682 } else { 5683 BIND(NEXT_32); 5684 ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src); 5685 } 5686 prfm(Address(src, SoftwarePrefetchHintDistance)); 5687 uzp1(v4, T16B, Vtmp1, Vtmp2); 5688 uzp1(v5, T16B, Vtmp3, Vtmp4); 5689 orr(Vtmp1, T16B, Vtmp1, Vtmp2); 5690 orr(Vtmp3, T16B, Vtmp3, Vtmp4); 5691 uzp2(Vtmp1, T16B, Vtmp1, Vtmp3); // high bytes 5692 umov(tmp2, Vtmp1, D, 1); 5693 fmovd(tmp1, Vtmp1); 5694 orr(tmp1, tmp1, tmp2); 5695 cbnz(tmp1, LOOP_8); 5696 stpq(v4, v5, dst); 5697 sub(len, len, 32); 5698 add(dst, dst, 32); 5699 add(src, src, 64); 5700 cmp(len, (u1)32); 5701 br(GE, NEXT_32); 5702 cbz(len, DONE); 5703 5704 BIND(LOOP_8); 5705 cmp(len, (u1)8); 5706 br(LT, LOOP_1); 5707 BIND(NEXT_8); 5708 ld1(Vtmp1, T8H, src); 5709 uzp1(Vtmp2, T16B, Vtmp1, Vtmp1); // low bytes 5710 uzp2(Vtmp3, T16B, Vtmp1, Vtmp1); // high bytes 5711 fmovd(tmp1, Vtmp3); 5712 cbnz(tmp1, NEXT_1); 5713 strd(Vtmp2, dst); 5714 5715 sub(len, len, 8); 5716 add(dst, dst, 8); 5717 add(src, src, 16); 5718 cmp(len, (u1)8); 5719 br(GE, NEXT_8); 5720 5721 BIND(LOOP_1); 5722 #endif 5723 cbz(len, DONE); 5724 BIND(NEXT_1); 5725 ldrh(tmp1, Address(post(src, 2))); 5726 tst(tmp1, 0xff00); 5727 br(NE, SET_RESULT); 5728 strb(tmp1, Address(post(dst, 1))); 5729 subs(len, len, 1); 5730 br(GT, NEXT_1); 5731 5732 BIND(SET_RESULT); 5733 sub(result, result, len); // Return index where we stopped 5734 // Return len == 0 if we processed all 5735 // characters 5736 BIND(DONE); 5737 } 5738 5739 5740 // Inflate byte[] array to char[]. 5741 void MacroAssembler::byte_array_inflate(Register src, Register dst, Register len, 5742 FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3, 5743 Register tmp4) { 5744 Label big, done, after_init, to_stub; 5745 5746 assert_different_registers(src, dst, len, tmp4, rscratch1); 5747 5748 fmovd(vtmp1, zr); 5749 lsrw(tmp4, len, 3); 5750 bind(after_init); 5751 cbnzw(tmp4, big); 5752 // Short string: less than 8 bytes. 5753 { 5754 Label loop, tiny; 5755 5756 cmpw(len, 4); 5757 br(LT, tiny); 5758 // Use SIMD to do 4 bytes. 5759 ldrs(vtmp2, post(src, 4)); 5760 zip1(vtmp3, T8B, vtmp2, vtmp1); 5761 subw(len, len, 4); 5762 strd(vtmp3, post(dst, 8)); 5763 5764 cbzw(len, done); 5765 5766 // Do the remaining bytes by steam. 5767 bind(loop); 5768 ldrb(tmp4, post(src, 1)); 5769 strh(tmp4, post(dst, 2)); 5770 subw(len, len, 1); 5771 5772 bind(tiny); 5773 cbnz(len, loop); 5774 5775 b(done); 5776 } 5777 5778 if (SoftwarePrefetchHintDistance >= 0) { 5779 bind(to_stub); 5780 RuntimeAddress stub = RuntimeAddress(StubRoutines::aarch64::large_byte_array_inflate()); 5781 assert(stub.target() != NULL, "large_byte_array_inflate stub has not been generated"); 5782 trampoline_call(stub); 5783 b(after_init); 5784 } 5785 5786 // Unpack the bytes 8 at a time. 5787 bind(big); 5788 { 5789 Label loop, around, loop_last, loop_start; 5790 5791 if (SoftwarePrefetchHintDistance >= 0) { 5792 const int large_loop_threshold = (64 + 16)/8; 5793 ldrd(vtmp2, post(src, 8)); 5794 andw(len, len, 7); 5795 cmp(tmp4, (u1)large_loop_threshold); 5796 br(GE, to_stub); 5797 b(loop_start); 5798 5799 bind(loop); 5800 ldrd(vtmp2, post(src, 8)); 5801 bind(loop_start); 5802 subs(tmp4, tmp4, 1); 5803 br(EQ, loop_last); 5804 zip1(vtmp2, T16B, vtmp2, vtmp1); 5805 ldrd(vtmp3, post(src, 8)); 5806 st1(vtmp2, T8H, post(dst, 16)); 5807 subs(tmp4, tmp4, 1); 5808 zip1(vtmp3, T16B, vtmp3, vtmp1); 5809 st1(vtmp3, T8H, post(dst, 16)); 5810 br(NE, loop); 5811 b(around); 5812 bind(loop_last); 5813 zip1(vtmp2, T16B, vtmp2, vtmp1); 5814 st1(vtmp2, T8H, post(dst, 16)); 5815 bind(around); 5816 cbz(len, done); 5817 } else { 5818 andw(len, len, 7); 5819 bind(loop); 5820 ldrd(vtmp2, post(src, 8)); 5821 sub(tmp4, tmp4, 1); 5822 zip1(vtmp3, T16B, vtmp2, vtmp1); 5823 st1(vtmp3, T8H, post(dst, 16)); 5824 cbnz(tmp4, loop); 5825 } 5826 } 5827 5828 // Do the tail of up to 8 bytes. 5829 add(src, src, len); 5830 ldrd(vtmp3, Address(src, -8)); 5831 add(dst, dst, len, ext::uxtw, 1); 5832 zip1(vtmp3, T16B, vtmp3, vtmp1); 5833 strq(vtmp3, Address(dst, -16)); 5834 5835 bind(done); 5836 } 5837 5838 // Compress char[] array to byte[]. 5839 void MacroAssembler::char_array_compress(Register src, Register dst, Register len, 5840 FloatRegister tmp1Reg, FloatRegister tmp2Reg, 5841 FloatRegister tmp3Reg, FloatRegister tmp4Reg, 5842 Register result) { 5843 encode_iso_array(src, dst, len, result, 5844 tmp1Reg, tmp2Reg, tmp3Reg, tmp4Reg); 5845 cmp(len, zr); 5846 csel(result, result, zr, EQ); 5847 } 5848 5849 // get_thread() can be called anywhere inside generated code so we 5850 // need to save whatever non-callee save context might get clobbered 5851 // by the call to JavaThread::aarch64_get_thread_helper() or, indeed, 5852 // the call setup code. 5853 // 5854 // aarch64_get_thread_helper() clobbers only r0, r1, and flags. 5855 // 5856 void MacroAssembler::get_thread(Register dst) { 5857 RegSet saved_regs = RegSet::range(r0, r1) + lr - dst; 5858 push(saved_regs, sp); 5859 5860 mov(lr, CAST_FROM_FN_PTR(address, JavaThread::aarch64_get_thread_helper)); 5861 blrt(lr, 1, 0, 1); 5862 if (dst != c_rarg0) { 5863 mov(dst, c_rarg0); 5864 } 5865 5866 pop(saved_regs, sp); 5867 }