1 /* 2 * Copyright (c) 1997, 2018, Oracle and/or its affiliates. All rights reserved. 3 * Copyright (c) 2014, 2015, Red Hat Inc. All rights reserved. 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. 9 * 10 * This code is distributed in the hope that it will be useful, but WITHOUT 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 * version 2 for more details (a copy is included in the LICENSE file that 14 * accompanied this code). 15 * 16 * You should have received a copy of the GNU General Public License version 17 * 2 along with this work; if not, write to the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 19 * 20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 21 * or visit www.oracle.com if you need additional information or have any 22 * questions. 23 * 24 */ 25 26 #include <sys/types.h> 27 28 #include "precompiled.hpp" 29 #include "jvm.h" 30 #include "asm/assembler.hpp" 31 #include "asm/assembler.inline.hpp" 32 #include "gc/shared/barrierSet.hpp" 33 #include "gc/shared/cardTable.hpp" 34 #include "gc/shared/barrierSetAssembler.hpp" 35 #include "gc/shared/cardTableBarrierSet.hpp" 36 #include "interpreter/interpreter.hpp" 37 #include "compiler/disassembler.hpp" 38 #include "memory/resourceArea.hpp" 39 #include "nativeInst_aarch64.hpp" 40 #include "oops/accessDecorators.hpp" 41 #include "oops/compressedOops.inline.hpp" 42 #include "oops/klass.inline.hpp" 43 #include "oops/oop.hpp" 44 #include "opto/compile.hpp" 45 #include "opto/intrinsicnode.hpp" 46 #include "opto/node.hpp" 47 #include "runtime/biasedLocking.hpp" 48 #include "runtime/icache.hpp" 49 #include "runtime/interfaceSupport.inline.hpp" 50 #include "runtime/jniHandles.inline.hpp" 51 #include "runtime/sharedRuntime.hpp" 52 #include "runtime/thread.hpp" 53 54 #ifdef PRODUCT 55 #define BLOCK_COMMENT(str) /* nothing */ 56 #define STOP(error) stop(error) 57 #else 58 #define BLOCK_COMMENT(str) block_comment(str) 59 #define STOP(error) block_comment(error); stop(error) 60 #endif 61 62 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 63 64 // Patch any kind of instruction; there may be several instructions. 65 // Return the total length (in bytes) of the instructions. 66 int MacroAssembler::pd_patch_instruction_size(address branch, address target) { 67 int instructions = 1; 68 assert((uint64_t)target < (1ul << 48), "48-bit overflow in address constant"); 69 long offset = (target - branch) >> 2; 70 unsigned insn = *(unsigned*)branch; 71 if ((Instruction_aarch64::extract(insn, 29, 24) & 0b111011) == 0b011000) { 72 // Load register (literal) 73 Instruction_aarch64::spatch(branch, 23, 5, offset); 74 } else if (Instruction_aarch64::extract(insn, 30, 26) == 0b00101) { 75 // Unconditional branch (immediate) 76 Instruction_aarch64::spatch(branch, 25, 0, offset); 77 } else if (Instruction_aarch64::extract(insn, 31, 25) == 0b0101010) { 78 // Conditional branch (immediate) 79 Instruction_aarch64::spatch(branch, 23, 5, offset); 80 } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011010) { 81 // Compare & branch (immediate) 82 Instruction_aarch64::spatch(branch, 23, 5, offset); 83 } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011011) { 84 // Test & branch (immediate) 85 Instruction_aarch64::spatch(branch, 18, 5, offset); 86 } else if (Instruction_aarch64::extract(insn, 28, 24) == 0b10000) { 87 // PC-rel. addressing 88 offset = target-branch; 89 int shift = Instruction_aarch64::extract(insn, 31, 31); 90 if (shift) { 91 u_int64_t dest = (u_int64_t)target; 92 uint64_t pc_page = (uint64_t)branch >> 12; 93 uint64_t adr_page = (uint64_t)target >> 12; 94 unsigned offset_lo = dest & 0xfff; 95 offset = adr_page - pc_page; 96 97 // We handle 4 types of PC relative addressing 98 // 1 - adrp Rx, target_page 99 // ldr/str Ry, [Rx, #offset_in_page] 100 // 2 - adrp Rx, target_page 101 // add Ry, Rx, #offset_in_page 102 // 3 - adrp Rx, target_page (page aligned reloc, offset == 0) 103 // movk Rx, #imm16<<32 104 // 4 - adrp Rx, target_page (page aligned reloc, offset == 0) 105 // In the first 3 cases we must check that Rx is the same in the adrp and the 106 // subsequent ldr/str, add or movk instruction. Otherwise we could accidentally end 107 // up treating a type 4 relocation as a type 1, 2 or 3 just because it happened 108 // to be followed by a random unrelated ldr/str, add or movk instruction. 109 // 110 unsigned insn2 = ((unsigned*)branch)[1]; 111 if (Instruction_aarch64::extract(insn2, 29, 24) == 0b111001 && 112 Instruction_aarch64::extract(insn, 4, 0) == 113 Instruction_aarch64::extract(insn2, 9, 5)) { 114 // Load/store register (unsigned immediate) 115 unsigned size = Instruction_aarch64::extract(insn2, 31, 30); 116 Instruction_aarch64::patch(branch + sizeof (unsigned), 117 21, 10, offset_lo >> size); 118 guarantee(((dest >> size) << size) == dest, "misaligned target"); 119 instructions = 2; 120 } else if (Instruction_aarch64::extract(insn2, 31, 22) == 0b1001000100 && 121 Instruction_aarch64::extract(insn, 4, 0) == 122 Instruction_aarch64::extract(insn2, 4, 0)) { 123 // add (immediate) 124 Instruction_aarch64::patch(branch + sizeof (unsigned), 125 21, 10, offset_lo); 126 instructions = 2; 127 } else if (Instruction_aarch64::extract(insn2, 31, 21) == 0b11110010110 && 128 Instruction_aarch64::extract(insn, 4, 0) == 129 Instruction_aarch64::extract(insn2, 4, 0)) { 130 // movk #imm16<<32 131 Instruction_aarch64::patch(branch + 4, 20, 5, (uint64_t)target >> 32); 132 long dest = ((long)target & 0xffffffffL) | ((long)branch & 0xffff00000000L); 133 long pc_page = (long)branch >> 12; 134 long adr_page = (long)dest >> 12; 135 offset = adr_page - pc_page; 136 instructions = 2; 137 } 138 } 139 int offset_lo = offset & 3; 140 offset >>= 2; 141 Instruction_aarch64::spatch(branch, 23, 5, offset); 142 Instruction_aarch64::patch(branch, 30, 29, offset_lo); 143 } else if (Instruction_aarch64::extract(insn, 31, 21) == 0b11010010100) { 144 u_int64_t dest = (u_int64_t)target; 145 // Move wide constant 146 assert(nativeInstruction_at(branch+4)->is_movk(), "wrong insns in patch"); 147 assert(nativeInstruction_at(branch+8)->is_movk(), "wrong insns in patch"); 148 Instruction_aarch64::patch(branch, 20, 5, dest & 0xffff); 149 Instruction_aarch64::patch(branch+4, 20, 5, (dest >>= 16) & 0xffff); 150 Instruction_aarch64::patch(branch+8, 20, 5, (dest >>= 16) & 0xffff); 151 assert(target_addr_for_insn(branch) == target, "should be"); 152 instructions = 3; 153 } else if (Instruction_aarch64::extract(insn, 31, 22) == 0b1011100101 && 154 Instruction_aarch64::extract(insn, 4, 0) == 0b11111) { 155 // nothing to do 156 assert(target == 0, "did not expect to relocate target for polling page load"); 157 } else { 158 ShouldNotReachHere(); 159 } 160 return instructions * NativeInstruction::instruction_size; 161 } 162 163 int MacroAssembler::patch_oop(address insn_addr, address o) { 164 int instructions; 165 unsigned insn = *(unsigned*)insn_addr; 166 assert(nativeInstruction_at(insn_addr+4)->is_movk(), "wrong insns in patch"); 167 168 // OOPs are either narrow (32 bits) or wide (48 bits). We encode 169 // narrow OOPs by setting the upper 16 bits in the first 170 // instruction. 171 if (Instruction_aarch64::extract(insn, 31, 21) == 0b11010010101) { 172 // Move narrow OOP 173 narrowOop n = CompressedOops::encode((oop)o); 174 Instruction_aarch64::patch(insn_addr, 20, 5, n >> 16); 175 Instruction_aarch64::patch(insn_addr+4, 20, 5, n & 0xffff); 176 instructions = 2; 177 } else { 178 // Move wide OOP 179 assert(nativeInstruction_at(insn_addr+8)->is_movk(), "wrong insns in patch"); 180 uintptr_t dest = (uintptr_t)o; 181 Instruction_aarch64::patch(insn_addr, 20, 5, dest & 0xffff); 182 Instruction_aarch64::patch(insn_addr+4, 20, 5, (dest >>= 16) & 0xffff); 183 Instruction_aarch64::patch(insn_addr+8, 20, 5, (dest >>= 16) & 0xffff); 184 instructions = 3; 185 } 186 return instructions * NativeInstruction::instruction_size; 187 } 188 189 int MacroAssembler::patch_narrow_klass(address insn_addr, narrowKlass n) { 190 // Metatdata pointers are either narrow (32 bits) or wide (48 bits). 191 // We encode narrow ones by setting the upper 16 bits in the first 192 // instruction. 193 NativeInstruction *insn = nativeInstruction_at(insn_addr); 194 assert(Instruction_aarch64::extract(insn->encoding(), 31, 21) == 0b11010010101 && 195 nativeInstruction_at(insn_addr+4)->is_movk(), "wrong insns in patch"); 196 197 Instruction_aarch64::patch(insn_addr, 20, 5, n >> 16); 198 Instruction_aarch64::patch(insn_addr+4, 20, 5, n & 0xffff); 199 return 2 * NativeInstruction::instruction_size; 200 } 201 202 address MacroAssembler::target_addr_for_insn(address insn_addr, unsigned insn) { 203 long offset = 0; 204 if ((Instruction_aarch64::extract(insn, 29, 24) & 0b011011) == 0b00011000) { 205 // Load register (literal) 206 offset = Instruction_aarch64::sextract(insn, 23, 5); 207 return address(((uint64_t)insn_addr + (offset << 2))); 208 } else if (Instruction_aarch64::extract(insn, 30, 26) == 0b00101) { 209 // Unconditional branch (immediate) 210 offset = Instruction_aarch64::sextract(insn, 25, 0); 211 } else if (Instruction_aarch64::extract(insn, 31, 25) == 0b0101010) { 212 // Conditional branch (immediate) 213 offset = Instruction_aarch64::sextract(insn, 23, 5); 214 } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011010) { 215 // Compare & branch (immediate) 216 offset = Instruction_aarch64::sextract(insn, 23, 5); 217 } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011011) { 218 // Test & branch (immediate) 219 offset = Instruction_aarch64::sextract(insn, 18, 5); 220 } else if (Instruction_aarch64::extract(insn, 28, 24) == 0b10000) { 221 // PC-rel. addressing 222 offset = Instruction_aarch64::extract(insn, 30, 29); 223 offset |= Instruction_aarch64::sextract(insn, 23, 5) << 2; 224 int shift = Instruction_aarch64::extract(insn, 31, 31) ? 12 : 0; 225 if (shift) { 226 offset <<= shift; 227 uint64_t target_page = ((uint64_t)insn_addr) + offset; 228 target_page &= ((uint64_t)-1) << shift; 229 // Return the target address for the following sequences 230 // 1 - adrp Rx, target_page 231 // ldr/str Ry, [Rx, #offset_in_page] 232 // 2 - adrp Rx, target_page 233 // add Ry, Rx, #offset_in_page 234 // 3 - adrp Rx, target_page (page aligned reloc, offset == 0) 235 // movk Rx, #imm12<<32 236 // 4 - adrp Rx, target_page (page aligned reloc, offset == 0) 237 // 238 // In the first two cases we check that the register is the same and 239 // return the target_page + the offset within the page. 240 // Otherwise we assume it is a page aligned relocation and return 241 // the target page only. 242 // 243 unsigned insn2 = ((unsigned*)insn_addr)[1]; 244 if (Instruction_aarch64::extract(insn2, 29, 24) == 0b111001 && 245 Instruction_aarch64::extract(insn, 4, 0) == 246 Instruction_aarch64::extract(insn2, 9, 5)) { 247 // Load/store register (unsigned immediate) 248 unsigned int byte_offset = Instruction_aarch64::extract(insn2, 21, 10); 249 unsigned int size = Instruction_aarch64::extract(insn2, 31, 30); 250 return address(target_page + (byte_offset << size)); 251 } else if (Instruction_aarch64::extract(insn2, 31, 22) == 0b1001000100 && 252 Instruction_aarch64::extract(insn, 4, 0) == 253 Instruction_aarch64::extract(insn2, 4, 0)) { 254 // add (immediate) 255 unsigned int byte_offset = Instruction_aarch64::extract(insn2, 21, 10); 256 return address(target_page + byte_offset); 257 } else { 258 if (Instruction_aarch64::extract(insn2, 31, 21) == 0b11110010110 && 259 Instruction_aarch64::extract(insn, 4, 0) == 260 Instruction_aarch64::extract(insn2, 4, 0)) { 261 target_page = (target_page & 0xffffffff) | 262 ((uint64_t)Instruction_aarch64::extract(insn2, 20, 5) << 32); 263 } 264 return (address)target_page; 265 } 266 } else { 267 ShouldNotReachHere(); 268 } 269 } else if (Instruction_aarch64::extract(insn, 31, 23) == 0b110100101) { 270 u_int32_t *insns = (u_int32_t *)insn_addr; 271 // Move wide constant: movz, movk, movk. See movptr(). 272 assert(nativeInstruction_at(insns+1)->is_movk(), "wrong insns in patch"); 273 assert(nativeInstruction_at(insns+2)->is_movk(), "wrong insns in patch"); 274 return address(u_int64_t(Instruction_aarch64::extract(insns[0], 20, 5)) 275 + (u_int64_t(Instruction_aarch64::extract(insns[1], 20, 5)) << 16) 276 + (u_int64_t(Instruction_aarch64::extract(insns[2], 20, 5)) << 32)); 277 } else if (Instruction_aarch64::extract(insn, 31, 22) == 0b1011100101 && 278 Instruction_aarch64::extract(insn, 4, 0) == 0b11111) { 279 return 0; 280 } else { 281 ShouldNotReachHere(); 282 } 283 return address(((uint64_t)insn_addr + (offset << 2))); 284 } 285 286 void MacroAssembler::serialize_memory(Register thread, Register tmp) { 287 dsb(Assembler::SY); 288 } 289 290 void MacroAssembler::safepoint_poll(Label& slow_path) { 291 if (SafepointMechanism::uses_thread_local_poll()) { 292 ldr(rscratch1, Address(rthread, Thread::polling_page_offset())); 293 tbnz(rscratch1, exact_log2(SafepointMechanism::poll_bit()), slow_path); 294 } else { 295 unsigned long offset; 296 adrp(rscratch1, ExternalAddress(SafepointSynchronize::address_of_state()), offset); 297 ldrw(rscratch1, Address(rscratch1, offset)); 298 assert(SafepointSynchronize::_not_synchronized == 0, "rewrite this code"); 299 cbnz(rscratch1, slow_path); 300 } 301 } 302 303 // Just like safepoint_poll, but use an acquiring load for thread- 304 // local polling. 305 // 306 // We need an acquire here to ensure that any subsequent load of the 307 // global SafepointSynchronize::_state flag is ordered after this load 308 // of the local Thread::_polling page. We don't want this poll to 309 // return false (i.e. not safepointing) and a later poll of the global 310 // SafepointSynchronize::_state spuriously to return true. 311 // 312 // This is to avoid a race when we're in a native->Java transition 313 // racing the code which wakes up from a safepoint. 314 // 315 void MacroAssembler::safepoint_poll_acquire(Label& slow_path) { 316 if (SafepointMechanism::uses_thread_local_poll()) { 317 lea(rscratch1, Address(rthread, Thread::polling_page_offset())); 318 ldar(rscratch1, rscratch1); 319 tbnz(rscratch1, exact_log2(SafepointMechanism::poll_bit()), slow_path); 320 } else { 321 safepoint_poll(slow_path); 322 } 323 } 324 325 void MacroAssembler::reset_last_Java_frame(bool clear_fp) { 326 // we must set sp to zero to clear frame 327 str(zr, Address(rthread, JavaThread::last_Java_sp_offset())); 328 329 // must clear fp, so that compiled frames are not confused; it is 330 // possible that we need it only for debugging 331 if (clear_fp) { 332 str(zr, Address(rthread, JavaThread::last_Java_fp_offset())); 333 } 334 335 // Always clear the pc because it could have been set by make_walkable() 336 str(zr, Address(rthread, JavaThread::last_Java_pc_offset())); 337 } 338 339 // Calls to C land 340 // 341 // When entering C land, the rfp, & resp of the last Java frame have to be recorded 342 // in the (thread-local) JavaThread object. When leaving C land, the last Java fp 343 // has to be reset to 0. This is required to allow proper stack traversal. 344 void MacroAssembler::set_last_Java_frame(Register last_java_sp, 345 Register last_java_fp, 346 Register last_java_pc, 347 Register scratch) { 348 349 if (last_java_pc->is_valid()) { 350 str(last_java_pc, Address(rthread, 351 JavaThread::frame_anchor_offset() 352 + JavaFrameAnchor::last_Java_pc_offset())); 353 } 354 355 // determine last_java_sp register 356 if (last_java_sp == sp) { 357 mov(scratch, sp); 358 last_java_sp = scratch; 359 } else if (!last_java_sp->is_valid()) { 360 last_java_sp = esp; 361 } 362 363 str(last_java_sp, Address(rthread, JavaThread::last_Java_sp_offset())); 364 365 // last_java_fp is optional 366 if (last_java_fp->is_valid()) { 367 str(last_java_fp, Address(rthread, JavaThread::last_Java_fp_offset())); 368 } 369 } 370 371 void MacroAssembler::set_last_Java_frame(Register last_java_sp, 372 Register last_java_fp, 373 address last_java_pc, 374 Register scratch) { 375 if (last_java_pc != NULL) { 376 adr(scratch, last_java_pc); 377 } else { 378 // FIXME: This is almost never correct. We should delete all 379 // cases of set_last_Java_frame with last_java_pc=NULL and use the 380 // correct return address instead. 381 adr(scratch, pc()); 382 } 383 384 str(scratch, Address(rthread, 385 JavaThread::frame_anchor_offset() 386 + JavaFrameAnchor::last_Java_pc_offset())); 387 388 set_last_Java_frame(last_java_sp, last_java_fp, noreg, scratch); 389 } 390 391 void MacroAssembler::set_last_Java_frame(Register last_java_sp, 392 Register last_java_fp, 393 Label &L, 394 Register scratch) { 395 if (L.is_bound()) { 396 set_last_Java_frame(last_java_sp, last_java_fp, target(L), scratch); 397 } else { 398 InstructionMark im(this); 399 L.add_patch_at(code(), locator()); 400 set_last_Java_frame(last_java_sp, last_java_fp, (address)NULL, scratch); 401 } 402 } 403 404 void MacroAssembler::far_call(Address entry, CodeBuffer *cbuf, Register tmp) { 405 assert(ReservedCodeCacheSize < 4*G, "branch out of range"); 406 assert(CodeCache::find_blob(entry.target()) != NULL, 407 "destination of far call not found in code cache"); 408 if (far_branches()) { 409 unsigned long offset; 410 // We can use ADRP here because we know that the total size of 411 // the code cache cannot exceed 2Gb. 412 adrp(tmp, entry, offset); 413 add(tmp, tmp, offset); 414 if (cbuf) cbuf->set_insts_mark(); 415 blr(tmp); 416 } else { 417 if (cbuf) cbuf->set_insts_mark(); 418 bl(entry); 419 } 420 } 421 422 void MacroAssembler::far_jump(Address entry, CodeBuffer *cbuf, Register tmp) { 423 assert(ReservedCodeCacheSize < 4*G, "branch out of range"); 424 assert(CodeCache::find_blob(entry.target()) != NULL, 425 "destination of far call not found in code cache"); 426 if (far_branches()) { 427 unsigned long offset; 428 // We can use ADRP here because we know that the total size of 429 // the code cache cannot exceed 2Gb. 430 adrp(tmp, entry, offset); 431 add(tmp, tmp, offset); 432 if (cbuf) cbuf->set_insts_mark(); 433 br(tmp); 434 } else { 435 if (cbuf) cbuf->set_insts_mark(); 436 b(entry); 437 } 438 } 439 440 void MacroAssembler::reserved_stack_check() { 441 // testing if reserved zone needs to be enabled 442 Label no_reserved_zone_enabling; 443 444 ldr(rscratch1, Address(rthread, JavaThread::reserved_stack_activation_offset())); 445 cmp(sp, rscratch1); 446 br(Assembler::LO, no_reserved_zone_enabling); 447 448 enter(); // LR and FP are live. 449 lea(rscratch1, CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone)); 450 mov(c_rarg0, rthread); 451 blr(rscratch1); 452 leave(); 453 454 // We have already removed our own frame. 455 // throw_delayed_StackOverflowError will think that it's been 456 // called by our caller. 457 lea(rscratch1, RuntimeAddress(StubRoutines::throw_delayed_StackOverflowError_entry())); 458 br(rscratch1); 459 should_not_reach_here(); 460 461 bind(no_reserved_zone_enabling); 462 } 463 464 int MacroAssembler::biased_locking_enter(Register lock_reg, 465 Register obj_reg, 466 Register swap_reg, 467 Register tmp_reg, 468 bool swap_reg_contains_mark, 469 Label& done, 470 Label* slow_case, 471 BiasedLockingCounters* counters) { 472 assert(UseBiasedLocking, "why call this otherwise?"); 473 assert_different_registers(lock_reg, obj_reg, swap_reg); 474 475 if (PrintBiasedLockingStatistics && counters == NULL) 476 counters = BiasedLocking::counters(); 477 478 assert_different_registers(lock_reg, obj_reg, swap_reg, tmp_reg, rscratch1, rscratch2, noreg); 479 assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits, "biased locking makes assumptions about bit layout"); 480 Address mark_addr (obj_reg, oopDesc::mark_offset_in_bytes()); 481 Address klass_addr (obj_reg, oopDesc::klass_offset_in_bytes()); 482 Address saved_mark_addr(lock_reg, 0); 483 484 // Biased locking 485 // See whether the lock is currently biased toward our thread and 486 // whether the epoch is still valid 487 // Note that the runtime guarantees sufficient alignment of JavaThread 488 // pointers to allow age to be placed into low bits 489 // First check to see whether biasing is even enabled for this object 490 Label cas_label; 491 int null_check_offset = -1; 492 if (!swap_reg_contains_mark) { 493 null_check_offset = offset(); 494 ldr(swap_reg, mark_addr); 495 } 496 andr(tmp_reg, swap_reg, markOopDesc::biased_lock_mask_in_place); 497 cmp(tmp_reg, markOopDesc::biased_lock_pattern); 498 br(Assembler::NE, cas_label); 499 // The bias pattern is present in the object's header. Need to check 500 // whether the bias owner and the epoch are both still current. 501 load_prototype_header(tmp_reg, obj_reg); 502 orr(tmp_reg, tmp_reg, rthread); 503 eor(tmp_reg, swap_reg, tmp_reg); 504 andr(tmp_reg, tmp_reg, ~((int) markOopDesc::age_mask_in_place)); 505 if (counters != NULL) { 506 Label around; 507 cbnz(tmp_reg, around); 508 atomic_incw(Address((address)counters->biased_lock_entry_count_addr()), tmp_reg, rscratch1, rscratch2); 509 b(done); 510 bind(around); 511 } else { 512 cbz(tmp_reg, done); 513 } 514 515 Label try_revoke_bias; 516 Label try_rebias; 517 518 // At this point we know that the header has the bias pattern and 519 // that we are not the bias owner in the current epoch. We need to 520 // figure out more details about the state of the header in order to 521 // know what operations can be legally performed on the object's 522 // header. 523 524 // If the low three bits in the xor result aren't clear, that means 525 // the prototype header is no longer biased and we have to revoke 526 // the bias on this object. 527 andr(rscratch1, tmp_reg, markOopDesc::biased_lock_mask_in_place); 528 cbnz(rscratch1, try_revoke_bias); 529 530 // Biasing is still enabled for this data type. See whether the 531 // epoch of the current bias is still valid, meaning that the epoch 532 // bits of the mark word are equal to the epoch bits of the 533 // prototype header. (Note that the prototype header's epoch bits 534 // only change at a safepoint.) If not, attempt to rebias the object 535 // toward the current thread. Note that we must be absolutely sure 536 // that the current epoch is invalid in order to do this because 537 // otherwise the manipulations it performs on the mark word are 538 // illegal. 539 andr(rscratch1, tmp_reg, markOopDesc::epoch_mask_in_place); 540 cbnz(rscratch1, try_rebias); 541 542 // The epoch of the current bias is still valid but we know nothing 543 // about the owner; it might be set or it might be clear. Try to 544 // acquire the bias of the object using an atomic operation. If this 545 // fails we will go in to the runtime to revoke the object's bias. 546 // Note that we first construct the presumed unbiased header so we 547 // don't accidentally blow away another thread's valid bias. 548 { 549 Label here; 550 mov(rscratch1, markOopDesc::biased_lock_mask_in_place | markOopDesc::age_mask_in_place | markOopDesc::epoch_mask_in_place); 551 andr(swap_reg, swap_reg, rscratch1); 552 orr(tmp_reg, swap_reg, rthread); 553 cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, rscratch1, here, slow_case); 554 // If the biasing toward our thread failed, this means that 555 // another thread succeeded in biasing it toward itself and we 556 // need to revoke that bias. The revocation will occur in the 557 // interpreter runtime in the slow case. 558 bind(here); 559 if (counters != NULL) { 560 atomic_incw(Address((address)counters->anonymously_biased_lock_entry_count_addr()), 561 tmp_reg, rscratch1, rscratch2); 562 } 563 } 564 b(done); 565 566 bind(try_rebias); 567 // At this point we know the epoch has expired, meaning that the 568 // current "bias owner", if any, is actually invalid. Under these 569 // circumstances _only_, we are allowed to use the current header's 570 // value as the comparison value when doing the cas to acquire the 571 // bias in the current epoch. In other words, we allow transfer of 572 // the bias from one thread to another directly in this situation. 573 // 574 // FIXME: due to a lack of registers we currently blow away the age 575 // bits in this situation. Should attempt to preserve them. 576 { 577 Label here; 578 load_prototype_header(tmp_reg, obj_reg); 579 orr(tmp_reg, rthread, tmp_reg); 580 cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, rscratch1, here, slow_case); 581 // If the biasing toward our thread failed, then another thread 582 // succeeded in biasing it toward itself and we need to revoke that 583 // bias. The revocation will occur in the runtime in the slow case. 584 bind(here); 585 if (counters != NULL) { 586 atomic_incw(Address((address)counters->rebiased_lock_entry_count_addr()), 587 tmp_reg, rscratch1, rscratch2); 588 } 589 } 590 b(done); 591 592 bind(try_revoke_bias); 593 // The prototype mark in the klass doesn't have the bias bit set any 594 // more, indicating that objects of this data type are not supposed 595 // to be biased any more. We are going to try to reset the mark of 596 // this object to the prototype value and fall through to the 597 // CAS-based locking scheme. Note that if our CAS fails, it means 598 // that another thread raced us for the privilege of revoking the 599 // bias of this particular object, so it's okay to continue in the 600 // normal locking code. 601 // 602 // FIXME: due to a lack of registers we currently blow away the age 603 // bits in this situation. Should attempt to preserve them. 604 { 605 Label here, nope; 606 load_prototype_header(tmp_reg, obj_reg); 607 cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, rscratch1, here, &nope); 608 bind(here); 609 610 // Fall through to the normal CAS-based lock, because no matter what 611 // the result of the above CAS, some thread must have succeeded in 612 // removing the bias bit from the object's header. 613 if (counters != NULL) { 614 atomic_incw(Address((address)counters->revoked_lock_entry_count_addr()), tmp_reg, 615 rscratch1, rscratch2); 616 } 617 bind(nope); 618 } 619 620 bind(cas_label); 621 622 return null_check_offset; 623 } 624 625 void MacroAssembler::biased_locking_exit(Register obj_reg, Register temp_reg, Label& done) { 626 assert(UseBiasedLocking, "why call this otherwise?"); 627 628 // Check for biased locking unlock case, which is a no-op 629 // Note: we do not have to check the thread ID for two reasons. 630 // First, the interpreter checks for IllegalMonitorStateException at 631 // a higher level. Second, if the bias was revoked while we held the 632 // lock, the object could not be rebiased toward another thread, so 633 // the bias bit would be clear. 634 ldr(temp_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes())); 635 andr(temp_reg, temp_reg, markOopDesc::biased_lock_mask_in_place); 636 cmp(temp_reg, markOopDesc::biased_lock_pattern); 637 br(Assembler::EQ, done); 638 } 639 640 static void pass_arg0(MacroAssembler* masm, Register arg) { 641 if (c_rarg0 != arg ) { 642 masm->mov(c_rarg0, arg); 643 } 644 } 645 646 static void pass_arg1(MacroAssembler* masm, Register arg) { 647 if (c_rarg1 != arg ) { 648 masm->mov(c_rarg1, arg); 649 } 650 } 651 652 static void pass_arg2(MacroAssembler* masm, Register arg) { 653 if (c_rarg2 != arg ) { 654 masm->mov(c_rarg2, arg); 655 } 656 } 657 658 static void pass_arg3(MacroAssembler* masm, Register arg) { 659 if (c_rarg3 != arg ) { 660 masm->mov(c_rarg3, arg); 661 } 662 } 663 664 void MacroAssembler::call_VM_base(Register oop_result, 665 Register java_thread, 666 Register last_java_sp, 667 address entry_point, 668 int number_of_arguments, 669 bool check_exceptions) { 670 // determine java_thread register 671 if (!java_thread->is_valid()) { 672 java_thread = rthread; 673 } 674 675 // determine last_java_sp register 676 if (!last_java_sp->is_valid()) { 677 last_java_sp = esp; 678 } 679 680 // debugging support 681 assert(number_of_arguments >= 0 , "cannot have negative number of arguments"); 682 assert(java_thread == rthread, "unexpected register"); 683 #ifdef ASSERT 684 // TraceBytecodes does not use r12 but saves it over the call, so don't verify 685 // if ((UseCompressedOops || UseCompressedClassPointers) && !TraceBytecodes) verify_heapbase("call_VM_base: heap base corrupted?"); 686 #endif // ASSERT 687 688 assert(java_thread != oop_result , "cannot use the same register for java_thread & oop_result"); 689 assert(java_thread != last_java_sp, "cannot use the same register for java_thread & last_java_sp"); 690 691 // push java thread (becomes first argument of C function) 692 693 mov(c_rarg0, java_thread); 694 695 // set last Java frame before call 696 assert(last_java_sp != rfp, "can't use rfp"); 697 698 Label l; 699 set_last_Java_frame(last_java_sp, rfp, l, rscratch1); 700 701 // do the call, remove parameters 702 MacroAssembler::call_VM_leaf_base(entry_point, number_of_arguments, &l); 703 704 // reset last Java frame 705 // Only interpreter should have to clear fp 706 reset_last_Java_frame(true); 707 708 // C++ interp handles this in the interpreter 709 check_and_handle_popframe(java_thread); 710 check_and_handle_earlyret(java_thread); 711 712 if (check_exceptions) { 713 // check for pending exceptions (java_thread is set upon return) 714 ldr(rscratch1, Address(java_thread, in_bytes(Thread::pending_exception_offset()))); 715 Label ok; 716 cbz(rscratch1, ok); 717 lea(rscratch1, RuntimeAddress(StubRoutines::forward_exception_entry())); 718 br(rscratch1); 719 bind(ok); 720 } 721 722 // get oop result if there is one and reset the value in the thread 723 if (oop_result->is_valid()) { 724 get_vm_result(oop_result, java_thread); 725 } 726 } 727 728 void MacroAssembler::call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions) { 729 call_VM_base(oop_result, noreg, noreg, entry_point, number_of_arguments, check_exceptions); 730 } 731 732 // Maybe emit a call via a trampoline. If the code cache is small 733 // trampolines won't be emitted. 734 735 address MacroAssembler::trampoline_call(Address entry, CodeBuffer *cbuf) { 736 assert(JavaThread::current()->is_Compiler_thread(), "just checking"); 737 assert(entry.rspec().type() == relocInfo::runtime_call_type 738 || entry.rspec().type() == relocInfo::opt_virtual_call_type 739 || entry.rspec().type() == relocInfo::static_call_type 740 || entry.rspec().type() == relocInfo::virtual_call_type, "wrong reloc type"); 741 742 unsigned int start_offset = offset(); 743 if (far_branches() && !Compile::current()->in_scratch_emit_size()) { 744 address stub = emit_trampoline_stub(start_offset, entry.target()); 745 if (stub == NULL) { 746 return NULL; // CodeCache is full 747 } 748 } 749 750 if (cbuf) cbuf->set_insts_mark(); 751 relocate(entry.rspec()); 752 if (!far_branches()) { 753 bl(entry.target()); 754 } else { 755 bl(pc()); 756 } 757 // just need to return a non-null address 758 return pc(); 759 } 760 761 762 // Emit a trampoline stub for a call to a target which is too far away. 763 // 764 // code sequences: 765 // 766 // call-site: 767 // branch-and-link to <destination> or <trampoline stub> 768 // 769 // Related trampoline stub for this call site in the stub section: 770 // load the call target from the constant pool 771 // branch (LR still points to the call site above) 772 773 address MacroAssembler::emit_trampoline_stub(int insts_call_instruction_offset, 774 address dest) { 775 address stub = start_a_stub(Compile::MAX_stubs_size/2); 776 if (stub == NULL) { 777 return NULL; // CodeBuffer::expand failed 778 } 779 780 // Create a trampoline stub relocation which relates this trampoline stub 781 // with the call instruction at insts_call_instruction_offset in the 782 // instructions code-section. 783 align(wordSize); 784 relocate(trampoline_stub_Relocation::spec(code()->insts()->start() 785 + insts_call_instruction_offset)); 786 const int stub_start_offset = offset(); 787 788 // Now, create the trampoline stub's code: 789 // - load the call 790 // - call 791 Label target; 792 ldr(rscratch1, target); 793 br(rscratch1); 794 bind(target); 795 assert(offset() - stub_start_offset == NativeCallTrampolineStub::data_offset, 796 "should be"); 797 emit_int64((int64_t)dest); 798 799 const address stub_start_addr = addr_at(stub_start_offset); 800 801 assert(is_NativeCallTrampolineStub_at(stub_start_addr), "doesn't look like a trampoline"); 802 803 end_a_stub(); 804 return stub_start_addr; 805 } 806 807 address MacroAssembler::ic_call(address entry, jint method_index) { 808 RelocationHolder rh = virtual_call_Relocation::spec(pc(), method_index); 809 // address const_ptr = long_constant((jlong)Universe::non_oop_word()); 810 // unsigned long offset; 811 // ldr_constant(rscratch2, const_ptr); 812 movptr(rscratch2, (uintptr_t)Universe::non_oop_word()); 813 return trampoline_call(Address(entry, rh)); 814 } 815 816 // Implementation of call_VM versions 817 818 void MacroAssembler::call_VM(Register oop_result, 819 address entry_point, 820 bool check_exceptions) { 821 call_VM_helper(oop_result, entry_point, 0, check_exceptions); 822 } 823 824 void MacroAssembler::call_VM(Register oop_result, 825 address entry_point, 826 Register arg_1, 827 bool check_exceptions) { 828 pass_arg1(this, arg_1); 829 call_VM_helper(oop_result, entry_point, 1, check_exceptions); 830 } 831 832 void MacroAssembler::call_VM(Register oop_result, 833 address entry_point, 834 Register arg_1, 835 Register arg_2, 836 bool check_exceptions) { 837 assert(arg_1 != c_rarg2, "smashed arg"); 838 pass_arg2(this, arg_2); 839 pass_arg1(this, arg_1); 840 call_VM_helper(oop_result, entry_point, 2, check_exceptions); 841 } 842 843 void MacroAssembler::call_VM(Register oop_result, 844 address entry_point, 845 Register arg_1, 846 Register arg_2, 847 Register arg_3, 848 bool check_exceptions) { 849 assert(arg_1 != c_rarg3, "smashed arg"); 850 assert(arg_2 != c_rarg3, "smashed arg"); 851 pass_arg3(this, arg_3); 852 853 assert(arg_1 != c_rarg2, "smashed arg"); 854 pass_arg2(this, arg_2); 855 856 pass_arg1(this, arg_1); 857 call_VM_helper(oop_result, entry_point, 3, check_exceptions); 858 } 859 860 void MacroAssembler::call_VM(Register oop_result, 861 Register last_java_sp, 862 address entry_point, 863 int number_of_arguments, 864 bool check_exceptions) { 865 call_VM_base(oop_result, rthread, last_java_sp, entry_point, number_of_arguments, check_exceptions); 866 } 867 868 void MacroAssembler::call_VM(Register oop_result, 869 Register last_java_sp, 870 address entry_point, 871 Register arg_1, 872 bool check_exceptions) { 873 pass_arg1(this, arg_1); 874 call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions); 875 } 876 877 void MacroAssembler::call_VM(Register oop_result, 878 Register last_java_sp, 879 address entry_point, 880 Register arg_1, 881 Register arg_2, 882 bool check_exceptions) { 883 884 assert(arg_1 != c_rarg2, "smashed arg"); 885 pass_arg2(this, arg_2); 886 pass_arg1(this, arg_1); 887 call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions); 888 } 889 890 void MacroAssembler::call_VM(Register oop_result, 891 Register last_java_sp, 892 address entry_point, 893 Register arg_1, 894 Register arg_2, 895 Register arg_3, 896 bool check_exceptions) { 897 assert(arg_1 != c_rarg3, "smashed arg"); 898 assert(arg_2 != c_rarg3, "smashed arg"); 899 pass_arg3(this, arg_3); 900 assert(arg_1 != c_rarg2, "smashed arg"); 901 pass_arg2(this, arg_2); 902 pass_arg1(this, arg_1); 903 call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions); 904 } 905 906 907 void MacroAssembler::get_vm_result(Register oop_result, Register java_thread) { 908 ldr(oop_result, Address(java_thread, JavaThread::vm_result_offset())); 909 str(zr, Address(java_thread, JavaThread::vm_result_offset())); 910 verify_oop(oop_result, "broken oop in call_VM_base"); 911 } 912 913 void MacroAssembler::get_vm_result_2(Register metadata_result, Register java_thread) { 914 ldr(metadata_result, Address(java_thread, JavaThread::vm_result_2_offset())); 915 str(zr, Address(java_thread, JavaThread::vm_result_2_offset())); 916 } 917 918 void MacroAssembler::align(int modulus) { 919 while (offset() % modulus != 0) nop(); 920 } 921 922 // these are no-ops overridden by InterpreterMacroAssembler 923 924 void MacroAssembler::check_and_handle_earlyret(Register java_thread) { } 925 926 void MacroAssembler::check_and_handle_popframe(Register java_thread) { } 927 928 929 RegisterOrConstant MacroAssembler::delayed_value_impl(intptr_t* delayed_value_addr, 930 Register tmp, 931 int offset) { 932 intptr_t value = *delayed_value_addr; 933 if (value != 0) 934 return RegisterOrConstant(value + offset); 935 936 // load indirectly to solve generation ordering problem 937 ldr(tmp, ExternalAddress((address) delayed_value_addr)); 938 939 if (offset != 0) 940 add(tmp, tmp, offset); 941 942 return RegisterOrConstant(tmp); 943 } 944 945 946 void MacroAssembler:: notify(int type) { 947 if (type == bytecode_start) { 948 // set_last_Java_frame(esp, rfp, (address)NULL); 949 Assembler:: notify(type); 950 // reset_last_Java_frame(true); 951 } 952 else 953 Assembler:: notify(type); 954 } 955 956 // Look up the method for a megamorphic invokeinterface call. 957 // The target method is determined by <intf_klass, itable_index>. 958 // The receiver klass is in recv_klass. 959 // On success, the result will be in method_result, and execution falls through. 960 // On failure, execution transfers to the given label. 961 void MacroAssembler::lookup_interface_method(Register recv_klass, 962 Register intf_klass, 963 RegisterOrConstant itable_index, 964 Register method_result, 965 Register scan_temp, 966 Label& L_no_such_interface, 967 bool return_method) { 968 assert_different_registers(recv_klass, intf_klass, scan_temp); 969 assert_different_registers(method_result, intf_klass, scan_temp); 970 assert(recv_klass != method_result || !return_method, 971 "recv_klass can be destroyed when method isn't needed"); 972 assert(itable_index.is_constant() || itable_index.as_register() == method_result, 973 "caller must use same register for non-constant itable index as for method"); 974 975 // Compute start of first itableOffsetEntry (which is at the end of the vtable) 976 int vtable_base = in_bytes(Klass::vtable_start_offset()); 977 int itentry_off = itableMethodEntry::method_offset_in_bytes(); 978 int scan_step = itableOffsetEntry::size() * wordSize; 979 int vte_size = vtableEntry::size_in_bytes(); 980 assert(vte_size == wordSize, "else adjust times_vte_scale"); 981 982 ldrw(scan_temp, Address(recv_klass, Klass::vtable_length_offset())); 983 984 // %%% Could store the aligned, prescaled offset in the klassoop. 985 // lea(scan_temp, Address(recv_klass, scan_temp, times_vte_scale, vtable_base)); 986 lea(scan_temp, Address(recv_klass, scan_temp, Address::lsl(3))); 987 add(scan_temp, scan_temp, vtable_base); 988 989 if (return_method) { 990 // Adjust recv_klass by scaled itable_index, so we can free itable_index. 991 assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below"); 992 // lea(recv_klass, Address(recv_klass, itable_index, Address::times_ptr, itentry_off)); 993 lea(recv_klass, Address(recv_klass, itable_index, Address::lsl(3))); 994 if (itentry_off) 995 add(recv_klass, recv_klass, itentry_off); 996 } 997 998 // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) { 999 // if (scan->interface() == intf) { 1000 // result = (klass + scan->offset() + itable_index); 1001 // } 1002 // } 1003 Label search, found_method; 1004 1005 for (int peel = 1; peel >= 0; peel--) { 1006 ldr(method_result, Address(scan_temp, itableOffsetEntry::interface_offset_in_bytes())); 1007 cmp(intf_klass, method_result); 1008 1009 if (peel) { 1010 br(Assembler::EQ, found_method); 1011 } else { 1012 br(Assembler::NE, search); 1013 // (invert the test to fall through to found_method...) 1014 } 1015 1016 if (!peel) break; 1017 1018 bind(search); 1019 1020 // Check that the previous entry is non-null. A null entry means that 1021 // the receiver class doesn't implement the interface, and wasn't the 1022 // same as when the caller was compiled. 1023 cbz(method_result, L_no_such_interface); 1024 add(scan_temp, scan_temp, scan_step); 1025 } 1026 1027 bind(found_method); 1028 1029 // Got a hit. 1030 if (return_method) { 1031 ldrw(scan_temp, Address(scan_temp, itableOffsetEntry::offset_offset_in_bytes())); 1032 ldr(method_result, Address(recv_klass, scan_temp, Address::uxtw(0))); 1033 } 1034 } 1035 1036 // virtual method calling 1037 void MacroAssembler::lookup_virtual_method(Register recv_klass, 1038 RegisterOrConstant vtable_index, 1039 Register method_result) { 1040 const int base = in_bytes(Klass::vtable_start_offset()); 1041 assert(vtableEntry::size() * wordSize == 8, 1042 "adjust the scaling in the code below"); 1043 int vtable_offset_in_bytes = base + vtableEntry::method_offset_in_bytes(); 1044 1045 if (vtable_index.is_register()) { 1046 lea(method_result, Address(recv_klass, 1047 vtable_index.as_register(), 1048 Address::lsl(LogBytesPerWord))); 1049 ldr(method_result, Address(method_result, vtable_offset_in_bytes)); 1050 } else { 1051 vtable_offset_in_bytes += vtable_index.as_constant() * wordSize; 1052 ldr(method_result, 1053 form_address(rscratch1, recv_klass, vtable_offset_in_bytes, 0)); 1054 } 1055 } 1056 1057 void MacroAssembler::check_klass_subtype(Register sub_klass, 1058 Register super_klass, 1059 Register temp_reg, 1060 Label& L_success) { 1061 Label L_failure; 1062 check_klass_subtype_fast_path(sub_klass, super_klass, temp_reg, &L_success, &L_failure, NULL); 1063 check_klass_subtype_slow_path(sub_klass, super_klass, temp_reg, noreg, &L_success, NULL); 1064 bind(L_failure); 1065 } 1066 1067 1068 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass, 1069 Register super_klass, 1070 Register temp_reg, 1071 Label* L_success, 1072 Label* L_failure, 1073 Label* L_slow_path, 1074 RegisterOrConstant super_check_offset) { 1075 assert_different_registers(sub_klass, super_klass, temp_reg); 1076 bool must_load_sco = (super_check_offset.constant_or_zero() == -1); 1077 if (super_check_offset.is_register()) { 1078 assert_different_registers(sub_klass, super_klass, 1079 super_check_offset.as_register()); 1080 } else if (must_load_sco) { 1081 assert(temp_reg != noreg, "supply either a temp or a register offset"); 1082 } 1083 1084 Label L_fallthrough; 1085 int label_nulls = 0; 1086 if (L_success == NULL) { L_success = &L_fallthrough; label_nulls++; } 1087 if (L_failure == NULL) { L_failure = &L_fallthrough; label_nulls++; } 1088 if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; } 1089 assert(label_nulls <= 1, "at most one NULL in the batch"); 1090 1091 int sc_offset = in_bytes(Klass::secondary_super_cache_offset()); 1092 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 1093 Address super_check_offset_addr(super_klass, sco_offset); 1094 1095 // Hacked jmp, which may only be used just before L_fallthrough. 1096 #define final_jmp(label) \ 1097 if (&(label) == &L_fallthrough) { /*do nothing*/ } \ 1098 else b(label) /*omit semi*/ 1099 1100 // If the pointers are equal, we are done (e.g., String[] elements). 1101 // This self-check enables sharing of secondary supertype arrays among 1102 // non-primary types such as array-of-interface. Otherwise, each such 1103 // type would need its own customized SSA. 1104 // We move this check to the front of the fast path because many 1105 // type checks are in fact trivially successful in this manner, 1106 // so we get a nicely predicted branch right at the start of the check. 1107 cmp(sub_klass, super_klass); 1108 br(Assembler::EQ, *L_success); 1109 1110 // Check the supertype display: 1111 if (must_load_sco) { 1112 ldrw(temp_reg, super_check_offset_addr); 1113 super_check_offset = RegisterOrConstant(temp_reg); 1114 } 1115 Address super_check_addr(sub_klass, super_check_offset); 1116 ldr(rscratch1, super_check_addr); 1117 cmp(super_klass, rscratch1); // load displayed supertype 1118 1119 // This check has worked decisively for primary supers. 1120 // Secondary supers are sought in the super_cache ('super_cache_addr'). 1121 // (Secondary supers are interfaces and very deeply nested subtypes.) 1122 // This works in the same check above because of a tricky aliasing 1123 // between the super_cache and the primary super display elements. 1124 // (The 'super_check_addr' can address either, as the case requires.) 1125 // Note that the cache is updated below if it does not help us find 1126 // what we need immediately. 1127 // So if it was a primary super, we can just fail immediately. 1128 // Otherwise, it's the slow path for us (no success at this point). 1129 1130 if (super_check_offset.is_register()) { 1131 br(Assembler::EQ, *L_success); 1132 cmp(super_check_offset.as_register(), sc_offset); 1133 if (L_failure == &L_fallthrough) { 1134 br(Assembler::EQ, *L_slow_path); 1135 } else { 1136 br(Assembler::NE, *L_failure); 1137 final_jmp(*L_slow_path); 1138 } 1139 } else if (super_check_offset.as_constant() == sc_offset) { 1140 // Need a slow path; fast failure is impossible. 1141 if (L_slow_path == &L_fallthrough) { 1142 br(Assembler::EQ, *L_success); 1143 } else { 1144 br(Assembler::NE, *L_slow_path); 1145 final_jmp(*L_success); 1146 } 1147 } else { 1148 // No slow path; it's a fast decision. 1149 if (L_failure == &L_fallthrough) { 1150 br(Assembler::EQ, *L_success); 1151 } else { 1152 br(Assembler::NE, *L_failure); 1153 final_jmp(*L_success); 1154 } 1155 } 1156 1157 bind(L_fallthrough); 1158 1159 #undef final_jmp 1160 } 1161 1162 // These two are taken from x86, but they look generally useful 1163 1164 // scans count pointer sized words at [addr] for occurence of value, 1165 // generic 1166 void MacroAssembler::repne_scan(Register addr, Register value, Register count, 1167 Register scratch) { 1168 Label Lloop, Lexit; 1169 cbz(count, Lexit); 1170 bind(Lloop); 1171 ldr(scratch, post(addr, wordSize)); 1172 cmp(value, scratch); 1173 br(EQ, Lexit); 1174 sub(count, count, 1); 1175 cbnz(count, Lloop); 1176 bind(Lexit); 1177 } 1178 1179 // scans count 4 byte words at [addr] for occurence of value, 1180 // generic 1181 void MacroAssembler::repne_scanw(Register addr, Register value, Register count, 1182 Register scratch) { 1183 Label Lloop, Lexit; 1184 cbz(count, Lexit); 1185 bind(Lloop); 1186 ldrw(scratch, post(addr, wordSize)); 1187 cmpw(value, scratch); 1188 br(EQ, Lexit); 1189 sub(count, count, 1); 1190 cbnz(count, Lloop); 1191 bind(Lexit); 1192 } 1193 1194 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass, 1195 Register super_klass, 1196 Register temp_reg, 1197 Register temp2_reg, 1198 Label* L_success, 1199 Label* L_failure, 1200 bool set_cond_codes) { 1201 assert_different_registers(sub_klass, super_klass, temp_reg); 1202 if (temp2_reg != noreg) 1203 assert_different_registers(sub_klass, super_klass, temp_reg, temp2_reg, rscratch1); 1204 #define IS_A_TEMP(reg) ((reg) == temp_reg || (reg) == temp2_reg) 1205 1206 Label L_fallthrough; 1207 int label_nulls = 0; 1208 if (L_success == NULL) { L_success = &L_fallthrough; label_nulls++; } 1209 if (L_failure == NULL) { L_failure = &L_fallthrough; label_nulls++; } 1210 assert(label_nulls <= 1, "at most one NULL in the batch"); 1211 1212 // a couple of useful fields in sub_klass: 1213 int ss_offset = in_bytes(Klass::secondary_supers_offset()); 1214 int sc_offset = in_bytes(Klass::secondary_super_cache_offset()); 1215 Address secondary_supers_addr(sub_klass, ss_offset); 1216 Address super_cache_addr( sub_klass, sc_offset); 1217 1218 BLOCK_COMMENT("check_klass_subtype_slow_path"); 1219 1220 // Do a linear scan of the secondary super-klass chain. 1221 // This code is rarely used, so simplicity is a virtue here. 1222 // The repne_scan instruction uses fixed registers, which we must spill. 1223 // Don't worry too much about pre-existing connections with the input regs. 1224 1225 assert(sub_klass != r0, "killed reg"); // killed by mov(r0, super) 1226 assert(sub_klass != r2, "killed reg"); // killed by lea(r2, &pst_counter) 1227 1228 RegSet pushed_registers; 1229 if (!IS_A_TEMP(r2)) pushed_registers += r2; 1230 if (!IS_A_TEMP(r5)) pushed_registers += r5; 1231 1232 if (super_klass != r0 || UseCompressedOops) { 1233 if (!IS_A_TEMP(r0)) pushed_registers += r0; 1234 } 1235 1236 push(pushed_registers, sp); 1237 1238 // Get super_klass value into r0 (even if it was in r5 or r2). 1239 if (super_klass != r0) { 1240 mov(r0, super_klass); 1241 } 1242 1243 #ifndef PRODUCT 1244 mov(rscratch2, (address)&SharedRuntime::_partial_subtype_ctr); 1245 Address pst_counter_addr(rscratch2); 1246 ldr(rscratch1, pst_counter_addr); 1247 add(rscratch1, rscratch1, 1); 1248 str(rscratch1, pst_counter_addr); 1249 #endif //PRODUCT 1250 1251 // We will consult the secondary-super array. 1252 ldr(r5, secondary_supers_addr); 1253 // Load the array length. 1254 ldrw(r2, Address(r5, Array<Klass*>::length_offset_in_bytes())); 1255 // Skip to start of data. 1256 add(r5, r5, Array<Klass*>::base_offset_in_bytes()); 1257 1258 cmp(sp, zr); // Clear Z flag; SP is never zero 1259 // Scan R2 words at [R5] for an occurrence of R0. 1260 // Set NZ/Z based on last compare. 1261 repne_scan(r5, r0, r2, rscratch1); 1262 1263 // Unspill the temp. registers: 1264 pop(pushed_registers, sp); 1265 1266 br(Assembler::NE, *L_failure); 1267 1268 // Success. Cache the super we found and proceed in triumph. 1269 str(super_klass, super_cache_addr); 1270 1271 if (L_success != &L_fallthrough) { 1272 b(*L_success); 1273 } 1274 1275 #undef IS_A_TEMP 1276 1277 bind(L_fallthrough); 1278 } 1279 1280 1281 void MacroAssembler::verify_oop(Register reg, const char* s) { 1282 if (!VerifyOops) return; 1283 1284 // Pass register number to verify_oop_subroutine 1285 const char* b = NULL; 1286 { 1287 ResourceMark rm; 1288 stringStream ss; 1289 ss.print("verify_oop: %s: %s", reg->name(), s); 1290 b = code_string(ss.as_string()); 1291 } 1292 BLOCK_COMMENT("verify_oop {"); 1293 1294 stp(r0, rscratch1, Address(pre(sp, -2 * wordSize))); 1295 stp(rscratch2, lr, Address(pre(sp, -2 * wordSize))); 1296 1297 mov(r0, reg); 1298 mov(rscratch1, (address)b); 1299 1300 // call indirectly to solve generation ordering problem 1301 lea(rscratch2, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address())); 1302 ldr(rscratch2, Address(rscratch2)); 1303 blr(rscratch2); 1304 1305 ldp(rscratch2, lr, Address(post(sp, 2 * wordSize))); 1306 ldp(r0, rscratch1, Address(post(sp, 2 * wordSize))); 1307 1308 BLOCK_COMMENT("} verify_oop"); 1309 } 1310 1311 void MacroAssembler::verify_oop_addr(Address addr, const char* s) { 1312 if (!VerifyOops) return; 1313 1314 const char* b = NULL; 1315 { 1316 ResourceMark rm; 1317 stringStream ss; 1318 ss.print("verify_oop_addr: %s", s); 1319 b = code_string(ss.as_string()); 1320 } 1321 BLOCK_COMMENT("verify_oop_addr {"); 1322 1323 stp(r0, rscratch1, Address(pre(sp, -2 * wordSize))); 1324 stp(rscratch2, lr, Address(pre(sp, -2 * wordSize))); 1325 1326 // addr may contain sp so we will have to adjust it based on the 1327 // pushes that we just did. 1328 if (addr.uses(sp)) { 1329 lea(r0, addr); 1330 ldr(r0, Address(r0, 4 * wordSize)); 1331 } else { 1332 ldr(r0, addr); 1333 } 1334 mov(rscratch1, (address)b); 1335 1336 // call indirectly to solve generation ordering problem 1337 lea(rscratch2, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address())); 1338 ldr(rscratch2, Address(rscratch2)); 1339 blr(rscratch2); 1340 1341 ldp(rscratch2, lr, Address(post(sp, 2 * wordSize))); 1342 ldp(r0, rscratch1, Address(post(sp, 2 * wordSize))); 1343 1344 BLOCK_COMMENT("} verify_oop_addr"); 1345 } 1346 1347 Address MacroAssembler::argument_address(RegisterOrConstant arg_slot, 1348 int extra_slot_offset) { 1349 // cf. TemplateTable::prepare_invoke(), if (load_receiver). 1350 int stackElementSize = Interpreter::stackElementSize; 1351 int offset = Interpreter::expr_offset_in_bytes(extra_slot_offset+0); 1352 #ifdef ASSERT 1353 int offset1 = Interpreter::expr_offset_in_bytes(extra_slot_offset+1); 1354 assert(offset1 - offset == stackElementSize, "correct arithmetic"); 1355 #endif 1356 if (arg_slot.is_constant()) { 1357 return Address(esp, arg_slot.as_constant() * stackElementSize 1358 + offset); 1359 } else { 1360 add(rscratch1, esp, arg_slot.as_register(), 1361 ext::uxtx, exact_log2(stackElementSize)); 1362 return Address(rscratch1, offset); 1363 } 1364 } 1365 1366 void MacroAssembler::call_VM_leaf_base(address entry_point, 1367 int number_of_arguments, 1368 Label *retaddr) { 1369 call_VM_leaf_base1(entry_point, number_of_arguments, 0, ret_type_integral, retaddr); 1370 } 1371 1372 void MacroAssembler::call_VM_leaf_base1(address entry_point, 1373 int number_of_gp_arguments, 1374 int number_of_fp_arguments, 1375 ret_type type, 1376 Label *retaddr) { 1377 Label E, L; 1378 1379 stp(rscratch1, rmethod, Address(pre(sp, -2 * wordSize))); 1380 1381 // We add 1 to number_of_arguments because the thread in arg0 is 1382 // not counted 1383 mov(rscratch1, entry_point); 1384 blrt(rscratch1, number_of_gp_arguments + 1, number_of_fp_arguments, type); 1385 if (retaddr) 1386 bind(*retaddr); 1387 1388 ldp(rscratch1, rmethod, Address(post(sp, 2 * wordSize))); 1389 maybe_isb(); 1390 } 1391 1392 void MacroAssembler::call_VM_leaf(address entry_point, int number_of_arguments) { 1393 call_VM_leaf_base(entry_point, number_of_arguments); 1394 } 1395 1396 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0) { 1397 pass_arg0(this, arg_0); 1398 call_VM_leaf_base(entry_point, 1); 1399 } 1400 1401 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1) { 1402 pass_arg0(this, arg_0); 1403 pass_arg1(this, arg_1); 1404 call_VM_leaf_base(entry_point, 2); 1405 } 1406 1407 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, 1408 Register arg_1, Register arg_2) { 1409 pass_arg0(this, arg_0); 1410 pass_arg1(this, arg_1); 1411 pass_arg2(this, arg_2); 1412 call_VM_leaf_base(entry_point, 3); 1413 } 1414 1415 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0) { 1416 pass_arg0(this, arg_0); 1417 MacroAssembler::call_VM_leaf_base(entry_point, 1); 1418 } 1419 1420 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1) { 1421 1422 assert(arg_0 != c_rarg1, "smashed arg"); 1423 pass_arg1(this, arg_1); 1424 pass_arg0(this, arg_0); 1425 MacroAssembler::call_VM_leaf_base(entry_point, 2); 1426 } 1427 1428 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) { 1429 assert(arg_0 != c_rarg2, "smashed arg"); 1430 assert(arg_1 != c_rarg2, "smashed arg"); 1431 pass_arg2(this, arg_2); 1432 assert(arg_0 != c_rarg1, "smashed arg"); 1433 pass_arg1(this, arg_1); 1434 pass_arg0(this, arg_0); 1435 MacroAssembler::call_VM_leaf_base(entry_point, 3); 1436 } 1437 1438 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) { 1439 assert(arg_0 != c_rarg3, "smashed arg"); 1440 assert(arg_1 != c_rarg3, "smashed arg"); 1441 assert(arg_2 != c_rarg3, "smashed arg"); 1442 pass_arg3(this, arg_3); 1443 assert(arg_0 != c_rarg2, "smashed arg"); 1444 assert(arg_1 != c_rarg2, "smashed arg"); 1445 pass_arg2(this, arg_2); 1446 assert(arg_0 != c_rarg1, "smashed arg"); 1447 pass_arg1(this, arg_1); 1448 pass_arg0(this, arg_0); 1449 MacroAssembler::call_VM_leaf_base(entry_point, 4); 1450 } 1451 1452 void MacroAssembler::null_check(Register reg, int offset) { 1453 if (needs_explicit_null_check(offset)) { 1454 // provoke OS NULL exception if reg = NULL by 1455 // accessing M[reg] w/o changing any registers 1456 // NOTE: this is plenty to provoke a segv 1457 ldr(zr, Address(reg)); 1458 } else { 1459 // nothing to do, (later) access of M[reg + offset] 1460 // will provoke OS NULL exception if reg = NULL 1461 } 1462 } 1463 1464 // MacroAssembler protected routines needed to implement 1465 // public methods 1466 1467 void MacroAssembler::mov(Register r, Address dest) { 1468 code_section()->relocate(pc(), dest.rspec()); 1469 u_int64_t imm64 = (u_int64_t)dest.target(); 1470 movptr(r, imm64); 1471 } 1472 1473 // Move a constant pointer into r. In AArch64 mode the virtual 1474 // address space is 48 bits in size, so we only need three 1475 // instructions to create a patchable instruction sequence that can 1476 // reach anywhere. 1477 void MacroAssembler::movptr(Register r, uintptr_t imm64) { 1478 #ifndef PRODUCT 1479 { 1480 char buffer[64]; 1481 snprintf(buffer, sizeof(buffer), "0x%"PRIX64, imm64); 1482 block_comment(buffer); 1483 } 1484 #endif 1485 assert(imm64 < (1ul << 48), "48-bit overflow in address constant"); 1486 movz(r, imm64 & 0xffff); 1487 imm64 >>= 16; 1488 movk(r, imm64 & 0xffff, 16); 1489 imm64 >>= 16; 1490 movk(r, imm64 & 0xffff, 32); 1491 } 1492 1493 // Macro to mov replicated immediate to vector register. 1494 // Vd will get the following values for different arrangements in T 1495 // imm32 == hex 000000gh T8B: Vd = ghghghghghghghgh 1496 // imm32 == hex 000000gh T16B: Vd = ghghghghghghghghghghghghghghghgh 1497 // imm32 == hex 0000efgh T4H: Vd = efghefghefghefgh 1498 // imm32 == hex 0000efgh T8H: Vd = efghefghefghefghefghefghefghefgh 1499 // imm32 == hex abcdefgh T2S: Vd = abcdefghabcdefgh 1500 // imm32 == hex abcdefgh T4S: Vd = abcdefghabcdefghabcdefghabcdefgh 1501 // T1D/T2D: invalid 1502 void MacroAssembler::mov(FloatRegister Vd, SIMD_Arrangement T, u_int32_t imm32) { 1503 assert(T != T1D && T != T2D, "invalid arrangement"); 1504 if (T == T8B || T == T16B) { 1505 assert((imm32 & ~0xff) == 0, "extraneous bits in unsigned imm32 (T8B/T16B)"); 1506 movi(Vd, T, imm32 & 0xff, 0); 1507 return; 1508 } 1509 u_int32_t nimm32 = ~imm32; 1510 if (T == T4H || T == T8H) { 1511 assert((imm32 & ~0xffff) == 0, "extraneous bits in unsigned imm32 (T4H/T8H)"); 1512 imm32 &= 0xffff; 1513 nimm32 &= 0xffff; 1514 } 1515 u_int32_t x = imm32; 1516 int movi_cnt = 0; 1517 int movn_cnt = 0; 1518 while (x) { if (x & 0xff) movi_cnt++; x >>= 8; } 1519 x = nimm32; 1520 while (x) { if (x & 0xff) movn_cnt++; x >>= 8; } 1521 if (movn_cnt < movi_cnt) imm32 = nimm32; 1522 unsigned lsl = 0; 1523 while (imm32 && (imm32 & 0xff) == 0) { lsl += 8; imm32 >>= 8; } 1524 if (movn_cnt < movi_cnt) 1525 mvni(Vd, T, imm32 & 0xff, lsl); 1526 else 1527 movi(Vd, T, imm32 & 0xff, lsl); 1528 imm32 >>= 8; lsl += 8; 1529 while (imm32) { 1530 while ((imm32 & 0xff) == 0) { lsl += 8; imm32 >>= 8; } 1531 if (movn_cnt < movi_cnt) 1532 bici(Vd, T, imm32 & 0xff, lsl); 1533 else 1534 orri(Vd, T, imm32 & 0xff, lsl); 1535 lsl += 8; imm32 >>= 8; 1536 } 1537 } 1538 1539 void MacroAssembler::mov_immediate64(Register dst, u_int64_t imm64) 1540 { 1541 #ifndef PRODUCT 1542 { 1543 char buffer[64]; 1544 snprintf(buffer, sizeof(buffer), "0x%"PRIX64, imm64); 1545 block_comment(buffer); 1546 } 1547 #endif 1548 if (operand_valid_for_logical_immediate(false, imm64)) { 1549 orr(dst, zr, imm64); 1550 } else { 1551 // we can use a combination of MOVZ or MOVN with 1552 // MOVK to build up the constant 1553 u_int64_t imm_h[4]; 1554 int zero_count = 0; 1555 int neg_count = 0; 1556 int i; 1557 for (i = 0; i < 4; i++) { 1558 imm_h[i] = ((imm64 >> (i * 16)) & 0xffffL); 1559 if (imm_h[i] == 0) { 1560 zero_count++; 1561 } else if (imm_h[i] == 0xffffL) { 1562 neg_count++; 1563 } 1564 } 1565 if (zero_count == 4) { 1566 // one MOVZ will do 1567 movz(dst, 0); 1568 } else if (neg_count == 4) { 1569 // one MOVN will do 1570 movn(dst, 0); 1571 } else if (zero_count == 3) { 1572 for (i = 0; i < 4; i++) { 1573 if (imm_h[i] != 0L) { 1574 movz(dst, (u_int32_t)imm_h[i], (i << 4)); 1575 break; 1576 } 1577 } 1578 } else if (neg_count == 3) { 1579 // one MOVN will do 1580 for (int i = 0; i < 4; i++) { 1581 if (imm_h[i] != 0xffffL) { 1582 movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4)); 1583 break; 1584 } 1585 } 1586 } else if (zero_count == 2) { 1587 // one MOVZ and one MOVK will do 1588 for (i = 0; i < 3; i++) { 1589 if (imm_h[i] != 0L) { 1590 movz(dst, (u_int32_t)imm_h[i], (i << 4)); 1591 i++; 1592 break; 1593 } 1594 } 1595 for (;i < 4; i++) { 1596 if (imm_h[i] != 0L) { 1597 movk(dst, (u_int32_t)imm_h[i], (i << 4)); 1598 } 1599 } 1600 } else if (neg_count == 2) { 1601 // one MOVN and one MOVK will do 1602 for (i = 0; i < 4; i++) { 1603 if (imm_h[i] != 0xffffL) { 1604 movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4)); 1605 i++; 1606 break; 1607 } 1608 } 1609 for (;i < 4; i++) { 1610 if (imm_h[i] != 0xffffL) { 1611 movk(dst, (u_int32_t)imm_h[i], (i << 4)); 1612 } 1613 } 1614 } else if (zero_count == 1) { 1615 // one MOVZ and two MOVKs will do 1616 for (i = 0; i < 4; i++) { 1617 if (imm_h[i] != 0L) { 1618 movz(dst, (u_int32_t)imm_h[i], (i << 4)); 1619 i++; 1620 break; 1621 } 1622 } 1623 for (;i < 4; i++) { 1624 if (imm_h[i] != 0x0L) { 1625 movk(dst, (u_int32_t)imm_h[i], (i << 4)); 1626 } 1627 } 1628 } else if (neg_count == 1) { 1629 // one MOVN and two MOVKs will do 1630 for (i = 0; i < 4; i++) { 1631 if (imm_h[i] != 0xffffL) { 1632 movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4)); 1633 i++; 1634 break; 1635 } 1636 } 1637 for (;i < 4; i++) { 1638 if (imm_h[i] != 0xffffL) { 1639 movk(dst, (u_int32_t)imm_h[i], (i << 4)); 1640 } 1641 } 1642 } else { 1643 // use a MOVZ and 3 MOVKs (makes it easier to debug) 1644 movz(dst, (u_int32_t)imm_h[0], 0); 1645 for (i = 1; i < 4; i++) { 1646 movk(dst, (u_int32_t)imm_h[i], (i << 4)); 1647 } 1648 } 1649 } 1650 } 1651 1652 void MacroAssembler::mov_immediate32(Register dst, u_int32_t imm32) 1653 { 1654 #ifndef PRODUCT 1655 { 1656 char buffer[64]; 1657 snprintf(buffer, sizeof(buffer), "0x%"PRIX32, imm32); 1658 block_comment(buffer); 1659 } 1660 #endif 1661 if (operand_valid_for_logical_immediate(true, imm32)) { 1662 orrw(dst, zr, imm32); 1663 } else { 1664 // we can use MOVZ, MOVN or two calls to MOVK to build up the 1665 // constant 1666 u_int32_t imm_h[2]; 1667 imm_h[0] = imm32 & 0xffff; 1668 imm_h[1] = ((imm32 >> 16) & 0xffff); 1669 if (imm_h[0] == 0) { 1670 movzw(dst, imm_h[1], 16); 1671 } else if (imm_h[0] == 0xffff) { 1672 movnw(dst, imm_h[1] ^ 0xffff, 16); 1673 } else if (imm_h[1] == 0) { 1674 movzw(dst, imm_h[0], 0); 1675 } else if (imm_h[1] == 0xffff) { 1676 movnw(dst, imm_h[0] ^ 0xffff, 0); 1677 } else { 1678 // use a MOVZ and MOVK (makes it easier to debug) 1679 movzw(dst, imm_h[0], 0); 1680 movkw(dst, imm_h[1], 16); 1681 } 1682 } 1683 } 1684 1685 // Form an address from base + offset in Rd. Rd may or may 1686 // not actually be used: you must use the Address that is returned. 1687 // It is up to you to ensure that the shift provided matches the size 1688 // of your data. 1689 Address MacroAssembler::form_address(Register Rd, Register base, long byte_offset, int shift) { 1690 if (Address::offset_ok_for_immed(byte_offset, shift)) 1691 // It fits; no need for any heroics 1692 return Address(base, byte_offset); 1693 1694 // Don't do anything clever with negative or misaligned offsets 1695 unsigned mask = (1 << shift) - 1; 1696 if (byte_offset < 0 || byte_offset & mask) { 1697 mov(Rd, byte_offset); 1698 add(Rd, base, Rd); 1699 return Address(Rd); 1700 } 1701 1702 // See if we can do this with two 12-bit offsets 1703 { 1704 unsigned long word_offset = byte_offset >> shift; 1705 unsigned long masked_offset = word_offset & 0xfff000; 1706 if (Address::offset_ok_for_immed(word_offset - masked_offset) 1707 && Assembler::operand_valid_for_add_sub_immediate(masked_offset << shift)) { 1708 add(Rd, base, masked_offset << shift); 1709 word_offset -= masked_offset; 1710 return Address(Rd, word_offset << shift); 1711 } 1712 } 1713 1714 // Do it the hard way 1715 mov(Rd, byte_offset); 1716 add(Rd, base, Rd); 1717 return Address(Rd); 1718 } 1719 1720 void MacroAssembler::atomic_incw(Register counter_addr, Register tmp, Register tmp2) { 1721 if (UseLSE) { 1722 mov(tmp, 1); 1723 ldadd(Assembler::word, tmp, zr, counter_addr); 1724 return; 1725 } 1726 Label retry_load; 1727 if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH)) 1728 prfm(Address(counter_addr), PSTL1STRM); 1729 bind(retry_load); 1730 // flush and load exclusive from the memory location 1731 ldxrw(tmp, counter_addr); 1732 addw(tmp, tmp, 1); 1733 // if we store+flush with no intervening write tmp wil be zero 1734 stxrw(tmp2, tmp, counter_addr); 1735 cbnzw(tmp2, retry_load); 1736 } 1737 1738 1739 int MacroAssembler::corrected_idivl(Register result, Register ra, Register rb, 1740 bool want_remainder, Register scratch) 1741 { 1742 // Full implementation of Java idiv and irem. The function 1743 // returns the (pc) offset of the div instruction - may be needed 1744 // for implicit exceptions. 1745 // 1746 // constraint : ra/rb =/= scratch 1747 // normal case 1748 // 1749 // input : ra: dividend 1750 // rb: divisor 1751 // 1752 // result: either 1753 // quotient (= ra idiv rb) 1754 // remainder (= ra irem rb) 1755 1756 assert(ra != scratch && rb != scratch, "reg cannot be scratch"); 1757 1758 int idivl_offset = offset(); 1759 if (! want_remainder) { 1760 sdivw(result, ra, rb); 1761 } else { 1762 sdivw(scratch, ra, rb); 1763 Assembler::msubw(result, scratch, rb, ra); 1764 } 1765 1766 return idivl_offset; 1767 } 1768 1769 int MacroAssembler::corrected_idivq(Register result, Register ra, Register rb, 1770 bool want_remainder, Register scratch) 1771 { 1772 // Full implementation of Java ldiv and lrem. The function 1773 // returns the (pc) offset of the div instruction - may be needed 1774 // for implicit exceptions. 1775 // 1776 // constraint : ra/rb =/= scratch 1777 // normal case 1778 // 1779 // input : ra: dividend 1780 // rb: divisor 1781 // 1782 // result: either 1783 // quotient (= ra idiv rb) 1784 // remainder (= ra irem rb) 1785 1786 assert(ra != scratch && rb != scratch, "reg cannot be scratch"); 1787 1788 int idivq_offset = offset(); 1789 if (! want_remainder) { 1790 sdiv(result, ra, rb); 1791 } else { 1792 sdiv(scratch, ra, rb); 1793 Assembler::msub(result, scratch, rb, ra); 1794 } 1795 1796 return idivq_offset; 1797 } 1798 1799 void MacroAssembler::membar(Membar_mask_bits order_constraint) { 1800 address prev = pc() - NativeMembar::instruction_size; 1801 address last = code()->last_insn(); 1802 if (last != NULL && nativeInstruction_at(last)->is_Membar() && prev == last) { 1803 NativeMembar *bar = NativeMembar_at(prev); 1804 // We are merging two memory barrier instructions. On AArch64 we 1805 // can do this simply by ORing them together. 1806 bar->set_kind(bar->get_kind() | order_constraint); 1807 BLOCK_COMMENT("merged membar"); 1808 } else { 1809 code()->set_last_insn(pc()); 1810 dmb(Assembler::barrier(order_constraint)); 1811 } 1812 } 1813 1814 bool MacroAssembler::try_merge_ldst(Register rt, const Address &adr, size_t size_in_bytes, bool is_store) { 1815 if (ldst_can_merge(rt, adr, size_in_bytes, is_store)) { 1816 merge_ldst(rt, adr, size_in_bytes, is_store); 1817 code()->clear_last_insn(); 1818 return true; 1819 } else { 1820 assert(size_in_bytes == 8 || size_in_bytes == 4, "only 8 bytes or 4 bytes load/store is supported."); 1821 const unsigned mask = size_in_bytes - 1; 1822 if (adr.getMode() == Address::base_plus_offset && 1823 (adr.offset() & mask) == 0) { // only supports base_plus_offset. 1824 code()->set_last_insn(pc()); 1825 } 1826 return false; 1827 } 1828 } 1829 1830 void MacroAssembler::ldr(Register Rx, const Address &adr) { 1831 // We always try to merge two adjacent loads into one ldp. 1832 if (!try_merge_ldst(Rx, adr, 8, false)) { 1833 Assembler::ldr(Rx, adr); 1834 } 1835 } 1836 1837 void MacroAssembler::ldrw(Register Rw, const Address &adr) { 1838 // We always try to merge two adjacent loads into one ldp. 1839 if (!try_merge_ldst(Rw, adr, 4, false)) { 1840 Assembler::ldrw(Rw, adr); 1841 } 1842 } 1843 1844 void MacroAssembler::str(Register Rx, const Address &adr) { 1845 // We always try to merge two adjacent stores into one stp. 1846 if (!try_merge_ldst(Rx, adr, 8, true)) { 1847 Assembler::str(Rx, adr); 1848 } 1849 } 1850 1851 void MacroAssembler::strw(Register Rw, const Address &adr) { 1852 // We always try to merge two adjacent stores into one stp. 1853 if (!try_merge_ldst(Rw, adr, 4, true)) { 1854 Assembler::strw(Rw, adr); 1855 } 1856 } 1857 1858 // MacroAssembler routines found actually to be needed 1859 1860 void MacroAssembler::push(Register src) 1861 { 1862 str(src, Address(pre(esp, -1 * wordSize))); 1863 } 1864 1865 void MacroAssembler::pop(Register dst) 1866 { 1867 ldr(dst, Address(post(esp, 1 * wordSize))); 1868 } 1869 1870 // Note: load_unsigned_short used to be called load_unsigned_word. 1871 int MacroAssembler::load_unsigned_short(Register dst, Address src) { 1872 int off = offset(); 1873 ldrh(dst, src); 1874 return off; 1875 } 1876 1877 int MacroAssembler::load_unsigned_byte(Register dst, Address src) { 1878 int off = offset(); 1879 ldrb(dst, src); 1880 return off; 1881 } 1882 1883 int MacroAssembler::load_signed_short(Register dst, Address src) { 1884 int off = offset(); 1885 ldrsh(dst, src); 1886 return off; 1887 } 1888 1889 int MacroAssembler::load_signed_byte(Register dst, Address src) { 1890 int off = offset(); 1891 ldrsb(dst, src); 1892 return off; 1893 } 1894 1895 int MacroAssembler::load_signed_short32(Register dst, Address src) { 1896 int off = offset(); 1897 ldrshw(dst, src); 1898 return off; 1899 } 1900 1901 int MacroAssembler::load_signed_byte32(Register dst, Address src) { 1902 int off = offset(); 1903 ldrsbw(dst, src); 1904 return off; 1905 } 1906 1907 void MacroAssembler::load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed, Register dst2) { 1908 switch (size_in_bytes) { 1909 case 8: ldr(dst, src); break; 1910 case 4: ldrw(dst, src); break; 1911 case 2: is_signed ? load_signed_short(dst, src) : load_unsigned_short(dst, src); break; 1912 case 1: is_signed ? load_signed_byte( dst, src) : load_unsigned_byte( dst, src); break; 1913 default: ShouldNotReachHere(); 1914 } 1915 } 1916 1917 void MacroAssembler::store_sized_value(Address dst, Register src, size_t size_in_bytes, Register src2) { 1918 switch (size_in_bytes) { 1919 case 8: str(src, dst); break; 1920 case 4: strw(src, dst); break; 1921 case 2: strh(src, dst); break; 1922 case 1: strb(src, dst); break; 1923 default: ShouldNotReachHere(); 1924 } 1925 } 1926 1927 void MacroAssembler::decrementw(Register reg, int value) 1928 { 1929 if (value < 0) { incrementw(reg, -value); return; } 1930 if (value == 0) { return; } 1931 if (value < (1 << 12)) { subw(reg, reg, value); return; } 1932 /* else */ { 1933 guarantee(reg != rscratch2, "invalid dst for register decrement"); 1934 movw(rscratch2, (unsigned)value); 1935 subw(reg, reg, rscratch2); 1936 } 1937 } 1938 1939 void MacroAssembler::decrement(Register reg, int value) 1940 { 1941 if (value < 0) { increment(reg, -value); return; } 1942 if (value == 0) { return; } 1943 if (value < (1 << 12)) { sub(reg, reg, value); return; } 1944 /* else */ { 1945 assert(reg != rscratch2, "invalid dst for register decrement"); 1946 mov(rscratch2, (unsigned long)value); 1947 sub(reg, reg, rscratch2); 1948 } 1949 } 1950 1951 void MacroAssembler::decrementw(Address dst, int value) 1952 { 1953 assert(!dst.uses(rscratch1), "invalid dst for address decrement"); 1954 if (dst.getMode() == Address::literal) { 1955 assert(abs(value) < (1 << 12), "invalid value and address mode combination"); 1956 lea(rscratch2, dst); 1957 dst = Address(rscratch2); 1958 } 1959 ldrw(rscratch1, dst); 1960 decrementw(rscratch1, value); 1961 strw(rscratch1, dst); 1962 } 1963 1964 void MacroAssembler::decrement(Address dst, int value) 1965 { 1966 assert(!dst.uses(rscratch1), "invalid address for decrement"); 1967 if (dst.getMode() == Address::literal) { 1968 assert(abs(value) < (1 << 12), "invalid value and address mode combination"); 1969 lea(rscratch2, dst); 1970 dst = Address(rscratch2); 1971 } 1972 ldr(rscratch1, dst); 1973 decrement(rscratch1, value); 1974 str(rscratch1, dst); 1975 } 1976 1977 void MacroAssembler::incrementw(Register reg, int value) 1978 { 1979 if (value < 0) { decrementw(reg, -value); return; } 1980 if (value == 0) { return; } 1981 if (value < (1 << 12)) { addw(reg, reg, value); return; } 1982 /* else */ { 1983 assert(reg != rscratch2, "invalid dst for register increment"); 1984 movw(rscratch2, (unsigned)value); 1985 addw(reg, reg, rscratch2); 1986 } 1987 } 1988 1989 void MacroAssembler::increment(Register reg, int value) 1990 { 1991 if (value < 0) { decrement(reg, -value); return; } 1992 if (value == 0) { return; } 1993 if (value < (1 << 12)) { add(reg, reg, value); return; } 1994 /* else */ { 1995 assert(reg != rscratch2, "invalid dst for register increment"); 1996 movw(rscratch2, (unsigned)value); 1997 add(reg, reg, rscratch2); 1998 } 1999 } 2000 2001 void MacroAssembler::incrementw(Address dst, int value) 2002 { 2003 assert(!dst.uses(rscratch1), "invalid dst for address increment"); 2004 if (dst.getMode() == Address::literal) { 2005 assert(abs(value) < (1 << 12), "invalid value and address mode combination"); 2006 lea(rscratch2, dst); 2007 dst = Address(rscratch2); 2008 } 2009 ldrw(rscratch1, dst); 2010 incrementw(rscratch1, value); 2011 strw(rscratch1, dst); 2012 } 2013 2014 void MacroAssembler::increment(Address dst, int value) 2015 { 2016 assert(!dst.uses(rscratch1), "invalid dst for address increment"); 2017 if (dst.getMode() == Address::literal) { 2018 assert(abs(value) < (1 << 12), "invalid value and address mode combination"); 2019 lea(rscratch2, dst); 2020 dst = Address(rscratch2); 2021 } 2022 ldr(rscratch1, dst); 2023 increment(rscratch1, value); 2024 str(rscratch1, dst); 2025 } 2026 2027 2028 void MacroAssembler::pusha() { 2029 push(0x7fffffff, sp); 2030 } 2031 2032 void MacroAssembler::popa() { 2033 pop(0x7fffffff, sp); 2034 } 2035 2036 // Push lots of registers in the bit set supplied. Don't push sp. 2037 // Return the number of words pushed 2038 int MacroAssembler::push(unsigned int bitset, Register stack) { 2039 int words_pushed = 0; 2040 2041 // Scan bitset to accumulate register pairs 2042 unsigned char regs[32]; 2043 int count = 0; 2044 for (int reg = 0; reg <= 30; reg++) { 2045 if (1 & bitset) 2046 regs[count++] = reg; 2047 bitset >>= 1; 2048 } 2049 regs[count++] = zr->encoding_nocheck(); 2050 count &= ~1; // Only push an even nuber of regs 2051 2052 if (count) { 2053 stp(as_Register(regs[0]), as_Register(regs[1]), 2054 Address(pre(stack, -count * wordSize))); 2055 words_pushed += 2; 2056 } 2057 for (int i = 2; i < count; i += 2) { 2058 stp(as_Register(regs[i]), as_Register(regs[i+1]), 2059 Address(stack, i * wordSize)); 2060 words_pushed += 2; 2061 } 2062 2063 assert(words_pushed == count, "oops, pushed != count"); 2064 2065 return count; 2066 } 2067 2068 int MacroAssembler::pop(unsigned int bitset, Register stack) { 2069 int words_pushed = 0; 2070 2071 // Scan bitset to accumulate register pairs 2072 unsigned char regs[32]; 2073 int count = 0; 2074 for (int reg = 0; reg <= 30; reg++) { 2075 if (1 & bitset) 2076 regs[count++] = reg; 2077 bitset >>= 1; 2078 } 2079 regs[count++] = zr->encoding_nocheck(); 2080 count &= ~1; 2081 2082 for (int i = 2; i < count; i += 2) { 2083 ldp(as_Register(regs[i]), as_Register(regs[i+1]), 2084 Address(stack, i * wordSize)); 2085 words_pushed += 2; 2086 } 2087 if (count) { 2088 ldp(as_Register(regs[0]), as_Register(regs[1]), 2089 Address(post(stack, count * wordSize))); 2090 words_pushed += 2; 2091 } 2092 2093 assert(words_pushed == count, "oops, pushed != count"); 2094 2095 return count; 2096 } 2097 #ifdef ASSERT 2098 void MacroAssembler::verify_heapbase(const char* msg) { 2099 #if 0 2100 assert (UseCompressedOops || UseCompressedClassPointers, "should be compressed"); 2101 assert (Universe::heap() != NULL, "java heap should be initialized"); 2102 if (CheckCompressedOops) { 2103 Label ok; 2104 push(1 << rscratch1->encoding(), sp); // cmpptr trashes rscratch1 2105 cmpptr(rheapbase, ExternalAddress((address)Universe::narrow_ptrs_base_addr())); 2106 br(Assembler::EQ, ok); 2107 stop(msg); 2108 bind(ok); 2109 pop(1 << rscratch1->encoding(), sp); 2110 } 2111 #endif 2112 } 2113 #endif 2114 2115 void MacroAssembler::resolve_jobject(Register value, Register thread, Register tmp) { 2116 Label done, not_weak; 2117 cbz(value, done); // Use NULL as-is. 2118 2119 STATIC_ASSERT(JNIHandles::weak_tag_mask == 1u); 2120 tbz(r0, 0, not_weak); // Test for jweak tag. 2121 2122 // Resolve jweak. 2123 access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF, value, 2124 Address(value, -JNIHandles::weak_tag_value), tmp, thread); 2125 verify_oop(value); 2126 b(done); 2127 2128 bind(not_weak); 2129 // Resolve (untagged) jobject. 2130 access_load_at(T_OBJECT, IN_NATIVE, value, Address(value, 0), tmp, thread); 2131 verify_oop(value); 2132 bind(done); 2133 } 2134 2135 void MacroAssembler::stop(const char* msg) { 2136 address ip = pc(); 2137 pusha(); 2138 mov(c_rarg0, (address)msg); 2139 mov(c_rarg1, (address)ip); 2140 mov(c_rarg2, sp); 2141 mov(c_rarg3, CAST_FROM_FN_PTR(address, MacroAssembler::debug64)); 2142 // call(c_rarg3); 2143 blrt(c_rarg3, 3, 0, 1); 2144 hlt(0); 2145 } 2146 2147 void MacroAssembler::unimplemented(const char* what) { 2148 const char* buf = NULL; 2149 { 2150 ResourceMark rm; 2151 stringStream ss; 2152 ss.print("unimplemented: %s", what); 2153 buf = code_string(ss.as_string()); 2154 } 2155 stop(buf); 2156 } 2157 2158 // If a constant does not fit in an immediate field, generate some 2159 // number of MOV instructions and then perform the operation. 2160 void MacroAssembler::wrap_add_sub_imm_insn(Register Rd, Register Rn, unsigned imm, 2161 add_sub_imm_insn insn1, 2162 add_sub_reg_insn insn2) { 2163 assert(Rd != zr, "Rd = zr and not setting flags?"); 2164 if (operand_valid_for_add_sub_immediate((int)imm)) { 2165 (this->*insn1)(Rd, Rn, imm); 2166 } else { 2167 if (uabs(imm) < (1 << 24)) { 2168 (this->*insn1)(Rd, Rn, imm & -(1 << 12)); 2169 (this->*insn1)(Rd, Rd, imm & ((1 << 12)-1)); 2170 } else { 2171 assert_different_registers(Rd, Rn); 2172 mov(Rd, (uint64_t)imm); 2173 (this->*insn2)(Rd, Rn, Rd, LSL, 0); 2174 } 2175 } 2176 } 2177 2178 // Seperate vsn which sets the flags. Optimisations are more restricted 2179 // because we must set the flags correctly. 2180 void MacroAssembler::wrap_adds_subs_imm_insn(Register Rd, Register Rn, unsigned imm, 2181 add_sub_imm_insn insn1, 2182 add_sub_reg_insn insn2) { 2183 if (operand_valid_for_add_sub_immediate((int)imm)) { 2184 (this->*insn1)(Rd, Rn, imm); 2185 } else { 2186 assert_different_registers(Rd, Rn); 2187 assert(Rd != zr, "overflow in immediate operand"); 2188 mov(Rd, (uint64_t)imm); 2189 (this->*insn2)(Rd, Rn, Rd, LSL, 0); 2190 } 2191 } 2192 2193 2194 void MacroAssembler::add(Register Rd, Register Rn, RegisterOrConstant increment) { 2195 if (increment.is_register()) { 2196 add(Rd, Rn, increment.as_register()); 2197 } else { 2198 add(Rd, Rn, increment.as_constant()); 2199 } 2200 } 2201 2202 void MacroAssembler::addw(Register Rd, Register Rn, RegisterOrConstant increment) { 2203 if (increment.is_register()) { 2204 addw(Rd, Rn, increment.as_register()); 2205 } else { 2206 addw(Rd, Rn, increment.as_constant()); 2207 } 2208 } 2209 2210 void MacroAssembler::sub(Register Rd, Register Rn, RegisterOrConstant decrement) { 2211 if (decrement.is_register()) { 2212 sub(Rd, Rn, decrement.as_register()); 2213 } else { 2214 sub(Rd, Rn, decrement.as_constant()); 2215 } 2216 } 2217 2218 void MacroAssembler::subw(Register Rd, Register Rn, RegisterOrConstant decrement) { 2219 if (decrement.is_register()) { 2220 subw(Rd, Rn, decrement.as_register()); 2221 } else { 2222 subw(Rd, Rn, decrement.as_constant()); 2223 } 2224 } 2225 2226 void MacroAssembler::reinit_heapbase() 2227 { 2228 if (UseCompressedOops) { 2229 if (Universe::is_fully_initialized()) { 2230 mov(rheapbase, Universe::narrow_ptrs_base()); 2231 } else { 2232 lea(rheapbase, ExternalAddress((address)Universe::narrow_ptrs_base_addr())); 2233 ldr(rheapbase, Address(rheapbase)); 2234 } 2235 } 2236 } 2237 2238 // this simulates the behaviour of the x86 cmpxchg instruction using a 2239 // load linked/store conditional pair. we use the acquire/release 2240 // versions of these instructions so that we flush pending writes as 2241 // per Java semantics. 2242 2243 // n.b the x86 version assumes the old value to be compared against is 2244 // in rax and updates rax with the value located in memory if the 2245 // cmpxchg fails. we supply a register for the old value explicitly 2246 2247 // the aarch64 load linked/store conditional instructions do not 2248 // accept an offset. so, unlike x86, we must provide a plain register 2249 // to identify the memory word to be compared/exchanged rather than a 2250 // register+offset Address. 2251 2252 void MacroAssembler::cmpxchgptr(Register oldv, Register newv, Register addr, Register tmp, 2253 Label &succeed, Label *fail) { 2254 // oldv holds comparison value 2255 // newv holds value to write in exchange 2256 // addr identifies memory word to compare against/update 2257 if (UseLSE) { 2258 mov(tmp, oldv); 2259 casal(Assembler::xword, oldv, newv, addr); 2260 cmp(tmp, oldv); 2261 br(Assembler::EQ, succeed); 2262 membar(AnyAny); 2263 } else { 2264 Label retry_load, nope; 2265 if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH)) 2266 prfm(Address(addr), PSTL1STRM); 2267 bind(retry_load); 2268 // flush and load exclusive from the memory location 2269 // and fail if it is not what we expect 2270 ldaxr(tmp, addr); 2271 cmp(tmp, oldv); 2272 br(Assembler::NE, nope); 2273 // if we store+flush with no intervening write tmp wil be zero 2274 stlxr(tmp, newv, addr); 2275 cbzw(tmp, succeed); 2276 // retry so we only ever return after a load fails to compare 2277 // ensures we don't return a stale value after a failed write. 2278 b(retry_load); 2279 // if the memory word differs we return it in oldv and signal a fail 2280 bind(nope); 2281 membar(AnyAny); 2282 mov(oldv, tmp); 2283 } 2284 if (fail) 2285 b(*fail); 2286 } 2287 2288 void MacroAssembler::cmpxchg_obj_header(Register oldv, Register newv, Register obj, Register tmp, 2289 Label &succeed, Label *fail) { 2290 assert(oopDesc::mark_offset_in_bytes() == 0, "assumption"); 2291 cmpxchgptr(oldv, newv, obj, tmp, succeed, fail); 2292 } 2293 2294 void MacroAssembler::cmpxchgw(Register oldv, Register newv, Register addr, Register tmp, 2295 Label &succeed, Label *fail) { 2296 // oldv holds comparison value 2297 // newv holds value to write in exchange 2298 // addr identifies memory word to compare against/update 2299 // tmp returns 0/1 for success/failure 2300 if (UseLSE) { 2301 mov(tmp, oldv); 2302 casal(Assembler::word, oldv, newv, addr); 2303 cmp(tmp, oldv); 2304 br(Assembler::EQ, succeed); 2305 membar(AnyAny); 2306 } else { 2307 Label retry_load, nope; 2308 if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH)) 2309 prfm(Address(addr), PSTL1STRM); 2310 bind(retry_load); 2311 // flush and load exclusive from the memory location 2312 // and fail if it is not what we expect 2313 ldaxrw(tmp, addr); 2314 cmp(tmp, oldv); 2315 br(Assembler::NE, nope); 2316 // if we store+flush with no intervening write tmp wil be zero 2317 stlxrw(tmp, newv, addr); 2318 cbzw(tmp, succeed); 2319 // retry so we only ever return after a load fails to compare 2320 // ensures we don't return a stale value after a failed write. 2321 b(retry_load); 2322 // if the memory word differs we return it in oldv and signal a fail 2323 bind(nope); 2324 membar(AnyAny); 2325 mov(oldv, tmp); 2326 } 2327 if (fail) 2328 b(*fail); 2329 } 2330 2331 // A generic CAS; success or failure is in the EQ flag. A weak CAS 2332 // doesn't retry and may fail spuriously. If the oldval is wanted, 2333 // Pass a register for the result, otherwise pass noreg. 2334 2335 // Clobbers rscratch1 2336 void MacroAssembler::cmpxchg(Register addr, Register expected, 2337 Register new_val, 2338 enum operand_size size, 2339 bool acquire, bool release, 2340 bool weak, 2341 Register result) { 2342 if (result == noreg) result = rscratch1; 2343 if (UseLSE) { 2344 mov(result, expected); 2345 lse_cas(result, new_val, addr, size, acquire, release, /*not_pair*/ true); 2346 cmp(result, expected); 2347 } else { 2348 BLOCK_COMMENT("cmpxchg {"); 2349 Label retry_load, done; 2350 if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH)) 2351 prfm(Address(addr), PSTL1STRM); 2352 bind(retry_load); 2353 load_exclusive(result, addr, size, acquire); 2354 if (size == xword) 2355 cmp(result, expected); 2356 else 2357 cmpw(result, expected); 2358 br(Assembler::NE, done); 2359 store_exclusive(rscratch1, new_val, addr, size, release); 2360 if (weak) { 2361 cmpw(rscratch1, 0u); // If the store fails, return NE to our caller. 2362 } else { 2363 cbnzw(rscratch1, retry_load); 2364 } 2365 bind(done); 2366 BLOCK_COMMENT("} cmpxchg"); 2367 } 2368 } 2369 2370 static bool different(Register a, RegisterOrConstant b, Register c) { 2371 if (b.is_constant()) 2372 return a != c; 2373 else 2374 return a != b.as_register() && a != c && b.as_register() != c; 2375 } 2376 2377 #define ATOMIC_OP(NAME, LDXR, OP, IOP, AOP, STXR, sz) \ 2378 void MacroAssembler::atomic_##NAME(Register prev, RegisterOrConstant incr, Register addr) { \ 2379 if (UseLSE) { \ 2380 prev = prev->is_valid() ? prev : zr; \ 2381 if (incr.is_register()) { \ 2382 AOP(sz, incr.as_register(), prev, addr); \ 2383 } else { \ 2384 mov(rscratch2, incr.as_constant()); \ 2385 AOP(sz, rscratch2, prev, addr); \ 2386 } \ 2387 return; \ 2388 } \ 2389 Register result = rscratch2; \ 2390 if (prev->is_valid()) \ 2391 result = different(prev, incr, addr) ? prev : rscratch2; \ 2392 \ 2393 Label retry_load; \ 2394 if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH)) \ 2395 prfm(Address(addr), PSTL1STRM); \ 2396 bind(retry_load); \ 2397 LDXR(result, addr); \ 2398 OP(rscratch1, result, incr); \ 2399 STXR(rscratch2, rscratch1, addr); \ 2400 cbnzw(rscratch2, retry_load); \ 2401 if (prev->is_valid() && prev != result) { \ 2402 IOP(prev, rscratch1, incr); \ 2403 } \ 2404 } 2405 2406 ATOMIC_OP(add, ldxr, add, sub, ldadd, stxr, Assembler::xword) 2407 ATOMIC_OP(addw, ldxrw, addw, subw, ldadd, stxrw, Assembler::word) 2408 ATOMIC_OP(addal, ldaxr, add, sub, ldaddal, stlxr, Assembler::xword) 2409 ATOMIC_OP(addalw, ldaxrw, addw, subw, ldaddal, stlxrw, Assembler::word) 2410 2411 #undef ATOMIC_OP 2412 2413 #define ATOMIC_XCHG(OP, AOP, LDXR, STXR, sz) \ 2414 void MacroAssembler::atomic_##OP(Register prev, Register newv, Register addr) { \ 2415 if (UseLSE) { \ 2416 prev = prev->is_valid() ? prev : zr; \ 2417 AOP(sz, newv, prev, addr); \ 2418 return; \ 2419 } \ 2420 Register result = rscratch2; \ 2421 if (prev->is_valid()) \ 2422 result = different(prev, newv, addr) ? prev : rscratch2; \ 2423 \ 2424 Label retry_load; \ 2425 if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH)) \ 2426 prfm(Address(addr), PSTL1STRM); \ 2427 bind(retry_load); \ 2428 LDXR(result, addr); \ 2429 STXR(rscratch1, newv, addr); \ 2430 cbnzw(rscratch1, retry_load); \ 2431 if (prev->is_valid() && prev != result) \ 2432 mov(prev, result); \ 2433 } 2434 2435 ATOMIC_XCHG(xchg, swp, ldxr, stxr, Assembler::xword) 2436 ATOMIC_XCHG(xchgw, swp, ldxrw, stxrw, Assembler::word) 2437 ATOMIC_XCHG(xchgal, swpal, ldaxr, stlxr, Assembler::xword) 2438 ATOMIC_XCHG(xchgalw, swpal, ldaxrw, stlxrw, Assembler::word) 2439 2440 #undef ATOMIC_XCHG 2441 2442 #ifndef PRODUCT 2443 extern "C" void findpc(intptr_t x); 2444 #endif 2445 2446 void MacroAssembler::debug64(char* msg, int64_t pc, int64_t regs[]) 2447 { 2448 // In order to get locks to work, we need to fake a in_VM state 2449 if (ShowMessageBoxOnError ) { 2450 JavaThread* thread = JavaThread::current(); 2451 JavaThreadState saved_state = thread->thread_state(); 2452 thread->set_thread_state(_thread_in_vm); 2453 #ifndef PRODUCT 2454 if (CountBytecodes || TraceBytecodes || StopInterpreterAt) { 2455 ttyLocker ttyl; 2456 BytecodeCounter::print(); 2457 } 2458 #endif 2459 if (os::message_box(msg, "Execution stopped, print registers?")) { 2460 ttyLocker ttyl; 2461 tty->print_cr(" pc = 0x%016lx", pc); 2462 #ifndef PRODUCT 2463 tty->cr(); 2464 findpc(pc); 2465 tty->cr(); 2466 #endif 2467 tty->print_cr(" r0 = 0x%016lx", regs[0]); 2468 tty->print_cr(" r1 = 0x%016lx", regs[1]); 2469 tty->print_cr(" r2 = 0x%016lx", regs[2]); 2470 tty->print_cr(" r3 = 0x%016lx", regs[3]); 2471 tty->print_cr(" r4 = 0x%016lx", regs[4]); 2472 tty->print_cr(" r5 = 0x%016lx", regs[5]); 2473 tty->print_cr(" r6 = 0x%016lx", regs[6]); 2474 tty->print_cr(" r7 = 0x%016lx", regs[7]); 2475 tty->print_cr(" r8 = 0x%016lx", regs[8]); 2476 tty->print_cr(" r9 = 0x%016lx", regs[9]); 2477 tty->print_cr("r10 = 0x%016lx", regs[10]); 2478 tty->print_cr("r11 = 0x%016lx", regs[11]); 2479 tty->print_cr("r12 = 0x%016lx", regs[12]); 2480 tty->print_cr("r13 = 0x%016lx", regs[13]); 2481 tty->print_cr("r14 = 0x%016lx", regs[14]); 2482 tty->print_cr("r15 = 0x%016lx", regs[15]); 2483 tty->print_cr("r16 = 0x%016lx", regs[16]); 2484 tty->print_cr("r17 = 0x%016lx", regs[17]); 2485 tty->print_cr("r18 = 0x%016lx", regs[18]); 2486 tty->print_cr("r19 = 0x%016lx", regs[19]); 2487 tty->print_cr("r20 = 0x%016lx", regs[20]); 2488 tty->print_cr("r21 = 0x%016lx", regs[21]); 2489 tty->print_cr("r22 = 0x%016lx", regs[22]); 2490 tty->print_cr("r23 = 0x%016lx", regs[23]); 2491 tty->print_cr("r24 = 0x%016lx", regs[24]); 2492 tty->print_cr("r25 = 0x%016lx", regs[25]); 2493 tty->print_cr("r26 = 0x%016lx", regs[26]); 2494 tty->print_cr("r27 = 0x%016lx", regs[27]); 2495 tty->print_cr("r28 = 0x%016lx", regs[28]); 2496 tty->print_cr("r30 = 0x%016lx", regs[30]); 2497 tty->print_cr("r31 = 0x%016lx", regs[31]); 2498 BREAKPOINT; 2499 } 2500 ThreadStateTransition::transition(thread, _thread_in_vm, saved_state); 2501 } else { 2502 ttyLocker ttyl; 2503 ::tty->print_cr("=============== DEBUG MESSAGE: %s ================\n", 2504 msg); 2505 assert(false, "DEBUG MESSAGE: %s", msg); 2506 } 2507 } 2508 2509 #ifdef BUILTIN_SIM 2510 // routine to generate an x86 prolog for a stub function which 2511 // bootstraps into the generated ARM code which directly follows the 2512 // stub 2513 // 2514 // the argument encodes the number of general and fp registers 2515 // passed by the caller and the callng convention (currently just 2516 // the number of general registers and assumes C argument passing) 2517 2518 extern "C" { 2519 int aarch64_stub_prolog_size(); 2520 void aarch64_stub_prolog(); 2521 void aarch64_prolog(); 2522 } 2523 2524 void MacroAssembler::c_stub_prolog(int gp_arg_count, int fp_arg_count, int ret_type, 2525 address *prolog_ptr) 2526 { 2527 int calltype = (((ret_type & 0x3) << 8) | 2528 ((fp_arg_count & 0xf) << 4) | 2529 (gp_arg_count & 0xf)); 2530 2531 // the addresses for the x86 to ARM entry code we need to use 2532 address start = pc(); 2533 // printf("start = %lx\n", start); 2534 int byteCount = aarch64_stub_prolog_size(); 2535 // printf("byteCount = %x\n", byteCount); 2536 int instructionCount = (byteCount + 3)/ 4; 2537 // printf("instructionCount = %x\n", instructionCount); 2538 for (int i = 0; i < instructionCount; i++) { 2539 nop(); 2540 } 2541 2542 memcpy(start, (void*)aarch64_stub_prolog, byteCount); 2543 2544 // write the address of the setup routine and the call format at the 2545 // end of into the copied code 2546 u_int64_t *patch_end = (u_int64_t *)(start + byteCount); 2547 if (prolog_ptr) 2548 patch_end[-2] = (u_int64_t)prolog_ptr; 2549 patch_end[-1] = calltype; 2550 } 2551 #endif 2552 2553 void MacroAssembler::push_call_clobbered_registers() { 2554 int step = 4 * wordSize; 2555 push(RegSet::range(r0, r18) - RegSet::of(rscratch1, rscratch2), sp); 2556 sub(sp, sp, step); 2557 mov(rscratch1, -step); 2558 // Push v0-v7, v16-v31. 2559 for (int i = 31; i>= 4; i -= 4) { 2560 if (i <= v7->encoding() || i >= v16->encoding()) 2561 st1(as_FloatRegister(i-3), as_FloatRegister(i-2), as_FloatRegister(i-1), 2562 as_FloatRegister(i), T1D, Address(post(sp, rscratch1))); 2563 } 2564 st1(as_FloatRegister(0), as_FloatRegister(1), as_FloatRegister(2), 2565 as_FloatRegister(3), T1D, Address(sp)); 2566 } 2567 2568 void MacroAssembler::pop_call_clobbered_registers() { 2569 for (int i = 0; i < 32; i += 4) { 2570 if (i <= v7->encoding() || i >= v16->encoding()) 2571 ld1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2), 2572 as_FloatRegister(i+3), T1D, Address(post(sp, 4 * wordSize))); 2573 } 2574 2575 pop(RegSet::range(r0, r18) - RegSet::of(rscratch1, rscratch2), sp); 2576 } 2577 2578 void MacroAssembler::push_CPU_state(bool save_vectors) { 2579 int step = (save_vectors ? 8 : 4) * wordSize; 2580 push(0x3fffffff, sp); // integer registers except lr & sp 2581 mov(rscratch1, -step); 2582 sub(sp, sp, step); 2583 for (int i = 28; i >= 4; i -= 4) { 2584 st1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2), 2585 as_FloatRegister(i+3), save_vectors ? T2D : T1D, Address(post(sp, rscratch1))); 2586 } 2587 st1(v0, v1, v2, v3, save_vectors ? T2D : T1D, sp); 2588 } 2589 2590 void MacroAssembler::pop_CPU_state(bool restore_vectors) { 2591 int step = (restore_vectors ? 8 : 4) * wordSize; 2592 for (int i = 0; i <= 28; i += 4) 2593 ld1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2), 2594 as_FloatRegister(i+3), restore_vectors ? T2D : T1D, Address(post(sp, step))); 2595 pop(0x3fffffff, sp); // integer registers except lr & sp 2596 } 2597 2598 /** 2599 * Helpers for multiply_to_len(). 2600 */ 2601 void MacroAssembler::add2_with_carry(Register final_dest_hi, Register dest_hi, Register dest_lo, 2602 Register src1, Register src2) { 2603 adds(dest_lo, dest_lo, src1); 2604 adc(dest_hi, dest_hi, zr); 2605 adds(dest_lo, dest_lo, src2); 2606 adc(final_dest_hi, dest_hi, zr); 2607 } 2608 2609 // Generate an address from (r + r1 extend offset). "size" is the 2610 // size of the operand. The result may be in rscratch2. 2611 Address MacroAssembler::offsetted_address(Register r, Register r1, 2612 Address::extend ext, int offset, int size) { 2613 if (offset || (ext.shift() % size != 0)) { 2614 lea(rscratch2, Address(r, r1, ext)); 2615 return Address(rscratch2, offset); 2616 } else { 2617 return Address(r, r1, ext); 2618 } 2619 } 2620 2621 Address MacroAssembler::spill_address(int size, int offset, Register tmp) 2622 { 2623 assert(offset >= 0, "spill to negative address?"); 2624 // Offset reachable ? 2625 // Not aligned - 9 bits signed offset 2626 // Aligned - 12 bits unsigned offset shifted 2627 Register base = sp; 2628 if ((offset & (size-1)) && offset >= (1<<8)) { 2629 add(tmp, base, offset & ((1<<12)-1)); 2630 base = tmp; 2631 offset &= -1<<12; 2632 } 2633 2634 if (offset >= (1<<12) * size) { 2635 add(tmp, base, offset & (((1<<12)-1)<<12)); 2636 base = tmp; 2637 offset &= ~(((1<<12)-1)<<12); 2638 } 2639 2640 return Address(base, offset); 2641 } 2642 2643 // Checks whether offset is aligned. 2644 // Returns true if it is, else false. 2645 bool MacroAssembler::merge_alignment_check(Register base, 2646 size_t size, 2647 long cur_offset, 2648 long prev_offset) const { 2649 if (AvoidUnalignedAccesses) { 2650 if (base == sp) { 2651 // Checks whether low offset if aligned to pair of registers. 2652 long pair_mask = size * 2 - 1; 2653 long offset = prev_offset > cur_offset ? cur_offset : prev_offset; 2654 return (offset & pair_mask) == 0; 2655 } else { // If base is not sp, we can't guarantee the access is aligned. 2656 return false; 2657 } 2658 } else { 2659 long mask = size - 1; 2660 // Load/store pair instruction only supports element size aligned offset. 2661 return (cur_offset & mask) == 0 && (prev_offset & mask) == 0; 2662 } 2663 } 2664 2665 // Checks whether current and previous loads/stores can be merged. 2666 // Returns true if it can be merged, else false. 2667 bool MacroAssembler::ldst_can_merge(Register rt, 2668 const Address &adr, 2669 size_t cur_size_in_bytes, 2670 bool is_store) const { 2671 address prev = pc() - NativeInstruction::instruction_size; 2672 address last = code()->last_insn(); 2673 2674 if (last == NULL || !nativeInstruction_at(last)->is_Imm_LdSt()) { 2675 return false; 2676 } 2677 2678 if (adr.getMode() != Address::base_plus_offset || prev != last) { 2679 return false; 2680 } 2681 2682 NativeLdSt* prev_ldst = NativeLdSt_at(prev); 2683 size_t prev_size_in_bytes = prev_ldst->size_in_bytes(); 2684 2685 assert(prev_size_in_bytes == 4 || prev_size_in_bytes == 8, "only supports 64/32bit merging."); 2686 assert(cur_size_in_bytes == 4 || cur_size_in_bytes == 8, "only supports 64/32bit merging."); 2687 2688 if (cur_size_in_bytes != prev_size_in_bytes || is_store != prev_ldst->is_store()) { 2689 return false; 2690 } 2691 2692 long max_offset = 63 * prev_size_in_bytes; 2693 long min_offset = -64 * prev_size_in_bytes; 2694 2695 assert(prev_ldst->is_not_pre_post_index(), "pre-index or post-index is not supported to be merged."); 2696 2697 // Only same base can be merged. 2698 if (adr.base() != prev_ldst->base()) { 2699 return false; 2700 } 2701 2702 long cur_offset = adr.offset(); 2703 long prev_offset = prev_ldst->offset(); 2704 size_t diff = abs(cur_offset - prev_offset); 2705 if (diff != prev_size_in_bytes) { 2706 return false; 2707 } 2708 2709 // Following cases can not be merged: 2710 // ldr x2, [x2, #8] 2711 // ldr x3, [x2, #16] 2712 // or: 2713 // ldr x2, [x3, #8] 2714 // ldr x2, [x3, #16] 2715 // If t1 and t2 is the same in "ldp t1, t2, [xn, #imm]", we'll get SIGILL. 2716 if (!is_store && (adr.base() == prev_ldst->target() || rt == prev_ldst->target())) { 2717 return false; 2718 } 2719 2720 long low_offset = prev_offset > cur_offset ? cur_offset : prev_offset; 2721 // Offset range must be in ldp/stp instruction's range. 2722 if (low_offset > max_offset || low_offset < min_offset) { 2723 return false; 2724 } 2725 2726 if (merge_alignment_check(adr.base(), prev_size_in_bytes, cur_offset, prev_offset)) { 2727 return true; 2728 } 2729 2730 return false; 2731 } 2732 2733 // Merge current load/store with previous load/store into ldp/stp. 2734 void MacroAssembler::merge_ldst(Register rt, 2735 const Address &adr, 2736 size_t cur_size_in_bytes, 2737 bool is_store) { 2738 2739 assert(ldst_can_merge(rt, adr, cur_size_in_bytes, is_store) == true, "cur and prev must be able to be merged."); 2740 2741 Register rt_low, rt_high; 2742 address prev = pc() - NativeInstruction::instruction_size; 2743 NativeLdSt* prev_ldst = NativeLdSt_at(prev); 2744 2745 long offset; 2746 2747 if (adr.offset() < prev_ldst->offset()) { 2748 offset = adr.offset(); 2749 rt_low = rt; 2750 rt_high = prev_ldst->target(); 2751 } else { 2752 offset = prev_ldst->offset(); 2753 rt_low = prev_ldst->target(); 2754 rt_high = rt; 2755 } 2756 2757 Address adr_p = Address(prev_ldst->base(), offset); 2758 // Overwrite previous generated binary. 2759 code_section()->set_end(prev); 2760 2761 const int sz = prev_ldst->size_in_bytes(); 2762 assert(sz == 8 || sz == 4, "only supports 64/32bit merging."); 2763 if (!is_store) { 2764 BLOCK_COMMENT("merged ldr pair"); 2765 if (sz == 8) { 2766 ldp(rt_low, rt_high, adr_p); 2767 } else { 2768 ldpw(rt_low, rt_high, adr_p); 2769 } 2770 } else { 2771 BLOCK_COMMENT("merged str pair"); 2772 if (sz == 8) { 2773 stp(rt_low, rt_high, adr_p); 2774 } else { 2775 stpw(rt_low, rt_high, adr_p); 2776 } 2777 } 2778 } 2779 2780 /** 2781 * Multiply 64 bit by 64 bit first loop. 2782 */ 2783 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart, 2784 Register y, Register y_idx, Register z, 2785 Register carry, Register product, 2786 Register idx, Register kdx) { 2787 // 2788 // jlong carry, x[], y[], z[]; 2789 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) { 2790 // huge_128 product = y[idx] * x[xstart] + carry; 2791 // z[kdx] = (jlong)product; 2792 // carry = (jlong)(product >>> 64); 2793 // } 2794 // z[xstart] = carry; 2795 // 2796 2797 Label L_first_loop, L_first_loop_exit; 2798 Label L_one_x, L_one_y, L_multiply; 2799 2800 subsw(xstart, xstart, 1); 2801 br(Assembler::MI, L_one_x); 2802 2803 lea(rscratch1, Address(x, xstart, Address::lsl(LogBytesPerInt))); 2804 ldr(x_xstart, Address(rscratch1)); 2805 ror(x_xstart, x_xstart, 32); // convert big-endian to little-endian 2806 2807 bind(L_first_loop); 2808 subsw(idx, idx, 1); 2809 br(Assembler::MI, L_first_loop_exit); 2810 subsw(idx, idx, 1); 2811 br(Assembler::MI, L_one_y); 2812 lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt))); 2813 ldr(y_idx, Address(rscratch1)); 2814 ror(y_idx, y_idx, 32); // convert big-endian to little-endian 2815 bind(L_multiply); 2816 2817 // AArch64 has a multiply-accumulate instruction that we can't use 2818 // here because it has no way to process carries, so we have to use 2819 // separate add and adc instructions. Bah. 2820 umulh(rscratch1, x_xstart, y_idx); // x_xstart * y_idx -> rscratch1:product 2821 mul(product, x_xstart, y_idx); 2822 adds(product, product, carry); 2823 adc(carry, rscratch1, zr); // x_xstart * y_idx + carry -> carry:product 2824 2825 subw(kdx, kdx, 2); 2826 ror(product, product, 32); // back to big-endian 2827 str(product, offsetted_address(z, kdx, Address::uxtw(LogBytesPerInt), 0, BytesPerLong)); 2828 2829 b(L_first_loop); 2830 2831 bind(L_one_y); 2832 ldrw(y_idx, Address(y, 0)); 2833 b(L_multiply); 2834 2835 bind(L_one_x); 2836 ldrw(x_xstart, Address(x, 0)); 2837 b(L_first_loop); 2838 2839 bind(L_first_loop_exit); 2840 } 2841 2842 /** 2843 * Multiply 128 bit by 128. Unrolled inner loop. 2844 * 2845 */ 2846 void MacroAssembler::multiply_128_x_128_loop(Register y, Register z, 2847 Register carry, Register carry2, 2848 Register idx, Register jdx, 2849 Register yz_idx1, Register yz_idx2, 2850 Register tmp, Register tmp3, Register tmp4, 2851 Register tmp6, Register product_hi) { 2852 2853 // jlong carry, x[], y[], z[]; 2854 // int kdx = ystart+1; 2855 // for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop 2856 // huge_128 tmp3 = (y[idx+1] * product_hi) + z[kdx+idx+1] + carry; 2857 // jlong carry2 = (jlong)(tmp3 >>> 64); 2858 // huge_128 tmp4 = (y[idx] * product_hi) + z[kdx+idx] + carry2; 2859 // carry = (jlong)(tmp4 >>> 64); 2860 // z[kdx+idx+1] = (jlong)tmp3; 2861 // z[kdx+idx] = (jlong)tmp4; 2862 // } 2863 // idx += 2; 2864 // if (idx > 0) { 2865 // yz_idx1 = (y[idx] * product_hi) + z[kdx+idx] + carry; 2866 // z[kdx+idx] = (jlong)yz_idx1; 2867 // carry = (jlong)(yz_idx1 >>> 64); 2868 // } 2869 // 2870 2871 Label L_third_loop, L_third_loop_exit, L_post_third_loop_done; 2872 2873 lsrw(jdx, idx, 2); 2874 2875 bind(L_third_loop); 2876 2877 subsw(jdx, jdx, 1); 2878 br(Assembler::MI, L_third_loop_exit); 2879 subw(idx, idx, 4); 2880 2881 lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt))); 2882 2883 ldp(yz_idx2, yz_idx1, Address(rscratch1, 0)); 2884 2885 lea(tmp6, Address(z, idx, Address::uxtw(LogBytesPerInt))); 2886 2887 ror(yz_idx1, yz_idx1, 32); // convert big-endian to little-endian 2888 ror(yz_idx2, yz_idx2, 32); 2889 2890 ldp(rscratch2, rscratch1, Address(tmp6, 0)); 2891 2892 mul(tmp3, product_hi, yz_idx1); // yz_idx1 * product_hi -> tmp4:tmp3 2893 umulh(tmp4, product_hi, yz_idx1); 2894 2895 ror(rscratch1, rscratch1, 32); // convert big-endian to little-endian 2896 ror(rscratch2, rscratch2, 32); 2897 2898 mul(tmp, product_hi, yz_idx2); // yz_idx2 * product_hi -> carry2:tmp 2899 umulh(carry2, product_hi, yz_idx2); 2900 2901 // propagate sum of both multiplications into carry:tmp4:tmp3 2902 adds(tmp3, tmp3, carry); 2903 adc(tmp4, tmp4, zr); 2904 adds(tmp3, tmp3, rscratch1); 2905 adcs(tmp4, tmp4, tmp); 2906 adc(carry, carry2, zr); 2907 adds(tmp4, tmp4, rscratch2); 2908 adc(carry, carry, zr); 2909 2910 ror(tmp3, tmp3, 32); // convert little-endian to big-endian 2911 ror(tmp4, tmp4, 32); 2912 stp(tmp4, tmp3, Address(tmp6, 0)); 2913 2914 b(L_third_loop); 2915 bind (L_third_loop_exit); 2916 2917 andw (idx, idx, 0x3); 2918 cbz(idx, L_post_third_loop_done); 2919 2920 Label L_check_1; 2921 subsw(idx, idx, 2); 2922 br(Assembler::MI, L_check_1); 2923 2924 lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt))); 2925 ldr(yz_idx1, Address(rscratch1, 0)); 2926 ror(yz_idx1, yz_idx1, 32); 2927 mul(tmp3, product_hi, yz_idx1); // yz_idx1 * product_hi -> tmp4:tmp3 2928 umulh(tmp4, product_hi, yz_idx1); 2929 lea(rscratch1, Address(z, idx, Address::uxtw(LogBytesPerInt))); 2930 ldr(yz_idx2, Address(rscratch1, 0)); 2931 ror(yz_idx2, yz_idx2, 32); 2932 2933 add2_with_carry(carry, tmp4, tmp3, carry, yz_idx2); 2934 2935 ror(tmp3, tmp3, 32); 2936 str(tmp3, Address(rscratch1, 0)); 2937 2938 bind (L_check_1); 2939 2940 andw (idx, idx, 0x1); 2941 subsw(idx, idx, 1); 2942 br(Assembler::MI, L_post_third_loop_done); 2943 ldrw(tmp4, Address(y, idx, Address::uxtw(LogBytesPerInt))); 2944 mul(tmp3, tmp4, product_hi); // tmp4 * product_hi -> carry2:tmp3 2945 umulh(carry2, tmp4, product_hi); 2946 ldrw(tmp4, Address(z, idx, Address::uxtw(LogBytesPerInt))); 2947 2948 add2_with_carry(carry2, tmp3, tmp4, carry); 2949 2950 strw(tmp3, Address(z, idx, Address::uxtw(LogBytesPerInt))); 2951 extr(carry, carry2, tmp3, 32); 2952 2953 bind(L_post_third_loop_done); 2954 } 2955 2956 /** 2957 * Code for BigInteger::multiplyToLen() instrinsic. 2958 * 2959 * r0: x 2960 * r1: xlen 2961 * r2: y 2962 * r3: ylen 2963 * r4: z 2964 * r5: zlen 2965 * r10: tmp1 2966 * r11: tmp2 2967 * r12: tmp3 2968 * r13: tmp4 2969 * r14: tmp5 2970 * r15: tmp6 2971 * r16: tmp7 2972 * 2973 */ 2974 void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Register ylen, 2975 Register z, Register zlen, 2976 Register tmp1, Register tmp2, Register tmp3, Register tmp4, 2977 Register tmp5, Register tmp6, Register product_hi) { 2978 2979 assert_different_registers(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6); 2980 2981 const Register idx = tmp1; 2982 const Register kdx = tmp2; 2983 const Register xstart = tmp3; 2984 2985 const Register y_idx = tmp4; 2986 const Register carry = tmp5; 2987 const Register product = xlen; 2988 const Register x_xstart = zlen; // reuse register 2989 2990 // First Loop. 2991 // 2992 // final static long LONG_MASK = 0xffffffffL; 2993 // int xstart = xlen - 1; 2994 // int ystart = ylen - 1; 2995 // long carry = 0; 2996 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) { 2997 // long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry; 2998 // z[kdx] = (int)product; 2999 // carry = product >>> 32; 3000 // } 3001 // z[xstart] = (int)carry; 3002 // 3003 3004 movw(idx, ylen); // idx = ylen; 3005 movw(kdx, zlen); // kdx = xlen+ylen; 3006 mov(carry, zr); // carry = 0; 3007 3008 Label L_done; 3009 3010 movw(xstart, xlen); 3011 subsw(xstart, xstart, 1); 3012 br(Assembler::MI, L_done); 3013 3014 multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, carry, product, idx, kdx); 3015 3016 Label L_second_loop; 3017 cbzw(kdx, L_second_loop); 3018 3019 Label L_carry; 3020 subw(kdx, kdx, 1); 3021 cbzw(kdx, L_carry); 3022 3023 strw(carry, Address(z, kdx, Address::uxtw(LogBytesPerInt))); 3024 lsr(carry, carry, 32); 3025 subw(kdx, kdx, 1); 3026 3027 bind(L_carry); 3028 strw(carry, Address(z, kdx, Address::uxtw(LogBytesPerInt))); 3029 3030 // Second and third (nested) loops. 3031 // 3032 // for (int i = xstart-1; i >= 0; i--) { // Second loop 3033 // carry = 0; 3034 // for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop 3035 // long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) + 3036 // (z[k] & LONG_MASK) + carry; 3037 // z[k] = (int)product; 3038 // carry = product >>> 32; 3039 // } 3040 // z[i] = (int)carry; 3041 // } 3042 // 3043 // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = product_hi 3044 3045 const Register jdx = tmp1; 3046 3047 bind(L_second_loop); 3048 mov(carry, zr); // carry = 0; 3049 movw(jdx, ylen); // j = ystart+1 3050 3051 subsw(xstart, xstart, 1); // i = xstart-1; 3052 br(Assembler::MI, L_done); 3053 3054 str(z, Address(pre(sp, -4 * wordSize))); 3055 3056 Label L_last_x; 3057 lea(z, offsetted_address(z, xstart, Address::uxtw(LogBytesPerInt), 4, BytesPerInt)); // z = z + k - j 3058 subsw(xstart, xstart, 1); // i = xstart-1; 3059 br(Assembler::MI, L_last_x); 3060 3061 lea(rscratch1, Address(x, xstart, Address::uxtw(LogBytesPerInt))); 3062 ldr(product_hi, Address(rscratch1)); 3063 ror(product_hi, product_hi, 32); // convert big-endian to little-endian 3064 3065 Label L_third_loop_prologue; 3066 bind(L_third_loop_prologue); 3067 3068 str(ylen, Address(sp, wordSize)); 3069 stp(x, xstart, Address(sp, 2 * wordSize)); 3070 multiply_128_x_128_loop(y, z, carry, x, jdx, ylen, product, 3071 tmp2, x_xstart, tmp3, tmp4, tmp6, product_hi); 3072 ldp(z, ylen, Address(post(sp, 2 * wordSize))); 3073 ldp(x, xlen, Address(post(sp, 2 * wordSize))); // copy old xstart -> xlen 3074 3075 addw(tmp3, xlen, 1); 3076 strw(carry, Address(z, tmp3, Address::uxtw(LogBytesPerInt))); 3077 subsw(tmp3, tmp3, 1); 3078 br(Assembler::MI, L_done); 3079 3080 lsr(carry, carry, 32); 3081 strw(carry, Address(z, tmp3, Address::uxtw(LogBytesPerInt))); 3082 b(L_second_loop); 3083 3084 // Next infrequent code is moved outside loops. 3085 bind(L_last_x); 3086 ldrw(product_hi, Address(x, 0)); 3087 b(L_third_loop_prologue); 3088 3089 bind(L_done); 3090 } 3091 3092 // Code for BigInteger::mulAdd instrinsic 3093 // out = r0 3094 // in = r1 3095 // offset = r2 (already out.length-offset) 3096 // len = r3 3097 // k = r4 3098 // 3099 // pseudo code from java implementation: 3100 // carry = 0; 3101 // offset = out.length-offset - 1; 3102 // for (int j=len-1; j >= 0; j--) { 3103 // product = (in[j] & LONG_MASK) * kLong + (out[offset] & LONG_MASK) + carry; 3104 // out[offset--] = (int)product; 3105 // carry = product >>> 32; 3106 // } 3107 // return (int)carry; 3108 void MacroAssembler::mul_add(Register out, Register in, Register offset, 3109 Register len, Register k) { 3110 Label LOOP, END; 3111 // pre-loop 3112 cmp(len, zr); // cmp, not cbz/cbnz: to use condition twice => less branches 3113 csel(out, zr, out, Assembler::EQ); 3114 br(Assembler::EQ, END); 3115 add(in, in, len, LSL, 2); // in[j+1] address 3116 add(offset, out, offset, LSL, 2); // out[offset + 1] address 3117 mov(out, zr); // used to keep carry now 3118 BIND(LOOP); 3119 ldrw(rscratch1, Address(pre(in, -4))); 3120 madd(rscratch1, rscratch1, k, out); 3121 ldrw(rscratch2, Address(pre(offset, -4))); 3122 add(rscratch1, rscratch1, rscratch2); 3123 strw(rscratch1, Address(offset)); 3124 lsr(out, rscratch1, 32); 3125 subs(len, len, 1); 3126 br(Assembler::NE, LOOP); 3127 BIND(END); 3128 } 3129 3130 /** 3131 * Emits code to update CRC-32 with a byte value according to constants in table 3132 * 3133 * @param [in,out]crc Register containing the crc. 3134 * @param [in]val Register containing the byte to fold into the CRC. 3135 * @param [in]table Register containing the table of crc constants. 3136 * 3137 * uint32_t crc; 3138 * val = crc_table[(val ^ crc) & 0xFF]; 3139 * crc = val ^ (crc >> 8); 3140 * 3141 */ 3142 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) { 3143 eor(val, val, crc); 3144 andr(val, val, 0xff); 3145 ldrw(val, Address(table, val, Address::lsl(2))); 3146 eor(crc, val, crc, Assembler::LSR, 8); 3147 } 3148 3149 /** 3150 * Emits code to update CRC-32 with a 32-bit value according to tables 0 to 3 3151 * 3152 * @param [in,out]crc Register containing the crc. 3153 * @param [in]v Register containing the 32-bit to fold into the CRC. 3154 * @param [in]table0 Register containing table 0 of crc constants. 3155 * @param [in]table1 Register containing table 1 of crc constants. 3156 * @param [in]table2 Register containing table 2 of crc constants. 3157 * @param [in]table3 Register containing table 3 of crc constants. 3158 * 3159 * uint32_t crc; 3160 * v = crc ^ v 3161 * crc = table3[v&0xff]^table2[(v>>8)&0xff]^table1[(v>>16)&0xff]^table0[v>>24] 3162 * 3163 */ 3164 void MacroAssembler::update_word_crc32(Register crc, Register v, Register tmp, 3165 Register table0, Register table1, Register table2, Register table3, 3166 bool upper) { 3167 eor(v, crc, v, upper ? LSR:LSL, upper ? 32:0); 3168 uxtb(tmp, v); 3169 ldrw(crc, Address(table3, tmp, Address::lsl(2))); 3170 ubfx(tmp, v, 8, 8); 3171 ldrw(tmp, Address(table2, tmp, Address::lsl(2))); 3172 eor(crc, crc, tmp); 3173 ubfx(tmp, v, 16, 8); 3174 ldrw(tmp, Address(table1, tmp, Address::lsl(2))); 3175 eor(crc, crc, tmp); 3176 ubfx(tmp, v, 24, 8); 3177 ldrw(tmp, Address(table0, tmp, Address::lsl(2))); 3178 eor(crc, crc, tmp); 3179 } 3180 3181 void MacroAssembler::kernel_crc32_using_crc32(Register crc, Register buf, 3182 Register len, Register tmp0, Register tmp1, Register tmp2, 3183 Register tmp3) { 3184 Label CRC_by64_loop, CRC_by4_loop, CRC_by1_loop, CRC_less64, CRC_by64_pre, CRC_by32_loop, CRC_less32, L_exit; 3185 assert_different_registers(crc, buf, len, tmp0, tmp1, tmp2, tmp3); 3186 3187 mvnw(crc, crc); 3188 3189 subs(len, len, 128); 3190 br(Assembler::GE, CRC_by64_pre); 3191 BIND(CRC_less64); 3192 adds(len, len, 128-32); 3193 br(Assembler::GE, CRC_by32_loop); 3194 BIND(CRC_less32); 3195 adds(len, len, 32-4); 3196 br(Assembler::GE, CRC_by4_loop); 3197 adds(len, len, 4); 3198 br(Assembler::GT, CRC_by1_loop); 3199 b(L_exit); 3200 3201 BIND(CRC_by32_loop); 3202 ldp(tmp0, tmp1, Address(post(buf, 16))); 3203 subs(len, len, 32); 3204 crc32x(crc, crc, tmp0); 3205 ldr(tmp2, Address(post(buf, 8))); 3206 crc32x(crc, crc, tmp1); 3207 ldr(tmp3, Address(post(buf, 8))); 3208 crc32x(crc, crc, tmp2); 3209 crc32x(crc, crc, tmp3); 3210 br(Assembler::GE, CRC_by32_loop); 3211 cmn(len, 32); 3212 br(Assembler::NE, CRC_less32); 3213 b(L_exit); 3214 3215 BIND(CRC_by4_loop); 3216 ldrw(tmp0, Address(post(buf, 4))); 3217 subs(len, len, 4); 3218 crc32w(crc, crc, tmp0); 3219 br(Assembler::GE, CRC_by4_loop); 3220 adds(len, len, 4); 3221 br(Assembler::LE, L_exit); 3222 BIND(CRC_by1_loop); 3223 ldrb(tmp0, Address(post(buf, 1))); 3224 subs(len, len, 1); 3225 crc32b(crc, crc, tmp0); 3226 br(Assembler::GT, CRC_by1_loop); 3227 b(L_exit); 3228 3229 BIND(CRC_by64_pre); 3230 sub(buf, buf, 8); 3231 ldp(tmp0, tmp1, Address(buf, 8)); 3232 crc32x(crc, crc, tmp0); 3233 ldr(tmp2, Address(buf, 24)); 3234 crc32x(crc, crc, tmp1); 3235 ldr(tmp3, Address(buf, 32)); 3236 crc32x(crc, crc, tmp2); 3237 ldr(tmp0, Address(buf, 40)); 3238 crc32x(crc, crc, tmp3); 3239 ldr(tmp1, Address(buf, 48)); 3240 crc32x(crc, crc, tmp0); 3241 ldr(tmp2, Address(buf, 56)); 3242 crc32x(crc, crc, tmp1); 3243 ldr(tmp3, Address(pre(buf, 64))); 3244 3245 b(CRC_by64_loop); 3246 3247 align(CodeEntryAlignment); 3248 BIND(CRC_by64_loop); 3249 subs(len, len, 64); 3250 crc32x(crc, crc, tmp2); 3251 ldr(tmp0, Address(buf, 8)); 3252 crc32x(crc, crc, tmp3); 3253 ldr(tmp1, Address(buf, 16)); 3254 crc32x(crc, crc, tmp0); 3255 ldr(tmp2, Address(buf, 24)); 3256 crc32x(crc, crc, tmp1); 3257 ldr(tmp3, Address(buf, 32)); 3258 crc32x(crc, crc, tmp2); 3259 ldr(tmp0, Address(buf, 40)); 3260 crc32x(crc, crc, tmp3); 3261 ldr(tmp1, Address(buf, 48)); 3262 crc32x(crc, crc, tmp0); 3263 ldr(tmp2, Address(buf, 56)); 3264 crc32x(crc, crc, tmp1); 3265 ldr(tmp3, Address(pre(buf, 64))); 3266 br(Assembler::GE, CRC_by64_loop); 3267 3268 // post-loop 3269 crc32x(crc, crc, tmp2); 3270 crc32x(crc, crc, tmp3); 3271 3272 sub(len, len, 64); 3273 add(buf, buf, 8); 3274 cmn(len, 128); 3275 br(Assembler::NE, CRC_less64); 3276 BIND(L_exit); 3277 mvnw(crc, crc); 3278 } 3279 3280 /** 3281 * @param crc register containing existing CRC (32-bit) 3282 * @param buf register pointing to input byte buffer (byte*) 3283 * @param len register containing number of bytes 3284 * @param table register that will contain address of CRC table 3285 * @param tmp scratch register 3286 */ 3287 void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len, 3288 Register table0, Register table1, Register table2, Register table3, 3289 Register tmp, Register tmp2, Register tmp3) { 3290 Label L_by16, L_by16_loop, L_by4, L_by4_loop, L_by1, L_by1_loop, L_exit; 3291 unsigned long offset; 3292 3293 if (UseCRC32) { 3294 kernel_crc32_using_crc32(crc, buf, len, table0, table1, table2, table3); 3295 return; 3296 } 3297 3298 mvnw(crc, crc); 3299 3300 adrp(table0, ExternalAddress(StubRoutines::crc_table_addr()), offset); 3301 if (offset) add(table0, table0, offset); 3302 add(table1, table0, 1*256*sizeof(juint)); 3303 add(table2, table0, 2*256*sizeof(juint)); 3304 add(table3, table0, 3*256*sizeof(juint)); 3305 3306 if (UseNeon) { 3307 cmp(len, 64); 3308 br(Assembler::LT, L_by16); 3309 eor(v16, T16B, v16, v16); 3310 3311 Label L_fold; 3312 3313 add(tmp, table0, 4*256*sizeof(juint)); // Point at the Neon constants 3314 3315 ld1(v0, v1, T2D, post(buf, 32)); 3316 ld1r(v4, T2D, post(tmp, 8)); 3317 ld1r(v5, T2D, post(tmp, 8)); 3318 ld1r(v6, T2D, post(tmp, 8)); 3319 ld1r(v7, T2D, post(tmp, 8)); 3320 mov(v16, T4S, 0, crc); 3321 3322 eor(v0, T16B, v0, v16); 3323 sub(len, len, 64); 3324 3325 BIND(L_fold); 3326 pmull(v22, T8H, v0, v5, T8B); 3327 pmull(v20, T8H, v0, v7, T8B); 3328 pmull(v23, T8H, v0, v4, T8B); 3329 pmull(v21, T8H, v0, v6, T8B); 3330 3331 pmull2(v18, T8H, v0, v5, T16B); 3332 pmull2(v16, T8H, v0, v7, T16B); 3333 pmull2(v19, T8H, v0, v4, T16B); 3334 pmull2(v17, T8H, v0, v6, T16B); 3335 3336 uzp1(v24, T8H, v20, v22); 3337 uzp2(v25, T8H, v20, v22); 3338 eor(v20, T16B, v24, v25); 3339 3340 uzp1(v26, T8H, v16, v18); 3341 uzp2(v27, T8H, v16, v18); 3342 eor(v16, T16B, v26, v27); 3343 3344 ushll2(v22, T4S, v20, T8H, 8); 3345 ushll(v20, T4S, v20, T4H, 8); 3346 3347 ushll2(v18, T4S, v16, T8H, 8); 3348 ushll(v16, T4S, v16, T4H, 8); 3349 3350 eor(v22, T16B, v23, v22); 3351 eor(v18, T16B, v19, v18); 3352 eor(v20, T16B, v21, v20); 3353 eor(v16, T16B, v17, v16); 3354 3355 uzp1(v17, T2D, v16, v20); 3356 uzp2(v21, T2D, v16, v20); 3357 eor(v17, T16B, v17, v21); 3358 3359 ushll2(v20, T2D, v17, T4S, 16); 3360 ushll(v16, T2D, v17, T2S, 16); 3361 3362 eor(v20, T16B, v20, v22); 3363 eor(v16, T16B, v16, v18); 3364 3365 uzp1(v17, T2D, v20, v16); 3366 uzp2(v21, T2D, v20, v16); 3367 eor(v28, T16B, v17, v21); 3368 3369 pmull(v22, T8H, v1, v5, T8B); 3370 pmull(v20, T8H, v1, v7, T8B); 3371 pmull(v23, T8H, v1, v4, T8B); 3372 pmull(v21, T8H, v1, v6, T8B); 3373 3374 pmull2(v18, T8H, v1, v5, T16B); 3375 pmull2(v16, T8H, v1, v7, T16B); 3376 pmull2(v19, T8H, v1, v4, T16B); 3377 pmull2(v17, T8H, v1, v6, T16B); 3378 3379 ld1(v0, v1, T2D, post(buf, 32)); 3380 3381 uzp1(v24, T8H, v20, v22); 3382 uzp2(v25, T8H, v20, v22); 3383 eor(v20, T16B, v24, v25); 3384 3385 uzp1(v26, T8H, v16, v18); 3386 uzp2(v27, T8H, v16, v18); 3387 eor(v16, T16B, v26, v27); 3388 3389 ushll2(v22, T4S, v20, T8H, 8); 3390 ushll(v20, T4S, v20, T4H, 8); 3391 3392 ushll2(v18, T4S, v16, T8H, 8); 3393 ushll(v16, T4S, v16, T4H, 8); 3394 3395 eor(v22, T16B, v23, v22); 3396 eor(v18, T16B, v19, v18); 3397 eor(v20, T16B, v21, v20); 3398 eor(v16, T16B, v17, v16); 3399 3400 uzp1(v17, T2D, v16, v20); 3401 uzp2(v21, T2D, v16, v20); 3402 eor(v16, T16B, v17, v21); 3403 3404 ushll2(v20, T2D, v16, T4S, 16); 3405 ushll(v16, T2D, v16, T2S, 16); 3406 3407 eor(v20, T16B, v22, v20); 3408 eor(v16, T16B, v16, v18); 3409 3410 uzp1(v17, T2D, v20, v16); 3411 uzp2(v21, T2D, v20, v16); 3412 eor(v20, T16B, v17, v21); 3413 3414 shl(v16, T2D, v28, 1); 3415 shl(v17, T2D, v20, 1); 3416 3417 eor(v0, T16B, v0, v16); 3418 eor(v1, T16B, v1, v17); 3419 3420 subs(len, len, 32); 3421 br(Assembler::GE, L_fold); 3422 3423 mov(crc, 0); 3424 mov(tmp, v0, T1D, 0); 3425 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false); 3426 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true); 3427 mov(tmp, v0, T1D, 1); 3428 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false); 3429 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true); 3430 mov(tmp, v1, T1D, 0); 3431 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false); 3432 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true); 3433 mov(tmp, v1, T1D, 1); 3434 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false); 3435 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true); 3436 3437 add(len, len, 32); 3438 } 3439 3440 BIND(L_by16); 3441 subs(len, len, 16); 3442 br(Assembler::GE, L_by16_loop); 3443 adds(len, len, 16-4); 3444 br(Assembler::GE, L_by4_loop); 3445 adds(len, len, 4); 3446 br(Assembler::GT, L_by1_loop); 3447 b(L_exit); 3448 3449 BIND(L_by4_loop); 3450 ldrw(tmp, Address(post(buf, 4))); 3451 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3); 3452 subs(len, len, 4); 3453 br(Assembler::GE, L_by4_loop); 3454 adds(len, len, 4); 3455 br(Assembler::LE, L_exit); 3456 BIND(L_by1_loop); 3457 subs(len, len, 1); 3458 ldrb(tmp, Address(post(buf, 1))); 3459 update_byte_crc32(crc, tmp, table0); 3460 br(Assembler::GT, L_by1_loop); 3461 b(L_exit); 3462 3463 align(CodeEntryAlignment); 3464 BIND(L_by16_loop); 3465 subs(len, len, 16); 3466 ldp(tmp, tmp3, Address(post(buf, 16))); 3467 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false); 3468 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true); 3469 update_word_crc32(crc, tmp3, tmp2, table0, table1, table2, table3, false); 3470 update_word_crc32(crc, tmp3, tmp2, table0, table1, table2, table3, true); 3471 br(Assembler::GE, L_by16_loop); 3472 adds(len, len, 16-4); 3473 br(Assembler::GE, L_by4_loop); 3474 adds(len, len, 4); 3475 br(Assembler::GT, L_by1_loop); 3476 BIND(L_exit); 3477 mvnw(crc, crc); 3478 } 3479 3480 void MacroAssembler::kernel_crc32c_using_crc32c(Register crc, Register buf, 3481 Register len, Register tmp0, Register tmp1, Register tmp2, 3482 Register tmp3) { 3483 Label CRC_by64_loop, CRC_by4_loop, CRC_by1_loop, CRC_less64, CRC_by64_pre, CRC_by32_loop, CRC_less32, L_exit; 3484 assert_different_registers(crc, buf, len, tmp0, tmp1, tmp2, tmp3); 3485 3486 subs(len, len, 128); 3487 br(Assembler::GE, CRC_by64_pre); 3488 BIND(CRC_less64); 3489 adds(len, len, 128-32); 3490 br(Assembler::GE, CRC_by32_loop); 3491 BIND(CRC_less32); 3492 adds(len, len, 32-4); 3493 br(Assembler::GE, CRC_by4_loop); 3494 adds(len, len, 4); 3495 br(Assembler::GT, CRC_by1_loop); 3496 b(L_exit); 3497 3498 BIND(CRC_by32_loop); 3499 ldp(tmp0, tmp1, Address(post(buf, 16))); 3500 subs(len, len, 32); 3501 crc32cx(crc, crc, tmp0); 3502 ldr(tmp2, Address(post(buf, 8))); 3503 crc32cx(crc, crc, tmp1); 3504 ldr(tmp3, Address(post(buf, 8))); 3505 crc32cx(crc, crc, tmp2); 3506 crc32cx(crc, crc, tmp3); 3507 br(Assembler::GE, CRC_by32_loop); 3508 cmn(len, 32); 3509 br(Assembler::NE, CRC_less32); 3510 b(L_exit); 3511 3512 BIND(CRC_by4_loop); 3513 ldrw(tmp0, Address(post(buf, 4))); 3514 subs(len, len, 4); 3515 crc32cw(crc, crc, tmp0); 3516 br(Assembler::GE, CRC_by4_loop); 3517 adds(len, len, 4); 3518 br(Assembler::LE, L_exit); 3519 BIND(CRC_by1_loop); 3520 ldrb(tmp0, Address(post(buf, 1))); 3521 subs(len, len, 1); 3522 crc32cb(crc, crc, tmp0); 3523 br(Assembler::GT, CRC_by1_loop); 3524 b(L_exit); 3525 3526 BIND(CRC_by64_pre); 3527 sub(buf, buf, 8); 3528 ldp(tmp0, tmp1, Address(buf, 8)); 3529 crc32cx(crc, crc, tmp0); 3530 ldr(tmp2, Address(buf, 24)); 3531 crc32cx(crc, crc, tmp1); 3532 ldr(tmp3, Address(buf, 32)); 3533 crc32cx(crc, crc, tmp2); 3534 ldr(tmp0, Address(buf, 40)); 3535 crc32cx(crc, crc, tmp3); 3536 ldr(tmp1, Address(buf, 48)); 3537 crc32cx(crc, crc, tmp0); 3538 ldr(tmp2, Address(buf, 56)); 3539 crc32cx(crc, crc, tmp1); 3540 ldr(tmp3, Address(pre(buf, 64))); 3541 3542 b(CRC_by64_loop); 3543 3544 align(CodeEntryAlignment); 3545 BIND(CRC_by64_loop); 3546 subs(len, len, 64); 3547 crc32cx(crc, crc, tmp2); 3548 ldr(tmp0, Address(buf, 8)); 3549 crc32cx(crc, crc, tmp3); 3550 ldr(tmp1, Address(buf, 16)); 3551 crc32cx(crc, crc, tmp0); 3552 ldr(tmp2, Address(buf, 24)); 3553 crc32cx(crc, crc, tmp1); 3554 ldr(tmp3, Address(buf, 32)); 3555 crc32cx(crc, crc, tmp2); 3556 ldr(tmp0, Address(buf, 40)); 3557 crc32cx(crc, crc, tmp3); 3558 ldr(tmp1, Address(buf, 48)); 3559 crc32cx(crc, crc, tmp0); 3560 ldr(tmp2, Address(buf, 56)); 3561 crc32cx(crc, crc, tmp1); 3562 ldr(tmp3, Address(pre(buf, 64))); 3563 br(Assembler::GE, CRC_by64_loop); 3564 3565 // post-loop 3566 crc32cx(crc, crc, tmp2); 3567 crc32cx(crc, crc, tmp3); 3568 3569 sub(len, len, 64); 3570 add(buf, buf, 8); 3571 cmn(len, 128); 3572 br(Assembler::NE, CRC_less64); 3573 BIND(L_exit); 3574 } 3575 3576 /** 3577 * @param crc register containing existing CRC (32-bit) 3578 * @param buf register pointing to input byte buffer (byte*) 3579 * @param len register containing number of bytes 3580 * @param table register that will contain address of CRC table 3581 * @param tmp scratch register 3582 */ 3583 void MacroAssembler::kernel_crc32c(Register crc, Register buf, Register len, 3584 Register table0, Register table1, Register table2, Register table3, 3585 Register tmp, Register tmp2, Register tmp3) { 3586 kernel_crc32c_using_crc32c(crc, buf, len, table0, table1, table2, table3); 3587 } 3588 3589 3590 SkipIfEqual::SkipIfEqual( 3591 MacroAssembler* masm, const bool* flag_addr, bool value) { 3592 _masm = masm; 3593 unsigned long offset; 3594 _masm->adrp(rscratch1, ExternalAddress((address)flag_addr), offset); 3595 _masm->ldrb(rscratch1, Address(rscratch1, offset)); 3596 _masm->cbzw(rscratch1, _label); 3597 } 3598 3599 SkipIfEqual::~SkipIfEqual() { 3600 _masm->bind(_label); 3601 } 3602 3603 void MacroAssembler::addptr(const Address &dst, int32_t src) { 3604 Address adr; 3605 switch(dst.getMode()) { 3606 case Address::base_plus_offset: 3607 // This is the expected mode, although we allow all the other 3608 // forms below. 3609 adr = form_address(rscratch2, dst.base(), dst.offset(), LogBytesPerWord); 3610 break; 3611 default: 3612 lea(rscratch2, dst); 3613 adr = Address(rscratch2); 3614 break; 3615 } 3616 ldr(rscratch1, adr); 3617 add(rscratch1, rscratch1, src); 3618 str(rscratch1, adr); 3619 } 3620 3621 void MacroAssembler::cmpptr(Register src1, Address src2) { 3622 unsigned long offset; 3623 adrp(rscratch1, src2, offset); 3624 ldr(rscratch1, Address(rscratch1, offset)); 3625 cmp(src1, rscratch1); 3626 } 3627 3628 void MacroAssembler::cmpoop(Register obj1, Register obj2) { 3629 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 3630 bs->obj_equals(this, obj1, obj2); 3631 } 3632 3633 void MacroAssembler::load_klass(Register dst, Register src) { 3634 if (UseCompressedClassPointers) { 3635 ldrw(dst, Address(src, oopDesc::klass_offset_in_bytes())); 3636 decode_klass_not_null(dst); 3637 } else { 3638 ldr(dst, Address(src, oopDesc::klass_offset_in_bytes())); 3639 } 3640 } 3641 3642 // ((OopHandle)result).resolve(); 3643 void MacroAssembler::resolve_oop_handle(Register result, Register tmp) { 3644 // OopHandle::resolve is an indirection. 3645 access_load_at(T_OBJECT, IN_NATIVE, result, Address(result, 0), tmp, noreg); 3646 } 3647 3648 void MacroAssembler::load_mirror(Register dst, Register method, Register tmp) { 3649 const int mirror_offset = in_bytes(Klass::java_mirror_offset()); 3650 ldr(dst, Address(rmethod, Method::const_offset())); 3651 ldr(dst, Address(dst, ConstMethod::constants_offset())); 3652 ldr(dst, Address(dst, ConstantPool::pool_holder_offset_in_bytes())); 3653 ldr(dst, Address(dst, mirror_offset)); 3654 resolve_oop_handle(dst, tmp); 3655 } 3656 3657 void MacroAssembler::cmp_klass(Register oop, Register trial_klass, Register tmp) { 3658 if (UseCompressedClassPointers) { 3659 ldrw(tmp, Address(oop, oopDesc::klass_offset_in_bytes())); 3660 if (Universe::narrow_klass_base() == NULL) { 3661 cmp(trial_klass, tmp, LSL, Universe::narrow_klass_shift()); 3662 return; 3663 } else if (((uint64_t)Universe::narrow_klass_base() & 0xffffffff) == 0 3664 && Universe::narrow_klass_shift() == 0) { 3665 // Only the bottom 32 bits matter 3666 cmpw(trial_klass, tmp); 3667 return; 3668 } 3669 decode_klass_not_null(tmp); 3670 } else { 3671 ldr(tmp, Address(oop, oopDesc::klass_offset_in_bytes())); 3672 } 3673 cmp(trial_klass, tmp); 3674 } 3675 3676 void MacroAssembler::load_prototype_header(Register dst, Register src) { 3677 load_klass(dst, src); 3678 ldr(dst, Address(dst, Klass::prototype_header_offset())); 3679 } 3680 3681 void MacroAssembler::store_klass(Register dst, Register src) { 3682 // FIXME: Should this be a store release? concurrent gcs assumes 3683 // klass length is valid if klass field is not null. 3684 if (UseCompressedClassPointers) { 3685 encode_klass_not_null(src); 3686 strw(src, Address(dst, oopDesc::klass_offset_in_bytes())); 3687 } else { 3688 str(src, Address(dst, oopDesc::klass_offset_in_bytes())); 3689 } 3690 } 3691 3692 void MacroAssembler::store_klass_gap(Register dst, Register src) { 3693 if (UseCompressedClassPointers) { 3694 // Store to klass gap in destination 3695 strw(src, Address(dst, oopDesc::klass_gap_offset_in_bytes())); 3696 } 3697 } 3698 3699 // Algorithm must match CompressedOops::encode. 3700 void MacroAssembler::encode_heap_oop(Register d, Register s) { 3701 #ifdef ASSERT 3702 verify_heapbase("MacroAssembler::encode_heap_oop: heap base corrupted?"); 3703 #endif 3704 verify_oop(s, "broken oop in encode_heap_oop"); 3705 if (Universe::narrow_oop_base() == NULL) { 3706 if (Universe::narrow_oop_shift() != 0) { 3707 assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong"); 3708 lsr(d, s, LogMinObjAlignmentInBytes); 3709 } else { 3710 mov(d, s); 3711 } 3712 } else { 3713 subs(d, s, rheapbase); 3714 csel(d, d, zr, Assembler::HS); 3715 lsr(d, d, LogMinObjAlignmentInBytes); 3716 3717 /* Old algorithm: is this any worse? 3718 Label nonnull; 3719 cbnz(r, nonnull); 3720 sub(r, r, rheapbase); 3721 bind(nonnull); 3722 lsr(r, r, LogMinObjAlignmentInBytes); 3723 */ 3724 } 3725 } 3726 3727 void MacroAssembler::encode_heap_oop_not_null(Register r) { 3728 #ifdef ASSERT 3729 verify_heapbase("MacroAssembler::encode_heap_oop_not_null: heap base corrupted?"); 3730 if (CheckCompressedOops) { 3731 Label ok; 3732 cbnz(r, ok); 3733 stop("null oop passed to encode_heap_oop_not_null"); 3734 bind(ok); 3735 } 3736 #endif 3737 verify_oop(r, "broken oop in encode_heap_oop_not_null"); 3738 if (Universe::narrow_oop_base() != NULL) { 3739 sub(r, r, rheapbase); 3740 } 3741 if (Universe::narrow_oop_shift() != 0) { 3742 assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong"); 3743 lsr(r, r, LogMinObjAlignmentInBytes); 3744 } 3745 } 3746 3747 void MacroAssembler::encode_heap_oop_not_null(Register dst, Register src) { 3748 #ifdef ASSERT 3749 verify_heapbase("MacroAssembler::encode_heap_oop_not_null2: heap base corrupted?"); 3750 if (CheckCompressedOops) { 3751 Label ok; 3752 cbnz(src, ok); 3753 stop("null oop passed to encode_heap_oop_not_null2"); 3754 bind(ok); 3755 } 3756 #endif 3757 verify_oop(src, "broken oop in encode_heap_oop_not_null2"); 3758 3759 Register data = src; 3760 if (Universe::narrow_oop_base() != NULL) { 3761 sub(dst, src, rheapbase); 3762 data = dst; 3763 } 3764 if (Universe::narrow_oop_shift() != 0) { 3765 assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong"); 3766 lsr(dst, data, LogMinObjAlignmentInBytes); 3767 data = dst; 3768 } 3769 if (data == src) 3770 mov(dst, src); 3771 } 3772 3773 void MacroAssembler::decode_heap_oop(Register d, Register s) { 3774 #ifdef ASSERT 3775 verify_heapbase("MacroAssembler::decode_heap_oop: heap base corrupted?"); 3776 #endif 3777 if (Universe::narrow_oop_base() == NULL) { 3778 if (Universe::narrow_oop_shift() != 0 || d != s) { 3779 lsl(d, s, Universe::narrow_oop_shift()); 3780 } 3781 } else { 3782 Label done; 3783 if (d != s) 3784 mov(d, s); 3785 cbz(s, done); 3786 add(d, rheapbase, s, Assembler::LSL, LogMinObjAlignmentInBytes); 3787 bind(done); 3788 } 3789 verify_oop(d, "broken oop in decode_heap_oop"); 3790 } 3791 3792 void MacroAssembler::decode_heap_oop_not_null(Register r) { 3793 assert (UseCompressedOops, "should only be used for compressed headers"); 3794 assert (Universe::heap() != NULL, "java heap should be initialized"); 3795 // Cannot assert, unverified entry point counts instructions (see .ad file) 3796 // vtableStubs also counts instructions in pd_code_size_limit. 3797 // Also do not verify_oop as this is called by verify_oop. 3798 if (Universe::narrow_oop_shift() != 0) { 3799 assert(LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong"); 3800 if (Universe::narrow_oop_base() != NULL) { 3801 add(r, rheapbase, r, Assembler::LSL, LogMinObjAlignmentInBytes); 3802 } else { 3803 add(r, zr, r, Assembler::LSL, LogMinObjAlignmentInBytes); 3804 } 3805 } else { 3806 assert (Universe::narrow_oop_base() == NULL, "sanity"); 3807 } 3808 } 3809 3810 void MacroAssembler::decode_heap_oop_not_null(Register dst, Register src) { 3811 assert (UseCompressedOops, "should only be used for compressed headers"); 3812 assert (Universe::heap() != NULL, "java heap should be initialized"); 3813 // Cannot assert, unverified entry point counts instructions (see .ad file) 3814 // vtableStubs also counts instructions in pd_code_size_limit. 3815 // Also do not verify_oop as this is called by verify_oop. 3816 if (Universe::narrow_oop_shift() != 0) { 3817 assert(LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong"); 3818 if (Universe::narrow_oop_base() != NULL) { 3819 add(dst, rheapbase, src, Assembler::LSL, LogMinObjAlignmentInBytes); 3820 } else { 3821 add(dst, zr, src, Assembler::LSL, LogMinObjAlignmentInBytes); 3822 } 3823 } else { 3824 assert (Universe::narrow_oop_base() == NULL, "sanity"); 3825 if (dst != src) { 3826 mov(dst, src); 3827 } 3828 } 3829 } 3830 3831 void MacroAssembler::encode_klass_not_null(Register dst, Register src) { 3832 if (Universe::narrow_klass_base() == NULL) { 3833 if (Universe::narrow_klass_shift() != 0) { 3834 assert (LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong"); 3835 lsr(dst, src, LogKlassAlignmentInBytes); 3836 } else { 3837 if (dst != src) mov(dst, src); 3838 } 3839 return; 3840 } 3841 3842 if (use_XOR_for_compressed_class_base) { 3843 if (Universe::narrow_klass_shift() != 0) { 3844 eor(dst, src, (uint64_t)Universe::narrow_klass_base()); 3845 lsr(dst, dst, LogKlassAlignmentInBytes); 3846 } else { 3847 eor(dst, src, (uint64_t)Universe::narrow_klass_base()); 3848 } 3849 return; 3850 } 3851 3852 if (((uint64_t)Universe::narrow_klass_base() & 0xffffffff) == 0 3853 && Universe::narrow_klass_shift() == 0) { 3854 movw(dst, src); 3855 return; 3856 } 3857 3858 #ifdef ASSERT 3859 verify_heapbase("MacroAssembler::encode_klass_not_null2: heap base corrupted?"); 3860 #endif 3861 3862 Register rbase = dst; 3863 if (dst == src) rbase = rheapbase; 3864 mov(rbase, (uint64_t)Universe::narrow_klass_base()); 3865 sub(dst, src, rbase); 3866 if (Universe::narrow_klass_shift() != 0) { 3867 assert (LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong"); 3868 lsr(dst, dst, LogKlassAlignmentInBytes); 3869 } 3870 if (dst == src) reinit_heapbase(); 3871 } 3872 3873 void MacroAssembler::encode_klass_not_null(Register r) { 3874 encode_klass_not_null(r, r); 3875 } 3876 3877 void MacroAssembler::decode_klass_not_null(Register dst, Register src) { 3878 Register rbase = dst; 3879 assert (UseCompressedClassPointers, "should only be used for compressed headers"); 3880 3881 if (Universe::narrow_klass_base() == NULL) { 3882 if (Universe::narrow_klass_shift() != 0) { 3883 assert(LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong"); 3884 lsl(dst, src, LogKlassAlignmentInBytes); 3885 } else { 3886 if (dst != src) mov(dst, src); 3887 } 3888 return; 3889 } 3890 3891 if (use_XOR_for_compressed_class_base) { 3892 if (Universe::narrow_klass_shift() != 0) { 3893 lsl(dst, src, LogKlassAlignmentInBytes); 3894 eor(dst, dst, (uint64_t)Universe::narrow_klass_base()); 3895 } else { 3896 eor(dst, src, (uint64_t)Universe::narrow_klass_base()); 3897 } 3898 return; 3899 } 3900 3901 if (((uint64_t)Universe::narrow_klass_base() & 0xffffffff) == 0 3902 && Universe::narrow_klass_shift() == 0) { 3903 if (dst != src) 3904 movw(dst, src); 3905 movk(dst, (uint64_t)Universe::narrow_klass_base() >> 32, 32); 3906 return; 3907 } 3908 3909 // Cannot assert, unverified entry point counts instructions (see .ad file) 3910 // vtableStubs also counts instructions in pd_code_size_limit. 3911 // Also do not verify_oop as this is called by verify_oop. 3912 if (dst == src) rbase = rheapbase; 3913 mov(rbase, (uint64_t)Universe::narrow_klass_base()); 3914 if (Universe::narrow_klass_shift() != 0) { 3915 assert(LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong"); 3916 add(dst, rbase, src, Assembler::LSL, LogKlassAlignmentInBytes); 3917 } else { 3918 add(dst, rbase, src); 3919 } 3920 if (dst == src) reinit_heapbase(); 3921 } 3922 3923 void MacroAssembler::decode_klass_not_null(Register r) { 3924 decode_klass_not_null(r, r); 3925 } 3926 3927 void MacroAssembler::set_narrow_oop(Register dst, jobject obj) { 3928 #ifdef ASSERT 3929 { 3930 ThreadInVMfromUnknown tiv; 3931 assert (UseCompressedOops, "should only be used for compressed oops"); 3932 assert (Universe::heap() != NULL, "java heap should be initialized"); 3933 assert (oop_recorder() != NULL, "this assembler needs an OopRecorder"); 3934 assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "should be real oop"); 3935 } 3936 #endif 3937 int oop_index = oop_recorder()->find_index(obj); 3938 InstructionMark im(this); 3939 RelocationHolder rspec = oop_Relocation::spec(oop_index); 3940 code_section()->relocate(inst_mark(), rspec); 3941 movz(dst, 0xDEAD, 16); 3942 movk(dst, 0xBEEF); 3943 } 3944 3945 void MacroAssembler::set_narrow_klass(Register dst, Klass* k) { 3946 assert (UseCompressedClassPointers, "should only be used for compressed headers"); 3947 assert (oop_recorder() != NULL, "this assembler needs an OopRecorder"); 3948 int index = oop_recorder()->find_index(k); 3949 assert(! Universe::heap()->is_in_reserved(k), "should not be an oop"); 3950 3951 InstructionMark im(this); 3952 RelocationHolder rspec = metadata_Relocation::spec(index); 3953 code_section()->relocate(inst_mark(), rspec); 3954 narrowKlass nk = Klass::encode_klass(k); 3955 movz(dst, (nk >> 16), 16); 3956 movk(dst, nk & 0xffff); 3957 } 3958 3959 void MacroAssembler::access_load_at(BasicType type, DecoratorSet decorators, 3960 Register dst, Address src, 3961 Register tmp1, Register thread_tmp) { 3962 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 3963 decorators = AccessInternal::decorator_fixup(decorators); 3964 bool as_raw = (decorators & AS_RAW) != 0; 3965 if (as_raw) { 3966 bs->BarrierSetAssembler::load_at(this, decorators, type, dst, src, tmp1, thread_tmp); 3967 } else { 3968 bs->load_at(this, decorators, type, dst, src, tmp1, thread_tmp); 3969 } 3970 } 3971 3972 void MacroAssembler::access_store_at(BasicType type, DecoratorSet decorators, 3973 Address dst, Register src, 3974 Register tmp1, Register thread_tmp) { 3975 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 3976 decorators = AccessInternal::decorator_fixup(decorators); 3977 bool as_raw = (decorators & AS_RAW) != 0; 3978 if (as_raw) { 3979 bs->BarrierSetAssembler::store_at(this, decorators, type, dst, src, tmp1, thread_tmp); 3980 } else { 3981 bs->store_at(this, decorators, type, dst, src, tmp1, thread_tmp); 3982 } 3983 } 3984 3985 void MacroAssembler::load_heap_oop(Register dst, Address src, Register tmp1, 3986 Register thread_tmp, DecoratorSet decorators) { 3987 access_load_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, thread_tmp); 3988 } 3989 3990 void MacroAssembler::load_heap_oop_not_null(Register dst, Address src, Register tmp1, 3991 Register thread_tmp, DecoratorSet decorators) { 3992 access_load_at(T_OBJECT, IN_HEAP | IS_NOT_NULL | decorators, dst, src, tmp1, thread_tmp); 3993 } 3994 3995 void MacroAssembler::store_heap_oop(Address dst, Register src, Register tmp1, 3996 Register thread_tmp, DecoratorSet decorators) { 3997 access_store_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, thread_tmp); 3998 } 3999 4000 // Used for storing NULLs. 4001 void MacroAssembler::store_heap_oop_null(Address dst) { 4002 access_store_at(T_OBJECT, IN_HEAP, dst, noreg, noreg, noreg); 4003 } 4004 4005 Address MacroAssembler::allocate_metadata_address(Metadata* obj) { 4006 assert(oop_recorder() != NULL, "this assembler needs a Recorder"); 4007 int index = oop_recorder()->allocate_metadata_index(obj); 4008 RelocationHolder rspec = metadata_Relocation::spec(index); 4009 return Address((address)obj, rspec); 4010 } 4011 4012 // Move an oop into a register. immediate is true if we want 4013 // immediate instrcutions, i.e. we are not going to patch this 4014 // instruction while the code is being executed by another thread. In 4015 // that case we can use move immediates rather than the constant pool. 4016 void MacroAssembler::movoop(Register dst, jobject obj, bool immediate) { 4017 int oop_index; 4018 if (obj == NULL) { 4019 oop_index = oop_recorder()->allocate_oop_index(obj); 4020 } else { 4021 #ifdef ASSERT 4022 { 4023 ThreadInVMfromUnknown tiv; 4024 assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "should be real oop"); 4025 } 4026 #endif 4027 oop_index = oop_recorder()->find_index(obj); 4028 } 4029 RelocationHolder rspec = oop_Relocation::spec(oop_index); 4030 if (! immediate) { 4031 address dummy = address(uintptr_t(pc()) & -wordSize); // A nearby aligned address 4032 ldr_constant(dst, Address(dummy, rspec)); 4033 } else 4034 mov(dst, Address((address)obj, rspec)); 4035 } 4036 4037 // Move a metadata address into a register. 4038 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) { 4039 int oop_index; 4040 if (obj == NULL) { 4041 oop_index = oop_recorder()->allocate_metadata_index(obj); 4042 } else { 4043 oop_index = oop_recorder()->find_index(obj); 4044 } 4045 RelocationHolder rspec = metadata_Relocation::spec(oop_index); 4046 mov(dst, Address((address)obj, rspec)); 4047 } 4048 4049 Address MacroAssembler::constant_oop_address(jobject obj) { 4050 #ifdef ASSERT 4051 { 4052 ThreadInVMfromUnknown tiv; 4053 assert(oop_recorder() != NULL, "this assembler needs an OopRecorder"); 4054 assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "not an oop"); 4055 } 4056 #endif 4057 int oop_index = oop_recorder()->find_index(obj); 4058 return Address((address)obj, oop_Relocation::spec(oop_index)); 4059 } 4060 4061 // Defines obj, preserves var_size_in_bytes, okay for t2 == var_size_in_bytes. 4062 void MacroAssembler::tlab_allocate(Register obj, 4063 Register var_size_in_bytes, 4064 int con_size_in_bytes, 4065 Register t1, 4066 Register t2, 4067 Label& slow_case) { 4068 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 4069 bs->tlab_allocate(this, obj, var_size_in_bytes, con_size_in_bytes, t1, t2, slow_case); 4070 } 4071 4072 // Defines obj, preserves var_size_in_bytes 4073 void MacroAssembler::eden_allocate(Register obj, 4074 Register var_size_in_bytes, 4075 int con_size_in_bytes, 4076 Register t1, 4077 Label& slow_case) { 4078 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 4079 bs->eden_allocate(this, obj, var_size_in_bytes, con_size_in_bytes, t1, slow_case); 4080 } 4081 4082 // Zero words; len is in bytes 4083 // Destroys all registers except addr 4084 // len must be a nonzero multiple of wordSize 4085 void MacroAssembler::zero_memory(Register addr, Register len, Register t1) { 4086 assert_different_registers(addr, len, t1, rscratch1, rscratch2); 4087 4088 #ifdef ASSERT 4089 { Label L; 4090 tst(len, BytesPerWord - 1); 4091 br(Assembler::EQ, L); 4092 stop("len is not a multiple of BytesPerWord"); 4093 bind(L); 4094 } 4095 #endif 4096 4097 #ifndef PRODUCT 4098 block_comment("zero memory"); 4099 #endif 4100 4101 Label loop; 4102 Label entry; 4103 4104 // Algorithm: 4105 // 4106 // scratch1 = cnt & 7; 4107 // cnt -= scratch1; 4108 // p += scratch1; 4109 // switch (scratch1) { 4110 // do { 4111 // cnt -= 8; 4112 // p[-8] = 0; 4113 // case 7: 4114 // p[-7] = 0; 4115 // case 6: 4116 // p[-6] = 0; 4117 // // ... 4118 // case 1: 4119 // p[-1] = 0; 4120 // case 0: 4121 // p += 8; 4122 // } while (cnt); 4123 // } 4124 4125 const int unroll = 8; // Number of str(zr) instructions we'll unroll 4126 4127 lsr(len, len, LogBytesPerWord); 4128 andr(rscratch1, len, unroll - 1); // tmp1 = cnt % unroll 4129 sub(len, len, rscratch1); // cnt -= unroll 4130 // t1 always points to the end of the region we're about to zero 4131 add(t1, addr, rscratch1, Assembler::LSL, LogBytesPerWord); 4132 adr(rscratch2, entry); 4133 sub(rscratch2, rscratch2, rscratch1, Assembler::LSL, 2); 4134 br(rscratch2); 4135 bind(loop); 4136 sub(len, len, unroll); 4137 for (int i = -unroll; i < 0; i++) 4138 Assembler::str(zr, Address(t1, i * wordSize)); 4139 bind(entry); 4140 add(t1, t1, unroll * wordSize); 4141 cbnz(len, loop); 4142 } 4143 4144 void MacroAssembler::verify_tlab() { 4145 #ifdef ASSERT 4146 if (UseTLAB && VerifyOops) { 4147 Label next, ok; 4148 4149 stp(rscratch2, rscratch1, Address(pre(sp, -16))); 4150 4151 ldr(rscratch2, Address(rthread, in_bytes(JavaThread::tlab_top_offset()))); 4152 ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_start_offset()))); 4153 cmp(rscratch2, rscratch1); 4154 br(Assembler::HS, next); 4155 STOP("assert(top >= start)"); 4156 should_not_reach_here(); 4157 4158 bind(next); 4159 ldr(rscratch2, Address(rthread, in_bytes(JavaThread::tlab_end_offset()))); 4160 ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_top_offset()))); 4161 cmp(rscratch2, rscratch1); 4162 br(Assembler::HS, ok); 4163 STOP("assert(top <= end)"); 4164 should_not_reach_here(); 4165 4166 bind(ok); 4167 ldp(rscratch2, rscratch1, Address(post(sp, 16))); 4168 } 4169 #endif 4170 } 4171 4172 // Writes to stack successive pages until offset reached to check for 4173 // stack overflow + shadow pages. This clobbers tmp. 4174 void MacroAssembler::bang_stack_size(Register size, Register tmp) { 4175 assert_different_registers(tmp, size, rscratch1); 4176 mov(tmp, sp); 4177 // Bang stack for total size given plus shadow page size. 4178 // Bang one page at a time because large size can bang beyond yellow and 4179 // red zones. 4180 Label loop; 4181 mov(rscratch1, os::vm_page_size()); 4182 bind(loop); 4183 lea(tmp, Address(tmp, -os::vm_page_size())); 4184 subsw(size, size, rscratch1); 4185 str(size, Address(tmp)); 4186 br(Assembler::GT, loop); 4187 4188 // Bang down shadow pages too. 4189 // At this point, (tmp-0) is the last address touched, so don't 4190 // touch it again. (It was touched as (tmp-pagesize) but then tmp 4191 // was post-decremented.) Skip this address by starting at i=1, and 4192 // touch a few more pages below. N.B. It is important to touch all 4193 // the way down to and including i=StackShadowPages. 4194 for (int i = 0; i < (int)(JavaThread::stack_shadow_zone_size() / os::vm_page_size()) - 1; i++) { 4195 // this could be any sized move but this is can be a debugging crumb 4196 // so the bigger the better. 4197 lea(tmp, Address(tmp, -os::vm_page_size())); 4198 str(size, Address(tmp)); 4199 } 4200 } 4201 4202 4203 // Move the address of the polling page into dest. 4204 void MacroAssembler::get_polling_page(Register dest, address page, relocInfo::relocType rtype) { 4205 if (SafepointMechanism::uses_thread_local_poll()) { 4206 ldr(dest, Address(rthread, Thread::polling_page_offset())); 4207 } else { 4208 unsigned long off; 4209 adrp(dest, Address(page, rtype), off); 4210 assert(off == 0, "polling page must be page aligned"); 4211 } 4212 } 4213 4214 // Move the address of the polling page into r, then read the polling 4215 // page. 4216 address MacroAssembler::read_polling_page(Register r, address page, relocInfo::relocType rtype) { 4217 get_polling_page(r, page, rtype); 4218 return read_polling_page(r, rtype); 4219 } 4220 4221 // Read the polling page. The address of the polling page must 4222 // already be in r. 4223 address MacroAssembler::read_polling_page(Register r, relocInfo::relocType rtype) { 4224 InstructionMark im(this); 4225 code_section()->relocate(inst_mark(), rtype); 4226 ldrw(zr, Address(r, 0)); 4227 return inst_mark(); 4228 } 4229 4230 void MacroAssembler::adrp(Register reg1, const Address &dest, unsigned long &byte_offset) { 4231 relocInfo::relocType rtype = dest.rspec().reloc()->type(); 4232 unsigned long low_page = (unsigned long)CodeCache::low_bound() >> 12; 4233 unsigned long high_page = (unsigned long)(CodeCache::high_bound()-1) >> 12; 4234 unsigned long dest_page = (unsigned long)dest.target() >> 12; 4235 long offset_low = dest_page - low_page; 4236 long offset_high = dest_page - high_page; 4237 4238 assert(is_valid_AArch64_address(dest.target()), "bad address"); 4239 assert(dest.getMode() == Address::literal, "ADRP must be applied to a literal address"); 4240 4241 InstructionMark im(this); 4242 code_section()->relocate(inst_mark(), dest.rspec()); 4243 // 8143067: Ensure that the adrp can reach the dest from anywhere within 4244 // the code cache so that if it is relocated we know it will still reach 4245 if (offset_high >= -(1<<20) && offset_low < (1<<20)) { 4246 _adrp(reg1, dest.target()); 4247 } else { 4248 unsigned long target = (unsigned long)dest.target(); 4249 unsigned long adrp_target 4250 = (target & 0xffffffffUL) | ((unsigned long)pc() & 0xffff00000000UL); 4251 4252 _adrp(reg1, (address)adrp_target); 4253 movk(reg1, target >> 32, 32); 4254 } 4255 byte_offset = (unsigned long)dest.target() & 0xfff; 4256 } 4257 4258 void MacroAssembler::load_byte_map_base(Register reg) { 4259 jbyte *byte_map_base = 4260 ((CardTableBarrierSet*)(BarrierSet::barrier_set()))->card_table()->byte_map_base(); 4261 4262 if (is_valid_AArch64_address((address)byte_map_base)) { 4263 // Strictly speaking the byte_map_base isn't an address at all, 4264 // and it might even be negative. 4265 unsigned long offset; 4266 adrp(reg, ExternalAddress((address)byte_map_base), offset); 4267 // We expect offset to be zero with most collectors. 4268 if (offset != 0) { 4269 add(reg, reg, offset); 4270 } 4271 } else { 4272 mov(reg, (uint64_t)byte_map_base); 4273 } 4274 } 4275 4276 void MacroAssembler::build_frame(int framesize) { 4277 assert(framesize > 0, "framesize must be > 0"); 4278 if (framesize < ((1 << 9) + 2 * wordSize)) { 4279 sub(sp, sp, framesize); 4280 stp(rfp, lr, Address(sp, framesize - 2 * wordSize)); 4281 if (PreserveFramePointer) add(rfp, sp, framesize - 2 * wordSize); 4282 } else { 4283 stp(rfp, lr, Address(pre(sp, -2 * wordSize))); 4284 if (PreserveFramePointer) mov(rfp, sp); 4285 if (framesize < ((1 << 12) + 2 * wordSize)) 4286 sub(sp, sp, framesize - 2 * wordSize); 4287 else { 4288 mov(rscratch1, framesize - 2 * wordSize); 4289 sub(sp, sp, rscratch1); 4290 } 4291 } 4292 } 4293 4294 void MacroAssembler::remove_frame(int framesize) { 4295 assert(framesize > 0, "framesize must be > 0"); 4296 if (framesize < ((1 << 9) + 2 * wordSize)) { 4297 ldp(rfp, lr, Address(sp, framesize - 2 * wordSize)); 4298 add(sp, sp, framesize); 4299 } else { 4300 if (framesize < ((1 << 12) + 2 * wordSize)) 4301 add(sp, sp, framesize - 2 * wordSize); 4302 else { 4303 mov(rscratch1, framesize - 2 * wordSize); 4304 add(sp, sp, rscratch1); 4305 } 4306 ldp(rfp, lr, Address(post(sp, 2 * wordSize))); 4307 } 4308 } 4309 4310 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr); 4311 4312 // Search for str1 in str2 and return index or -1 4313 void MacroAssembler::string_indexof(Register str2, Register str1, 4314 Register cnt2, Register cnt1, 4315 Register tmp1, Register tmp2, 4316 Register tmp3, Register tmp4, 4317 Register tmp5, Register tmp6, 4318 int icnt1, Register result, int ae) { 4319 // NOTE: tmp5, tmp6 can be zr depending on specific method version 4320 Label LINEARSEARCH, LINEARSTUB, LINEAR_MEDIUM, DONE, NOMATCH, MATCH; 4321 4322 Register ch1 = rscratch1; 4323 Register ch2 = rscratch2; 4324 Register cnt1tmp = tmp1; 4325 Register cnt2tmp = tmp2; 4326 Register cnt1_neg = cnt1; 4327 Register cnt2_neg = cnt2; 4328 Register result_tmp = tmp4; 4329 4330 bool isL = ae == StrIntrinsicNode::LL; 4331 4332 bool str1_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL; 4333 bool str2_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU; 4334 int str1_chr_shift = str1_isL ? 0:1; 4335 int str2_chr_shift = str2_isL ? 0:1; 4336 int str1_chr_size = str1_isL ? 1:2; 4337 int str2_chr_size = str2_isL ? 1:2; 4338 chr_insn str1_load_1chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb : 4339 (chr_insn)&MacroAssembler::ldrh; 4340 chr_insn str2_load_1chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb : 4341 (chr_insn)&MacroAssembler::ldrh; 4342 chr_insn load_2chr = isL ? (chr_insn)&MacroAssembler::ldrh : (chr_insn)&MacroAssembler::ldrw; 4343 chr_insn load_4chr = isL ? (chr_insn)&MacroAssembler::ldrw : (chr_insn)&MacroAssembler::ldr; 4344 4345 // Note, inline_string_indexOf() generates checks: 4346 // if (substr.count > string.count) return -1; 4347 // if (substr.count == 0) return 0; 4348 4349 // We have two strings, a source string in str2, cnt2 and a pattern string 4350 // in str1, cnt1. Find the 1st occurence of pattern in source or return -1. 4351 4352 // For larger pattern and source we use a simplified Boyer Moore algorithm. 4353 // With a small pattern and source we use linear scan. 4354 4355 if (icnt1 == -1) { 4356 sub(result_tmp, cnt2, cnt1); 4357 cmp(cnt1, 8); // Use Linear Scan if cnt1 < 8 || cnt1 >= 256 4358 br(LT, LINEARSEARCH); 4359 dup(v0, T16B, cnt1); // done in separate FPU pipeline. Almost no penalty 4360 cmp_imm12(cnt1, 256); 4361 lsr(tmp1, cnt2, 2); 4362 ccmp(cnt1, tmp1, 0b0000, LT); // Source must be 4 * pattern for BM 4363 br(GE, LINEARSTUB); 4364 } 4365 4366 // The Boyer Moore alogorithm is based on the description here:- 4367 // 4368 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm 4369 // 4370 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule 4371 // and the 'Good Suffix' rule. 4372 // 4373 // These rules are essentially heuristics for how far we can shift the 4374 // pattern along the search string. 4375 // 4376 // The implementation here uses the 'Bad Character' rule only because of the 4377 // complexity of initialisation for the 'Good Suffix' rule. 4378 // 4379 // This is also known as the Boyer-Moore-Horspool algorithm:- 4380 // 4381 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm 4382 // 4383 // This particular implementation has few java-specific optimizations. 4384 // 4385 // #define ASIZE 256 4386 // 4387 // int bm(unsigned char *x, int m, unsigned char *y, int n) { 4388 // int i, j; 4389 // unsigned c; 4390 // unsigned char bc[ASIZE]; 4391 // 4392 // /* Preprocessing */ 4393 // for (i = 0; i < ASIZE; ++i) 4394 // bc[i] = m; 4395 // for (i = 0; i < m - 1; ) { 4396 // c = x[i]; 4397 // ++i; 4398 // // c < 256 for Latin1 string, so, no need for branch 4399 // #ifdef PATTERN_STRING_IS_LATIN1 4400 // bc[c] = m - i; 4401 // #else 4402 // if (c < ASIZE) bc[c] = m - i; 4403 // #endif 4404 // } 4405 // 4406 // /* Searching */ 4407 // j = 0; 4408 // while (j <= n - m) { 4409 // c = y[i+j]; 4410 // if (x[m-1] == c) 4411 // for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i); 4412 // if (i < 0) return j; 4413 // // c < 256 for Latin1 string, so, no need for branch 4414 // #ifdef SOURCE_STRING_IS_LATIN1 4415 // // LL case: (c< 256) always true. Remove branch 4416 // j += bc[y[j+m-1]]; 4417 // #endif 4418 // #ifndef PATTERN_STRING_IS_UTF 4419 // // UU case: need if (c<ASIZE) check. Skip 1 character if not. 4420 // if (c < ASIZE) 4421 // j += bc[y[j+m-1]]; 4422 // else 4423 // j += 1 4424 // #endif 4425 // #ifdef PATTERN_IS_LATIN1_AND_SOURCE_IS_UTF 4426 // // UL case: need if (c<ASIZE) check. Skip <pattern length> if not. 4427 // if (c < ASIZE) 4428 // j += bc[y[j+m-1]]; 4429 // else 4430 // j += m 4431 // #endif 4432 // } 4433 // } 4434 4435 if (icnt1 == -1) { 4436 Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH, 4437 BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP; 4438 Register cnt1end = tmp2; 4439 Register str2end = cnt2; 4440 Register skipch = tmp2; 4441 4442 // str1 length is >=8, so, we can read at least 1 register for cases when 4443 // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for 4444 // UL case. We'll re-read last character in inner pre-loop code to have 4445 // single outer pre-loop load 4446 const int firstStep = isL ? 7 : 3; 4447 4448 const int ASIZE = 256; 4449 const int STORED_BYTES = 32; // amount of bytes stored per instruction 4450 sub(sp, sp, ASIZE); 4451 mov(tmp5, ASIZE/STORED_BYTES); // loop iterations 4452 mov(ch1, sp); 4453 BIND(BM_INIT_LOOP); 4454 stpq(v0, v0, Address(post(ch1, STORED_BYTES))); 4455 subs(tmp5, tmp5, 1); 4456 br(GT, BM_INIT_LOOP); 4457 4458 sub(cnt1tmp, cnt1, 1); 4459 mov(tmp5, str2); 4460 add(str2end, str2, result_tmp, LSL, str2_chr_shift); 4461 sub(ch2, cnt1, 1); 4462 mov(tmp3, str1); 4463 BIND(BCLOOP); 4464 (this->*str1_load_1chr)(ch1, Address(post(tmp3, str1_chr_size))); 4465 if (!str1_isL) { 4466 cmp_imm12(ch1, ASIZE); 4467 br(HS, BCSKIP); 4468 } 4469 strb(ch2, Address(sp, ch1)); 4470 BIND(BCSKIP); 4471 subs(ch2, ch2, 1); 4472 br(GT, BCLOOP); 4473 4474 add(tmp6, str1, cnt1, LSL, str1_chr_shift); // address after str1 4475 if (str1_isL == str2_isL) { 4476 // load last 8 bytes (8LL/4UU symbols) 4477 ldr(tmp6, Address(tmp6, -wordSize)); 4478 } else { 4479 ldrw(tmp6, Address(tmp6, -wordSize/2)); // load last 4 bytes(4 symbols) 4480 // convert Latin1 to UTF. We'll have to wait until load completed, but 4481 // it's still faster than per-character loads+checks 4482 lsr(tmp3, tmp6, BitsPerByte * (wordSize/2 - str1_chr_size)); // str1[N-1] 4483 ubfx(ch1, tmp6, 8, 8); // str1[N-2] 4484 ubfx(ch2, tmp6, 16, 8); // str1[N-3] 4485 andr(tmp6, tmp6, 0xFF); // str1[N-4] 4486 orr(ch2, ch1, ch2, LSL, 16); 4487 orr(tmp6, tmp6, tmp3, LSL, 48); 4488 orr(tmp6, tmp6, ch2, LSL, 16); 4489 } 4490 BIND(BMLOOPSTR2); 4491 (this->*str2_load_1chr)(skipch, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift))); 4492 sub(cnt1tmp, cnt1tmp, firstStep); // cnt1tmp is positive here, because cnt1 >= 8 4493 if (str1_isL == str2_isL) { 4494 // re-init tmp3. It's for free because it's executed in parallel with 4495 // load above. Alternative is to initialize it before loop, but it'll 4496 // affect performance on in-order systems with 2 or more ld/st pipelines 4497 lsr(tmp3, tmp6, BitsPerByte * (wordSize - str1_chr_size)); 4498 } 4499 if (!isL) { // UU/UL case 4500 lsl(ch2, cnt1tmp, 1); // offset in bytes 4501 } 4502 cmp(tmp3, skipch); 4503 br(NE, BMSKIP); 4504 ldr(ch2, Address(str2, isL ? cnt1tmp : ch2)); 4505 mov(ch1, tmp6); 4506 if (isL) { 4507 b(BMLOOPSTR1_AFTER_LOAD); 4508 } else { 4509 sub(cnt1tmp, cnt1tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8 4510 b(BMLOOPSTR1_CMP); 4511 } 4512 BIND(BMLOOPSTR1); 4513 (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift))); 4514 (this->*str2_load_1chr)(ch2, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift))); 4515 BIND(BMLOOPSTR1_AFTER_LOAD); 4516 subs(cnt1tmp, cnt1tmp, 1); 4517 br(LT, BMLOOPSTR1_LASTCMP); 4518 BIND(BMLOOPSTR1_CMP); 4519 cmp(ch1, ch2); 4520 br(EQ, BMLOOPSTR1); 4521 BIND(BMSKIP); 4522 if (!isL) { 4523 // if we've met UTF symbol while searching Latin1 pattern, then we can 4524 // skip cnt1 symbols 4525 if (str1_isL != str2_isL) { 4526 mov(result_tmp, cnt1); 4527 } else { 4528 mov(result_tmp, 1); 4529 } 4530 cmp_imm12(skipch, ASIZE); 4531 br(HS, BMADV); 4532 } 4533 ldrb(result_tmp, Address(sp, skipch)); // load skip distance 4534 BIND(BMADV); 4535 sub(cnt1tmp, cnt1, 1); 4536 add(str2, str2, result_tmp, LSL, str2_chr_shift); 4537 cmp(str2, str2end); 4538 br(LE, BMLOOPSTR2); 4539 add(sp, sp, ASIZE); 4540 b(NOMATCH); 4541 BIND(BMLOOPSTR1_LASTCMP); 4542 cmp(ch1, ch2); 4543 br(NE, BMSKIP); 4544 BIND(BMMATCH); 4545 sub(result, str2, tmp5); 4546 if (!str2_isL) lsr(result, result, 1); 4547 add(sp, sp, ASIZE); 4548 b(DONE); 4549 4550 BIND(LINEARSTUB); 4551 cmp(cnt1, 16); // small patterns still should be handled by simple algorithm 4552 br(LT, LINEAR_MEDIUM); 4553 mov(result, zr); 4554 RuntimeAddress stub = NULL; 4555 if (isL) { 4556 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ll()); 4557 assert(stub.target() != NULL, "string_indexof_linear_ll stub has not been generated"); 4558 } else if (str1_isL) { 4559 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ul()); 4560 assert(stub.target() != NULL, "string_indexof_linear_ul stub has not been generated"); 4561 } else { 4562 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_uu()); 4563 assert(stub.target() != NULL, "string_indexof_linear_uu stub has not been generated"); 4564 } 4565 trampoline_call(stub); 4566 b(DONE); 4567 } 4568 4569 BIND(LINEARSEARCH); 4570 { 4571 Label DO1, DO2, DO3; 4572 4573 Register str2tmp = tmp2; 4574 Register first = tmp3; 4575 4576 if (icnt1 == -1) 4577 { 4578 Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT; 4579 4580 cmp(cnt1, str1_isL == str2_isL ? 4 : 2); 4581 br(LT, DOSHORT); 4582 BIND(LINEAR_MEDIUM); 4583 (this->*str1_load_1chr)(first, Address(str1)); 4584 lea(str1, Address(str1, cnt1, Address::lsl(str1_chr_shift))); 4585 sub(cnt1_neg, zr, cnt1, LSL, str1_chr_shift); 4586 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 4587 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 4588 4589 BIND(FIRST_LOOP); 4590 (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg)); 4591 cmp(first, ch2); 4592 br(EQ, STR1_LOOP); 4593 BIND(STR2_NEXT); 4594 adds(cnt2_neg, cnt2_neg, str2_chr_size); 4595 br(LE, FIRST_LOOP); 4596 b(NOMATCH); 4597 4598 BIND(STR1_LOOP); 4599 adds(cnt1tmp, cnt1_neg, str1_chr_size); 4600 add(cnt2tmp, cnt2_neg, str2_chr_size); 4601 br(GE, MATCH); 4602 4603 BIND(STR1_NEXT); 4604 (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp)); 4605 (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp)); 4606 cmp(ch1, ch2); 4607 br(NE, STR2_NEXT); 4608 adds(cnt1tmp, cnt1tmp, str1_chr_size); 4609 add(cnt2tmp, cnt2tmp, str2_chr_size); 4610 br(LT, STR1_NEXT); 4611 b(MATCH); 4612 4613 BIND(DOSHORT); 4614 if (str1_isL == str2_isL) { 4615 cmp(cnt1, 2); 4616 br(LT, DO1); 4617 br(GT, DO3); 4618 } 4619 } 4620 4621 if (icnt1 == 4) { 4622 Label CH1_LOOP; 4623 4624 (this->*load_4chr)(ch1, str1); 4625 sub(result_tmp, cnt2, 4); 4626 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 4627 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 4628 4629 BIND(CH1_LOOP); 4630 (this->*load_4chr)(ch2, Address(str2, cnt2_neg)); 4631 cmp(ch1, ch2); 4632 br(EQ, MATCH); 4633 adds(cnt2_neg, cnt2_neg, str2_chr_size); 4634 br(LE, CH1_LOOP); 4635 b(NOMATCH); 4636 } 4637 4638 if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 2) { 4639 Label CH1_LOOP; 4640 4641 BIND(DO2); 4642 (this->*load_2chr)(ch1, str1); 4643 if (icnt1 == 2) { 4644 sub(result_tmp, cnt2, 2); 4645 } 4646 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 4647 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 4648 BIND(CH1_LOOP); 4649 (this->*load_2chr)(ch2, Address(str2, cnt2_neg)); 4650 cmp(ch1, ch2); 4651 br(EQ, MATCH); 4652 adds(cnt2_neg, cnt2_neg, str2_chr_size); 4653 br(LE, CH1_LOOP); 4654 b(NOMATCH); 4655 } 4656 4657 if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 3) { 4658 Label FIRST_LOOP, STR2_NEXT, STR1_LOOP; 4659 4660 BIND(DO3); 4661 (this->*load_2chr)(first, str1); 4662 (this->*str1_load_1chr)(ch1, Address(str1, 2*str1_chr_size)); 4663 if (icnt1 == 3) { 4664 sub(result_tmp, cnt2, 3); 4665 } 4666 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 4667 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 4668 BIND(FIRST_LOOP); 4669 (this->*load_2chr)(ch2, Address(str2, cnt2_neg)); 4670 cmpw(first, ch2); 4671 br(EQ, STR1_LOOP); 4672 BIND(STR2_NEXT); 4673 adds(cnt2_neg, cnt2_neg, str2_chr_size); 4674 br(LE, FIRST_LOOP); 4675 b(NOMATCH); 4676 4677 BIND(STR1_LOOP); 4678 add(cnt2tmp, cnt2_neg, 2*str2_chr_size); 4679 (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp)); 4680 cmp(ch1, ch2); 4681 br(NE, STR2_NEXT); 4682 b(MATCH); 4683 } 4684 4685 if (icnt1 == -1 || icnt1 == 1) { 4686 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP; 4687 4688 BIND(DO1); 4689 (this->*str1_load_1chr)(ch1, str1); 4690 cmp(cnt2, 8); 4691 br(LT, DO1_SHORT); 4692 4693 sub(result_tmp, cnt2, 8/str2_chr_size); 4694 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 4695 mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001); 4696 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 4697 4698 if (str2_isL) { 4699 orr(ch1, ch1, ch1, LSL, 8); 4700 } 4701 orr(ch1, ch1, ch1, LSL, 16); 4702 orr(ch1, ch1, ch1, LSL, 32); 4703 BIND(CH1_LOOP); 4704 ldr(ch2, Address(str2, cnt2_neg)); 4705 eor(ch2, ch1, ch2); 4706 sub(tmp1, ch2, tmp3); 4707 orr(tmp2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 4708 bics(tmp1, tmp1, tmp2); 4709 br(NE, HAS_ZERO); 4710 adds(cnt2_neg, cnt2_neg, 8); 4711 br(LT, CH1_LOOP); 4712 4713 cmp(cnt2_neg, 8); 4714 mov(cnt2_neg, 0); 4715 br(LT, CH1_LOOP); 4716 b(NOMATCH); 4717 4718 BIND(HAS_ZERO); 4719 rev(tmp1, tmp1); 4720 clz(tmp1, tmp1); 4721 add(cnt2_neg, cnt2_neg, tmp1, LSR, 3); 4722 b(MATCH); 4723 4724 BIND(DO1_SHORT); 4725 mov(result_tmp, cnt2); 4726 lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift))); 4727 sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift); 4728 BIND(DO1_LOOP); 4729 (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg)); 4730 cmpw(ch1, ch2); 4731 br(EQ, MATCH); 4732 adds(cnt2_neg, cnt2_neg, str2_chr_size); 4733 br(LT, DO1_LOOP); 4734 } 4735 } 4736 BIND(NOMATCH); 4737 mov(result, -1); 4738 b(DONE); 4739 BIND(MATCH); 4740 add(result, result_tmp, cnt2_neg, ASR, str2_chr_shift); 4741 BIND(DONE); 4742 } 4743 4744 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr); 4745 typedef void (MacroAssembler::* uxt_insn)(Register Rd, Register Rn); 4746 4747 void MacroAssembler::string_indexof_char(Register str1, Register cnt1, 4748 Register ch, Register result, 4749 Register tmp1, Register tmp2, Register tmp3) 4750 { 4751 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE; 4752 Register cnt1_neg = cnt1; 4753 Register ch1 = rscratch1; 4754 Register result_tmp = rscratch2; 4755 4756 cmp(cnt1, 4); 4757 br(LT, DO1_SHORT); 4758 4759 orr(ch, ch, ch, LSL, 16); 4760 orr(ch, ch, ch, LSL, 32); 4761 4762 sub(cnt1, cnt1, 4); 4763 mov(result_tmp, cnt1); 4764 lea(str1, Address(str1, cnt1, Address::uxtw(1))); 4765 sub(cnt1_neg, zr, cnt1, LSL, 1); 4766 4767 mov(tmp3, 0x0001000100010001); 4768 4769 BIND(CH1_LOOP); 4770 ldr(ch1, Address(str1, cnt1_neg)); 4771 eor(ch1, ch, ch1); 4772 sub(tmp1, ch1, tmp3); 4773 orr(tmp2, ch1, 0x7fff7fff7fff7fff); 4774 bics(tmp1, tmp1, tmp2); 4775 br(NE, HAS_ZERO); 4776 adds(cnt1_neg, cnt1_neg, 8); 4777 br(LT, CH1_LOOP); 4778 4779 cmp(cnt1_neg, 8); 4780 mov(cnt1_neg, 0); 4781 br(LT, CH1_LOOP); 4782 b(NOMATCH); 4783 4784 BIND(HAS_ZERO); 4785 rev(tmp1, tmp1); 4786 clz(tmp1, tmp1); 4787 add(cnt1_neg, cnt1_neg, tmp1, LSR, 3); 4788 b(MATCH); 4789 4790 BIND(DO1_SHORT); 4791 mov(result_tmp, cnt1); 4792 lea(str1, Address(str1, cnt1, Address::uxtw(1))); 4793 sub(cnt1_neg, zr, cnt1, LSL, 1); 4794 BIND(DO1_LOOP); 4795 ldrh(ch1, Address(str1, cnt1_neg)); 4796 cmpw(ch, ch1); 4797 br(EQ, MATCH); 4798 adds(cnt1_neg, cnt1_neg, 2); 4799 br(LT, DO1_LOOP); 4800 BIND(NOMATCH); 4801 mov(result, -1); 4802 b(DONE); 4803 BIND(MATCH); 4804 add(result, result_tmp, cnt1_neg, ASR, 1); 4805 BIND(DONE); 4806 } 4807 4808 // Compare strings. 4809 void MacroAssembler::string_compare(Register str1, Register str2, 4810 Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2, 4811 FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3, int ae) { 4812 Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB, 4813 DIFFERENCE, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, 4814 SHORT_LOOP_START, TAIL_CHECK; 4815 4816 const int STUB_THRESHOLD = 64 + 8; 4817 bool isLL = ae == StrIntrinsicNode::LL; 4818 bool isLU = ae == StrIntrinsicNode::LU; 4819 bool isUL = ae == StrIntrinsicNode::UL; 4820 4821 bool str1_isL = isLL || isLU; 4822 bool str2_isL = isLL || isUL; 4823 4824 int str1_chr_shift = str1_isL ? 0 : 1; 4825 int str2_chr_shift = str2_isL ? 0 : 1; 4826 int str1_chr_size = str1_isL ? 1 : 2; 4827 int str2_chr_size = str2_isL ? 1 : 2; 4828 int minCharsInWord = isLL ? wordSize : wordSize/2; 4829 4830 FloatRegister vtmpZ = vtmp1, vtmp = vtmp2; 4831 chr_insn str1_load_chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb : 4832 (chr_insn)&MacroAssembler::ldrh; 4833 chr_insn str2_load_chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb : 4834 (chr_insn)&MacroAssembler::ldrh; 4835 uxt_insn ext_chr = isLL ? (uxt_insn)&MacroAssembler::uxtbw : 4836 (uxt_insn)&MacroAssembler::uxthw; 4837 4838 BLOCK_COMMENT("string_compare {"); 4839 4840 // Bizzarely, the counts are passed in bytes, regardless of whether they 4841 // are L or U strings, however the result is always in characters. 4842 if (!str1_isL) asrw(cnt1, cnt1, 1); 4843 if (!str2_isL) asrw(cnt2, cnt2, 1); 4844 4845 // Compute the minimum of the string lengths and save the difference. 4846 subsw(result, cnt1, cnt2); 4847 cselw(cnt2, cnt1, cnt2, Assembler::LE); // min 4848 4849 // A very short string 4850 cmpw(cnt2, minCharsInWord); 4851 br(Assembler::LT, SHORT_STRING); 4852 4853 // Compare longwords 4854 // load first parts of strings and finish initialization while loading 4855 { 4856 if (str1_isL == str2_isL) { // LL or UU 4857 ldr(tmp1, Address(str1)); 4858 cmp(str1, str2); 4859 br(Assembler::EQ, DONE); 4860 ldr(tmp2, Address(str2)); 4861 cmp(cnt2, STUB_THRESHOLD); 4862 br(GE, STUB); 4863 subsw(cnt2, cnt2, minCharsInWord); 4864 br(EQ, TAIL_CHECK); 4865 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift))); 4866 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift))); 4867 sub(cnt2, zr, cnt2, LSL, str2_chr_shift); 4868 } else if (isLU) { 4869 ldrs(vtmp, Address(str1)); 4870 cmp(str1, str2); 4871 br(Assembler::EQ, DONE); 4872 ldr(tmp2, Address(str2)); 4873 cmp(cnt2, STUB_THRESHOLD); 4874 br(GE, STUB); 4875 subsw(cnt2, cnt2, 4); 4876 br(EQ, TAIL_CHECK); 4877 eor(vtmpZ, T16B, vtmpZ, vtmpZ); 4878 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift))); 4879 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift))); 4880 zip1(vtmp, T8B, vtmp, vtmpZ); 4881 sub(cnt1, zr, cnt2, LSL, str1_chr_shift); 4882 sub(cnt2, zr, cnt2, LSL, str2_chr_shift); 4883 add(cnt1, cnt1, 4); 4884 fmovd(tmp1, vtmp); 4885 } else { // UL case 4886 ldr(tmp1, Address(str1)); 4887 cmp(str1, str2); 4888 br(Assembler::EQ, DONE); 4889 ldrs(vtmp, Address(str2)); 4890 cmp(cnt2, STUB_THRESHOLD); 4891 br(GE, STUB); 4892 subsw(cnt2, cnt2, 4); 4893 br(EQ, TAIL_CHECK); 4894 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift))); 4895 eor(vtmpZ, T16B, vtmpZ, vtmpZ); 4896 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift))); 4897 sub(cnt1, zr, cnt2, LSL, str1_chr_shift); 4898 zip1(vtmp, T8B, vtmp, vtmpZ); 4899 sub(cnt2, zr, cnt2, LSL, str2_chr_shift); 4900 add(cnt1, cnt1, 8); 4901 fmovd(tmp2, vtmp); 4902 } 4903 adds(cnt2, cnt2, isUL ? 4 : 8); 4904 br(GE, TAIL); 4905 eor(rscratch2, tmp1, tmp2); 4906 cbnz(rscratch2, DIFFERENCE); 4907 // main loop 4908 bind(NEXT_WORD); 4909 if (str1_isL == str2_isL) { 4910 ldr(tmp1, Address(str1, cnt2)); 4911 ldr(tmp2, Address(str2, cnt2)); 4912 adds(cnt2, cnt2, 8); 4913 } else if (isLU) { 4914 ldrs(vtmp, Address(str1, cnt1)); 4915 ldr(tmp2, Address(str2, cnt2)); 4916 add(cnt1, cnt1, 4); 4917 zip1(vtmp, T8B, vtmp, vtmpZ); 4918 fmovd(tmp1, vtmp); 4919 adds(cnt2, cnt2, 8); 4920 } else { // UL 4921 ldrs(vtmp, Address(str2, cnt2)); 4922 ldr(tmp1, Address(str1, cnt1)); 4923 zip1(vtmp, T8B, vtmp, vtmpZ); 4924 add(cnt1, cnt1, 8); 4925 fmovd(tmp2, vtmp); 4926 adds(cnt2, cnt2, 4); 4927 } 4928 br(GE, TAIL); 4929 4930 eor(rscratch2, tmp1, tmp2); 4931 cbz(rscratch2, NEXT_WORD); 4932 b(DIFFERENCE); 4933 bind(TAIL); 4934 eor(rscratch2, tmp1, tmp2); 4935 cbnz(rscratch2, DIFFERENCE); 4936 // Last longword. In the case where length == 4 we compare the 4937 // same longword twice, but that's still faster than another 4938 // conditional branch. 4939 if (str1_isL == str2_isL) { 4940 ldr(tmp1, Address(str1)); 4941 ldr(tmp2, Address(str2)); 4942 } else if (isLU) { 4943 ldrs(vtmp, Address(str1)); 4944 ldr(tmp2, Address(str2)); 4945 zip1(vtmp, T8B, vtmp, vtmpZ); 4946 fmovd(tmp1, vtmp); 4947 } else { // UL 4948 ldrs(vtmp, Address(str2)); 4949 ldr(tmp1, Address(str1)); 4950 zip1(vtmp, T8B, vtmp, vtmpZ); 4951 fmovd(tmp2, vtmp); 4952 } 4953 bind(TAIL_CHECK); 4954 eor(rscratch2, tmp1, tmp2); 4955 cbz(rscratch2, DONE); 4956 4957 // Find the first different characters in the longwords and 4958 // compute their difference. 4959 bind(DIFFERENCE); 4960 rev(rscratch2, rscratch2); 4961 clz(rscratch2, rscratch2); 4962 andr(rscratch2, rscratch2, isLL ? -8 : -16); 4963 lsrv(tmp1, tmp1, rscratch2); 4964 (this->*ext_chr)(tmp1, tmp1); 4965 lsrv(tmp2, tmp2, rscratch2); 4966 (this->*ext_chr)(tmp2, tmp2); 4967 subw(result, tmp1, tmp2); 4968 b(DONE); 4969 } 4970 4971 bind(STUB); 4972 RuntimeAddress stub = NULL; 4973 switch(ae) { 4974 case StrIntrinsicNode::LL: 4975 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LL()); 4976 break; 4977 case StrIntrinsicNode::UU: 4978 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UU()); 4979 break; 4980 case StrIntrinsicNode::LU: 4981 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LU()); 4982 break; 4983 case StrIntrinsicNode::UL: 4984 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UL()); 4985 break; 4986 default: 4987 ShouldNotReachHere(); 4988 } 4989 assert(stub.target() != NULL, "compare_long_string stub has not been generated"); 4990 trampoline_call(stub); 4991 b(DONE); 4992 4993 bind(SHORT_STRING); 4994 // Is the minimum length zero? 4995 cbz(cnt2, DONE); 4996 // arrange code to do most branches while loading and loading next characters 4997 // while comparing previous 4998 (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size))); 4999 subs(cnt2, cnt2, 1); 5000 br(EQ, SHORT_LAST_INIT); 5001 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size))); 5002 b(SHORT_LOOP_START); 5003 bind(SHORT_LOOP); 5004 subs(cnt2, cnt2, 1); 5005 br(EQ, SHORT_LAST); 5006 bind(SHORT_LOOP_START); 5007 (this->*str1_load_chr)(tmp2, Address(post(str1, str1_chr_size))); 5008 (this->*str2_load_chr)(rscratch1, Address(post(str2, str2_chr_size))); 5009 cmp(tmp1, cnt1); 5010 br(NE, SHORT_LOOP_TAIL); 5011 subs(cnt2, cnt2, 1); 5012 br(EQ, SHORT_LAST2); 5013 (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size))); 5014 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size))); 5015 cmp(tmp2, rscratch1); 5016 br(EQ, SHORT_LOOP); 5017 sub(result, tmp2, rscratch1); 5018 b(DONE); 5019 bind(SHORT_LOOP_TAIL); 5020 sub(result, tmp1, cnt1); 5021 b(DONE); 5022 bind(SHORT_LAST2); 5023 cmp(tmp2, rscratch1); 5024 br(EQ, DONE); 5025 sub(result, tmp2, rscratch1); 5026 5027 b(DONE); 5028 bind(SHORT_LAST_INIT); 5029 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size))); 5030 bind(SHORT_LAST); 5031 cmp(tmp1, cnt1); 5032 br(EQ, DONE); 5033 sub(result, tmp1, cnt1); 5034 5035 bind(DONE); 5036 5037 BLOCK_COMMENT("} string_compare"); 5038 } 5039 5040 // This method checks if provided byte array contains byte with highest bit set. 5041 void MacroAssembler::has_negatives(Register ary1, Register len, Register result) { 5042 // Simple and most common case of aligned small array which is not at the 5043 // end of memory page is placed here. All other cases are in stub. 5044 Label LOOP, END, STUB, STUB_LONG, SET_RESULT, DONE; 5045 const uint64_t UPPER_BIT_MASK=0x8080808080808080; 5046 assert_different_registers(ary1, len, result); 5047 5048 cmpw(len, 0); 5049 br(LE, SET_RESULT); 5050 cmpw(len, 4 * wordSize); 5051 br(GE, STUB_LONG); // size > 32 then go to stub 5052 5053 int shift = 64 - exact_log2(os::vm_page_size()); 5054 lsl(rscratch1, ary1, shift); 5055 mov(rscratch2, (size_t)(4 * wordSize) << shift); 5056 adds(rscratch2, rscratch1, rscratch2); // At end of page? 5057 br(CS, STUB); // at the end of page then go to stub 5058 subs(len, len, wordSize); 5059 br(LT, END); 5060 5061 BIND(LOOP); 5062 ldr(rscratch1, Address(post(ary1, wordSize))); 5063 tst(rscratch1, UPPER_BIT_MASK); 5064 br(NE, SET_RESULT); 5065 subs(len, len, wordSize); 5066 br(GE, LOOP); 5067 cmpw(len, -wordSize); 5068 br(EQ, SET_RESULT); 5069 5070 BIND(END); 5071 ldr(result, Address(ary1)); 5072 sub(len, zr, len, LSL, 3); // LSL 3 is to get bits from bytes 5073 lslv(result, result, len); 5074 tst(result, UPPER_BIT_MASK); 5075 b(SET_RESULT); 5076 5077 BIND(STUB); 5078 RuntimeAddress has_neg = RuntimeAddress(StubRoutines::aarch64::has_negatives()); 5079 assert(has_neg.target() != NULL, "has_negatives stub has not been generated"); 5080 trampoline_call(has_neg); 5081 b(DONE); 5082 5083 BIND(STUB_LONG); 5084 RuntimeAddress has_neg_long = RuntimeAddress( 5085 StubRoutines::aarch64::has_negatives_long()); 5086 assert(has_neg_long.target() != NULL, "has_negatives stub has not been generated"); 5087 trampoline_call(has_neg_long); 5088 b(DONE); 5089 5090 BIND(SET_RESULT); 5091 cset(result, NE); // set true or false 5092 5093 BIND(DONE); 5094 } 5095 5096 void MacroAssembler::arrays_equals(Register a1, Register a2, Register tmp3, 5097 Register tmp4, Register tmp5, Register result, 5098 Register cnt1, int elem_size) { 5099 Label DONE, SAME; 5100 Register tmp1 = rscratch1; 5101 Register tmp2 = rscratch2; 5102 Register cnt2 = tmp2; // cnt2 only used in array length compare 5103 int elem_per_word = wordSize/elem_size; 5104 int log_elem_size = exact_log2(elem_size); 5105 int length_offset = arrayOopDesc::length_offset_in_bytes(); 5106 int base_offset 5107 = arrayOopDesc::base_offset_in_bytes(elem_size == 2 ? T_CHAR : T_BYTE); 5108 int stubBytesThreshold = 3 * 64 + (UseSIMDForArrayEquals ? 0 : 16); 5109 5110 assert(elem_size == 1 || elem_size == 2, "must be char or byte"); 5111 assert_different_registers(a1, a2, result, cnt1, rscratch1, rscratch2); 5112 5113 #ifndef PRODUCT 5114 { 5115 const char kind = (elem_size == 2) ? 'U' : 'L'; 5116 char comment[64]; 5117 snprintf(comment, sizeof comment, "array_equals%c{", kind); 5118 BLOCK_COMMENT(comment); 5119 } 5120 #endif 5121 5122 // if (a1 == a2) 5123 // return true; 5124 cmpoop(a1, a2); // May have read barriers for a1 and a2. 5125 br(EQ, SAME); 5126 5127 if (UseSimpleArrayEquals) { 5128 Label NEXT_WORD, SHORT, TAIL03, TAIL01, A_MIGHT_BE_NULL, A_IS_NOT_NULL; 5129 // if (a1 == null || a2 == null) 5130 // return false; 5131 // a1 & a2 == 0 means (some-pointer is null) or 5132 // (very-rare-or-even-probably-impossible-pointer-values) 5133 // so, we can save one branch in most cases 5134 tst(a1, a2); 5135 mov(result, false); 5136 br(EQ, A_MIGHT_BE_NULL); 5137 // if (a1.length != a2.length) 5138 // return false; 5139 bind(A_IS_NOT_NULL); 5140 ldrw(cnt1, Address(a1, length_offset)); 5141 ldrw(cnt2, Address(a2, length_offset)); 5142 eorw(tmp5, cnt1, cnt2); 5143 cbnzw(tmp5, DONE); 5144 lea(a1, Address(a1, base_offset)); 5145 lea(a2, Address(a2, base_offset)); 5146 // Check for short strings, i.e. smaller than wordSize. 5147 subs(cnt1, cnt1, elem_per_word); 5148 br(Assembler::LT, SHORT); 5149 // Main 8 byte comparison loop. 5150 bind(NEXT_WORD); { 5151 ldr(tmp1, Address(post(a1, wordSize))); 5152 ldr(tmp2, Address(post(a2, wordSize))); 5153 subs(cnt1, cnt1, elem_per_word); 5154 eor(tmp5, tmp1, tmp2); 5155 cbnz(tmp5, DONE); 5156 } br(GT, NEXT_WORD); 5157 // Last longword. In the case where length == 4 we compare the 5158 // same longword twice, but that's still faster than another 5159 // conditional branch. 5160 // cnt1 could be 0, -1, -2, -3, -4 for chars; -4 only happens when 5161 // length == 4. 5162 if (log_elem_size > 0) 5163 lsl(cnt1, cnt1, log_elem_size); 5164 ldr(tmp3, Address(a1, cnt1)); 5165 ldr(tmp4, Address(a2, cnt1)); 5166 eor(tmp5, tmp3, tmp4); 5167 cbnz(tmp5, DONE); 5168 b(SAME); 5169 bind(A_MIGHT_BE_NULL); 5170 // in case both a1 and a2 are not-null, proceed with loads 5171 cbz(a1, DONE); 5172 cbz(a2, DONE); 5173 b(A_IS_NOT_NULL); 5174 bind(SHORT); 5175 5176 tbz(cnt1, 2 - log_elem_size, TAIL03); // 0-7 bytes left. 5177 { 5178 ldrw(tmp1, Address(post(a1, 4))); 5179 ldrw(tmp2, Address(post(a2, 4))); 5180 eorw(tmp5, tmp1, tmp2); 5181 cbnzw(tmp5, DONE); 5182 } 5183 bind(TAIL03); 5184 tbz(cnt1, 1 - log_elem_size, TAIL01); // 0-3 bytes left. 5185 { 5186 ldrh(tmp3, Address(post(a1, 2))); 5187 ldrh(tmp4, Address(post(a2, 2))); 5188 eorw(tmp5, tmp3, tmp4); 5189 cbnzw(tmp5, DONE); 5190 } 5191 bind(TAIL01); 5192 if (elem_size == 1) { // Only needed when comparing byte arrays. 5193 tbz(cnt1, 0, SAME); // 0-1 bytes left. 5194 { 5195 ldrb(tmp1, a1); 5196 ldrb(tmp2, a2); 5197 eorw(tmp5, tmp1, tmp2); 5198 cbnzw(tmp5, DONE); 5199 } 5200 } 5201 } else { 5202 Label NEXT_DWORD, SHORT, TAIL, TAIL2, STUB, EARLY_OUT, 5203 CSET_EQ, LAST_CHECK; 5204 mov(result, false); 5205 cbz(a1, DONE); 5206 ldrw(cnt1, Address(a1, length_offset)); 5207 cbz(a2, DONE); 5208 ldrw(cnt2, Address(a2, length_offset)); 5209 // on most CPUs a2 is still "locked"(surprisingly) in ldrw and it's 5210 // faster to perform another branch before comparing a1 and a2 5211 cmp(cnt1, elem_per_word); 5212 br(LE, SHORT); // short or same 5213 ldr(tmp3, Address(pre(a1, base_offset))); 5214 cmp(cnt1, stubBytesThreshold); 5215 br(GE, STUB); 5216 ldr(tmp4, Address(pre(a2, base_offset))); 5217 sub(tmp5, zr, cnt1, LSL, 3 + log_elem_size); 5218 cmp(cnt2, cnt1); 5219 br(NE, DONE); 5220 5221 // Main 16 byte comparison loop with 2 exits 5222 bind(NEXT_DWORD); { 5223 ldr(tmp1, Address(pre(a1, wordSize))); 5224 ldr(tmp2, Address(pre(a2, wordSize))); 5225 subs(cnt1, cnt1, 2 * elem_per_word); 5226 br(LE, TAIL); 5227 eor(tmp4, tmp3, tmp4); 5228 cbnz(tmp4, DONE); 5229 ldr(tmp3, Address(pre(a1, wordSize))); 5230 ldr(tmp4, Address(pre(a2, wordSize))); 5231 cmp(cnt1, elem_per_word); 5232 br(LE, TAIL2); 5233 cmp(tmp1, tmp2); 5234 } br(EQ, NEXT_DWORD); 5235 b(DONE); 5236 5237 bind(TAIL); 5238 eor(tmp4, tmp3, tmp4); 5239 eor(tmp2, tmp1, tmp2); 5240 lslv(tmp2, tmp2, tmp5); 5241 orr(tmp5, tmp4, tmp2); 5242 cmp(tmp5, zr); 5243 b(CSET_EQ); 5244 5245 bind(TAIL2); 5246 eor(tmp2, tmp1, tmp2); 5247 cbnz(tmp2, DONE); 5248 b(LAST_CHECK); 5249 5250 bind(STUB); 5251 ldr(tmp4, Address(pre(a2, base_offset))); 5252 cmp(cnt2, cnt1); 5253 br(NE, DONE); 5254 if (elem_size == 2) { // convert to byte counter 5255 lsl(cnt1, cnt1, 1); 5256 } 5257 eor(tmp5, tmp3, tmp4); 5258 cbnz(tmp5, DONE); 5259 RuntimeAddress stub = RuntimeAddress(StubRoutines::aarch64::large_array_equals()); 5260 assert(stub.target() != NULL, "array_equals_long stub has not been generated"); 5261 trampoline_call(stub); 5262 b(DONE); 5263 5264 bind(EARLY_OUT); 5265 // (a1 != null && a2 == null) || (a1 != null && a2 != null && a1 == a2) 5266 // so, if a2 == null => return false(0), else return true, so we can return a2 5267 mov(result, a2); 5268 b(DONE); 5269 bind(SHORT); 5270 cmp(cnt2, cnt1); 5271 br(NE, DONE); 5272 cbz(cnt1, SAME); 5273 sub(tmp5, zr, cnt1, LSL, 3 + log_elem_size); 5274 ldr(tmp3, Address(a1, base_offset)); 5275 ldr(tmp4, Address(a2, base_offset)); 5276 bind(LAST_CHECK); 5277 eor(tmp4, tmp3, tmp4); 5278 lslv(tmp5, tmp4, tmp5); 5279 cmp(tmp5, zr); 5280 bind(CSET_EQ); 5281 cset(result, EQ); 5282 b(DONE); 5283 } 5284 5285 bind(SAME); 5286 mov(result, true); 5287 // That's it. 5288 bind(DONE); 5289 5290 BLOCK_COMMENT("} array_equals"); 5291 } 5292 5293 // Compare Strings 5294 5295 // For Strings we're passed the address of the first characters in a1 5296 // and a2 and the length in cnt1. 5297 // elem_size is the element size in bytes: either 1 or 2. 5298 // There are two implementations. For arrays >= 8 bytes, all 5299 // comparisons (including the final one, which may overlap) are 5300 // performed 8 bytes at a time. For strings < 8 bytes, we compare a 5301 // halfword, then a short, and then a byte. 5302 5303 void MacroAssembler::string_equals(Register a1, Register a2, 5304 Register result, Register cnt1, int elem_size) 5305 { 5306 Label SAME, DONE, SHORT, NEXT_WORD; 5307 Register tmp1 = rscratch1; 5308 Register tmp2 = rscratch2; 5309 Register cnt2 = tmp2; // cnt2 only used in array length compare 5310 5311 assert(elem_size == 1 || elem_size == 2, "must be 2 or 1 byte"); 5312 assert_different_registers(a1, a2, result, cnt1, rscratch1, rscratch2); 5313 5314 #ifndef PRODUCT 5315 { 5316 const char kind = (elem_size == 2) ? 'U' : 'L'; 5317 char comment[64]; 5318 snprintf(comment, sizeof comment, "{string_equals%c", kind); 5319 BLOCK_COMMENT(comment); 5320 } 5321 #endif 5322 5323 mov(result, false); 5324 5325 // Check for short strings, i.e. smaller than wordSize. 5326 subs(cnt1, cnt1, wordSize); 5327 br(Assembler::LT, SHORT); 5328 // Main 8 byte comparison loop. 5329 bind(NEXT_WORD); { 5330 ldr(tmp1, Address(post(a1, wordSize))); 5331 ldr(tmp2, Address(post(a2, wordSize))); 5332 subs(cnt1, cnt1, wordSize); 5333 eor(tmp1, tmp1, tmp2); 5334 cbnz(tmp1, DONE); 5335 } br(GT, NEXT_WORD); 5336 // Last longword. In the case where length == 4 we compare the 5337 // same longword twice, but that's still faster than another 5338 // conditional branch. 5339 // cnt1 could be 0, -1, -2, -3, -4 for chars; -4 only happens when 5340 // length == 4. 5341 ldr(tmp1, Address(a1, cnt1)); 5342 ldr(tmp2, Address(a2, cnt1)); 5343 eor(tmp2, tmp1, tmp2); 5344 cbnz(tmp2, DONE); 5345 b(SAME); 5346 5347 bind(SHORT); 5348 Label TAIL03, TAIL01; 5349 5350 tbz(cnt1, 2, TAIL03); // 0-7 bytes left. 5351 { 5352 ldrw(tmp1, Address(post(a1, 4))); 5353 ldrw(tmp2, Address(post(a2, 4))); 5354 eorw(tmp1, tmp1, tmp2); 5355 cbnzw(tmp1, DONE); 5356 } 5357 bind(TAIL03); 5358 tbz(cnt1, 1, TAIL01); // 0-3 bytes left. 5359 { 5360 ldrh(tmp1, Address(post(a1, 2))); 5361 ldrh(tmp2, Address(post(a2, 2))); 5362 eorw(tmp1, tmp1, tmp2); 5363 cbnzw(tmp1, DONE); 5364 } 5365 bind(TAIL01); 5366 if (elem_size == 1) { // Only needed when comparing 1-byte elements 5367 tbz(cnt1, 0, SAME); // 0-1 bytes left. 5368 { 5369 ldrb(tmp1, a1); 5370 ldrb(tmp2, a2); 5371 eorw(tmp1, tmp1, tmp2); 5372 cbnzw(tmp1, DONE); 5373 } 5374 } 5375 // Arrays are equal. 5376 bind(SAME); 5377 mov(result, true); 5378 5379 // That's it. 5380 bind(DONE); 5381 BLOCK_COMMENT("} string_equals"); 5382 } 5383 5384 5385 // The size of the blocks erased by the zero_blocks stub. We must 5386 // handle anything smaller than this ourselves in zero_words(). 5387 const int MacroAssembler::zero_words_block_size = 8; 5388 5389 // zero_words() is used by C2 ClearArray patterns. It is as small as 5390 // possible, handling small word counts locally and delegating 5391 // anything larger to the zero_blocks stub. It is expanded many times 5392 // in compiled code, so it is important to keep it short. 5393 5394 // ptr: Address of a buffer to be zeroed. 5395 // cnt: Count in HeapWords. 5396 // 5397 // ptr, cnt, rscratch1, and rscratch2 are clobbered. 5398 void MacroAssembler::zero_words(Register ptr, Register cnt) 5399 { 5400 assert(is_power_of_2(zero_words_block_size), "adjust this"); 5401 assert(ptr == r10 && cnt == r11, "mismatch in register usage"); 5402 5403 BLOCK_COMMENT("zero_words {"); 5404 cmp(cnt, zero_words_block_size); 5405 Label around, done, done16; 5406 br(LO, around); 5407 { 5408 RuntimeAddress zero_blocks = RuntimeAddress(StubRoutines::aarch64::zero_blocks()); 5409 assert(zero_blocks.target() != NULL, "zero_blocks stub has not been generated"); 5410 if (StubRoutines::aarch64::complete()) { 5411 trampoline_call(zero_blocks); 5412 } else { 5413 bl(zero_blocks); 5414 } 5415 } 5416 bind(around); 5417 for (int i = zero_words_block_size >> 1; i > 1; i >>= 1) { 5418 Label l; 5419 tbz(cnt, exact_log2(i), l); 5420 for (int j = 0; j < i; j += 2) { 5421 stp(zr, zr, post(ptr, 16)); 5422 } 5423 bind(l); 5424 } 5425 { 5426 Label l; 5427 tbz(cnt, 0, l); 5428 str(zr, Address(ptr)); 5429 bind(l); 5430 } 5431 BLOCK_COMMENT("} zero_words"); 5432 } 5433 5434 // base: Address of a buffer to be zeroed, 8 bytes aligned. 5435 // cnt: Immediate count in HeapWords. 5436 #define SmallArraySize (18 * BytesPerLong) 5437 void MacroAssembler::zero_words(Register base, u_int64_t cnt) 5438 { 5439 BLOCK_COMMENT("zero_words {"); 5440 int i = cnt & 1; // store any odd word to start 5441 if (i) str(zr, Address(base)); 5442 5443 if (cnt <= SmallArraySize / BytesPerLong) { 5444 for (; i < (int)cnt; i += 2) 5445 stp(zr, zr, Address(base, i * wordSize)); 5446 } else { 5447 const int unroll = 4; // Number of stp(zr, zr) instructions we'll unroll 5448 int remainder = cnt % (2 * unroll); 5449 for (; i < remainder; i += 2) 5450 stp(zr, zr, Address(base, i * wordSize)); 5451 5452 Label loop; 5453 Register cnt_reg = rscratch1; 5454 Register loop_base = rscratch2; 5455 cnt = cnt - remainder; 5456 mov(cnt_reg, cnt); 5457 // adjust base and prebias by -2 * wordSize so we can pre-increment 5458 add(loop_base, base, (remainder - 2) * wordSize); 5459 bind(loop); 5460 sub(cnt_reg, cnt_reg, 2 * unroll); 5461 for (i = 1; i < unroll; i++) 5462 stp(zr, zr, Address(loop_base, 2 * i * wordSize)); 5463 stp(zr, zr, Address(pre(loop_base, 2 * unroll * wordSize))); 5464 cbnz(cnt_reg, loop); 5465 } 5466 BLOCK_COMMENT("} zero_words"); 5467 } 5468 5469 // Zero blocks of memory by using DC ZVA. 5470 // 5471 // Aligns the base address first sufficently for DC ZVA, then uses 5472 // DC ZVA repeatedly for every full block. cnt is the size to be 5473 // zeroed in HeapWords. Returns the count of words left to be zeroed 5474 // in cnt. 5475 // 5476 // NOTE: This is intended to be used in the zero_blocks() stub. If 5477 // you want to use it elsewhere, note that cnt must be >= 2*zva_length. 5478 void MacroAssembler::zero_dcache_blocks(Register base, Register cnt) { 5479 Register tmp = rscratch1; 5480 Register tmp2 = rscratch2; 5481 int zva_length = VM_Version::zva_length(); 5482 Label initial_table_end, loop_zva; 5483 Label fini; 5484 5485 // Base must be 16 byte aligned. If not just return and let caller handle it 5486 tst(base, 0x0f); 5487 br(Assembler::NE, fini); 5488 // Align base with ZVA length. 5489 neg(tmp, base); 5490 andr(tmp, tmp, zva_length - 1); 5491 5492 // tmp: the number of bytes to be filled to align the base with ZVA length. 5493 add(base, base, tmp); 5494 sub(cnt, cnt, tmp, Assembler::ASR, 3); 5495 adr(tmp2, initial_table_end); 5496 sub(tmp2, tmp2, tmp, Assembler::LSR, 2); 5497 br(tmp2); 5498 5499 for (int i = -zva_length + 16; i < 0; i += 16) 5500 stp(zr, zr, Address(base, i)); 5501 bind(initial_table_end); 5502 5503 sub(cnt, cnt, zva_length >> 3); 5504 bind(loop_zva); 5505 dc(Assembler::ZVA, base); 5506 subs(cnt, cnt, zva_length >> 3); 5507 add(base, base, zva_length); 5508 br(Assembler::GE, loop_zva); 5509 add(cnt, cnt, zva_length >> 3); // count not zeroed by DC ZVA 5510 bind(fini); 5511 } 5512 5513 // base: Address of a buffer to be filled, 8 bytes aligned. 5514 // cnt: Count in 8-byte unit. 5515 // value: Value to be filled with. 5516 // base will point to the end of the buffer after filling. 5517 void MacroAssembler::fill_words(Register base, Register cnt, Register value) 5518 { 5519 // Algorithm: 5520 // 5521 // scratch1 = cnt & 7; 5522 // cnt -= scratch1; 5523 // p += scratch1; 5524 // switch (scratch1) { 5525 // do { 5526 // cnt -= 8; 5527 // p[-8] = v; 5528 // case 7: 5529 // p[-7] = v; 5530 // case 6: 5531 // p[-6] = v; 5532 // // ... 5533 // case 1: 5534 // p[-1] = v; 5535 // case 0: 5536 // p += 8; 5537 // } while (cnt); 5538 // } 5539 5540 assert_different_registers(base, cnt, value, rscratch1, rscratch2); 5541 5542 Label fini, skip, entry, loop; 5543 const int unroll = 8; // Number of stp instructions we'll unroll 5544 5545 cbz(cnt, fini); 5546 tbz(base, 3, skip); 5547 str(value, Address(post(base, 8))); 5548 sub(cnt, cnt, 1); 5549 bind(skip); 5550 5551 andr(rscratch1, cnt, (unroll-1) * 2); 5552 sub(cnt, cnt, rscratch1); 5553 add(base, base, rscratch1, Assembler::LSL, 3); 5554 adr(rscratch2, entry); 5555 sub(rscratch2, rscratch2, rscratch1, Assembler::LSL, 1); 5556 br(rscratch2); 5557 5558 bind(loop); 5559 add(base, base, unroll * 16); 5560 for (int i = -unroll; i < 0; i++) 5561 stp(value, value, Address(base, i * 16)); 5562 bind(entry); 5563 subs(cnt, cnt, unroll * 2); 5564 br(Assembler::GE, loop); 5565 5566 tbz(cnt, 0, fini); 5567 str(value, Address(post(base, 8))); 5568 bind(fini); 5569 } 5570 5571 // Intrinsic for sun/nio/cs/ISO_8859_1$Encoder.implEncodeISOArray and 5572 // java/lang/StringUTF16.compress. 5573 void MacroAssembler::encode_iso_array(Register src, Register dst, 5574 Register len, Register result, 5575 FloatRegister Vtmp1, FloatRegister Vtmp2, 5576 FloatRegister Vtmp3, FloatRegister Vtmp4) 5577 { 5578 Label DONE, SET_RESULT, NEXT_32, NEXT_32_PRFM, LOOP_8, NEXT_8, LOOP_1, NEXT_1, 5579 NEXT_32_START, NEXT_32_PRFM_START; 5580 Register tmp1 = rscratch1, tmp2 = rscratch2; 5581 5582 mov(result, len); // Save initial len 5583 5584 #ifndef BUILTIN_SIM 5585 cmp(len, 8); // handle shortest strings first 5586 br(LT, LOOP_1); 5587 cmp(len, 32); 5588 br(LT, NEXT_8); 5589 // The following code uses the SIMD 'uzp1' and 'uzp2' instructions 5590 // to convert chars to bytes 5591 if (SoftwarePrefetchHintDistance >= 0) { 5592 ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src); 5593 cmp(len, SoftwarePrefetchHintDistance/2 + 16); 5594 br(LE, NEXT_32_START); 5595 b(NEXT_32_PRFM_START); 5596 BIND(NEXT_32_PRFM); 5597 ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src); 5598 BIND(NEXT_32_PRFM_START); 5599 prfm(Address(src, SoftwarePrefetchHintDistance)); 5600 orr(v4, T16B, Vtmp1, Vtmp2); 5601 orr(v5, T16B, Vtmp3, Vtmp4); 5602 uzp1(Vtmp1, T16B, Vtmp1, Vtmp2); 5603 uzp1(Vtmp3, T16B, Vtmp3, Vtmp4); 5604 stpq(Vtmp1, Vtmp3, dst); 5605 uzp2(v5, T16B, v4, v5); // high bytes 5606 umov(tmp2, v5, D, 1); 5607 fmovd(tmp1, v5); 5608 orr(tmp1, tmp1, tmp2); 5609 cbnz(tmp1, LOOP_8); 5610 sub(len, len, 32); 5611 add(dst, dst, 32); 5612 add(src, src, 64); 5613 cmp(len, SoftwarePrefetchHintDistance/2 + 16); 5614 br(GE, NEXT_32_PRFM); 5615 cmp(len, 32); 5616 br(LT, LOOP_8); 5617 BIND(NEXT_32); 5618 ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src); 5619 BIND(NEXT_32_START); 5620 } else { 5621 BIND(NEXT_32); 5622 ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src); 5623 } 5624 prfm(Address(src, SoftwarePrefetchHintDistance)); 5625 uzp1(v4, T16B, Vtmp1, Vtmp2); 5626 uzp1(v5, T16B, Vtmp3, Vtmp4); 5627 stpq(v4, v5, dst); 5628 orr(Vtmp1, T16B, Vtmp1, Vtmp2); 5629 orr(Vtmp3, T16B, Vtmp3, Vtmp4); 5630 uzp2(Vtmp1, T16B, Vtmp1, Vtmp3); // high bytes 5631 umov(tmp2, Vtmp1, D, 1); 5632 fmovd(tmp1, Vtmp1); 5633 orr(tmp1, tmp1, tmp2); 5634 cbnz(tmp1, LOOP_8); 5635 sub(len, len, 32); 5636 add(dst, dst, 32); 5637 add(src, src, 64); 5638 cmp(len, 32); 5639 br(GE, NEXT_32); 5640 cbz(len, DONE); 5641 5642 BIND(LOOP_8); 5643 cmp(len, 8); 5644 br(LT, LOOP_1); 5645 BIND(NEXT_8); 5646 ld1(Vtmp1, T8H, src); 5647 uzp1(Vtmp2, T16B, Vtmp1, Vtmp1); // low bytes 5648 uzp2(Vtmp3, T16B, Vtmp1, Vtmp1); // high bytes 5649 strd(Vtmp2, dst); 5650 fmovd(tmp1, Vtmp3); 5651 cbnz(tmp1, NEXT_1); 5652 5653 sub(len, len, 8); 5654 add(dst, dst, 8); 5655 add(src, src, 16); 5656 cmp(len, 8); 5657 br(GE, NEXT_8); 5658 5659 BIND(LOOP_1); 5660 #endif 5661 cbz(len, DONE); 5662 BIND(NEXT_1); 5663 ldrh(tmp1, Address(post(src, 2))); 5664 strb(tmp1, Address(post(dst, 1))); 5665 tst(tmp1, 0xff00); 5666 br(NE, SET_RESULT); 5667 subs(len, len, 1); 5668 br(GT, NEXT_1); 5669 5670 BIND(SET_RESULT); 5671 sub(result, result, len); // Return index where we stopped 5672 // Return len == 0 if we processed all 5673 // characters 5674 BIND(DONE); 5675 } 5676 5677 5678 // Inflate byte[] array to char[]. 5679 void MacroAssembler::byte_array_inflate(Register src, Register dst, Register len, 5680 FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3, 5681 Register tmp4) { 5682 Label big, done, after_init, to_stub; 5683 5684 assert_different_registers(src, dst, len, tmp4, rscratch1); 5685 5686 fmovd(vtmp1, zr); 5687 lsrw(tmp4, len, 3); 5688 bind(after_init); 5689 cbnzw(tmp4, big); 5690 // Short string: less than 8 bytes. 5691 { 5692 Label loop, tiny; 5693 5694 cmpw(len, 4); 5695 br(LT, tiny); 5696 // Use SIMD to do 4 bytes. 5697 ldrs(vtmp2, post(src, 4)); 5698 zip1(vtmp3, T8B, vtmp2, vtmp1); 5699 subw(len, len, 4); 5700 strd(vtmp3, post(dst, 8)); 5701 5702 cbzw(len, done); 5703 5704 // Do the remaining bytes by steam. 5705 bind(loop); 5706 ldrb(tmp4, post(src, 1)); 5707 strh(tmp4, post(dst, 2)); 5708 subw(len, len, 1); 5709 5710 bind(tiny); 5711 cbnz(len, loop); 5712 5713 b(done); 5714 } 5715 5716 if (SoftwarePrefetchHintDistance >= 0) { 5717 bind(to_stub); 5718 RuntimeAddress stub = RuntimeAddress(StubRoutines::aarch64::large_byte_array_inflate()); 5719 assert(stub.target() != NULL, "large_byte_array_inflate stub has not been generated"); 5720 trampoline_call(stub); 5721 b(after_init); 5722 } 5723 5724 // Unpack the bytes 8 at a time. 5725 bind(big); 5726 { 5727 Label loop, around, loop_last, loop_start; 5728 5729 if (SoftwarePrefetchHintDistance >= 0) { 5730 const int large_loop_threshold = (64 + 16)/8; 5731 ldrd(vtmp2, post(src, 8)); 5732 andw(len, len, 7); 5733 cmp(tmp4, large_loop_threshold); 5734 br(GE, to_stub); 5735 b(loop_start); 5736 5737 bind(loop); 5738 ldrd(vtmp2, post(src, 8)); 5739 bind(loop_start); 5740 subs(tmp4, tmp4, 1); 5741 br(EQ, loop_last); 5742 zip1(vtmp2, T16B, vtmp2, vtmp1); 5743 ldrd(vtmp3, post(src, 8)); 5744 st1(vtmp2, T8H, post(dst, 16)); 5745 subs(tmp4, tmp4, 1); 5746 zip1(vtmp3, T16B, vtmp3, vtmp1); 5747 st1(vtmp3, T8H, post(dst, 16)); 5748 br(NE, loop); 5749 b(around); 5750 bind(loop_last); 5751 zip1(vtmp2, T16B, vtmp2, vtmp1); 5752 st1(vtmp2, T8H, post(dst, 16)); 5753 bind(around); 5754 cbz(len, done); 5755 } else { 5756 andw(len, len, 7); 5757 bind(loop); 5758 ldrd(vtmp2, post(src, 8)); 5759 sub(tmp4, tmp4, 1); 5760 zip1(vtmp3, T16B, vtmp2, vtmp1); 5761 st1(vtmp3, T8H, post(dst, 16)); 5762 cbnz(tmp4, loop); 5763 } 5764 } 5765 5766 // Do the tail of up to 8 bytes. 5767 add(src, src, len); 5768 ldrd(vtmp3, Address(src, -8)); 5769 add(dst, dst, len, ext::uxtw, 1); 5770 zip1(vtmp3, T16B, vtmp3, vtmp1); 5771 strq(vtmp3, Address(dst, -16)); 5772 5773 bind(done); 5774 } 5775 5776 // Compress char[] array to byte[]. 5777 void MacroAssembler::char_array_compress(Register src, Register dst, Register len, 5778 FloatRegister tmp1Reg, FloatRegister tmp2Reg, 5779 FloatRegister tmp3Reg, FloatRegister tmp4Reg, 5780 Register result) { 5781 encode_iso_array(src, dst, len, result, 5782 tmp1Reg, tmp2Reg, tmp3Reg, tmp4Reg); 5783 cmp(len, zr); 5784 csel(result, result, zr, EQ); 5785 } 5786 5787 // get_thread() can be called anywhere inside generated code so we 5788 // need to save whatever non-callee save context might get clobbered 5789 // by the call to JavaThread::aarch64_get_thread_helper() or, indeed, 5790 // the call setup code. 5791 // 5792 // aarch64_get_thread_helper() clobbers only r0, r1, and flags. 5793 // 5794 void MacroAssembler::get_thread(Register dst) { 5795 RegSet saved_regs = RegSet::range(r0, r1) + lr - dst; 5796 push(saved_regs, sp); 5797 5798 mov(lr, CAST_FROM_FN_PTR(address, JavaThread::aarch64_get_thread_helper)); 5799 blrt(lr, 1, 0, 1); 5800 if (dst != c_rarg0) { 5801 mov(dst, c_rarg0); 5802 } 5803 5804 pop(saved_regs, sp); 5805 }