1 /* 2 * Copyright (c) 1997, 2018, Oracle and/or its affiliates. All rights reserved. 3 * Copyright (c) 2014, 2015, Red Hat Inc. All rights reserved. 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. 9 * 10 * This code is distributed in the hope that it will be useful, but WITHOUT 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 * version 2 for more details (a copy is included in the LICENSE file that 14 * accompanied this code). 15 * 16 * You should have received a copy of the GNU General Public License version 17 * 2 along with this work; if not, write to the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 19 * 20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 21 * or visit www.oracle.com if you need additional information or have any 22 * questions. 23 * 24 */ 25 26 #include <sys/types.h> 27 28 #include "precompiled.hpp" 29 #include "jvm.h" 30 #include "asm/assembler.hpp" 31 #include "asm/assembler.inline.hpp" 32 #include "gc/shared/barrierSet.hpp" 33 #include "gc/shared/cardTable.hpp" 34 #include "gc/shared/barrierSetAssembler.hpp" 35 #include "gc/shared/cardTableBarrierSet.hpp" 36 #include "interpreter/interpreter.hpp" 37 #include "compiler/disassembler.hpp" 38 #include "memory/resourceArea.hpp" 39 #include "nativeInst_aarch64.hpp" 40 #include "oops/accessDecorators.hpp" 41 #include "oops/compressedOops.inline.hpp" 42 #include "oops/klass.inline.hpp" 43 #include "oops/oop.hpp" 44 #include "opto/compile.hpp" 45 #include "opto/intrinsicnode.hpp" 46 #include "opto/node.hpp" 47 #include "runtime/biasedLocking.hpp" 48 #include "runtime/icache.hpp" 49 #include "runtime/interfaceSupport.inline.hpp" 50 #include "runtime/jniHandles.inline.hpp" 51 #include "runtime/sharedRuntime.hpp" 52 #include "runtime/thread.hpp" 53 54 #ifdef PRODUCT 55 #define BLOCK_COMMENT(str) /* nothing */ 56 #define STOP(error) stop(error) 57 #else 58 #define BLOCK_COMMENT(str) block_comment(str) 59 #define STOP(error) block_comment(error); stop(error) 60 #endif 61 62 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 63 64 // Patch any kind of instruction; there may be several instructions. 65 // Return the total length (in bytes) of the instructions. 66 int MacroAssembler::pd_patch_instruction_size(address branch, address target) { 67 int instructions = 1; 68 assert((uint64_t)target < (1ul << 48), "48-bit overflow in address constant"); 69 long offset = (target - branch) >> 2; 70 unsigned insn = *(unsigned*)branch; 71 if ((Instruction_aarch64::extract(insn, 29, 24) & 0b111011) == 0b011000) { 72 // Load register (literal) 73 Instruction_aarch64::spatch(branch, 23, 5, offset); 74 } else if (Instruction_aarch64::extract(insn, 30, 26) == 0b00101) { 75 // Unconditional branch (immediate) 76 Instruction_aarch64::spatch(branch, 25, 0, offset); 77 } else if (Instruction_aarch64::extract(insn, 31, 25) == 0b0101010) { 78 // Conditional branch (immediate) 79 Instruction_aarch64::spatch(branch, 23, 5, offset); 80 } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011010) { 81 // Compare & branch (immediate) 82 Instruction_aarch64::spatch(branch, 23, 5, offset); 83 } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011011) { 84 // Test & branch (immediate) 85 Instruction_aarch64::spatch(branch, 18, 5, offset); 86 } else if (Instruction_aarch64::extract(insn, 28, 24) == 0b10000) { 87 // PC-rel. addressing 88 offset = target-branch; 89 int shift = Instruction_aarch64::extract(insn, 31, 31); 90 if (shift) { 91 u_int64_t dest = (u_int64_t)target; 92 uint64_t pc_page = (uint64_t)branch >> 12; 93 uint64_t adr_page = (uint64_t)target >> 12; 94 unsigned offset_lo = dest & 0xfff; 95 offset = adr_page - pc_page; 96 97 // We handle 4 types of PC relative addressing 98 // 1 - adrp Rx, target_page 99 // ldr/str Ry, [Rx, #offset_in_page] 100 // 2 - adrp Rx, target_page 101 // add Ry, Rx, #offset_in_page 102 // 3 - adrp Rx, target_page (page aligned reloc, offset == 0) 103 // movk Rx, #imm16<<32 104 // 4 - adrp Rx, target_page (page aligned reloc, offset == 0) 105 // In the first 3 cases we must check that Rx is the same in the adrp and the 106 // subsequent ldr/str, add or movk instruction. Otherwise we could accidentally end 107 // up treating a type 4 relocation as a type 1, 2 or 3 just because it happened 108 // to be followed by a random unrelated ldr/str, add or movk instruction. 109 // 110 unsigned insn2 = ((unsigned*)branch)[1]; 111 if (Instruction_aarch64::extract(insn2, 29, 24) == 0b111001 && 112 Instruction_aarch64::extract(insn, 4, 0) == 113 Instruction_aarch64::extract(insn2, 9, 5)) { 114 // Load/store register (unsigned immediate) 115 unsigned size = Instruction_aarch64::extract(insn2, 31, 30); 116 Instruction_aarch64::patch(branch + sizeof (unsigned), 117 21, 10, offset_lo >> size); 118 guarantee(((dest >> size) << size) == dest, "misaligned target"); 119 instructions = 2; 120 } else if (Instruction_aarch64::extract(insn2, 31, 22) == 0b1001000100 && 121 Instruction_aarch64::extract(insn, 4, 0) == 122 Instruction_aarch64::extract(insn2, 4, 0)) { 123 // add (immediate) 124 Instruction_aarch64::patch(branch + sizeof (unsigned), 125 21, 10, offset_lo); 126 instructions = 2; 127 } else if (Instruction_aarch64::extract(insn2, 31, 21) == 0b11110010110 && 128 Instruction_aarch64::extract(insn, 4, 0) == 129 Instruction_aarch64::extract(insn2, 4, 0)) { 130 // movk #imm16<<32 131 Instruction_aarch64::patch(branch + 4, 20, 5, (uint64_t)target >> 32); 132 long dest = ((long)target & 0xffffffffL) | ((long)branch & 0xffff00000000L); 133 long pc_page = (long)branch >> 12; 134 long adr_page = (long)dest >> 12; 135 offset = adr_page - pc_page; 136 instructions = 2; 137 } 138 } 139 int offset_lo = offset & 3; 140 offset >>= 2; 141 Instruction_aarch64::spatch(branch, 23, 5, offset); 142 Instruction_aarch64::patch(branch, 30, 29, offset_lo); 143 } else if (Instruction_aarch64::extract(insn, 31, 21) == 0b11010010100) { 144 u_int64_t dest = (u_int64_t)target; 145 // Move wide constant 146 assert(nativeInstruction_at(branch+4)->is_movk(), "wrong insns in patch"); 147 assert(nativeInstruction_at(branch+8)->is_movk(), "wrong insns in patch"); 148 Instruction_aarch64::patch(branch, 20, 5, dest & 0xffff); 149 Instruction_aarch64::patch(branch+4, 20, 5, (dest >>= 16) & 0xffff); 150 Instruction_aarch64::patch(branch+8, 20, 5, (dest >>= 16) & 0xffff); 151 assert(target_addr_for_insn(branch) == target, "should be"); 152 instructions = 3; 153 } else if (Instruction_aarch64::extract(insn, 31, 22) == 0b1011100101 && 154 Instruction_aarch64::extract(insn, 4, 0) == 0b11111) { 155 // nothing to do 156 assert(target == 0, "did not expect to relocate target for polling page load"); 157 } else { 158 ShouldNotReachHere(); 159 } 160 return instructions * NativeInstruction::instruction_size; 161 } 162 163 int MacroAssembler::patch_oop(address insn_addr, address o) { 164 int instructions; 165 unsigned insn = *(unsigned*)insn_addr; 166 assert(nativeInstruction_at(insn_addr+4)->is_movk(), "wrong insns in patch"); 167 168 // OOPs are either narrow (32 bits) or wide (48 bits). We encode 169 // narrow OOPs by setting the upper 16 bits in the first 170 // instruction. 171 if (Instruction_aarch64::extract(insn, 31, 21) == 0b11010010101) { 172 // Move narrow OOP 173 narrowOop n = CompressedOops::encode((oop)o); 174 Instruction_aarch64::patch(insn_addr, 20, 5, n >> 16); 175 Instruction_aarch64::patch(insn_addr+4, 20, 5, n & 0xffff); 176 instructions = 2; 177 } else { 178 // Move wide OOP 179 assert(nativeInstruction_at(insn_addr+8)->is_movk(), "wrong insns in patch"); 180 uintptr_t dest = (uintptr_t)o; 181 Instruction_aarch64::patch(insn_addr, 20, 5, dest & 0xffff); 182 Instruction_aarch64::patch(insn_addr+4, 20, 5, (dest >>= 16) & 0xffff); 183 Instruction_aarch64::patch(insn_addr+8, 20, 5, (dest >>= 16) & 0xffff); 184 instructions = 3; 185 } 186 return instructions * NativeInstruction::instruction_size; 187 } 188 189 int MacroAssembler::patch_narrow_klass(address insn_addr, narrowKlass n) { 190 // Metatdata pointers are either narrow (32 bits) or wide (48 bits). 191 // We encode narrow ones by setting the upper 16 bits in the first 192 // instruction. 193 NativeInstruction *insn = nativeInstruction_at(insn_addr); 194 assert(Instruction_aarch64::extract(insn->encoding(), 31, 21) == 0b11010010101 && 195 nativeInstruction_at(insn_addr+4)->is_movk(), "wrong insns in patch"); 196 197 Instruction_aarch64::patch(insn_addr, 20, 5, n >> 16); 198 Instruction_aarch64::patch(insn_addr+4, 20, 5, n & 0xffff); 199 return 2 * NativeInstruction::instruction_size; 200 } 201 202 address MacroAssembler::target_addr_for_insn(address insn_addr, unsigned insn) { 203 long offset = 0; 204 if ((Instruction_aarch64::extract(insn, 29, 24) & 0b011011) == 0b00011000) { 205 // Load register (literal) 206 offset = Instruction_aarch64::sextract(insn, 23, 5); 207 return address(((uint64_t)insn_addr + (offset << 2))); 208 } else if (Instruction_aarch64::extract(insn, 30, 26) == 0b00101) { 209 // Unconditional branch (immediate) 210 offset = Instruction_aarch64::sextract(insn, 25, 0); 211 } else if (Instruction_aarch64::extract(insn, 31, 25) == 0b0101010) { 212 // Conditional branch (immediate) 213 offset = Instruction_aarch64::sextract(insn, 23, 5); 214 } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011010) { 215 // Compare & branch (immediate) 216 offset = Instruction_aarch64::sextract(insn, 23, 5); 217 } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011011) { 218 // Test & branch (immediate) 219 offset = Instruction_aarch64::sextract(insn, 18, 5); 220 } else if (Instruction_aarch64::extract(insn, 28, 24) == 0b10000) { 221 // PC-rel. addressing 222 offset = Instruction_aarch64::extract(insn, 30, 29); 223 offset |= Instruction_aarch64::sextract(insn, 23, 5) << 2; 224 int shift = Instruction_aarch64::extract(insn, 31, 31) ? 12 : 0; 225 if (shift) { 226 offset <<= shift; 227 uint64_t target_page = ((uint64_t)insn_addr) + offset; 228 target_page &= ((uint64_t)-1) << shift; 229 // Return the target address for the following sequences 230 // 1 - adrp Rx, target_page 231 // ldr/str Ry, [Rx, #offset_in_page] 232 // 2 - adrp Rx, target_page 233 // add Ry, Rx, #offset_in_page 234 // 3 - adrp Rx, target_page (page aligned reloc, offset == 0) 235 // movk Rx, #imm12<<32 236 // 4 - adrp Rx, target_page (page aligned reloc, offset == 0) 237 // 238 // In the first two cases we check that the register is the same and 239 // return the target_page + the offset within the page. 240 // Otherwise we assume it is a page aligned relocation and return 241 // the target page only. 242 // 243 unsigned insn2 = ((unsigned*)insn_addr)[1]; 244 if (Instruction_aarch64::extract(insn2, 29, 24) == 0b111001 && 245 Instruction_aarch64::extract(insn, 4, 0) == 246 Instruction_aarch64::extract(insn2, 9, 5)) { 247 // Load/store register (unsigned immediate) 248 unsigned int byte_offset = Instruction_aarch64::extract(insn2, 21, 10); 249 unsigned int size = Instruction_aarch64::extract(insn2, 31, 30); 250 return address(target_page + (byte_offset << size)); 251 } else if (Instruction_aarch64::extract(insn2, 31, 22) == 0b1001000100 && 252 Instruction_aarch64::extract(insn, 4, 0) == 253 Instruction_aarch64::extract(insn2, 4, 0)) { 254 // add (immediate) 255 unsigned int byte_offset = Instruction_aarch64::extract(insn2, 21, 10); 256 return address(target_page + byte_offset); 257 } else { 258 if (Instruction_aarch64::extract(insn2, 31, 21) == 0b11110010110 && 259 Instruction_aarch64::extract(insn, 4, 0) == 260 Instruction_aarch64::extract(insn2, 4, 0)) { 261 target_page = (target_page & 0xffffffff) | 262 ((uint64_t)Instruction_aarch64::extract(insn2, 20, 5) << 32); 263 } 264 return (address)target_page; 265 } 266 } else { 267 ShouldNotReachHere(); 268 } 269 } else if (Instruction_aarch64::extract(insn, 31, 23) == 0b110100101) { 270 u_int32_t *insns = (u_int32_t *)insn_addr; 271 // Move wide constant: movz, movk, movk. See movptr(). 272 assert(nativeInstruction_at(insns+1)->is_movk(), "wrong insns in patch"); 273 assert(nativeInstruction_at(insns+2)->is_movk(), "wrong insns in patch"); 274 return address(u_int64_t(Instruction_aarch64::extract(insns[0], 20, 5)) 275 + (u_int64_t(Instruction_aarch64::extract(insns[1], 20, 5)) << 16) 276 + (u_int64_t(Instruction_aarch64::extract(insns[2], 20, 5)) << 32)); 277 } else if (Instruction_aarch64::extract(insn, 31, 22) == 0b1011100101 && 278 Instruction_aarch64::extract(insn, 4, 0) == 0b11111) { 279 return 0; 280 } else { 281 ShouldNotReachHere(); 282 } 283 return address(((uint64_t)insn_addr + (offset << 2))); 284 } 285 286 void MacroAssembler::serialize_memory(Register thread, Register tmp) { 287 dsb(Assembler::SY); 288 } 289 290 void MacroAssembler::safepoint_poll(Label& slow_path) { 291 if (SafepointMechanism::uses_thread_local_poll()) { 292 ldr(rscratch1, Address(rthread, Thread::polling_page_offset())); 293 tbnz(rscratch1, exact_log2(SafepointMechanism::poll_bit()), slow_path); 294 } else { 295 unsigned long offset; 296 adrp(rscratch1, ExternalAddress(SafepointSynchronize::address_of_state()), offset); 297 ldrw(rscratch1, Address(rscratch1, offset)); 298 assert(SafepointSynchronize::_not_synchronized == 0, "rewrite this code"); 299 cbnz(rscratch1, slow_path); 300 } 301 } 302 303 // Just like safepoint_poll, but use an acquiring load for thread- 304 // local polling. 305 // 306 // We need an acquire here to ensure that any subsequent load of the 307 // global SafepointSynchronize::_state flag is ordered after this load 308 // of the local Thread::_polling page. We don't want this poll to 309 // return false (i.e. not safepointing) and a later poll of the global 310 // SafepointSynchronize::_state spuriously to return true. 311 // 312 // This is to avoid a race when we're in a native->Java transition 313 // racing the code which wakes up from a safepoint. 314 // 315 void MacroAssembler::safepoint_poll_acquire(Label& slow_path) { 316 if (SafepointMechanism::uses_thread_local_poll()) { 317 lea(rscratch1, Address(rthread, Thread::polling_page_offset())); 318 ldar(rscratch1, rscratch1); 319 tbnz(rscratch1, exact_log2(SafepointMechanism::poll_bit()), slow_path); 320 } else { 321 safepoint_poll(slow_path); 322 } 323 } 324 325 void MacroAssembler::reset_last_Java_frame(bool clear_fp) { 326 // we must set sp to zero to clear frame 327 str(zr, Address(rthread, JavaThread::last_Java_sp_offset())); 328 329 // must clear fp, so that compiled frames are not confused; it is 330 // possible that we need it only for debugging 331 if (clear_fp) { 332 str(zr, Address(rthread, JavaThread::last_Java_fp_offset())); 333 } 334 335 // Always clear the pc because it could have been set by make_walkable() 336 str(zr, Address(rthread, JavaThread::last_Java_pc_offset())); 337 } 338 339 // Calls to C land 340 // 341 // When entering C land, the rfp, & resp of the last Java frame have to be recorded 342 // in the (thread-local) JavaThread object. When leaving C land, the last Java fp 343 // has to be reset to 0. This is required to allow proper stack traversal. 344 void MacroAssembler::set_last_Java_frame(Register last_java_sp, 345 Register last_java_fp, 346 Register last_java_pc, 347 Register scratch) { 348 349 if (last_java_pc->is_valid()) { 350 str(last_java_pc, Address(rthread, 351 JavaThread::frame_anchor_offset() 352 + JavaFrameAnchor::last_Java_pc_offset())); 353 } 354 355 // determine last_java_sp register 356 if (last_java_sp == sp) { 357 mov(scratch, sp); 358 last_java_sp = scratch; 359 } else if (!last_java_sp->is_valid()) { 360 last_java_sp = esp; 361 } 362 363 str(last_java_sp, Address(rthread, JavaThread::last_Java_sp_offset())); 364 365 // last_java_fp is optional 366 if (last_java_fp->is_valid()) { 367 str(last_java_fp, Address(rthread, JavaThread::last_Java_fp_offset())); 368 } 369 } 370 371 void MacroAssembler::set_last_Java_frame(Register last_java_sp, 372 Register last_java_fp, 373 address last_java_pc, 374 Register scratch) { 375 if (last_java_pc != NULL) { 376 adr(scratch, last_java_pc); 377 } else { 378 // FIXME: This is almost never correct. We should delete all 379 // cases of set_last_Java_frame with last_java_pc=NULL and use the 380 // correct return address instead. 381 adr(scratch, pc()); 382 } 383 384 str(scratch, Address(rthread, 385 JavaThread::frame_anchor_offset() 386 + JavaFrameAnchor::last_Java_pc_offset())); 387 388 set_last_Java_frame(last_java_sp, last_java_fp, noreg, scratch); 389 } 390 391 void MacroAssembler::set_last_Java_frame(Register last_java_sp, 392 Register last_java_fp, 393 Label &L, 394 Register scratch) { 395 if (L.is_bound()) { 396 set_last_Java_frame(last_java_sp, last_java_fp, target(L), scratch); 397 } else { 398 InstructionMark im(this); 399 L.add_patch_at(code(), locator()); 400 set_last_Java_frame(last_java_sp, last_java_fp, (address)NULL, scratch); 401 } 402 } 403 404 void MacroAssembler::far_call(Address entry, CodeBuffer *cbuf, Register tmp) { 405 assert(ReservedCodeCacheSize < 4*G, "branch out of range"); 406 assert(CodeCache::find_blob(entry.target()) != NULL, 407 "destination of far call not found in code cache"); 408 if (far_branches()) { 409 unsigned long offset; 410 // We can use ADRP here because we know that the total size of 411 // the code cache cannot exceed 2Gb. 412 adrp(tmp, entry, offset); 413 add(tmp, tmp, offset); 414 if (cbuf) cbuf->set_insts_mark(); 415 blr(tmp); 416 } else { 417 if (cbuf) cbuf->set_insts_mark(); 418 bl(entry); 419 } 420 } 421 422 void MacroAssembler::far_jump(Address entry, CodeBuffer *cbuf, Register tmp) { 423 assert(ReservedCodeCacheSize < 4*G, "branch out of range"); 424 assert(CodeCache::find_blob(entry.target()) != NULL, 425 "destination of far call not found in code cache"); 426 if (far_branches()) { 427 unsigned long offset; 428 // We can use ADRP here because we know that the total size of 429 // the code cache cannot exceed 2Gb. 430 adrp(tmp, entry, offset); 431 add(tmp, tmp, offset); 432 if (cbuf) cbuf->set_insts_mark(); 433 br(tmp); 434 } else { 435 if (cbuf) cbuf->set_insts_mark(); 436 b(entry); 437 } 438 } 439 440 void MacroAssembler::reserved_stack_check() { 441 // testing if reserved zone needs to be enabled 442 Label no_reserved_zone_enabling; 443 444 ldr(rscratch1, Address(rthread, JavaThread::reserved_stack_activation_offset())); 445 cmp(sp, rscratch1); 446 br(Assembler::LO, no_reserved_zone_enabling); 447 448 enter(); // LR and FP are live. 449 lea(rscratch1, CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone)); 450 mov(c_rarg0, rthread); 451 blr(rscratch1); 452 leave(); 453 454 // We have already removed our own frame. 455 // throw_delayed_StackOverflowError will think that it's been 456 // called by our caller. 457 lea(rscratch1, RuntimeAddress(StubRoutines::throw_delayed_StackOverflowError_entry())); 458 br(rscratch1); 459 should_not_reach_here(); 460 461 bind(no_reserved_zone_enabling); 462 } 463 464 int MacroAssembler::biased_locking_enter(Register lock_reg, 465 Register obj_reg, 466 Register swap_reg, 467 Register tmp_reg, 468 bool swap_reg_contains_mark, 469 Label& done, 470 Label* slow_case, 471 BiasedLockingCounters* counters) { 472 assert(UseBiasedLocking, "why call this otherwise?"); 473 assert_different_registers(lock_reg, obj_reg, swap_reg); 474 475 if (PrintBiasedLockingStatistics && counters == NULL) 476 counters = BiasedLocking::counters(); 477 478 assert_different_registers(lock_reg, obj_reg, swap_reg, tmp_reg, rscratch1, rscratch2, noreg); 479 assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits, "biased locking makes assumptions about bit layout"); 480 Address mark_addr (obj_reg, oopDesc::mark_offset_in_bytes()); 481 Address klass_addr (obj_reg, oopDesc::klass_offset_in_bytes()); 482 Address saved_mark_addr(lock_reg, 0); 483 484 // Biased locking 485 // See whether the lock is currently biased toward our thread and 486 // whether the epoch is still valid 487 // Note that the runtime guarantees sufficient alignment of JavaThread 488 // pointers to allow age to be placed into low bits 489 // First check to see whether biasing is even enabled for this object 490 Label cas_label; 491 int null_check_offset = -1; 492 if (!swap_reg_contains_mark) { 493 null_check_offset = offset(); 494 ldr(swap_reg, mark_addr); 495 } 496 andr(tmp_reg, swap_reg, markOopDesc::biased_lock_mask_in_place); 497 cmp(tmp_reg, markOopDesc::biased_lock_pattern); 498 br(Assembler::NE, cas_label); 499 // The bias pattern is present in the object's header. Need to check 500 // whether the bias owner and the epoch are both still current. 501 load_prototype_header(tmp_reg, obj_reg); 502 orr(tmp_reg, tmp_reg, rthread); 503 eor(tmp_reg, swap_reg, tmp_reg); 504 andr(tmp_reg, tmp_reg, ~((int) markOopDesc::age_mask_in_place)); 505 if (counters != NULL) { 506 Label around; 507 cbnz(tmp_reg, around); 508 atomic_incw(Address((address)counters->biased_lock_entry_count_addr()), tmp_reg, rscratch1, rscratch2); 509 b(done); 510 bind(around); 511 } else { 512 cbz(tmp_reg, done); 513 } 514 515 Label try_revoke_bias; 516 Label try_rebias; 517 518 // At this point we know that the header has the bias pattern and 519 // that we are not the bias owner in the current epoch. We need to 520 // figure out more details about the state of the header in order to 521 // know what operations can be legally performed on the object's 522 // header. 523 524 // If the low three bits in the xor result aren't clear, that means 525 // the prototype header is no longer biased and we have to revoke 526 // the bias on this object. 527 andr(rscratch1, tmp_reg, markOopDesc::biased_lock_mask_in_place); 528 cbnz(rscratch1, try_revoke_bias); 529 530 // Biasing is still enabled for this data type. See whether the 531 // epoch of the current bias is still valid, meaning that the epoch 532 // bits of the mark word are equal to the epoch bits of the 533 // prototype header. (Note that the prototype header's epoch bits 534 // only change at a safepoint.) If not, attempt to rebias the object 535 // toward the current thread. Note that we must be absolutely sure 536 // that the current epoch is invalid in order to do this because 537 // otherwise the manipulations it performs on the mark word are 538 // illegal. 539 andr(rscratch1, tmp_reg, markOopDesc::epoch_mask_in_place); 540 cbnz(rscratch1, try_rebias); 541 542 // The epoch of the current bias is still valid but we know nothing 543 // about the owner; it might be set or it might be clear. Try to 544 // acquire the bias of the object using an atomic operation. If this 545 // fails we will go in to the runtime to revoke the object's bias. 546 // Note that we first construct the presumed unbiased header so we 547 // don't accidentally blow away another thread's valid bias. 548 { 549 Label here; 550 mov(rscratch1, markOopDesc::biased_lock_mask_in_place | markOopDesc::age_mask_in_place | markOopDesc::epoch_mask_in_place); 551 andr(swap_reg, swap_reg, rscratch1); 552 orr(tmp_reg, swap_reg, rthread); 553 cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, rscratch1, here, slow_case); 554 // If the biasing toward our thread failed, this means that 555 // another thread succeeded in biasing it toward itself and we 556 // need to revoke that bias. The revocation will occur in the 557 // interpreter runtime in the slow case. 558 bind(here); 559 if (counters != NULL) { 560 atomic_incw(Address((address)counters->anonymously_biased_lock_entry_count_addr()), 561 tmp_reg, rscratch1, rscratch2); 562 } 563 } 564 b(done); 565 566 bind(try_rebias); 567 // At this point we know the epoch has expired, meaning that the 568 // current "bias owner", if any, is actually invalid. Under these 569 // circumstances _only_, we are allowed to use the current header's 570 // value as the comparison value when doing the cas to acquire the 571 // bias in the current epoch. In other words, we allow transfer of 572 // the bias from one thread to another directly in this situation. 573 // 574 // FIXME: due to a lack of registers we currently blow away the age 575 // bits in this situation. Should attempt to preserve them. 576 { 577 Label here; 578 load_prototype_header(tmp_reg, obj_reg); 579 orr(tmp_reg, rthread, tmp_reg); 580 cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, rscratch1, here, slow_case); 581 // If the biasing toward our thread failed, then another thread 582 // succeeded in biasing it toward itself and we need to revoke that 583 // bias. The revocation will occur in the runtime in the slow case. 584 bind(here); 585 if (counters != NULL) { 586 atomic_incw(Address((address)counters->rebiased_lock_entry_count_addr()), 587 tmp_reg, rscratch1, rscratch2); 588 } 589 } 590 b(done); 591 592 bind(try_revoke_bias); 593 // The prototype mark in the klass doesn't have the bias bit set any 594 // more, indicating that objects of this data type are not supposed 595 // to be biased any more. We are going to try to reset the mark of 596 // this object to the prototype value and fall through to the 597 // CAS-based locking scheme. Note that if our CAS fails, it means 598 // that another thread raced us for the privilege of revoking the 599 // bias of this particular object, so it's okay to continue in the 600 // normal locking code. 601 // 602 // FIXME: due to a lack of registers we currently blow away the age 603 // bits in this situation. Should attempt to preserve them. 604 { 605 Label here, nope; 606 load_prototype_header(tmp_reg, obj_reg); 607 cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, rscratch1, here, &nope); 608 bind(here); 609 610 // Fall through to the normal CAS-based lock, because no matter what 611 // the result of the above CAS, some thread must have succeeded in 612 // removing the bias bit from the object's header. 613 if (counters != NULL) { 614 atomic_incw(Address((address)counters->revoked_lock_entry_count_addr()), tmp_reg, 615 rscratch1, rscratch2); 616 } 617 bind(nope); 618 } 619 620 bind(cas_label); 621 622 return null_check_offset; 623 } 624 625 void MacroAssembler::biased_locking_exit(Register obj_reg, Register temp_reg, Label& done) { 626 assert(UseBiasedLocking, "why call this otherwise?"); 627 628 // Check for biased locking unlock case, which is a no-op 629 // Note: we do not have to check the thread ID for two reasons. 630 // First, the interpreter checks for IllegalMonitorStateException at 631 // a higher level. Second, if the bias was revoked while we held the 632 // lock, the object could not be rebiased toward another thread, so 633 // the bias bit would be clear. 634 ldr(temp_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes())); 635 andr(temp_reg, temp_reg, markOopDesc::biased_lock_mask_in_place); 636 cmp(temp_reg, markOopDesc::biased_lock_pattern); 637 br(Assembler::EQ, done); 638 } 639 640 static void pass_arg0(MacroAssembler* masm, Register arg) { 641 if (c_rarg0 != arg ) { 642 masm->mov(c_rarg0, arg); 643 } 644 } 645 646 static void pass_arg1(MacroAssembler* masm, Register arg) { 647 if (c_rarg1 != arg ) { 648 masm->mov(c_rarg1, arg); 649 } 650 } 651 652 static void pass_arg2(MacroAssembler* masm, Register arg) { 653 if (c_rarg2 != arg ) { 654 masm->mov(c_rarg2, arg); 655 } 656 } 657 658 static void pass_arg3(MacroAssembler* masm, Register arg) { 659 if (c_rarg3 != arg ) { 660 masm->mov(c_rarg3, arg); 661 } 662 } 663 664 void MacroAssembler::call_VM_base(Register oop_result, 665 Register java_thread, 666 Register last_java_sp, 667 address entry_point, 668 int number_of_arguments, 669 bool check_exceptions) { 670 // determine java_thread register 671 if (!java_thread->is_valid()) { 672 java_thread = rthread; 673 } 674 675 // determine last_java_sp register 676 if (!last_java_sp->is_valid()) { 677 last_java_sp = esp; 678 } 679 680 // debugging support 681 assert(number_of_arguments >= 0 , "cannot have negative number of arguments"); 682 assert(java_thread == rthread, "unexpected register"); 683 #ifdef ASSERT 684 // TraceBytecodes does not use r12 but saves it over the call, so don't verify 685 // if ((UseCompressedOops || UseCompressedClassPointers) && !TraceBytecodes) verify_heapbase("call_VM_base: heap base corrupted?"); 686 #endif // ASSERT 687 688 assert(java_thread != oop_result , "cannot use the same register for java_thread & oop_result"); 689 assert(java_thread != last_java_sp, "cannot use the same register for java_thread & last_java_sp"); 690 691 // push java thread (becomes first argument of C function) 692 693 mov(c_rarg0, java_thread); 694 695 // set last Java frame before call 696 assert(last_java_sp != rfp, "can't use rfp"); 697 698 Label l; 699 set_last_Java_frame(last_java_sp, rfp, l, rscratch1); 700 701 // do the call, remove parameters 702 MacroAssembler::call_VM_leaf_base(entry_point, number_of_arguments, &l); 703 704 // reset last Java frame 705 // Only interpreter should have to clear fp 706 reset_last_Java_frame(true); 707 708 // C++ interp handles this in the interpreter 709 check_and_handle_popframe(java_thread); 710 check_and_handle_earlyret(java_thread); 711 712 if (check_exceptions) { 713 // check for pending exceptions (java_thread is set upon return) 714 ldr(rscratch1, Address(java_thread, in_bytes(Thread::pending_exception_offset()))); 715 Label ok; 716 cbz(rscratch1, ok); 717 lea(rscratch1, RuntimeAddress(StubRoutines::forward_exception_entry())); 718 br(rscratch1); 719 bind(ok); 720 } 721 722 // get oop result if there is one and reset the value in the thread 723 if (oop_result->is_valid()) { 724 get_vm_result(oop_result, java_thread); 725 } 726 } 727 728 void MacroAssembler::call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions) { 729 call_VM_base(oop_result, noreg, noreg, entry_point, number_of_arguments, check_exceptions); 730 } 731 732 // Maybe emit a call via a trampoline. If the code cache is small 733 // trampolines won't be emitted. 734 735 address MacroAssembler::trampoline_call(Address entry, CodeBuffer *cbuf) { 736 assert(JavaThread::current()->is_Compiler_thread(), "just checking"); 737 assert(entry.rspec().type() == relocInfo::runtime_call_type 738 || entry.rspec().type() == relocInfo::opt_virtual_call_type 739 || entry.rspec().type() == relocInfo::static_call_type 740 || entry.rspec().type() == relocInfo::virtual_call_type, "wrong reloc type"); 741 742 // We need a trampoline if branches are far. 743 if (far_branches()) { 744 // We don't want to emit a trampoline if C2 is generating dummy 745 // code during its branch shortening phase. 746 CompileTask* task = ciEnv::current()->task(); 747 bool in_scratch_emit_size = 748 (task != NULL && is_c2_compile(task->comp_level()) && 749 Compile::current()->in_scratch_emit_size()); 750 if (!in_scratch_emit_size) { 751 address stub = emit_trampoline_stub(offset(), entry.target()); 752 if (stub == NULL) { 753 return NULL; // CodeCache is full 754 } 755 } 756 } 757 758 if (cbuf) cbuf->set_insts_mark(); 759 relocate(entry.rspec()); 760 if (!far_branches()) { 761 bl(entry.target()); 762 } else { 763 bl(pc()); 764 } 765 // just need to return a non-null address 766 return pc(); 767 } 768 769 770 // Emit a trampoline stub for a call to a target which is too far away. 771 // 772 // code sequences: 773 // 774 // call-site: 775 // branch-and-link to <destination> or <trampoline stub> 776 // 777 // Related trampoline stub for this call site in the stub section: 778 // load the call target from the constant pool 779 // branch (LR still points to the call site above) 780 781 address MacroAssembler::emit_trampoline_stub(int insts_call_instruction_offset, 782 address dest) { 783 address stub = start_a_stub(Compile::MAX_stubs_size/2); 784 if (stub == NULL) { 785 return NULL; // CodeBuffer::expand failed 786 } 787 788 // Create a trampoline stub relocation which relates this trampoline stub 789 // with the call instruction at insts_call_instruction_offset in the 790 // instructions code-section. 791 align(wordSize); 792 relocate(trampoline_stub_Relocation::spec(code()->insts()->start() 793 + insts_call_instruction_offset)); 794 const int stub_start_offset = offset(); 795 796 // Now, create the trampoline stub's code: 797 // - load the call 798 // - call 799 Label target; 800 ldr(rscratch1, target); 801 br(rscratch1); 802 bind(target); 803 assert(offset() - stub_start_offset == NativeCallTrampolineStub::data_offset, 804 "should be"); 805 emit_int64((int64_t)dest); 806 807 const address stub_start_addr = addr_at(stub_start_offset); 808 809 assert(is_NativeCallTrampolineStub_at(stub_start_addr), "doesn't look like a trampoline"); 810 811 end_a_stub(); 812 return stub_start_addr; 813 } 814 815 address MacroAssembler::ic_call(address entry, jint method_index) { 816 RelocationHolder rh = virtual_call_Relocation::spec(pc(), method_index); 817 // address const_ptr = long_constant((jlong)Universe::non_oop_word()); 818 // unsigned long offset; 819 // ldr_constant(rscratch2, const_ptr); 820 movptr(rscratch2, (uintptr_t)Universe::non_oop_word()); 821 return trampoline_call(Address(entry, rh)); 822 } 823 824 // Implementation of call_VM versions 825 826 void MacroAssembler::call_VM(Register oop_result, 827 address entry_point, 828 bool check_exceptions) { 829 call_VM_helper(oop_result, entry_point, 0, check_exceptions); 830 } 831 832 void MacroAssembler::call_VM(Register oop_result, 833 address entry_point, 834 Register arg_1, 835 bool check_exceptions) { 836 pass_arg1(this, arg_1); 837 call_VM_helper(oop_result, entry_point, 1, check_exceptions); 838 } 839 840 void MacroAssembler::call_VM(Register oop_result, 841 address entry_point, 842 Register arg_1, 843 Register arg_2, 844 bool check_exceptions) { 845 assert(arg_1 != c_rarg2, "smashed arg"); 846 pass_arg2(this, arg_2); 847 pass_arg1(this, arg_1); 848 call_VM_helper(oop_result, entry_point, 2, check_exceptions); 849 } 850 851 void MacroAssembler::call_VM(Register oop_result, 852 address entry_point, 853 Register arg_1, 854 Register arg_2, 855 Register arg_3, 856 bool check_exceptions) { 857 assert(arg_1 != c_rarg3, "smashed arg"); 858 assert(arg_2 != c_rarg3, "smashed arg"); 859 pass_arg3(this, arg_3); 860 861 assert(arg_1 != c_rarg2, "smashed arg"); 862 pass_arg2(this, arg_2); 863 864 pass_arg1(this, arg_1); 865 call_VM_helper(oop_result, entry_point, 3, check_exceptions); 866 } 867 868 void MacroAssembler::call_VM(Register oop_result, 869 Register last_java_sp, 870 address entry_point, 871 int number_of_arguments, 872 bool check_exceptions) { 873 call_VM_base(oop_result, rthread, last_java_sp, entry_point, number_of_arguments, check_exceptions); 874 } 875 876 void MacroAssembler::call_VM(Register oop_result, 877 Register last_java_sp, 878 address entry_point, 879 Register arg_1, 880 bool check_exceptions) { 881 pass_arg1(this, arg_1); 882 call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions); 883 } 884 885 void MacroAssembler::call_VM(Register oop_result, 886 Register last_java_sp, 887 address entry_point, 888 Register arg_1, 889 Register arg_2, 890 bool check_exceptions) { 891 892 assert(arg_1 != c_rarg2, "smashed arg"); 893 pass_arg2(this, arg_2); 894 pass_arg1(this, arg_1); 895 call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions); 896 } 897 898 void MacroAssembler::call_VM(Register oop_result, 899 Register last_java_sp, 900 address entry_point, 901 Register arg_1, 902 Register arg_2, 903 Register arg_3, 904 bool check_exceptions) { 905 assert(arg_1 != c_rarg3, "smashed arg"); 906 assert(arg_2 != c_rarg3, "smashed arg"); 907 pass_arg3(this, arg_3); 908 assert(arg_1 != c_rarg2, "smashed arg"); 909 pass_arg2(this, arg_2); 910 pass_arg1(this, arg_1); 911 call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions); 912 } 913 914 915 void MacroAssembler::get_vm_result(Register oop_result, Register java_thread) { 916 ldr(oop_result, Address(java_thread, JavaThread::vm_result_offset())); 917 str(zr, Address(java_thread, JavaThread::vm_result_offset())); 918 verify_oop(oop_result, "broken oop in call_VM_base"); 919 } 920 921 void MacroAssembler::get_vm_result_2(Register metadata_result, Register java_thread) { 922 ldr(metadata_result, Address(java_thread, JavaThread::vm_result_2_offset())); 923 str(zr, Address(java_thread, JavaThread::vm_result_2_offset())); 924 } 925 926 void MacroAssembler::align(int modulus) { 927 while (offset() % modulus != 0) nop(); 928 } 929 930 // these are no-ops overridden by InterpreterMacroAssembler 931 932 void MacroAssembler::check_and_handle_earlyret(Register java_thread) { } 933 934 void MacroAssembler::check_and_handle_popframe(Register java_thread) { } 935 936 937 RegisterOrConstant MacroAssembler::delayed_value_impl(intptr_t* delayed_value_addr, 938 Register tmp, 939 int offset) { 940 intptr_t value = *delayed_value_addr; 941 if (value != 0) 942 return RegisterOrConstant(value + offset); 943 944 // load indirectly to solve generation ordering problem 945 ldr(tmp, ExternalAddress((address) delayed_value_addr)); 946 947 if (offset != 0) 948 add(tmp, tmp, offset); 949 950 return RegisterOrConstant(tmp); 951 } 952 953 954 void MacroAssembler:: notify(int type) { 955 if (type == bytecode_start) { 956 // set_last_Java_frame(esp, rfp, (address)NULL); 957 Assembler:: notify(type); 958 // reset_last_Java_frame(true); 959 } 960 else 961 Assembler:: notify(type); 962 } 963 964 // Look up the method for a megamorphic invokeinterface call. 965 // The target method is determined by <intf_klass, itable_index>. 966 // The receiver klass is in recv_klass. 967 // On success, the result will be in method_result, and execution falls through. 968 // On failure, execution transfers to the given label. 969 void MacroAssembler::lookup_interface_method(Register recv_klass, 970 Register intf_klass, 971 RegisterOrConstant itable_index, 972 Register method_result, 973 Register scan_temp, 974 Label& L_no_such_interface, 975 bool return_method) { 976 assert_different_registers(recv_klass, intf_klass, scan_temp); 977 assert_different_registers(method_result, intf_klass, scan_temp); 978 assert(recv_klass != method_result || !return_method, 979 "recv_klass can be destroyed when method isn't needed"); 980 assert(itable_index.is_constant() || itable_index.as_register() == method_result, 981 "caller must use same register for non-constant itable index as for method"); 982 983 // Compute start of first itableOffsetEntry (which is at the end of the vtable) 984 int vtable_base = in_bytes(Klass::vtable_start_offset()); 985 int itentry_off = itableMethodEntry::method_offset_in_bytes(); 986 int scan_step = itableOffsetEntry::size() * wordSize; 987 int vte_size = vtableEntry::size_in_bytes(); 988 assert(vte_size == wordSize, "else adjust times_vte_scale"); 989 990 ldrw(scan_temp, Address(recv_klass, Klass::vtable_length_offset())); 991 992 // %%% Could store the aligned, prescaled offset in the klassoop. 993 // lea(scan_temp, Address(recv_klass, scan_temp, times_vte_scale, vtable_base)); 994 lea(scan_temp, Address(recv_klass, scan_temp, Address::lsl(3))); 995 add(scan_temp, scan_temp, vtable_base); 996 997 if (return_method) { 998 // Adjust recv_klass by scaled itable_index, so we can free itable_index. 999 assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below"); 1000 // lea(recv_klass, Address(recv_klass, itable_index, Address::times_ptr, itentry_off)); 1001 lea(recv_klass, Address(recv_klass, itable_index, Address::lsl(3))); 1002 if (itentry_off) 1003 add(recv_klass, recv_klass, itentry_off); 1004 } 1005 1006 // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) { 1007 // if (scan->interface() == intf) { 1008 // result = (klass + scan->offset() + itable_index); 1009 // } 1010 // } 1011 Label search, found_method; 1012 1013 for (int peel = 1; peel >= 0; peel--) { 1014 ldr(method_result, Address(scan_temp, itableOffsetEntry::interface_offset_in_bytes())); 1015 cmp(intf_klass, method_result); 1016 1017 if (peel) { 1018 br(Assembler::EQ, found_method); 1019 } else { 1020 br(Assembler::NE, search); 1021 // (invert the test to fall through to found_method...) 1022 } 1023 1024 if (!peel) break; 1025 1026 bind(search); 1027 1028 // Check that the previous entry is non-null. A null entry means that 1029 // the receiver class doesn't implement the interface, and wasn't the 1030 // same as when the caller was compiled. 1031 cbz(method_result, L_no_such_interface); 1032 add(scan_temp, scan_temp, scan_step); 1033 } 1034 1035 bind(found_method); 1036 1037 // Got a hit. 1038 if (return_method) { 1039 ldrw(scan_temp, Address(scan_temp, itableOffsetEntry::offset_offset_in_bytes())); 1040 ldr(method_result, Address(recv_klass, scan_temp, Address::uxtw(0))); 1041 } 1042 } 1043 1044 // virtual method calling 1045 void MacroAssembler::lookup_virtual_method(Register recv_klass, 1046 RegisterOrConstant vtable_index, 1047 Register method_result) { 1048 const int base = in_bytes(Klass::vtable_start_offset()); 1049 assert(vtableEntry::size() * wordSize == 8, 1050 "adjust the scaling in the code below"); 1051 int vtable_offset_in_bytes = base + vtableEntry::method_offset_in_bytes(); 1052 1053 if (vtable_index.is_register()) { 1054 lea(method_result, Address(recv_klass, 1055 vtable_index.as_register(), 1056 Address::lsl(LogBytesPerWord))); 1057 ldr(method_result, Address(method_result, vtable_offset_in_bytes)); 1058 } else { 1059 vtable_offset_in_bytes += vtable_index.as_constant() * wordSize; 1060 ldr(method_result, 1061 form_address(rscratch1, recv_klass, vtable_offset_in_bytes, 0)); 1062 } 1063 } 1064 1065 void MacroAssembler::check_klass_subtype(Register sub_klass, 1066 Register super_klass, 1067 Register temp_reg, 1068 Label& L_success) { 1069 Label L_failure; 1070 check_klass_subtype_fast_path(sub_klass, super_klass, temp_reg, &L_success, &L_failure, NULL); 1071 check_klass_subtype_slow_path(sub_klass, super_klass, temp_reg, noreg, &L_success, NULL); 1072 bind(L_failure); 1073 } 1074 1075 1076 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass, 1077 Register super_klass, 1078 Register temp_reg, 1079 Label* L_success, 1080 Label* L_failure, 1081 Label* L_slow_path, 1082 RegisterOrConstant super_check_offset) { 1083 assert_different_registers(sub_klass, super_klass, temp_reg); 1084 bool must_load_sco = (super_check_offset.constant_or_zero() == -1); 1085 if (super_check_offset.is_register()) { 1086 assert_different_registers(sub_klass, super_klass, 1087 super_check_offset.as_register()); 1088 } else if (must_load_sco) { 1089 assert(temp_reg != noreg, "supply either a temp or a register offset"); 1090 } 1091 1092 Label L_fallthrough; 1093 int label_nulls = 0; 1094 if (L_success == NULL) { L_success = &L_fallthrough; label_nulls++; } 1095 if (L_failure == NULL) { L_failure = &L_fallthrough; label_nulls++; } 1096 if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; } 1097 assert(label_nulls <= 1, "at most one NULL in the batch"); 1098 1099 int sc_offset = in_bytes(Klass::secondary_super_cache_offset()); 1100 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 1101 Address super_check_offset_addr(super_klass, sco_offset); 1102 1103 // Hacked jmp, which may only be used just before L_fallthrough. 1104 #define final_jmp(label) \ 1105 if (&(label) == &L_fallthrough) { /*do nothing*/ } \ 1106 else b(label) /*omit semi*/ 1107 1108 // If the pointers are equal, we are done (e.g., String[] elements). 1109 // This self-check enables sharing of secondary supertype arrays among 1110 // non-primary types such as array-of-interface. Otherwise, each such 1111 // type would need its own customized SSA. 1112 // We move this check to the front of the fast path because many 1113 // type checks are in fact trivially successful in this manner, 1114 // so we get a nicely predicted branch right at the start of the check. 1115 cmp(sub_klass, super_klass); 1116 br(Assembler::EQ, *L_success); 1117 1118 // Check the supertype display: 1119 if (must_load_sco) { 1120 ldrw(temp_reg, super_check_offset_addr); 1121 super_check_offset = RegisterOrConstant(temp_reg); 1122 } 1123 Address super_check_addr(sub_klass, super_check_offset); 1124 ldr(rscratch1, super_check_addr); 1125 cmp(super_klass, rscratch1); // load displayed supertype 1126 1127 // This check has worked decisively for primary supers. 1128 // Secondary supers are sought in the super_cache ('super_cache_addr'). 1129 // (Secondary supers are interfaces and very deeply nested subtypes.) 1130 // This works in the same check above because of a tricky aliasing 1131 // between the super_cache and the primary super display elements. 1132 // (The 'super_check_addr' can address either, as the case requires.) 1133 // Note that the cache is updated below if it does not help us find 1134 // what we need immediately. 1135 // So if it was a primary super, we can just fail immediately. 1136 // Otherwise, it's the slow path for us (no success at this point). 1137 1138 if (super_check_offset.is_register()) { 1139 br(Assembler::EQ, *L_success); 1140 cmp(super_check_offset.as_register(), sc_offset); 1141 if (L_failure == &L_fallthrough) { 1142 br(Assembler::EQ, *L_slow_path); 1143 } else { 1144 br(Assembler::NE, *L_failure); 1145 final_jmp(*L_slow_path); 1146 } 1147 } else if (super_check_offset.as_constant() == sc_offset) { 1148 // Need a slow path; fast failure is impossible. 1149 if (L_slow_path == &L_fallthrough) { 1150 br(Assembler::EQ, *L_success); 1151 } else { 1152 br(Assembler::NE, *L_slow_path); 1153 final_jmp(*L_success); 1154 } 1155 } else { 1156 // No slow path; it's a fast decision. 1157 if (L_failure == &L_fallthrough) { 1158 br(Assembler::EQ, *L_success); 1159 } else { 1160 br(Assembler::NE, *L_failure); 1161 final_jmp(*L_success); 1162 } 1163 } 1164 1165 bind(L_fallthrough); 1166 1167 #undef final_jmp 1168 } 1169 1170 // These two are taken from x86, but they look generally useful 1171 1172 // scans count pointer sized words at [addr] for occurence of value, 1173 // generic 1174 void MacroAssembler::repne_scan(Register addr, Register value, Register count, 1175 Register scratch) { 1176 Label Lloop, Lexit; 1177 cbz(count, Lexit); 1178 bind(Lloop); 1179 ldr(scratch, post(addr, wordSize)); 1180 cmp(value, scratch); 1181 br(EQ, Lexit); 1182 sub(count, count, 1); 1183 cbnz(count, Lloop); 1184 bind(Lexit); 1185 } 1186 1187 // scans count 4 byte words at [addr] for occurence of value, 1188 // generic 1189 void MacroAssembler::repne_scanw(Register addr, Register value, Register count, 1190 Register scratch) { 1191 Label Lloop, Lexit; 1192 cbz(count, Lexit); 1193 bind(Lloop); 1194 ldrw(scratch, post(addr, wordSize)); 1195 cmpw(value, scratch); 1196 br(EQ, Lexit); 1197 sub(count, count, 1); 1198 cbnz(count, Lloop); 1199 bind(Lexit); 1200 } 1201 1202 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass, 1203 Register super_klass, 1204 Register temp_reg, 1205 Register temp2_reg, 1206 Label* L_success, 1207 Label* L_failure, 1208 bool set_cond_codes) { 1209 assert_different_registers(sub_klass, super_klass, temp_reg); 1210 if (temp2_reg != noreg) 1211 assert_different_registers(sub_klass, super_klass, temp_reg, temp2_reg, rscratch1); 1212 #define IS_A_TEMP(reg) ((reg) == temp_reg || (reg) == temp2_reg) 1213 1214 Label L_fallthrough; 1215 int label_nulls = 0; 1216 if (L_success == NULL) { L_success = &L_fallthrough; label_nulls++; } 1217 if (L_failure == NULL) { L_failure = &L_fallthrough; label_nulls++; } 1218 assert(label_nulls <= 1, "at most one NULL in the batch"); 1219 1220 // a couple of useful fields in sub_klass: 1221 int ss_offset = in_bytes(Klass::secondary_supers_offset()); 1222 int sc_offset = in_bytes(Klass::secondary_super_cache_offset()); 1223 Address secondary_supers_addr(sub_klass, ss_offset); 1224 Address super_cache_addr( sub_klass, sc_offset); 1225 1226 BLOCK_COMMENT("check_klass_subtype_slow_path"); 1227 1228 // Do a linear scan of the secondary super-klass chain. 1229 // This code is rarely used, so simplicity is a virtue here. 1230 // The repne_scan instruction uses fixed registers, which we must spill. 1231 // Don't worry too much about pre-existing connections with the input regs. 1232 1233 assert(sub_klass != r0, "killed reg"); // killed by mov(r0, super) 1234 assert(sub_klass != r2, "killed reg"); // killed by lea(r2, &pst_counter) 1235 1236 RegSet pushed_registers; 1237 if (!IS_A_TEMP(r2)) pushed_registers += r2; 1238 if (!IS_A_TEMP(r5)) pushed_registers += r5; 1239 1240 if (super_klass != r0 || UseCompressedOops) { 1241 if (!IS_A_TEMP(r0)) pushed_registers += r0; 1242 } 1243 1244 push(pushed_registers, sp); 1245 1246 // Get super_klass value into r0 (even if it was in r5 or r2). 1247 if (super_klass != r0) { 1248 mov(r0, super_klass); 1249 } 1250 1251 #ifndef PRODUCT 1252 mov(rscratch2, (address)&SharedRuntime::_partial_subtype_ctr); 1253 Address pst_counter_addr(rscratch2); 1254 ldr(rscratch1, pst_counter_addr); 1255 add(rscratch1, rscratch1, 1); 1256 str(rscratch1, pst_counter_addr); 1257 #endif //PRODUCT 1258 1259 // We will consult the secondary-super array. 1260 ldr(r5, secondary_supers_addr); 1261 // Load the array length. 1262 ldrw(r2, Address(r5, Array<Klass*>::length_offset_in_bytes())); 1263 // Skip to start of data. 1264 add(r5, r5, Array<Klass*>::base_offset_in_bytes()); 1265 1266 cmp(sp, zr); // Clear Z flag; SP is never zero 1267 // Scan R2 words at [R5] for an occurrence of R0. 1268 // Set NZ/Z based on last compare. 1269 repne_scan(r5, r0, r2, rscratch1); 1270 1271 // Unspill the temp. registers: 1272 pop(pushed_registers, sp); 1273 1274 br(Assembler::NE, *L_failure); 1275 1276 // Success. Cache the super we found and proceed in triumph. 1277 str(super_klass, super_cache_addr); 1278 1279 if (L_success != &L_fallthrough) { 1280 b(*L_success); 1281 } 1282 1283 #undef IS_A_TEMP 1284 1285 bind(L_fallthrough); 1286 } 1287 1288 1289 void MacroAssembler::verify_oop(Register reg, const char* s) { 1290 if (!VerifyOops) return; 1291 1292 // Pass register number to verify_oop_subroutine 1293 const char* b = NULL; 1294 { 1295 ResourceMark rm; 1296 stringStream ss; 1297 ss.print("verify_oop: %s: %s", reg->name(), s); 1298 b = code_string(ss.as_string()); 1299 } 1300 BLOCK_COMMENT("verify_oop {"); 1301 1302 stp(r0, rscratch1, Address(pre(sp, -2 * wordSize))); 1303 stp(rscratch2, lr, Address(pre(sp, -2 * wordSize))); 1304 1305 mov(r0, reg); 1306 mov(rscratch1, (address)b); 1307 1308 // call indirectly to solve generation ordering problem 1309 lea(rscratch2, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address())); 1310 ldr(rscratch2, Address(rscratch2)); 1311 blr(rscratch2); 1312 1313 ldp(rscratch2, lr, Address(post(sp, 2 * wordSize))); 1314 ldp(r0, rscratch1, Address(post(sp, 2 * wordSize))); 1315 1316 BLOCK_COMMENT("} verify_oop"); 1317 } 1318 1319 void MacroAssembler::verify_oop_addr(Address addr, const char* s) { 1320 if (!VerifyOops) return; 1321 1322 const char* b = NULL; 1323 { 1324 ResourceMark rm; 1325 stringStream ss; 1326 ss.print("verify_oop_addr: %s", s); 1327 b = code_string(ss.as_string()); 1328 } 1329 BLOCK_COMMENT("verify_oop_addr {"); 1330 1331 stp(r0, rscratch1, Address(pre(sp, -2 * wordSize))); 1332 stp(rscratch2, lr, Address(pre(sp, -2 * wordSize))); 1333 1334 // addr may contain sp so we will have to adjust it based on the 1335 // pushes that we just did. 1336 if (addr.uses(sp)) { 1337 lea(r0, addr); 1338 ldr(r0, Address(r0, 4 * wordSize)); 1339 } else { 1340 ldr(r0, addr); 1341 } 1342 mov(rscratch1, (address)b); 1343 1344 // call indirectly to solve generation ordering problem 1345 lea(rscratch2, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address())); 1346 ldr(rscratch2, Address(rscratch2)); 1347 blr(rscratch2); 1348 1349 ldp(rscratch2, lr, Address(post(sp, 2 * wordSize))); 1350 ldp(r0, rscratch1, Address(post(sp, 2 * wordSize))); 1351 1352 BLOCK_COMMENT("} verify_oop_addr"); 1353 } 1354 1355 Address MacroAssembler::argument_address(RegisterOrConstant arg_slot, 1356 int extra_slot_offset) { 1357 // cf. TemplateTable::prepare_invoke(), if (load_receiver). 1358 int stackElementSize = Interpreter::stackElementSize; 1359 int offset = Interpreter::expr_offset_in_bytes(extra_slot_offset+0); 1360 #ifdef ASSERT 1361 int offset1 = Interpreter::expr_offset_in_bytes(extra_slot_offset+1); 1362 assert(offset1 - offset == stackElementSize, "correct arithmetic"); 1363 #endif 1364 if (arg_slot.is_constant()) { 1365 return Address(esp, arg_slot.as_constant() * stackElementSize 1366 + offset); 1367 } else { 1368 add(rscratch1, esp, arg_slot.as_register(), 1369 ext::uxtx, exact_log2(stackElementSize)); 1370 return Address(rscratch1, offset); 1371 } 1372 } 1373 1374 void MacroAssembler::call_VM_leaf_base(address entry_point, 1375 int number_of_arguments, 1376 Label *retaddr) { 1377 call_VM_leaf_base1(entry_point, number_of_arguments, 0, ret_type_integral, retaddr); 1378 } 1379 1380 void MacroAssembler::call_VM_leaf_base1(address entry_point, 1381 int number_of_gp_arguments, 1382 int number_of_fp_arguments, 1383 ret_type type, 1384 Label *retaddr) { 1385 Label E, L; 1386 1387 stp(rscratch1, rmethod, Address(pre(sp, -2 * wordSize))); 1388 1389 // We add 1 to number_of_arguments because the thread in arg0 is 1390 // not counted 1391 mov(rscratch1, entry_point); 1392 blrt(rscratch1, number_of_gp_arguments + 1, number_of_fp_arguments, type); 1393 if (retaddr) 1394 bind(*retaddr); 1395 1396 ldp(rscratch1, rmethod, Address(post(sp, 2 * wordSize))); 1397 maybe_isb(); 1398 } 1399 1400 void MacroAssembler::call_VM_leaf(address entry_point, int number_of_arguments) { 1401 call_VM_leaf_base(entry_point, number_of_arguments); 1402 } 1403 1404 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0) { 1405 pass_arg0(this, arg_0); 1406 call_VM_leaf_base(entry_point, 1); 1407 } 1408 1409 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1) { 1410 pass_arg0(this, arg_0); 1411 pass_arg1(this, arg_1); 1412 call_VM_leaf_base(entry_point, 2); 1413 } 1414 1415 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, 1416 Register arg_1, Register arg_2) { 1417 pass_arg0(this, arg_0); 1418 pass_arg1(this, arg_1); 1419 pass_arg2(this, arg_2); 1420 call_VM_leaf_base(entry_point, 3); 1421 } 1422 1423 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0) { 1424 pass_arg0(this, arg_0); 1425 MacroAssembler::call_VM_leaf_base(entry_point, 1); 1426 } 1427 1428 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1) { 1429 1430 assert(arg_0 != c_rarg1, "smashed arg"); 1431 pass_arg1(this, arg_1); 1432 pass_arg0(this, arg_0); 1433 MacroAssembler::call_VM_leaf_base(entry_point, 2); 1434 } 1435 1436 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) { 1437 assert(arg_0 != c_rarg2, "smashed arg"); 1438 assert(arg_1 != c_rarg2, "smashed arg"); 1439 pass_arg2(this, arg_2); 1440 assert(arg_0 != c_rarg1, "smashed arg"); 1441 pass_arg1(this, arg_1); 1442 pass_arg0(this, arg_0); 1443 MacroAssembler::call_VM_leaf_base(entry_point, 3); 1444 } 1445 1446 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) { 1447 assert(arg_0 != c_rarg3, "smashed arg"); 1448 assert(arg_1 != c_rarg3, "smashed arg"); 1449 assert(arg_2 != c_rarg3, "smashed arg"); 1450 pass_arg3(this, arg_3); 1451 assert(arg_0 != c_rarg2, "smashed arg"); 1452 assert(arg_1 != c_rarg2, "smashed arg"); 1453 pass_arg2(this, arg_2); 1454 assert(arg_0 != c_rarg1, "smashed arg"); 1455 pass_arg1(this, arg_1); 1456 pass_arg0(this, arg_0); 1457 MacroAssembler::call_VM_leaf_base(entry_point, 4); 1458 } 1459 1460 void MacroAssembler::null_check(Register reg, int offset) { 1461 if (needs_explicit_null_check(offset)) { 1462 // provoke OS NULL exception if reg = NULL by 1463 // accessing M[reg] w/o changing any registers 1464 // NOTE: this is plenty to provoke a segv 1465 ldr(zr, Address(reg)); 1466 } else { 1467 // nothing to do, (later) access of M[reg + offset] 1468 // will provoke OS NULL exception if reg = NULL 1469 } 1470 } 1471 1472 // MacroAssembler protected routines needed to implement 1473 // public methods 1474 1475 void MacroAssembler::mov(Register r, Address dest) { 1476 code_section()->relocate(pc(), dest.rspec()); 1477 u_int64_t imm64 = (u_int64_t)dest.target(); 1478 movptr(r, imm64); 1479 } 1480 1481 // Move a constant pointer into r. In AArch64 mode the virtual 1482 // address space is 48 bits in size, so we only need three 1483 // instructions to create a patchable instruction sequence that can 1484 // reach anywhere. 1485 void MacroAssembler::movptr(Register r, uintptr_t imm64) { 1486 #ifndef PRODUCT 1487 { 1488 char buffer[64]; 1489 snprintf(buffer, sizeof(buffer), "0x%"PRIX64, imm64); 1490 block_comment(buffer); 1491 } 1492 #endif 1493 assert(imm64 < (1ul << 48), "48-bit overflow in address constant"); 1494 movz(r, imm64 & 0xffff); 1495 imm64 >>= 16; 1496 movk(r, imm64 & 0xffff, 16); 1497 imm64 >>= 16; 1498 movk(r, imm64 & 0xffff, 32); 1499 } 1500 1501 // Macro to mov replicated immediate to vector register. 1502 // Vd will get the following values for different arrangements in T 1503 // imm32 == hex 000000gh T8B: Vd = ghghghghghghghgh 1504 // imm32 == hex 000000gh T16B: Vd = ghghghghghghghghghghghghghghghgh 1505 // imm32 == hex 0000efgh T4H: Vd = efghefghefghefgh 1506 // imm32 == hex 0000efgh T8H: Vd = efghefghefghefghefghefghefghefgh 1507 // imm32 == hex abcdefgh T2S: Vd = abcdefghabcdefgh 1508 // imm32 == hex abcdefgh T4S: Vd = abcdefghabcdefghabcdefghabcdefgh 1509 // T1D/T2D: invalid 1510 void MacroAssembler::mov(FloatRegister Vd, SIMD_Arrangement T, u_int32_t imm32) { 1511 assert(T != T1D && T != T2D, "invalid arrangement"); 1512 if (T == T8B || T == T16B) { 1513 assert((imm32 & ~0xff) == 0, "extraneous bits in unsigned imm32 (T8B/T16B)"); 1514 movi(Vd, T, imm32 & 0xff, 0); 1515 return; 1516 } 1517 u_int32_t nimm32 = ~imm32; 1518 if (T == T4H || T == T8H) { 1519 assert((imm32 & ~0xffff) == 0, "extraneous bits in unsigned imm32 (T4H/T8H)"); 1520 imm32 &= 0xffff; 1521 nimm32 &= 0xffff; 1522 } 1523 u_int32_t x = imm32; 1524 int movi_cnt = 0; 1525 int movn_cnt = 0; 1526 while (x) { if (x & 0xff) movi_cnt++; x >>= 8; } 1527 x = nimm32; 1528 while (x) { if (x & 0xff) movn_cnt++; x >>= 8; } 1529 if (movn_cnt < movi_cnt) imm32 = nimm32; 1530 unsigned lsl = 0; 1531 while (imm32 && (imm32 & 0xff) == 0) { lsl += 8; imm32 >>= 8; } 1532 if (movn_cnt < movi_cnt) 1533 mvni(Vd, T, imm32 & 0xff, lsl); 1534 else 1535 movi(Vd, T, imm32 & 0xff, lsl); 1536 imm32 >>= 8; lsl += 8; 1537 while (imm32) { 1538 while ((imm32 & 0xff) == 0) { lsl += 8; imm32 >>= 8; } 1539 if (movn_cnt < movi_cnt) 1540 bici(Vd, T, imm32 & 0xff, lsl); 1541 else 1542 orri(Vd, T, imm32 & 0xff, lsl); 1543 lsl += 8; imm32 >>= 8; 1544 } 1545 } 1546 1547 void MacroAssembler::mov_immediate64(Register dst, u_int64_t imm64) 1548 { 1549 #ifndef PRODUCT 1550 { 1551 char buffer[64]; 1552 snprintf(buffer, sizeof(buffer), "0x%"PRIX64, imm64); 1553 block_comment(buffer); 1554 } 1555 #endif 1556 if (operand_valid_for_logical_immediate(false, imm64)) { 1557 orr(dst, zr, imm64); 1558 } else { 1559 // we can use a combination of MOVZ or MOVN with 1560 // MOVK to build up the constant 1561 u_int64_t imm_h[4]; 1562 int zero_count = 0; 1563 int neg_count = 0; 1564 int i; 1565 for (i = 0; i < 4; i++) { 1566 imm_h[i] = ((imm64 >> (i * 16)) & 0xffffL); 1567 if (imm_h[i] == 0) { 1568 zero_count++; 1569 } else if (imm_h[i] == 0xffffL) { 1570 neg_count++; 1571 } 1572 } 1573 if (zero_count == 4) { 1574 // one MOVZ will do 1575 movz(dst, 0); 1576 } else if (neg_count == 4) { 1577 // one MOVN will do 1578 movn(dst, 0); 1579 } else if (zero_count == 3) { 1580 for (i = 0; i < 4; i++) { 1581 if (imm_h[i] != 0L) { 1582 movz(dst, (u_int32_t)imm_h[i], (i << 4)); 1583 break; 1584 } 1585 } 1586 } else if (neg_count == 3) { 1587 // one MOVN will do 1588 for (int i = 0; i < 4; i++) { 1589 if (imm_h[i] != 0xffffL) { 1590 movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4)); 1591 break; 1592 } 1593 } 1594 } else if (zero_count == 2) { 1595 // one MOVZ and one MOVK will do 1596 for (i = 0; i < 3; i++) { 1597 if (imm_h[i] != 0L) { 1598 movz(dst, (u_int32_t)imm_h[i], (i << 4)); 1599 i++; 1600 break; 1601 } 1602 } 1603 for (;i < 4; i++) { 1604 if (imm_h[i] != 0L) { 1605 movk(dst, (u_int32_t)imm_h[i], (i << 4)); 1606 } 1607 } 1608 } else if (neg_count == 2) { 1609 // one MOVN and one MOVK will do 1610 for (i = 0; i < 4; i++) { 1611 if (imm_h[i] != 0xffffL) { 1612 movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4)); 1613 i++; 1614 break; 1615 } 1616 } 1617 for (;i < 4; i++) { 1618 if (imm_h[i] != 0xffffL) { 1619 movk(dst, (u_int32_t)imm_h[i], (i << 4)); 1620 } 1621 } 1622 } else if (zero_count == 1) { 1623 // one MOVZ and two MOVKs will do 1624 for (i = 0; i < 4; i++) { 1625 if (imm_h[i] != 0L) { 1626 movz(dst, (u_int32_t)imm_h[i], (i << 4)); 1627 i++; 1628 break; 1629 } 1630 } 1631 for (;i < 4; i++) { 1632 if (imm_h[i] != 0x0L) { 1633 movk(dst, (u_int32_t)imm_h[i], (i << 4)); 1634 } 1635 } 1636 } else if (neg_count == 1) { 1637 // one MOVN and two MOVKs will do 1638 for (i = 0; i < 4; i++) { 1639 if (imm_h[i] != 0xffffL) { 1640 movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4)); 1641 i++; 1642 break; 1643 } 1644 } 1645 for (;i < 4; i++) { 1646 if (imm_h[i] != 0xffffL) { 1647 movk(dst, (u_int32_t)imm_h[i], (i << 4)); 1648 } 1649 } 1650 } else { 1651 // use a MOVZ and 3 MOVKs (makes it easier to debug) 1652 movz(dst, (u_int32_t)imm_h[0], 0); 1653 for (i = 1; i < 4; i++) { 1654 movk(dst, (u_int32_t)imm_h[i], (i << 4)); 1655 } 1656 } 1657 } 1658 } 1659 1660 void MacroAssembler::mov_immediate32(Register dst, u_int32_t imm32) 1661 { 1662 #ifndef PRODUCT 1663 { 1664 char buffer[64]; 1665 snprintf(buffer, sizeof(buffer), "0x%"PRIX32, imm32); 1666 block_comment(buffer); 1667 } 1668 #endif 1669 if (operand_valid_for_logical_immediate(true, imm32)) { 1670 orrw(dst, zr, imm32); 1671 } else { 1672 // we can use MOVZ, MOVN or two calls to MOVK to build up the 1673 // constant 1674 u_int32_t imm_h[2]; 1675 imm_h[0] = imm32 & 0xffff; 1676 imm_h[1] = ((imm32 >> 16) & 0xffff); 1677 if (imm_h[0] == 0) { 1678 movzw(dst, imm_h[1], 16); 1679 } else if (imm_h[0] == 0xffff) { 1680 movnw(dst, imm_h[1] ^ 0xffff, 16); 1681 } else if (imm_h[1] == 0) { 1682 movzw(dst, imm_h[0], 0); 1683 } else if (imm_h[1] == 0xffff) { 1684 movnw(dst, imm_h[0] ^ 0xffff, 0); 1685 } else { 1686 // use a MOVZ and MOVK (makes it easier to debug) 1687 movzw(dst, imm_h[0], 0); 1688 movkw(dst, imm_h[1], 16); 1689 } 1690 } 1691 } 1692 1693 // Form an address from base + offset in Rd. Rd may or may 1694 // not actually be used: you must use the Address that is returned. 1695 // It is up to you to ensure that the shift provided matches the size 1696 // of your data. 1697 Address MacroAssembler::form_address(Register Rd, Register base, long byte_offset, int shift) { 1698 if (Address::offset_ok_for_immed(byte_offset, shift)) 1699 // It fits; no need for any heroics 1700 return Address(base, byte_offset); 1701 1702 // Don't do anything clever with negative or misaligned offsets 1703 unsigned mask = (1 << shift) - 1; 1704 if (byte_offset < 0 || byte_offset & mask) { 1705 mov(Rd, byte_offset); 1706 add(Rd, base, Rd); 1707 return Address(Rd); 1708 } 1709 1710 // See if we can do this with two 12-bit offsets 1711 { 1712 unsigned long word_offset = byte_offset >> shift; 1713 unsigned long masked_offset = word_offset & 0xfff000; 1714 if (Address::offset_ok_for_immed(word_offset - masked_offset) 1715 && Assembler::operand_valid_for_add_sub_immediate(masked_offset << shift)) { 1716 add(Rd, base, masked_offset << shift); 1717 word_offset -= masked_offset; 1718 return Address(Rd, word_offset << shift); 1719 } 1720 } 1721 1722 // Do it the hard way 1723 mov(Rd, byte_offset); 1724 add(Rd, base, Rd); 1725 return Address(Rd); 1726 } 1727 1728 void MacroAssembler::atomic_incw(Register counter_addr, Register tmp, Register tmp2) { 1729 if (UseLSE) { 1730 mov(tmp, 1); 1731 ldadd(Assembler::word, tmp, zr, counter_addr); 1732 return; 1733 } 1734 Label retry_load; 1735 if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH)) 1736 prfm(Address(counter_addr), PSTL1STRM); 1737 bind(retry_load); 1738 // flush and load exclusive from the memory location 1739 ldxrw(tmp, counter_addr); 1740 addw(tmp, tmp, 1); 1741 // if we store+flush with no intervening write tmp wil be zero 1742 stxrw(tmp2, tmp, counter_addr); 1743 cbnzw(tmp2, retry_load); 1744 } 1745 1746 1747 int MacroAssembler::corrected_idivl(Register result, Register ra, Register rb, 1748 bool want_remainder, Register scratch) 1749 { 1750 // Full implementation of Java idiv and irem. The function 1751 // returns the (pc) offset of the div instruction - may be needed 1752 // for implicit exceptions. 1753 // 1754 // constraint : ra/rb =/= scratch 1755 // normal case 1756 // 1757 // input : ra: dividend 1758 // rb: divisor 1759 // 1760 // result: either 1761 // quotient (= ra idiv rb) 1762 // remainder (= ra irem rb) 1763 1764 assert(ra != scratch && rb != scratch, "reg cannot be scratch"); 1765 1766 int idivl_offset = offset(); 1767 if (! want_remainder) { 1768 sdivw(result, ra, rb); 1769 } else { 1770 sdivw(scratch, ra, rb); 1771 Assembler::msubw(result, scratch, rb, ra); 1772 } 1773 1774 return idivl_offset; 1775 } 1776 1777 int MacroAssembler::corrected_idivq(Register result, Register ra, Register rb, 1778 bool want_remainder, Register scratch) 1779 { 1780 // Full implementation of Java ldiv and lrem. The function 1781 // returns the (pc) offset of the div instruction - may be needed 1782 // for implicit exceptions. 1783 // 1784 // constraint : ra/rb =/= scratch 1785 // normal case 1786 // 1787 // input : ra: dividend 1788 // rb: divisor 1789 // 1790 // result: either 1791 // quotient (= ra idiv rb) 1792 // remainder (= ra irem rb) 1793 1794 assert(ra != scratch && rb != scratch, "reg cannot be scratch"); 1795 1796 int idivq_offset = offset(); 1797 if (! want_remainder) { 1798 sdiv(result, ra, rb); 1799 } else { 1800 sdiv(scratch, ra, rb); 1801 Assembler::msub(result, scratch, rb, ra); 1802 } 1803 1804 return idivq_offset; 1805 } 1806 1807 void MacroAssembler::membar(Membar_mask_bits order_constraint) { 1808 address prev = pc() - NativeMembar::instruction_size; 1809 address last = code()->last_insn(); 1810 if (last != NULL && nativeInstruction_at(last)->is_Membar() && prev == last) { 1811 NativeMembar *bar = NativeMembar_at(prev); 1812 // We are merging two memory barrier instructions. On AArch64 we 1813 // can do this simply by ORing them together. 1814 bar->set_kind(bar->get_kind() | order_constraint); 1815 BLOCK_COMMENT("merged membar"); 1816 } else { 1817 code()->set_last_insn(pc()); 1818 dmb(Assembler::barrier(order_constraint)); 1819 } 1820 } 1821 1822 bool MacroAssembler::try_merge_ldst(Register rt, const Address &adr, size_t size_in_bytes, bool is_store) { 1823 if (ldst_can_merge(rt, adr, size_in_bytes, is_store)) { 1824 merge_ldst(rt, adr, size_in_bytes, is_store); 1825 code()->clear_last_insn(); 1826 return true; 1827 } else { 1828 assert(size_in_bytes == 8 || size_in_bytes == 4, "only 8 bytes or 4 bytes load/store is supported."); 1829 const unsigned mask = size_in_bytes - 1; 1830 if (adr.getMode() == Address::base_plus_offset && 1831 (adr.offset() & mask) == 0) { // only supports base_plus_offset. 1832 code()->set_last_insn(pc()); 1833 } 1834 return false; 1835 } 1836 } 1837 1838 void MacroAssembler::ldr(Register Rx, const Address &adr) { 1839 // We always try to merge two adjacent loads into one ldp. 1840 if (!try_merge_ldst(Rx, adr, 8, false)) { 1841 Assembler::ldr(Rx, adr); 1842 } 1843 } 1844 1845 void MacroAssembler::ldrw(Register Rw, const Address &adr) { 1846 // We always try to merge two adjacent loads into one ldp. 1847 if (!try_merge_ldst(Rw, adr, 4, false)) { 1848 Assembler::ldrw(Rw, adr); 1849 } 1850 } 1851 1852 void MacroAssembler::str(Register Rx, const Address &adr) { 1853 // We always try to merge two adjacent stores into one stp. 1854 if (!try_merge_ldst(Rx, adr, 8, true)) { 1855 Assembler::str(Rx, adr); 1856 } 1857 } 1858 1859 void MacroAssembler::strw(Register Rw, const Address &adr) { 1860 // We always try to merge two adjacent stores into one stp. 1861 if (!try_merge_ldst(Rw, adr, 4, true)) { 1862 Assembler::strw(Rw, adr); 1863 } 1864 } 1865 1866 // MacroAssembler routines found actually to be needed 1867 1868 void MacroAssembler::push(Register src) 1869 { 1870 str(src, Address(pre(esp, -1 * wordSize))); 1871 } 1872 1873 void MacroAssembler::pop(Register dst) 1874 { 1875 ldr(dst, Address(post(esp, 1 * wordSize))); 1876 } 1877 1878 // Note: load_unsigned_short used to be called load_unsigned_word. 1879 int MacroAssembler::load_unsigned_short(Register dst, Address src) { 1880 int off = offset(); 1881 ldrh(dst, src); 1882 return off; 1883 } 1884 1885 int MacroAssembler::load_unsigned_byte(Register dst, Address src) { 1886 int off = offset(); 1887 ldrb(dst, src); 1888 return off; 1889 } 1890 1891 int MacroAssembler::load_signed_short(Register dst, Address src) { 1892 int off = offset(); 1893 ldrsh(dst, src); 1894 return off; 1895 } 1896 1897 int MacroAssembler::load_signed_byte(Register dst, Address src) { 1898 int off = offset(); 1899 ldrsb(dst, src); 1900 return off; 1901 } 1902 1903 int MacroAssembler::load_signed_short32(Register dst, Address src) { 1904 int off = offset(); 1905 ldrshw(dst, src); 1906 return off; 1907 } 1908 1909 int MacroAssembler::load_signed_byte32(Register dst, Address src) { 1910 int off = offset(); 1911 ldrsbw(dst, src); 1912 return off; 1913 } 1914 1915 void MacroAssembler::load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed, Register dst2) { 1916 switch (size_in_bytes) { 1917 case 8: ldr(dst, src); break; 1918 case 4: ldrw(dst, src); break; 1919 case 2: is_signed ? load_signed_short(dst, src) : load_unsigned_short(dst, src); break; 1920 case 1: is_signed ? load_signed_byte( dst, src) : load_unsigned_byte( dst, src); break; 1921 default: ShouldNotReachHere(); 1922 } 1923 } 1924 1925 void MacroAssembler::store_sized_value(Address dst, Register src, size_t size_in_bytes, Register src2) { 1926 switch (size_in_bytes) { 1927 case 8: str(src, dst); break; 1928 case 4: strw(src, dst); break; 1929 case 2: strh(src, dst); break; 1930 case 1: strb(src, dst); break; 1931 default: ShouldNotReachHere(); 1932 } 1933 } 1934 1935 void MacroAssembler::decrementw(Register reg, int value) 1936 { 1937 if (value < 0) { incrementw(reg, -value); return; } 1938 if (value == 0) { return; } 1939 if (value < (1 << 12)) { subw(reg, reg, value); return; } 1940 /* else */ { 1941 guarantee(reg != rscratch2, "invalid dst for register decrement"); 1942 movw(rscratch2, (unsigned)value); 1943 subw(reg, reg, rscratch2); 1944 } 1945 } 1946 1947 void MacroAssembler::decrement(Register reg, int value) 1948 { 1949 if (value < 0) { increment(reg, -value); return; } 1950 if (value == 0) { return; } 1951 if (value < (1 << 12)) { sub(reg, reg, value); return; } 1952 /* else */ { 1953 assert(reg != rscratch2, "invalid dst for register decrement"); 1954 mov(rscratch2, (unsigned long)value); 1955 sub(reg, reg, rscratch2); 1956 } 1957 } 1958 1959 void MacroAssembler::decrementw(Address dst, int value) 1960 { 1961 assert(!dst.uses(rscratch1), "invalid dst for address decrement"); 1962 if (dst.getMode() == Address::literal) { 1963 assert(abs(value) < (1 << 12), "invalid value and address mode combination"); 1964 lea(rscratch2, dst); 1965 dst = Address(rscratch2); 1966 } 1967 ldrw(rscratch1, dst); 1968 decrementw(rscratch1, value); 1969 strw(rscratch1, dst); 1970 } 1971 1972 void MacroAssembler::decrement(Address dst, int value) 1973 { 1974 assert(!dst.uses(rscratch1), "invalid address for decrement"); 1975 if (dst.getMode() == Address::literal) { 1976 assert(abs(value) < (1 << 12), "invalid value and address mode combination"); 1977 lea(rscratch2, dst); 1978 dst = Address(rscratch2); 1979 } 1980 ldr(rscratch1, dst); 1981 decrement(rscratch1, value); 1982 str(rscratch1, dst); 1983 } 1984 1985 void MacroAssembler::incrementw(Register reg, int value) 1986 { 1987 if (value < 0) { decrementw(reg, -value); return; } 1988 if (value == 0) { return; } 1989 if (value < (1 << 12)) { addw(reg, reg, value); return; } 1990 /* else */ { 1991 assert(reg != rscratch2, "invalid dst for register increment"); 1992 movw(rscratch2, (unsigned)value); 1993 addw(reg, reg, rscratch2); 1994 } 1995 } 1996 1997 void MacroAssembler::increment(Register reg, int value) 1998 { 1999 if (value < 0) { decrement(reg, -value); return; } 2000 if (value == 0) { return; } 2001 if (value < (1 << 12)) { add(reg, reg, value); return; } 2002 /* else */ { 2003 assert(reg != rscratch2, "invalid dst for register increment"); 2004 movw(rscratch2, (unsigned)value); 2005 add(reg, reg, rscratch2); 2006 } 2007 } 2008 2009 void MacroAssembler::incrementw(Address dst, int value) 2010 { 2011 assert(!dst.uses(rscratch1), "invalid dst for address increment"); 2012 if (dst.getMode() == Address::literal) { 2013 assert(abs(value) < (1 << 12), "invalid value and address mode combination"); 2014 lea(rscratch2, dst); 2015 dst = Address(rscratch2); 2016 } 2017 ldrw(rscratch1, dst); 2018 incrementw(rscratch1, value); 2019 strw(rscratch1, dst); 2020 } 2021 2022 void MacroAssembler::increment(Address dst, int value) 2023 { 2024 assert(!dst.uses(rscratch1), "invalid dst for address increment"); 2025 if (dst.getMode() == Address::literal) { 2026 assert(abs(value) < (1 << 12), "invalid value and address mode combination"); 2027 lea(rscratch2, dst); 2028 dst = Address(rscratch2); 2029 } 2030 ldr(rscratch1, dst); 2031 increment(rscratch1, value); 2032 str(rscratch1, dst); 2033 } 2034 2035 2036 void MacroAssembler::pusha() { 2037 push(0x7fffffff, sp); 2038 } 2039 2040 void MacroAssembler::popa() { 2041 pop(0x7fffffff, sp); 2042 } 2043 2044 // Push lots of registers in the bit set supplied. Don't push sp. 2045 // Return the number of words pushed 2046 int MacroAssembler::push(unsigned int bitset, Register stack) { 2047 int words_pushed = 0; 2048 2049 // Scan bitset to accumulate register pairs 2050 unsigned char regs[32]; 2051 int count = 0; 2052 for (int reg = 0; reg <= 30; reg++) { 2053 if (1 & bitset) 2054 regs[count++] = reg; 2055 bitset >>= 1; 2056 } 2057 regs[count++] = zr->encoding_nocheck(); 2058 count &= ~1; // Only push an even nuber of regs 2059 2060 if (count) { 2061 stp(as_Register(regs[0]), as_Register(regs[1]), 2062 Address(pre(stack, -count * wordSize))); 2063 words_pushed += 2; 2064 } 2065 for (int i = 2; i < count; i += 2) { 2066 stp(as_Register(regs[i]), as_Register(regs[i+1]), 2067 Address(stack, i * wordSize)); 2068 words_pushed += 2; 2069 } 2070 2071 assert(words_pushed == count, "oops, pushed != count"); 2072 2073 return count; 2074 } 2075 2076 int MacroAssembler::pop(unsigned int bitset, Register stack) { 2077 int words_pushed = 0; 2078 2079 // Scan bitset to accumulate register pairs 2080 unsigned char regs[32]; 2081 int count = 0; 2082 for (int reg = 0; reg <= 30; reg++) { 2083 if (1 & bitset) 2084 regs[count++] = reg; 2085 bitset >>= 1; 2086 } 2087 regs[count++] = zr->encoding_nocheck(); 2088 count &= ~1; 2089 2090 for (int i = 2; i < count; i += 2) { 2091 ldp(as_Register(regs[i]), as_Register(regs[i+1]), 2092 Address(stack, i * wordSize)); 2093 words_pushed += 2; 2094 } 2095 if (count) { 2096 ldp(as_Register(regs[0]), as_Register(regs[1]), 2097 Address(post(stack, count * wordSize))); 2098 words_pushed += 2; 2099 } 2100 2101 assert(words_pushed == count, "oops, pushed != count"); 2102 2103 return count; 2104 } 2105 #ifdef ASSERT 2106 void MacroAssembler::verify_heapbase(const char* msg) { 2107 #if 0 2108 assert (UseCompressedOops || UseCompressedClassPointers, "should be compressed"); 2109 assert (Universe::heap() != NULL, "java heap should be initialized"); 2110 if (CheckCompressedOops) { 2111 Label ok; 2112 push(1 << rscratch1->encoding(), sp); // cmpptr trashes rscratch1 2113 cmpptr(rheapbase, ExternalAddress((address)Universe::narrow_ptrs_base_addr())); 2114 br(Assembler::EQ, ok); 2115 stop(msg); 2116 bind(ok); 2117 pop(1 << rscratch1->encoding(), sp); 2118 } 2119 #endif 2120 } 2121 #endif 2122 2123 void MacroAssembler::resolve_jobject(Register value, Register thread, Register tmp) { 2124 Label done, not_weak; 2125 cbz(value, done); // Use NULL as-is. 2126 2127 STATIC_ASSERT(JNIHandles::weak_tag_mask == 1u); 2128 tbz(r0, 0, not_weak); // Test for jweak tag. 2129 2130 // Resolve jweak. 2131 access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF, value, 2132 Address(value, -JNIHandles::weak_tag_value), tmp, thread); 2133 verify_oop(value); 2134 b(done); 2135 2136 bind(not_weak); 2137 // Resolve (untagged) jobject. 2138 access_load_at(T_OBJECT, IN_NATIVE, value, Address(value, 0), tmp, thread); 2139 verify_oop(value); 2140 bind(done); 2141 } 2142 2143 void MacroAssembler::stop(const char* msg) { 2144 address ip = pc(); 2145 pusha(); 2146 mov(c_rarg0, (address)msg); 2147 mov(c_rarg1, (address)ip); 2148 mov(c_rarg2, sp); 2149 mov(c_rarg3, CAST_FROM_FN_PTR(address, MacroAssembler::debug64)); 2150 // call(c_rarg3); 2151 blrt(c_rarg3, 3, 0, 1); 2152 hlt(0); 2153 } 2154 2155 void MacroAssembler::unimplemented(const char* what) { 2156 const char* buf = NULL; 2157 { 2158 ResourceMark rm; 2159 stringStream ss; 2160 ss.print("unimplemented: %s", what); 2161 buf = code_string(ss.as_string()); 2162 } 2163 stop(buf); 2164 } 2165 2166 // If a constant does not fit in an immediate field, generate some 2167 // number of MOV instructions and then perform the operation. 2168 void MacroAssembler::wrap_add_sub_imm_insn(Register Rd, Register Rn, unsigned imm, 2169 add_sub_imm_insn insn1, 2170 add_sub_reg_insn insn2) { 2171 assert(Rd != zr, "Rd = zr and not setting flags?"); 2172 if (operand_valid_for_add_sub_immediate((int)imm)) { 2173 (this->*insn1)(Rd, Rn, imm); 2174 } else { 2175 if (uabs(imm) < (1 << 24)) { 2176 (this->*insn1)(Rd, Rn, imm & -(1 << 12)); 2177 (this->*insn1)(Rd, Rd, imm & ((1 << 12)-1)); 2178 } else { 2179 assert_different_registers(Rd, Rn); 2180 mov(Rd, (uint64_t)imm); 2181 (this->*insn2)(Rd, Rn, Rd, LSL, 0); 2182 } 2183 } 2184 } 2185 2186 // Seperate vsn which sets the flags. Optimisations are more restricted 2187 // because we must set the flags correctly. 2188 void MacroAssembler::wrap_adds_subs_imm_insn(Register Rd, Register Rn, unsigned imm, 2189 add_sub_imm_insn insn1, 2190 add_sub_reg_insn insn2) { 2191 if (operand_valid_for_add_sub_immediate((int)imm)) { 2192 (this->*insn1)(Rd, Rn, imm); 2193 } else { 2194 assert_different_registers(Rd, Rn); 2195 assert(Rd != zr, "overflow in immediate operand"); 2196 mov(Rd, (uint64_t)imm); 2197 (this->*insn2)(Rd, Rn, Rd, LSL, 0); 2198 } 2199 } 2200 2201 2202 void MacroAssembler::add(Register Rd, Register Rn, RegisterOrConstant increment) { 2203 if (increment.is_register()) { 2204 add(Rd, Rn, increment.as_register()); 2205 } else { 2206 add(Rd, Rn, increment.as_constant()); 2207 } 2208 } 2209 2210 void MacroAssembler::addw(Register Rd, Register Rn, RegisterOrConstant increment) { 2211 if (increment.is_register()) { 2212 addw(Rd, Rn, increment.as_register()); 2213 } else { 2214 addw(Rd, Rn, increment.as_constant()); 2215 } 2216 } 2217 2218 void MacroAssembler::sub(Register Rd, Register Rn, RegisterOrConstant decrement) { 2219 if (decrement.is_register()) { 2220 sub(Rd, Rn, decrement.as_register()); 2221 } else { 2222 sub(Rd, Rn, decrement.as_constant()); 2223 } 2224 } 2225 2226 void MacroAssembler::subw(Register Rd, Register Rn, RegisterOrConstant decrement) { 2227 if (decrement.is_register()) { 2228 subw(Rd, Rn, decrement.as_register()); 2229 } else { 2230 subw(Rd, Rn, decrement.as_constant()); 2231 } 2232 } 2233 2234 void MacroAssembler::reinit_heapbase() 2235 { 2236 if (UseCompressedOops) { 2237 if (Universe::is_fully_initialized()) { 2238 mov(rheapbase, Universe::narrow_ptrs_base()); 2239 } else { 2240 lea(rheapbase, ExternalAddress((address)Universe::narrow_ptrs_base_addr())); 2241 ldr(rheapbase, Address(rheapbase)); 2242 } 2243 } 2244 } 2245 2246 // this simulates the behaviour of the x86 cmpxchg instruction using a 2247 // load linked/store conditional pair. we use the acquire/release 2248 // versions of these instructions so that we flush pending writes as 2249 // per Java semantics. 2250 2251 // n.b the x86 version assumes the old value to be compared against is 2252 // in rax and updates rax with the value located in memory if the 2253 // cmpxchg fails. we supply a register for the old value explicitly 2254 2255 // the aarch64 load linked/store conditional instructions do not 2256 // accept an offset. so, unlike x86, we must provide a plain register 2257 // to identify the memory word to be compared/exchanged rather than a 2258 // register+offset Address. 2259 2260 void MacroAssembler::cmpxchgptr(Register oldv, Register newv, Register addr, Register tmp, 2261 Label &succeed, Label *fail) { 2262 // oldv holds comparison value 2263 // newv holds value to write in exchange 2264 // addr identifies memory word to compare against/update 2265 if (UseLSE) { 2266 mov(tmp, oldv); 2267 casal(Assembler::xword, oldv, newv, addr); 2268 cmp(tmp, oldv); 2269 br(Assembler::EQ, succeed); 2270 membar(AnyAny); 2271 } else { 2272 Label retry_load, nope; 2273 if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH)) 2274 prfm(Address(addr), PSTL1STRM); 2275 bind(retry_load); 2276 // flush and load exclusive from the memory location 2277 // and fail if it is not what we expect 2278 ldaxr(tmp, addr); 2279 cmp(tmp, oldv); 2280 br(Assembler::NE, nope); 2281 // if we store+flush with no intervening write tmp wil be zero 2282 stlxr(tmp, newv, addr); 2283 cbzw(tmp, succeed); 2284 // retry so we only ever return after a load fails to compare 2285 // ensures we don't return a stale value after a failed write. 2286 b(retry_load); 2287 // if the memory word differs we return it in oldv and signal a fail 2288 bind(nope); 2289 membar(AnyAny); 2290 mov(oldv, tmp); 2291 } 2292 if (fail) 2293 b(*fail); 2294 } 2295 2296 void MacroAssembler::cmpxchg_obj_header(Register oldv, Register newv, Register obj, Register tmp, 2297 Label &succeed, Label *fail) { 2298 assert(oopDesc::mark_offset_in_bytes() == 0, "assumption"); 2299 cmpxchgptr(oldv, newv, obj, tmp, succeed, fail); 2300 } 2301 2302 void MacroAssembler::cmpxchgw(Register oldv, Register newv, Register addr, Register tmp, 2303 Label &succeed, Label *fail) { 2304 // oldv holds comparison value 2305 // newv holds value to write in exchange 2306 // addr identifies memory word to compare against/update 2307 // tmp returns 0/1 for success/failure 2308 if (UseLSE) { 2309 mov(tmp, oldv); 2310 casal(Assembler::word, oldv, newv, addr); 2311 cmp(tmp, oldv); 2312 br(Assembler::EQ, succeed); 2313 membar(AnyAny); 2314 } else { 2315 Label retry_load, nope; 2316 if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH)) 2317 prfm(Address(addr), PSTL1STRM); 2318 bind(retry_load); 2319 // flush and load exclusive from the memory location 2320 // and fail if it is not what we expect 2321 ldaxrw(tmp, addr); 2322 cmp(tmp, oldv); 2323 br(Assembler::NE, nope); 2324 // if we store+flush with no intervening write tmp wil be zero 2325 stlxrw(tmp, newv, addr); 2326 cbzw(tmp, succeed); 2327 // retry so we only ever return after a load fails to compare 2328 // ensures we don't return a stale value after a failed write. 2329 b(retry_load); 2330 // if the memory word differs we return it in oldv and signal a fail 2331 bind(nope); 2332 membar(AnyAny); 2333 mov(oldv, tmp); 2334 } 2335 if (fail) 2336 b(*fail); 2337 } 2338 2339 // A generic CAS; success or failure is in the EQ flag. A weak CAS 2340 // doesn't retry and may fail spuriously. If the oldval is wanted, 2341 // Pass a register for the result, otherwise pass noreg. 2342 2343 // Clobbers rscratch1 2344 void MacroAssembler::cmpxchg(Register addr, Register expected, 2345 Register new_val, 2346 enum operand_size size, 2347 bool acquire, bool release, 2348 bool weak, 2349 Register result) { 2350 if (result == noreg) result = rscratch1; 2351 if (UseLSE) { 2352 mov(result, expected); 2353 lse_cas(result, new_val, addr, size, acquire, release, /*not_pair*/ true); 2354 cmp(result, expected); 2355 } else { 2356 BLOCK_COMMENT("cmpxchg {"); 2357 Label retry_load, done; 2358 if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH)) 2359 prfm(Address(addr), PSTL1STRM); 2360 bind(retry_load); 2361 load_exclusive(result, addr, size, acquire); 2362 if (size == xword) 2363 cmp(result, expected); 2364 else 2365 cmpw(result, expected); 2366 br(Assembler::NE, done); 2367 store_exclusive(rscratch1, new_val, addr, size, release); 2368 if (weak) { 2369 cmpw(rscratch1, 0u); // If the store fails, return NE to our caller. 2370 } else { 2371 cbnzw(rscratch1, retry_load); 2372 } 2373 bind(done); 2374 BLOCK_COMMENT("} cmpxchg"); 2375 } 2376 } 2377 2378 static bool different(Register a, RegisterOrConstant b, Register c) { 2379 if (b.is_constant()) 2380 return a != c; 2381 else 2382 return a != b.as_register() && a != c && b.as_register() != c; 2383 } 2384 2385 #define ATOMIC_OP(NAME, LDXR, OP, IOP, AOP, STXR, sz) \ 2386 void MacroAssembler::atomic_##NAME(Register prev, RegisterOrConstant incr, Register addr) { \ 2387 if (UseLSE) { \ 2388 prev = prev->is_valid() ? prev : zr; \ 2389 if (incr.is_register()) { \ 2390 AOP(sz, incr.as_register(), prev, addr); \ 2391 } else { \ 2392 mov(rscratch2, incr.as_constant()); \ 2393 AOP(sz, rscratch2, prev, addr); \ 2394 } \ 2395 return; \ 2396 } \ 2397 Register result = rscratch2; \ 2398 if (prev->is_valid()) \ 2399 result = different(prev, incr, addr) ? prev : rscratch2; \ 2400 \ 2401 Label retry_load; \ 2402 if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH)) \ 2403 prfm(Address(addr), PSTL1STRM); \ 2404 bind(retry_load); \ 2405 LDXR(result, addr); \ 2406 OP(rscratch1, result, incr); \ 2407 STXR(rscratch2, rscratch1, addr); \ 2408 cbnzw(rscratch2, retry_load); \ 2409 if (prev->is_valid() && prev != result) { \ 2410 IOP(prev, rscratch1, incr); \ 2411 } \ 2412 } 2413 2414 ATOMIC_OP(add, ldxr, add, sub, ldadd, stxr, Assembler::xword) 2415 ATOMIC_OP(addw, ldxrw, addw, subw, ldadd, stxrw, Assembler::word) 2416 ATOMIC_OP(addal, ldaxr, add, sub, ldaddal, stlxr, Assembler::xword) 2417 ATOMIC_OP(addalw, ldaxrw, addw, subw, ldaddal, stlxrw, Assembler::word) 2418 2419 #undef ATOMIC_OP 2420 2421 #define ATOMIC_XCHG(OP, AOP, LDXR, STXR, sz) \ 2422 void MacroAssembler::atomic_##OP(Register prev, Register newv, Register addr) { \ 2423 if (UseLSE) { \ 2424 prev = prev->is_valid() ? prev : zr; \ 2425 AOP(sz, newv, prev, addr); \ 2426 return; \ 2427 } \ 2428 Register result = rscratch2; \ 2429 if (prev->is_valid()) \ 2430 result = different(prev, newv, addr) ? prev : rscratch2; \ 2431 \ 2432 Label retry_load; \ 2433 if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH)) \ 2434 prfm(Address(addr), PSTL1STRM); \ 2435 bind(retry_load); \ 2436 LDXR(result, addr); \ 2437 STXR(rscratch1, newv, addr); \ 2438 cbnzw(rscratch1, retry_load); \ 2439 if (prev->is_valid() && prev != result) \ 2440 mov(prev, result); \ 2441 } 2442 2443 ATOMIC_XCHG(xchg, swp, ldxr, stxr, Assembler::xword) 2444 ATOMIC_XCHG(xchgw, swp, ldxrw, stxrw, Assembler::word) 2445 ATOMIC_XCHG(xchgal, swpal, ldaxr, stlxr, Assembler::xword) 2446 ATOMIC_XCHG(xchgalw, swpal, ldaxrw, stlxrw, Assembler::word) 2447 2448 #undef ATOMIC_XCHG 2449 2450 #ifndef PRODUCT 2451 extern "C" void findpc(intptr_t x); 2452 #endif 2453 2454 void MacroAssembler::debug64(char* msg, int64_t pc, int64_t regs[]) 2455 { 2456 // In order to get locks to work, we need to fake a in_VM state 2457 if (ShowMessageBoxOnError ) { 2458 JavaThread* thread = JavaThread::current(); 2459 JavaThreadState saved_state = thread->thread_state(); 2460 thread->set_thread_state(_thread_in_vm); 2461 #ifndef PRODUCT 2462 if (CountBytecodes || TraceBytecodes || StopInterpreterAt) { 2463 ttyLocker ttyl; 2464 BytecodeCounter::print(); 2465 } 2466 #endif 2467 if (os::message_box(msg, "Execution stopped, print registers?")) { 2468 ttyLocker ttyl; 2469 tty->print_cr(" pc = 0x%016lx", pc); 2470 #ifndef PRODUCT 2471 tty->cr(); 2472 findpc(pc); 2473 tty->cr(); 2474 #endif 2475 tty->print_cr(" r0 = 0x%016lx", regs[0]); 2476 tty->print_cr(" r1 = 0x%016lx", regs[1]); 2477 tty->print_cr(" r2 = 0x%016lx", regs[2]); 2478 tty->print_cr(" r3 = 0x%016lx", regs[3]); 2479 tty->print_cr(" r4 = 0x%016lx", regs[4]); 2480 tty->print_cr(" r5 = 0x%016lx", regs[5]); 2481 tty->print_cr(" r6 = 0x%016lx", regs[6]); 2482 tty->print_cr(" r7 = 0x%016lx", regs[7]); 2483 tty->print_cr(" r8 = 0x%016lx", regs[8]); 2484 tty->print_cr(" r9 = 0x%016lx", regs[9]); 2485 tty->print_cr("r10 = 0x%016lx", regs[10]); 2486 tty->print_cr("r11 = 0x%016lx", regs[11]); 2487 tty->print_cr("r12 = 0x%016lx", regs[12]); 2488 tty->print_cr("r13 = 0x%016lx", regs[13]); 2489 tty->print_cr("r14 = 0x%016lx", regs[14]); 2490 tty->print_cr("r15 = 0x%016lx", regs[15]); 2491 tty->print_cr("r16 = 0x%016lx", regs[16]); 2492 tty->print_cr("r17 = 0x%016lx", regs[17]); 2493 tty->print_cr("r18 = 0x%016lx", regs[18]); 2494 tty->print_cr("r19 = 0x%016lx", regs[19]); 2495 tty->print_cr("r20 = 0x%016lx", regs[20]); 2496 tty->print_cr("r21 = 0x%016lx", regs[21]); 2497 tty->print_cr("r22 = 0x%016lx", regs[22]); 2498 tty->print_cr("r23 = 0x%016lx", regs[23]); 2499 tty->print_cr("r24 = 0x%016lx", regs[24]); 2500 tty->print_cr("r25 = 0x%016lx", regs[25]); 2501 tty->print_cr("r26 = 0x%016lx", regs[26]); 2502 tty->print_cr("r27 = 0x%016lx", regs[27]); 2503 tty->print_cr("r28 = 0x%016lx", regs[28]); 2504 tty->print_cr("r30 = 0x%016lx", regs[30]); 2505 tty->print_cr("r31 = 0x%016lx", regs[31]); 2506 BREAKPOINT; 2507 } 2508 ThreadStateTransition::transition(thread, _thread_in_vm, saved_state); 2509 } else { 2510 ttyLocker ttyl; 2511 ::tty->print_cr("=============== DEBUG MESSAGE: %s ================\n", 2512 msg); 2513 assert(false, "DEBUG MESSAGE: %s", msg); 2514 } 2515 } 2516 2517 #ifdef BUILTIN_SIM 2518 // routine to generate an x86 prolog for a stub function which 2519 // bootstraps into the generated ARM code which directly follows the 2520 // stub 2521 // 2522 // the argument encodes the number of general and fp registers 2523 // passed by the caller and the callng convention (currently just 2524 // the number of general registers and assumes C argument passing) 2525 2526 extern "C" { 2527 int aarch64_stub_prolog_size(); 2528 void aarch64_stub_prolog(); 2529 void aarch64_prolog(); 2530 } 2531 2532 void MacroAssembler::c_stub_prolog(int gp_arg_count, int fp_arg_count, int ret_type, 2533 address *prolog_ptr) 2534 { 2535 int calltype = (((ret_type & 0x3) << 8) | 2536 ((fp_arg_count & 0xf) << 4) | 2537 (gp_arg_count & 0xf)); 2538 2539 // the addresses for the x86 to ARM entry code we need to use 2540 address start = pc(); 2541 // printf("start = %lx\n", start); 2542 int byteCount = aarch64_stub_prolog_size(); 2543 // printf("byteCount = %x\n", byteCount); 2544 int instructionCount = (byteCount + 3)/ 4; 2545 // printf("instructionCount = %x\n", instructionCount); 2546 for (int i = 0; i < instructionCount; i++) { 2547 nop(); 2548 } 2549 2550 memcpy(start, (void*)aarch64_stub_prolog, byteCount); 2551 2552 // write the address of the setup routine and the call format at the 2553 // end of into the copied code 2554 u_int64_t *patch_end = (u_int64_t *)(start + byteCount); 2555 if (prolog_ptr) 2556 patch_end[-2] = (u_int64_t)prolog_ptr; 2557 patch_end[-1] = calltype; 2558 } 2559 #endif 2560 2561 void MacroAssembler::push_call_clobbered_registers() { 2562 int step = 4 * wordSize; 2563 push(RegSet::range(r0, r18) - RegSet::of(rscratch1, rscratch2), sp); 2564 sub(sp, sp, step); 2565 mov(rscratch1, -step); 2566 // Push v0-v7, v16-v31. 2567 for (int i = 31; i>= 4; i -= 4) { 2568 if (i <= v7->encoding() || i >= v16->encoding()) 2569 st1(as_FloatRegister(i-3), as_FloatRegister(i-2), as_FloatRegister(i-1), 2570 as_FloatRegister(i), T1D, Address(post(sp, rscratch1))); 2571 } 2572 st1(as_FloatRegister(0), as_FloatRegister(1), as_FloatRegister(2), 2573 as_FloatRegister(3), T1D, Address(sp)); 2574 } 2575 2576 void MacroAssembler::pop_call_clobbered_registers() { 2577 for (int i = 0; i < 32; i += 4) { 2578 if (i <= v7->encoding() || i >= v16->encoding()) 2579 ld1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2), 2580 as_FloatRegister(i+3), T1D, Address(post(sp, 4 * wordSize))); 2581 } 2582 2583 pop(RegSet::range(r0, r18) - RegSet::of(rscratch1, rscratch2), sp); 2584 } 2585 2586 void MacroAssembler::push_CPU_state(bool save_vectors) { 2587 int step = (save_vectors ? 8 : 4) * wordSize; 2588 push(0x3fffffff, sp); // integer registers except lr & sp 2589 mov(rscratch1, -step); 2590 sub(sp, sp, step); 2591 for (int i = 28; i >= 4; i -= 4) { 2592 st1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2), 2593 as_FloatRegister(i+3), save_vectors ? T2D : T1D, Address(post(sp, rscratch1))); 2594 } 2595 st1(v0, v1, v2, v3, save_vectors ? T2D : T1D, sp); 2596 } 2597 2598 void MacroAssembler::pop_CPU_state(bool restore_vectors) { 2599 int step = (restore_vectors ? 8 : 4) * wordSize; 2600 for (int i = 0; i <= 28; i += 4) 2601 ld1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2), 2602 as_FloatRegister(i+3), restore_vectors ? T2D : T1D, Address(post(sp, step))); 2603 pop(0x3fffffff, sp); // integer registers except lr & sp 2604 } 2605 2606 /** 2607 * Helpers for multiply_to_len(). 2608 */ 2609 void MacroAssembler::add2_with_carry(Register final_dest_hi, Register dest_hi, Register dest_lo, 2610 Register src1, Register src2) { 2611 adds(dest_lo, dest_lo, src1); 2612 adc(dest_hi, dest_hi, zr); 2613 adds(dest_lo, dest_lo, src2); 2614 adc(final_dest_hi, dest_hi, zr); 2615 } 2616 2617 // Generate an address from (r + r1 extend offset). "size" is the 2618 // size of the operand. The result may be in rscratch2. 2619 Address MacroAssembler::offsetted_address(Register r, Register r1, 2620 Address::extend ext, int offset, int size) { 2621 if (offset || (ext.shift() % size != 0)) { 2622 lea(rscratch2, Address(r, r1, ext)); 2623 return Address(rscratch2, offset); 2624 } else { 2625 return Address(r, r1, ext); 2626 } 2627 } 2628 2629 Address MacroAssembler::spill_address(int size, int offset, Register tmp) 2630 { 2631 assert(offset >= 0, "spill to negative address?"); 2632 // Offset reachable ? 2633 // Not aligned - 9 bits signed offset 2634 // Aligned - 12 bits unsigned offset shifted 2635 Register base = sp; 2636 if ((offset & (size-1)) && offset >= (1<<8)) { 2637 add(tmp, base, offset & ((1<<12)-1)); 2638 base = tmp; 2639 offset &= -1<<12; 2640 } 2641 2642 if (offset >= (1<<12) * size) { 2643 add(tmp, base, offset & (((1<<12)-1)<<12)); 2644 base = tmp; 2645 offset &= ~(((1<<12)-1)<<12); 2646 } 2647 2648 return Address(base, offset); 2649 } 2650 2651 // Checks whether offset is aligned. 2652 // Returns true if it is, else false. 2653 bool MacroAssembler::merge_alignment_check(Register base, 2654 size_t size, 2655 long cur_offset, 2656 long prev_offset) const { 2657 if (AvoidUnalignedAccesses) { 2658 if (base == sp) { 2659 // Checks whether low offset if aligned to pair of registers. 2660 long pair_mask = size * 2 - 1; 2661 long offset = prev_offset > cur_offset ? cur_offset : prev_offset; 2662 return (offset & pair_mask) == 0; 2663 } else { // If base is not sp, we can't guarantee the access is aligned. 2664 return false; 2665 } 2666 } else { 2667 long mask = size - 1; 2668 // Load/store pair instruction only supports element size aligned offset. 2669 return (cur_offset & mask) == 0 && (prev_offset & mask) == 0; 2670 } 2671 } 2672 2673 // Checks whether current and previous loads/stores can be merged. 2674 // Returns true if it can be merged, else false. 2675 bool MacroAssembler::ldst_can_merge(Register rt, 2676 const Address &adr, 2677 size_t cur_size_in_bytes, 2678 bool is_store) const { 2679 address prev = pc() - NativeInstruction::instruction_size; 2680 address last = code()->last_insn(); 2681 2682 if (last == NULL || !nativeInstruction_at(last)->is_Imm_LdSt()) { 2683 return false; 2684 } 2685 2686 if (adr.getMode() != Address::base_plus_offset || prev != last) { 2687 return false; 2688 } 2689 2690 NativeLdSt* prev_ldst = NativeLdSt_at(prev); 2691 size_t prev_size_in_bytes = prev_ldst->size_in_bytes(); 2692 2693 assert(prev_size_in_bytes == 4 || prev_size_in_bytes == 8, "only supports 64/32bit merging."); 2694 assert(cur_size_in_bytes == 4 || cur_size_in_bytes == 8, "only supports 64/32bit merging."); 2695 2696 if (cur_size_in_bytes != prev_size_in_bytes || is_store != prev_ldst->is_store()) { 2697 return false; 2698 } 2699 2700 long max_offset = 63 * prev_size_in_bytes; 2701 long min_offset = -64 * prev_size_in_bytes; 2702 2703 assert(prev_ldst->is_not_pre_post_index(), "pre-index or post-index is not supported to be merged."); 2704 2705 // Only same base can be merged. 2706 if (adr.base() != prev_ldst->base()) { 2707 return false; 2708 } 2709 2710 long cur_offset = adr.offset(); 2711 long prev_offset = prev_ldst->offset(); 2712 size_t diff = abs(cur_offset - prev_offset); 2713 if (diff != prev_size_in_bytes) { 2714 return false; 2715 } 2716 2717 // Following cases can not be merged: 2718 // ldr x2, [x2, #8] 2719 // ldr x3, [x2, #16] 2720 // or: 2721 // ldr x2, [x3, #8] 2722 // ldr x2, [x3, #16] 2723 // If t1 and t2 is the same in "ldp t1, t2, [xn, #imm]", we'll get SIGILL. 2724 if (!is_store && (adr.base() == prev_ldst->target() || rt == prev_ldst->target())) { 2725 return false; 2726 } 2727 2728 long low_offset = prev_offset > cur_offset ? cur_offset : prev_offset; 2729 // Offset range must be in ldp/stp instruction's range. 2730 if (low_offset > max_offset || low_offset < min_offset) { 2731 return false; 2732 } 2733 2734 if (merge_alignment_check(adr.base(), prev_size_in_bytes, cur_offset, prev_offset)) { 2735 return true; 2736 } 2737 2738 return false; 2739 } 2740 2741 // Merge current load/store with previous load/store into ldp/stp. 2742 void MacroAssembler::merge_ldst(Register rt, 2743 const Address &adr, 2744 size_t cur_size_in_bytes, 2745 bool is_store) { 2746 2747 assert(ldst_can_merge(rt, adr, cur_size_in_bytes, is_store) == true, "cur and prev must be able to be merged."); 2748 2749 Register rt_low, rt_high; 2750 address prev = pc() - NativeInstruction::instruction_size; 2751 NativeLdSt* prev_ldst = NativeLdSt_at(prev); 2752 2753 long offset; 2754 2755 if (adr.offset() < prev_ldst->offset()) { 2756 offset = adr.offset(); 2757 rt_low = rt; 2758 rt_high = prev_ldst->target(); 2759 } else { 2760 offset = prev_ldst->offset(); 2761 rt_low = prev_ldst->target(); 2762 rt_high = rt; 2763 } 2764 2765 Address adr_p = Address(prev_ldst->base(), offset); 2766 // Overwrite previous generated binary. 2767 code_section()->set_end(prev); 2768 2769 const int sz = prev_ldst->size_in_bytes(); 2770 assert(sz == 8 || sz == 4, "only supports 64/32bit merging."); 2771 if (!is_store) { 2772 BLOCK_COMMENT("merged ldr pair"); 2773 if (sz == 8) { 2774 ldp(rt_low, rt_high, adr_p); 2775 } else { 2776 ldpw(rt_low, rt_high, adr_p); 2777 } 2778 } else { 2779 BLOCK_COMMENT("merged str pair"); 2780 if (sz == 8) { 2781 stp(rt_low, rt_high, adr_p); 2782 } else { 2783 stpw(rt_low, rt_high, adr_p); 2784 } 2785 } 2786 } 2787 2788 /** 2789 * Multiply 64 bit by 64 bit first loop. 2790 */ 2791 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart, 2792 Register y, Register y_idx, Register z, 2793 Register carry, Register product, 2794 Register idx, Register kdx) { 2795 // 2796 // jlong carry, x[], y[], z[]; 2797 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) { 2798 // huge_128 product = y[idx] * x[xstart] + carry; 2799 // z[kdx] = (jlong)product; 2800 // carry = (jlong)(product >>> 64); 2801 // } 2802 // z[xstart] = carry; 2803 // 2804 2805 Label L_first_loop, L_first_loop_exit; 2806 Label L_one_x, L_one_y, L_multiply; 2807 2808 subsw(xstart, xstart, 1); 2809 br(Assembler::MI, L_one_x); 2810 2811 lea(rscratch1, Address(x, xstart, Address::lsl(LogBytesPerInt))); 2812 ldr(x_xstart, Address(rscratch1)); 2813 ror(x_xstart, x_xstart, 32); // convert big-endian to little-endian 2814 2815 bind(L_first_loop); 2816 subsw(idx, idx, 1); 2817 br(Assembler::MI, L_first_loop_exit); 2818 subsw(idx, idx, 1); 2819 br(Assembler::MI, L_one_y); 2820 lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt))); 2821 ldr(y_idx, Address(rscratch1)); 2822 ror(y_idx, y_idx, 32); // convert big-endian to little-endian 2823 bind(L_multiply); 2824 2825 // AArch64 has a multiply-accumulate instruction that we can't use 2826 // here because it has no way to process carries, so we have to use 2827 // separate add and adc instructions. Bah. 2828 umulh(rscratch1, x_xstart, y_idx); // x_xstart * y_idx -> rscratch1:product 2829 mul(product, x_xstart, y_idx); 2830 adds(product, product, carry); 2831 adc(carry, rscratch1, zr); // x_xstart * y_idx + carry -> carry:product 2832 2833 subw(kdx, kdx, 2); 2834 ror(product, product, 32); // back to big-endian 2835 str(product, offsetted_address(z, kdx, Address::uxtw(LogBytesPerInt), 0, BytesPerLong)); 2836 2837 b(L_first_loop); 2838 2839 bind(L_one_y); 2840 ldrw(y_idx, Address(y, 0)); 2841 b(L_multiply); 2842 2843 bind(L_one_x); 2844 ldrw(x_xstart, Address(x, 0)); 2845 b(L_first_loop); 2846 2847 bind(L_first_loop_exit); 2848 } 2849 2850 /** 2851 * Multiply 128 bit by 128. Unrolled inner loop. 2852 * 2853 */ 2854 void MacroAssembler::multiply_128_x_128_loop(Register y, Register z, 2855 Register carry, Register carry2, 2856 Register idx, Register jdx, 2857 Register yz_idx1, Register yz_idx2, 2858 Register tmp, Register tmp3, Register tmp4, 2859 Register tmp6, Register product_hi) { 2860 2861 // jlong carry, x[], y[], z[]; 2862 // int kdx = ystart+1; 2863 // for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop 2864 // huge_128 tmp3 = (y[idx+1] * product_hi) + z[kdx+idx+1] + carry; 2865 // jlong carry2 = (jlong)(tmp3 >>> 64); 2866 // huge_128 tmp4 = (y[idx] * product_hi) + z[kdx+idx] + carry2; 2867 // carry = (jlong)(tmp4 >>> 64); 2868 // z[kdx+idx+1] = (jlong)tmp3; 2869 // z[kdx+idx] = (jlong)tmp4; 2870 // } 2871 // idx += 2; 2872 // if (idx > 0) { 2873 // yz_idx1 = (y[idx] * product_hi) + z[kdx+idx] + carry; 2874 // z[kdx+idx] = (jlong)yz_idx1; 2875 // carry = (jlong)(yz_idx1 >>> 64); 2876 // } 2877 // 2878 2879 Label L_third_loop, L_third_loop_exit, L_post_third_loop_done; 2880 2881 lsrw(jdx, idx, 2); 2882 2883 bind(L_third_loop); 2884 2885 subsw(jdx, jdx, 1); 2886 br(Assembler::MI, L_third_loop_exit); 2887 subw(idx, idx, 4); 2888 2889 lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt))); 2890 2891 ldp(yz_idx2, yz_idx1, Address(rscratch1, 0)); 2892 2893 lea(tmp6, Address(z, idx, Address::uxtw(LogBytesPerInt))); 2894 2895 ror(yz_idx1, yz_idx1, 32); // convert big-endian to little-endian 2896 ror(yz_idx2, yz_idx2, 32); 2897 2898 ldp(rscratch2, rscratch1, Address(tmp6, 0)); 2899 2900 mul(tmp3, product_hi, yz_idx1); // yz_idx1 * product_hi -> tmp4:tmp3 2901 umulh(tmp4, product_hi, yz_idx1); 2902 2903 ror(rscratch1, rscratch1, 32); // convert big-endian to little-endian 2904 ror(rscratch2, rscratch2, 32); 2905 2906 mul(tmp, product_hi, yz_idx2); // yz_idx2 * product_hi -> carry2:tmp 2907 umulh(carry2, product_hi, yz_idx2); 2908 2909 // propagate sum of both multiplications into carry:tmp4:tmp3 2910 adds(tmp3, tmp3, carry); 2911 adc(tmp4, tmp4, zr); 2912 adds(tmp3, tmp3, rscratch1); 2913 adcs(tmp4, tmp4, tmp); 2914 adc(carry, carry2, zr); 2915 adds(tmp4, tmp4, rscratch2); 2916 adc(carry, carry, zr); 2917 2918 ror(tmp3, tmp3, 32); // convert little-endian to big-endian 2919 ror(tmp4, tmp4, 32); 2920 stp(tmp4, tmp3, Address(tmp6, 0)); 2921 2922 b(L_third_loop); 2923 bind (L_third_loop_exit); 2924 2925 andw (idx, idx, 0x3); 2926 cbz(idx, L_post_third_loop_done); 2927 2928 Label L_check_1; 2929 subsw(idx, idx, 2); 2930 br(Assembler::MI, L_check_1); 2931 2932 lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt))); 2933 ldr(yz_idx1, Address(rscratch1, 0)); 2934 ror(yz_idx1, yz_idx1, 32); 2935 mul(tmp3, product_hi, yz_idx1); // yz_idx1 * product_hi -> tmp4:tmp3 2936 umulh(tmp4, product_hi, yz_idx1); 2937 lea(rscratch1, Address(z, idx, Address::uxtw(LogBytesPerInt))); 2938 ldr(yz_idx2, Address(rscratch1, 0)); 2939 ror(yz_idx2, yz_idx2, 32); 2940 2941 add2_with_carry(carry, tmp4, tmp3, carry, yz_idx2); 2942 2943 ror(tmp3, tmp3, 32); 2944 str(tmp3, Address(rscratch1, 0)); 2945 2946 bind (L_check_1); 2947 2948 andw (idx, idx, 0x1); 2949 subsw(idx, idx, 1); 2950 br(Assembler::MI, L_post_third_loop_done); 2951 ldrw(tmp4, Address(y, idx, Address::uxtw(LogBytesPerInt))); 2952 mul(tmp3, tmp4, product_hi); // tmp4 * product_hi -> carry2:tmp3 2953 umulh(carry2, tmp4, product_hi); 2954 ldrw(tmp4, Address(z, idx, Address::uxtw(LogBytesPerInt))); 2955 2956 add2_with_carry(carry2, tmp3, tmp4, carry); 2957 2958 strw(tmp3, Address(z, idx, Address::uxtw(LogBytesPerInt))); 2959 extr(carry, carry2, tmp3, 32); 2960 2961 bind(L_post_third_loop_done); 2962 } 2963 2964 /** 2965 * Code for BigInteger::multiplyToLen() instrinsic. 2966 * 2967 * r0: x 2968 * r1: xlen 2969 * r2: y 2970 * r3: ylen 2971 * r4: z 2972 * r5: zlen 2973 * r10: tmp1 2974 * r11: tmp2 2975 * r12: tmp3 2976 * r13: tmp4 2977 * r14: tmp5 2978 * r15: tmp6 2979 * r16: tmp7 2980 * 2981 */ 2982 void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Register ylen, 2983 Register z, Register zlen, 2984 Register tmp1, Register tmp2, Register tmp3, Register tmp4, 2985 Register tmp5, Register tmp6, Register product_hi) { 2986 2987 assert_different_registers(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6); 2988 2989 const Register idx = tmp1; 2990 const Register kdx = tmp2; 2991 const Register xstart = tmp3; 2992 2993 const Register y_idx = tmp4; 2994 const Register carry = tmp5; 2995 const Register product = xlen; 2996 const Register x_xstart = zlen; // reuse register 2997 2998 // First Loop. 2999 // 3000 // final static long LONG_MASK = 0xffffffffL; 3001 // int xstart = xlen - 1; 3002 // int ystart = ylen - 1; 3003 // long carry = 0; 3004 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) { 3005 // long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry; 3006 // z[kdx] = (int)product; 3007 // carry = product >>> 32; 3008 // } 3009 // z[xstart] = (int)carry; 3010 // 3011 3012 movw(idx, ylen); // idx = ylen; 3013 movw(kdx, zlen); // kdx = xlen+ylen; 3014 mov(carry, zr); // carry = 0; 3015 3016 Label L_done; 3017 3018 movw(xstart, xlen); 3019 subsw(xstart, xstart, 1); 3020 br(Assembler::MI, L_done); 3021 3022 multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, carry, product, idx, kdx); 3023 3024 Label L_second_loop; 3025 cbzw(kdx, L_second_loop); 3026 3027 Label L_carry; 3028 subw(kdx, kdx, 1); 3029 cbzw(kdx, L_carry); 3030 3031 strw(carry, Address(z, kdx, Address::uxtw(LogBytesPerInt))); 3032 lsr(carry, carry, 32); 3033 subw(kdx, kdx, 1); 3034 3035 bind(L_carry); 3036 strw(carry, Address(z, kdx, Address::uxtw(LogBytesPerInt))); 3037 3038 // Second and third (nested) loops. 3039 // 3040 // for (int i = xstart-1; i >= 0; i--) { // Second loop 3041 // carry = 0; 3042 // for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop 3043 // long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) + 3044 // (z[k] & LONG_MASK) + carry; 3045 // z[k] = (int)product; 3046 // carry = product >>> 32; 3047 // } 3048 // z[i] = (int)carry; 3049 // } 3050 // 3051 // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = product_hi 3052 3053 const Register jdx = tmp1; 3054 3055 bind(L_second_loop); 3056 mov(carry, zr); // carry = 0; 3057 movw(jdx, ylen); // j = ystart+1 3058 3059 subsw(xstart, xstart, 1); // i = xstart-1; 3060 br(Assembler::MI, L_done); 3061 3062 str(z, Address(pre(sp, -4 * wordSize))); 3063 3064 Label L_last_x; 3065 lea(z, offsetted_address(z, xstart, Address::uxtw(LogBytesPerInt), 4, BytesPerInt)); // z = z + k - j 3066 subsw(xstart, xstart, 1); // i = xstart-1; 3067 br(Assembler::MI, L_last_x); 3068 3069 lea(rscratch1, Address(x, xstart, Address::uxtw(LogBytesPerInt))); 3070 ldr(product_hi, Address(rscratch1)); 3071 ror(product_hi, product_hi, 32); // convert big-endian to little-endian 3072 3073 Label L_third_loop_prologue; 3074 bind(L_third_loop_prologue); 3075 3076 str(ylen, Address(sp, wordSize)); 3077 stp(x, xstart, Address(sp, 2 * wordSize)); 3078 multiply_128_x_128_loop(y, z, carry, x, jdx, ylen, product, 3079 tmp2, x_xstart, tmp3, tmp4, tmp6, product_hi); 3080 ldp(z, ylen, Address(post(sp, 2 * wordSize))); 3081 ldp(x, xlen, Address(post(sp, 2 * wordSize))); // copy old xstart -> xlen 3082 3083 addw(tmp3, xlen, 1); 3084 strw(carry, Address(z, tmp3, Address::uxtw(LogBytesPerInt))); 3085 subsw(tmp3, tmp3, 1); 3086 br(Assembler::MI, L_done); 3087 3088 lsr(carry, carry, 32); 3089 strw(carry, Address(z, tmp3, Address::uxtw(LogBytesPerInt))); 3090 b(L_second_loop); 3091 3092 // Next infrequent code is moved outside loops. 3093 bind(L_last_x); 3094 ldrw(product_hi, Address(x, 0)); 3095 b(L_third_loop_prologue); 3096 3097 bind(L_done); 3098 } 3099 3100 // Code for BigInteger::mulAdd instrinsic 3101 // out = r0 3102 // in = r1 3103 // offset = r2 (already out.length-offset) 3104 // len = r3 3105 // k = r4 3106 // 3107 // pseudo code from java implementation: 3108 // carry = 0; 3109 // offset = out.length-offset - 1; 3110 // for (int j=len-1; j >= 0; j--) { 3111 // product = (in[j] & LONG_MASK) * kLong + (out[offset] & LONG_MASK) + carry; 3112 // out[offset--] = (int)product; 3113 // carry = product >>> 32; 3114 // } 3115 // return (int)carry; 3116 void MacroAssembler::mul_add(Register out, Register in, Register offset, 3117 Register len, Register k) { 3118 Label LOOP, END; 3119 // pre-loop 3120 cmp(len, zr); // cmp, not cbz/cbnz: to use condition twice => less branches 3121 csel(out, zr, out, Assembler::EQ); 3122 br(Assembler::EQ, END); 3123 add(in, in, len, LSL, 2); // in[j+1] address 3124 add(offset, out, offset, LSL, 2); // out[offset + 1] address 3125 mov(out, zr); // used to keep carry now 3126 BIND(LOOP); 3127 ldrw(rscratch1, Address(pre(in, -4))); 3128 madd(rscratch1, rscratch1, k, out); 3129 ldrw(rscratch2, Address(pre(offset, -4))); 3130 add(rscratch1, rscratch1, rscratch2); 3131 strw(rscratch1, Address(offset)); 3132 lsr(out, rscratch1, 32); 3133 subs(len, len, 1); 3134 br(Assembler::NE, LOOP); 3135 BIND(END); 3136 } 3137 3138 /** 3139 * Emits code to update CRC-32 with a byte value according to constants in table 3140 * 3141 * @param [in,out]crc Register containing the crc. 3142 * @param [in]val Register containing the byte to fold into the CRC. 3143 * @param [in]table Register containing the table of crc constants. 3144 * 3145 * uint32_t crc; 3146 * val = crc_table[(val ^ crc) & 0xFF]; 3147 * crc = val ^ (crc >> 8); 3148 * 3149 */ 3150 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) { 3151 eor(val, val, crc); 3152 andr(val, val, 0xff); 3153 ldrw(val, Address(table, val, Address::lsl(2))); 3154 eor(crc, val, crc, Assembler::LSR, 8); 3155 } 3156 3157 /** 3158 * Emits code to update CRC-32 with a 32-bit value according to tables 0 to 3 3159 * 3160 * @param [in,out]crc Register containing the crc. 3161 * @param [in]v Register containing the 32-bit to fold into the CRC. 3162 * @param [in]table0 Register containing table 0 of crc constants. 3163 * @param [in]table1 Register containing table 1 of crc constants. 3164 * @param [in]table2 Register containing table 2 of crc constants. 3165 * @param [in]table3 Register containing table 3 of crc constants. 3166 * 3167 * uint32_t crc; 3168 * v = crc ^ v 3169 * crc = table3[v&0xff]^table2[(v>>8)&0xff]^table1[(v>>16)&0xff]^table0[v>>24] 3170 * 3171 */ 3172 void MacroAssembler::update_word_crc32(Register crc, Register v, Register tmp, 3173 Register table0, Register table1, Register table2, Register table3, 3174 bool upper) { 3175 eor(v, crc, v, upper ? LSR:LSL, upper ? 32:0); 3176 uxtb(tmp, v); 3177 ldrw(crc, Address(table3, tmp, Address::lsl(2))); 3178 ubfx(tmp, v, 8, 8); 3179 ldrw(tmp, Address(table2, tmp, Address::lsl(2))); 3180 eor(crc, crc, tmp); 3181 ubfx(tmp, v, 16, 8); 3182 ldrw(tmp, Address(table1, tmp, Address::lsl(2))); 3183 eor(crc, crc, tmp); 3184 ubfx(tmp, v, 24, 8); 3185 ldrw(tmp, Address(table0, tmp, Address::lsl(2))); 3186 eor(crc, crc, tmp); 3187 } 3188 3189 void MacroAssembler::kernel_crc32_using_crc32(Register crc, Register buf, 3190 Register len, Register tmp0, Register tmp1, Register tmp2, 3191 Register tmp3) { 3192 Label CRC_by64_loop, CRC_by4_loop, CRC_by1_loop, CRC_less64, CRC_by64_pre, CRC_by32_loop, CRC_less32, L_exit; 3193 assert_different_registers(crc, buf, len, tmp0, tmp1, tmp2, tmp3); 3194 3195 mvnw(crc, crc); 3196 3197 subs(len, len, 128); 3198 br(Assembler::GE, CRC_by64_pre); 3199 BIND(CRC_less64); 3200 adds(len, len, 128-32); 3201 br(Assembler::GE, CRC_by32_loop); 3202 BIND(CRC_less32); 3203 adds(len, len, 32-4); 3204 br(Assembler::GE, CRC_by4_loop); 3205 adds(len, len, 4); 3206 br(Assembler::GT, CRC_by1_loop); 3207 b(L_exit); 3208 3209 BIND(CRC_by32_loop); 3210 ldp(tmp0, tmp1, Address(post(buf, 16))); 3211 subs(len, len, 32); 3212 crc32x(crc, crc, tmp0); 3213 ldr(tmp2, Address(post(buf, 8))); 3214 crc32x(crc, crc, tmp1); 3215 ldr(tmp3, Address(post(buf, 8))); 3216 crc32x(crc, crc, tmp2); 3217 crc32x(crc, crc, tmp3); 3218 br(Assembler::GE, CRC_by32_loop); 3219 cmn(len, 32); 3220 br(Assembler::NE, CRC_less32); 3221 b(L_exit); 3222 3223 BIND(CRC_by4_loop); 3224 ldrw(tmp0, Address(post(buf, 4))); 3225 subs(len, len, 4); 3226 crc32w(crc, crc, tmp0); 3227 br(Assembler::GE, CRC_by4_loop); 3228 adds(len, len, 4); 3229 br(Assembler::LE, L_exit); 3230 BIND(CRC_by1_loop); 3231 ldrb(tmp0, Address(post(buf, 1))); 3232 subs(len, len, 1); 3233 crc32b(crc, crc, tmp0); 3234 br(Assembler::GT, CRC_by1_loop); 3235 b(L_exit); 3236 3237 BIND(CRC_by64_pre); 3238 sub(buf, buf, 8); 3239 ldp(tmp0, tmp1, Address(buf, 8)); 3240 crc32x(crc, crc, tmp0); 3241 ldr(tmp2, Address(buf, 24)); 3242 crc32x(crc, crc, tmp1); 3243 ldr(tmp3, Address(buf, 32)); 3244 crc32x(crc, crc, tmp2); 3245 ldr(tmp0, Address(buf, 40)); 3246 crc32x(crc, crc, tmp3); 3247 ldr(tmp1, Address(buf, 48)); 3248 crc32x(crc, crc, tmp0); 3249 ldr(tmp2, Address(buf, 56)); 3250 crc32x(crc, crc, tmp1); 3251 ldr(tmp3, Address(pre(buf, 64))); 3252 3253 b(CRC_by64_loop); 3254 3255 align(CodeEntryAlignment); 3256 BIND(CRC_by64_loop); 3257 subs(len, len, 64); 3258 crc32x(crc, crc, tmp2); 3259 ldr(tmp0, Address(buf, 8)); 3260 crc32x(crc, crc, tmp3); 3261 ldr(tmp1, Address(buf, 16)); 3262 crc32x(crc, crc, tmp0); 3263 ldr(tmp2, Address(buf, 24)); 3264 crc32x(crc, crc, tmp1); 3265 ldr(tmp3, Address(buf, 32)); 3266 crc32x(crc, crc, tmp2); 3267 ldr(tmp0, Address(buf, 40)); 3268 crc32x(crc, crc, tmp3); 3269 ldr(tmp1, Address(buf, 48)); 3270 crc32x(crc, crc, tmp0); 3271 ldr(tmp2, Address(buf, 56)); 3272 crc32x(crc, crc, tmp1); 3273 ldr(tmp3, Address(pre(buf, 64))); 3274 br(Assembler::GE, CRC_by64_loop); 3275 3276 // post-loop 3277 crc32x(crc, crc, tmp2); 3278 crc32x(crc, crc, tmp3); 3279 3280 sub(len, len, 64); 3281 add(buf, buf, 8); 3282 cmn(len, 128); 3283 br(Assembler::NE, CRC_less64); 3284 BIND(L_exit); 3285 mvnw(crc, crc); 3286 } 3287 3288 /** 3289 * @param crc register containing existing CRC (32-bit) 3290 * @param buf register pointing to input byte buffer (byte*) 3291 * @param len register containing number of bytes 3292 * @param table register that will contain address of CRC table 3293 * @param tmp scratch register 3294 */ 3295 void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len, 3296 Register table0, Register table1, Register table2, Register table3, 3297 Register tmp, Register tmp2, Register tmp3) { 3298 Label L_by16, L_by16_loop, L_by4, L_by4_loop, L_by1, L_by1_loop, L_exit; 3299 unsigned long offset; 3300 3301 if (UseCRC32) { 3302 kernel_crc32_using_crc32(crc, buf, len, table0, table1, table2, table3); 3303 return; 3304 } 3305 3306 mvnw(crc, crc); 3307 3308 adrp(table0, ExternalAddress(StubRoutines::crc_table_addr()), offset); 3309 if (offset) add(table0, table0, offset); 3310 add(table1, table0, 1*256*sizeof(juint)); 3311 add(table2, table0, 2*256*sizeof(juint)); 3312 add(table3, table0, 3*256*sizeof(juint)); 3313 3314 if (UseNeon) { 3315 cmp(len, 64); 3316 br(Assembler::LT, L_by16); 3317 eor(v16, T16B, v16, v16); 3318 3319 Label L_fold; 3320 3321 add(tmp, table0, 4*256*sizeof(juint)); // Point at the Neon constants 3322 3323 ld1(v0, v1, T2D, post(buf, 32)); 3324 ld1r(v4, T2D, post(tmp, 8)); 3325 ld1r(v5, T2D, post(tmp, 8)); 3326 ld1r(v6, T2D, post(tmp, 8)); 3327 ld1r(v7, T2D, post(tmp, 8)); 3328 mov(v16, T4S, 0, crc); 3329 3330 eor(v0, T16B, v0, v16); 3331 sub(len, len, 64); 3332 3333 BIND(L_fold); 3334 pmull(v22, T8H, v0, v5, T8B); 3335 pmull(v20, T8H, v0, v7, T8B); 3336 pmull(v23, T8H, v0, v4, T8B); 3337 pmull(v21, T8H, v0, v6, T8B); 3338 3339 pmull2(v18, T8H, v0, v5, T16B); 3340 pmull2(v16, T8H, v0, v7, T16B); 3341 pmull2(v19, T8H, v0, v4, T16B); 3342 pmull2(v17, T8H, v0, v6, T16B); 3343 3344 uzp1(v24, T8H, v20, v22); 3345 uzp2(v25, T8H, v20, v22); 3346 eor(v20, T16B, v24, v25); 3347 3348 uzp1(v26, T8H, v16, v18); 3349 uzp2(v27, T8H, v16, v18); 3350 eor(v16, T16B, v26, v27); 3351 3352 ushll2(v22, T4S, v20, T8H, 8); 3353 ushll(v20, T4S, v20, T4H, 8); 3354 3355 ushll2(v18, T4S, v16, T8H, 8); 3356 ushll(v16, T4S, v16, T4H, 8); 3357 3358 eor(v22, T16B, v23, v22); 3359 eor(v18, T16B, v19, v18); 3360 eor(v20, T16B, v21, v20); 3361 eor(v16, T16B, v17, v16); 3362 3363 uzp1(v17, T2D, v16, v20); 3364 uzp2(v21, T2D, v16, v20); 3365 eor(v17, T16B, v17, v21); 3366 3367 ushll2(v20, T2D, v17, T4S, 16); 3368 ushll(v16, T2D, v17, T2S, 16); 3369 3370 eor(v20, T16B, v20, v22); 3371 eor(v16, T16B, v16, v18); 3372 3373 uzp1(v17, T2D, v20, v16); 3374 uzp2(v21, T2D, v20, v16); 3375 eor(v28, T16B, v17, v21); 3376 3377 pmull(v22, T8H, v1, v5, T8B); 3378 pmull(v20, T8H, v1, v7, T8B); 3379 pmull(v23, T8H, v1, v4, T8B); 3380 pmull(v21, T8H, v1, v6, T8B); 3381 3382 pmull2(v18, T8H, v1, v5, T16B); 3383 pmull2(v16, T8H, v1, v7, T16B); 3384 pmull2(v19, T8H, v1, v4, T16B); 3385 pmull2(v17, T8H, v1, v6, T16B); 3386 3387 ld1(v0, v1, T2D, post(buf, 32)); 3388 3389 uzp1(v24, T8H, v20, v22); 3390 uzp2(v25, T8H, v20, v22); 3391 eor(v20, T16B, v24, v25); 3392 3393 uzp1(v26, T8H, v16, v18); 3394 uzp2(v27, T8H, v16, v18); 3395 eor(v16, T16B, v26, v27); 3396 3397 ushll2(v22, T4S, v20, T8H, 8); 3398 ushll(v20, T4S, v20, T4H, 8); 3399 3400 ushll2(v18, T4S, v16, T8H, 8); 3401 ushll(v16, T4S, v16, T4H, 8); 3402 3403 eor(v22, T16B, v23, v22); 3404 eor(v18, T16B, v19, v18); 3405 eor(v20, T16B, v21, v20); 3406 eor(v16, T16B, v17, v16); 3407 3408 uzp1(v17, T2D, v16, v20); 3409 uzp2(v21, T2D, v16, v20); 3410 eor(v16, T16B, v17, v21); 3411 3412 ushll2(v20, T2D, v16, T4S, 16); 3413 ushll(v16, T2D, v16, T2S, 16); 3414 3415 eor(v20, T16B, v22, v20); 3416 eor(v16, T16B, v16, v18); 3417 3418 uzp1(v17, T2D, v20, v16); 3419 uzp2(v21, T2D, v20, v16); 3420 eor(v20, T16B, v17, v21); 3421 3422 shl(v16, T2D, v28, 1); 3423 shl(v17, T2D, v20, 1); 3424 3425 eor(v0, T16B, v0, v16); 3426 eor(v1, T16B, v1, v17); 3427 3428 subs(len, len, 32); 3429 br(Assembler::GE, L_fold); 3430 3431 mov(crc, 0); 3432 mov(tmp, v0, T1D, 0); 3433 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false); 3434 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true); 3435 mov(tmp, v0, T1D, 1); 3436 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false); 3437 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true); 3438 mov(tmp, v1, T1D, 0); 3439 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false); 3440 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true); 3441 mov(tmp, v1, T1D, 1); 3442 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false); 3443 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true); 3444 3445 add(len, len, 32); 3446 } 3447 3448 BIND(L_by16); 3449 subs(len, len, 16); 3450 br(Assembler::GE, L_by16_loop); 3451 adds(len, len, 16-4); 3452 br(Assembler::GE, L_by4_loop); 3453 adds(len, len, 4); 3454 br(Assembler::GT, L_by1_loop); 3455 b(L_exit); 3456 3457 BIND(L_by4_loop); 3458 ldrw(tmp, Address(post(buf, 4))); 3459 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3); 3460 subs(len, len, 4); 3461 br(Assembler::GE, L_by4_loop); 3462 adds(len, len, 4); 3463 br(Assembler::LE, L_exit); 3464 BIND(L_by1_loop); 3465 subs(len, len, 1); 3466 ldrb(tmp, Address(post(buf, 1))); 3467 update_byte_crc32(crc, tmp, table0); 3468 br(Assembler::GT, L_by1_loop); 3469 b(L_exit); 3470 3471 align(CodeEntryAlignment); 3472 BIND(L_by16_loop); 3473 subs(len, len, 16); 3474 ldp(tmp, tmp3, Address(post(buf, 16))); 3475 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false); 3476 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true); 3477 update_word_crc32(crc, tmp3, tmp2, table0, table1, table2, table3, false); 3478 update_word_crc32(crc, tmp3, tmp2, table0, table1, table2, table3, true); 3479 br(Assembler::GE, L_by16_loop); 3480 adds(len, len, 16-4); 3481 br(Assembler::GE, L_by4_loop); 3482 adds(len, len, 4); 3483 br(Assembler::GT, L_by1_loop); 3484 BIND(L_exit); 3485 mvnw(crc, crc); 3486 } 3487 3488 void MacroAssembler::kernel_crc32c_using_crc32c(Register crc, Register buf, 3489 Register len, Register tmp0, Register tmp1, Register tmp2, 3490 Register tmp3) { 3491 Label CRC_by64_loop, CRC_by4_loop, CRC_by1_loop, CRC_less64, CRC_by64_pre, CRC_by32_loop, CRC_less32, L_exit; 3492 assert_different_registers(crc, buf, len, tmp0, tmp1, tmp2, tmp3); 3493 3494 subs(len, len, 128); 3495 br(Assembler::GE, CRC_by64_pre); 3496 BIND(CRC_less64); 3497 adds(len, len, 128-32); 3498 br(Assembler::GE, CRC_by32_loop); 3499 BIND(CRC_less32); 3500 adds(len, len, 32-4); 3501 br(Assembler::GE, CRC_by4_loop); 3502 adds(len, len, 4); 3503 br(Assembler::GT, CRC_by1_loop); 3504 b(L_exit); 3505 3506 BIND(CRC_by32_loop); 3507 ldp(tmp0, tmp1, Address(post(buf, 16))); 3508 subs(len, len, 32); 3509 crc32cx(crc, crc, tmp0); 3510 ldr(tmp2, Address(post(buf, 8))); 3511 crc32cx(crc, crc, tmp1); 3512 ldr(tmp3, Address(post(buf, 8))); 3513 crc32cx(crc, crc, tmp2); 3514 crc32cx(crc, crc, tmp3); 3515 br(Assembler::GE, CRC_by32_loop); 3516 cmn(len, 32); 3517 br(Assembler::NE, CRC_less32); 3518 b(L_exit); 3519 3520 BIND(CRC_by4_loop); 3521 ldrw(tmp0, Address(post(buf, 4))); 3522 subs(len, len, 4); 3523 crc32cw(crc, crc, tmp0); 3524 br(Assembler::GE, CRC_by4_loop); 3525 adds(len, len, 4); 3526 br(Assembler::LE, L_exit); 3527 BIND(CRC_by1_loop); 3528 ldrb(tmp0, Address(post(buf, 1))); 3529 subs(len, len, 1); 3530 crc32cb(crc, crc, tmp0); 3531 br(Assembler::GT, CRC_by1_loop); 3532 b(L_exit); 3533 3534 BIND(CRC_by64_pre); 3535 sub(buf, buf, 8); 3536 ldp(tmp0, tmp1, Address(buf, 8)); 3537 crc32cx(crc, crc, tmp0); 3538 ldr(tmp2, Address(buf, 24)); 3539 crc32cx(crc, crc, tmp1); 3540 ldr(tmp3, Address(buf, 32)); 3541 crc32cx(crc, crc, tmp2); 3542 ldr(tmp0, Address(buf, 40)); 3543 crc32cx(crc, crc, tmp3); 3544 ldr(tmp1, Address(buf, 48)); 3545 crc32cx(crc, crc, tmp0); 3546 ldr(tmp2, Address(buf, 56)); 3547 crc32cx(crc, crc, tmp1); 3548 ldr(tmp3, Address(pre(buf, 64))); 3549 3550 b(CRC_by64_loop); 3551 3552 align(CodeEntryAlignment); 3553 BIND(CRC_by64_loop); 3554 subs(len, len, 64); 3555 crc32cx(crc, crc, tmp2); 3556 ldr(tmp0, Address(buf, 8)); 3557 crc32cx(crc, crc, tmp3); 3558 ldr(tmp1, Address(buf, 16)); 3559 crc32cx(crc, crc, tmp0); 3560 ldr(tmp2, Address(buf, 24)); 3561 crc32cx(crc, crc, tmp1); 3562 ldr(tmp3, Address(buf, 32)); 3563 crc32cx(crc, crc, tmp2); 3564 ldr(tmp0, Address(buf, 40)); 3565 crc32cx(crc, crc, tmp3); 3566 ldr(tmp1, Address(buf, 48)); 3567 crc32cx(crc, crc, tmp0); 3568 ldr(tmp2, Address(buf, 56)); 3569 crc32cx(crc, crc, tmp1); 3570 ldr(tmp3, Address(pre(buf, 64))); 3571 br(Assembler::GE, CRC_by64_loop); 3572 3573 // post-loop 3574 crc32cx(crc, crc, tmp2); 3575 crc32cx(crc, crc, tmp3); 3576 3577 sub(len, len, 64); 3578 add(buf, buf, 8); 3579 cmn(len, 128); 3580 br(Assembler::NE, CRC_less64); 3581 BIND(L_exit); 3582 } 3583 3584 /** 3585 * @param crc register containing existing CRC (32-bit) 3586 * @param buf register pointing to input byte buffer (byte*) 3587 * @param len register containing number of bytes 3588 * @param table register that will contain address of CRC table 3589 * @param tmp scratch register 3590 */ 3591 void MacroAssembler::kernel_crc32c(Register crc, Register buf, Register len, 3592 Register table0, Register table1, Register table2, Register table3, 3593 Register tmp, Register tmp2, Register tmp3) { 3594 kernel_crc32c_using_crc32c(crc, buf, len, table0, table1, table2, table3); 3595 } 3596 3597 3598 SkipIfEqual::SkipIfEqual( 3599 MacroAssembler* masm, const bool* flag_addr, bool value) { 3600 _masm = masm; 3601 unsigned long offset; 3602 _masm->adrp(rscratch1, ExternalAddress((address)flag_addr), offset); 3603 _masm->ldrb(rscratch1, Address(rscratch1, offset)); 3604 _masm->cbzw(rscratch1, _label); 3605 } 3606 3607 SkipIfEqual::~SkipIfEqual() { 3608 _masm->bind(_label); 3609 } 3610 3611 void MacroAssembler::addptr(const Address &dst, int32_t src) { 3612 Address adr; 3613 switch(dst.getMode()) { 3614 case Address::base_plus_offset: 3615 // This is the expected mode, although we allow all the other 3616 // forms below. 3617 adr = form_address(rscratch2, dst.base(), dst.offset(), LogBytesPerWord); 3618 break; 3619 default: 3620 lea(rscratch2, dst); 3621 adr = Address(rscratch2); 3622 break; 3623 } 3624 ldr(rscratch1, adr); 3625 add(rscratch1, rscratch1, src); 3626 str(rscratch1, adr); 3627 } 3628 3629 void MacroAssembler::cmpptr(Register src1, Address src2) { 3630 unsigned long offset; 3631 adrp(rscratch1, src2, offset); 3632 ldr(rscratch1, Address(rscratch1, offset)); 3633 cmp(src1, rscratch1); 3634 } 3635 3636 void MacroAssembler::cmpoop(Register obj1, Register obj2) { 3637 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 3638 bs->obj_equals(this, obj1, obj2); 3639 } 3640 3641 void MacroAssembler::load_klass(Register dst, Register src) { 3642 if (UseCompressedClassPointers) { 3643 ldrw(dst, Address(src, oopDesc::klass_offset_in_bytes())); 3644 decode_klass_not_null(dst); 3645 } else { 3646 ldr(dst, Address(src, oopDesc::klass_offset_in_bytes())); 3647 } 3648 } 3649 3650 // ((OopHandle)result).resolve(); 3651 void MacroAssembler::resolve_oop_handle(Register result, Register tmp) { 3652 // OopHandle::resolve is an indirection. 3653 access_load_at(T_OBJECT, IN_NATIVE, result, Address(result, 0), tmp, noreg); 3654 } 3655 3656 void MacroAssembler::load_mirror(Register dst, Register method, Register tmp) { 3657 const int mirror_offset = in_bytes(Klass::java_mirror_offset()); 3658 ldr(dst, Address(rmethod, Method::const_offset())); 3659 ldr(dst, Address(dst, ConstMethod::constants_offset())); 3660 ldr(dst, Address(dst, ConstantPool::pool_holder_offset_in_bytes())); 3661 ldr(dst, Address(dst, mirror_offset)); 3662 resolve_oop_handle(dst, tmp); 3663 } 3664 3665 void MacroAssembler::cmp_klass(Register oop, Register trial_klass, Register tmp) { 3666 if (UseCompressedClassPointers) { 3667 ldrw(tmp, Address(oop, oopDesc::klass_offset_in_bytes())); 3668 if (Universe::narrow_klass_base() == NULL) { 3669 cmp(trial_klass, tmp, LSL, Universe::narrow_klass_shift()); 3670 return; 3671 } else if (((uint64_t)Universe::narrow_klass_base() & 0xffffffff) == 0 3672 && Universe::narrow_klass_shift() == 0) { 3673 // Only the bottom 32 bits matter 3674 cmpw(trial_klass, tmp); 3675 return; 3676 } 3677 decode_klass_not_null(tmp); 3678 } else { 3679 ldr(tmp, Address(oop, oopDesc::klass_offset_in_bytes())); 3680 } 3681 cmp(trial_klass, tmp); 3682 } 3683 3684 void MacroAssembler::load_prototype_header(Register dst, Register src) { 3685 load_klass(dst, src); 3686 ldr(dst, Address(dst, Klass::prototype_header_offset())); 3687 } 3688 3689 void MacroAssembler::store_klass(Register dst, Register src) { 3690 // FIXME: Should this be a store release? concurrent gcs assumes 3691 // klass length is valid if klass field is not null. 3692 if (UseCompressedClassPointers) { 3693 encode_klass_not_null(src); 3694 strw(src, Address(dst, oopDesc::klass_offset_in_bytes())); 3695 } else { 3696 str(src, Address(dst, oopDesc::klass_offset_in_bytes())); 3697 } 3698 } 3699 3700 void MacroAssembler::store_klass_gap(Register dst, Register src) { 3701 if (UseCompressedClassPointers) { 3702 // Store to klass gap in destination 3703 strw(src, Address(dst, oopDesc::klass_gap_offset_in_bytes())); 3704 } 3705 } 3706 3707 // Algorithm must match CompressedOops::encode. 3708 void MacroAssembler::encode_heap_oop(Register d, Register s) { 3709 #ifdef ASSERT 3710 verify_heapbase("MacroAssembler::encode_heap_oop: heap base corrupted?"); 3711 #endif 3712 verify_oop(s, "broken oop in encode_heap_oop"); 3713 if (Universe::narrow_oop_base() == NULL) { 3714 if (Universe::narrow_oop_shift() != 0) { 3715 assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong"); 3716 lsr(d, s, LogMinObjAlignmentInBytes); 3717 } else { 3718 mov(d, s); 3719 } 3720 } else { 3721 subs(d, s, rheapbase); 3722 csel(d, d, zr, Assembler::HS); 3723 lsr(d, d, LogMinObjAlignmentInBytes); 3724 3725 /* Old algorithm: is this any worse? 3726 Label nonnull; 3727 cbnz(r, nonnull); 3728 sub(r, r, rheapbase); 3729 bind(nonnull); 3730 lsr(r, r, LogMinObjAlignmentInBytes); 3731 */ 3732 } 3733 } 3734 3735 void MacroAssembler::encode_heap_oop_not_null(Register r) { 3736 #ifdef ASSERT 3737 verify_heapbase("MacroAssembler::encode_heap_oop_not_null: heap base corrupted?"); 3738 if (CheckCompressedOops) { 3739 Label ok; 3740 cbnz(r, ok); 3741 stop("null oop passed to encode_heap_oop_not_null"); 3742 bind(ok); 3743 } 3744 #endif 3745 verify_oop(r, "broken oop in encode_heap_oop_not_null"); 3746 if (Universe::narrow_oop_base() != NULL) { 3747 sub(r, r, rheapbase); 3748 } 3749 if (Universe::narrow_oop_shift() != 0) { 3750 assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong"); 3751 lsr(r, r, LogMinObjAlignmentInBytes); 3752 } 3753 } 3754 3755 void MacroAssembler::encode_heap_oop_not_null(Register dst, Register src) { 3756 #ifdef ASSERT 3757 verify_heapbase("MacroAssembler::encode_heap_oop_not_null2: heap base corrupted?"); 3758 if (CheckCompressedOops) { 3759 Label ok; 3760 cbnz(src, ok); 3761 stop("null oop passed to encode_heap_oop_not_null2"); 3762 bind(ok); 3763 } 3764 #endif 3765 verify_oop(src, "broken oop in encode_heap_oop_not_null2"); 3766 3767 Register data = src; 3768 if (Universe::narrow_oop_base() != NULL) { 3769 sub(dst, src, rheapbase); 3770 data = dst; 3771 } 3772 if (Universe::narrow_oop_shift() != 0) { 3773 assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong"); 3774 lsr(dst, data, LogMinObjAlignmentInBytes); 3775 data = dst; 3776 } 3777 if (data == src) 3778 mov(dst, src); 3779 } 3780 3781 void MacroAssembler::decode_heap_oop(Register d, Register s) { 3782 #ifdef ASSERT 3783 verify_heapbase("MacroAssembler::decode_heap_oop: heap base corrupted?"); 3784 #endif 3785 if (Universe::narrow_oop_base() == NULL) { 3786 if (Universe::narrow_oop_shift() != 0 || d != s) { 3787 lsl(d, s, Universe::narrow_oop_shift()); 3788 } 3789 } else { 3790 Label done; 3791 if (d != s) 3792 mov(d, s); 3793 cbz(s, done); 3794 add(d, rheapbase, s, Assembler::LSL, LogMinObjAlignmentInBytes); 3795 bind(done); 3796 } 3797 verify_oop(d, "broken oop in decode_heap_oop"); 3798 } 3799 3800 void MacroAssembler::decode_heap_oop_not_null(Register r) { 3801 assert (UseCompressedOops, "should only be used for compressed headers"); 3802 assert (Universe::heap() != NULL, "java heap should be initialized"); 3803 // Cannot assert, unverified entry point counts instructions (see .ad file) 3804 // vtableStubs also counts instructions in pd_code_size_limit. 3805 // Also do not verify_oop as this is called by verify_oop. 3806 if (Universe::narrow_oop_shift() != 0) { 3807 assert(LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong"); 3808 if (Universe::narrow_oop_base() != NULL) { 3809 add(r, rheapbase, r, Assembler::LSL, LogMinObjAlignmentInBytes); 3810 } else { 3811 add(r, zr, r, Assembler::LSL, LogMinObjAlignmentInBytes); 3812 } 3813 } else { 3814 assert (Universe::narrow_oop_base() == NULL, "sanity"); 3815 } 3816 } 3817 3818 void MacroAssembler::decode_heap_oop_not_null(Register dst, Register src) { 3819 assert (UseCompressedOops, "should only be used for compressed headers"); 3820 assert (Universe::heap() != NULL, "java heap should be initialized"); 3821 // Cannot assert, unverified entry point counts instructions (see .ad file) 3822 // vtableStubs also counts instructions in pd_code_size_limit. 3823 // Also do not verify_oop as this is called by verify_oop. 3824 if (Universe::narrow_oop_shift() != 0) { 3825 assert(LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong"); 3826 if (Universe::narrow_oop_base() != NULL) { 3827 add(dst, rheapbase, src, Assembler::LSL, LogMinObjAlignmentInBytes); 3828 } else { 3829 add(dst, zr, src, Assembler::LSL, LogMinObjAlignmentInBytes); 3830 } 3831 } else { 3832 assert (Universe::narrow_oop_base() == NULL, "sanity"); 3833 if (dst != src) { 3834 mov(dst, src); 3835 } 3836 } 3837 } 3838 3839 void MacroAssembler::encode_klass_not_null(Register dst, Register src) { 3840 if (Universe::narrow_klass_base() == NULL) { 3841 if (Universe::narrow_klass_shift() != 0) { 3842 assert (LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong"); 3843 lsr(dst, src, LogKlassAlignmentInBytes); 3844 } else { 3845 if (dst != src) mov(dst, src); 3846 } 3847 return; 3848 } 3849 3850 if (use_XOR_for_compressed_class_base) { 3851 if (Universe::narrow_klass_shift() != 0) { 3852 eor(dst, src, (uint64_t)Universe::narrow_klass_base()); 3853 lsr(dst, dst, LogKlassAlignmentInBytes); 3854 } else { 3855 eor(dst, src, (uint64_t)Universe::narrow_klass_base()); 3856 } 3857 return; 3858 } 3859 3860 if (((uint64_t)Universe::narrow_klass_base() & 0xffffffff) == 0 3861 && Universe::narrow_klass_shift() == 0) { 3862 movw(dst, src); 3863 return; 3864 } 3865 3866 #ifdef ASSERT 3867 verify_heapbase("MacroAssembler::encode_klass_not_null2: heap base corrupted?"); 3868 #endif 3869 3870 Register rbase = dst; 3871 if (dst == src) rbase = rheapbase; 3872 mov(rbase, (uint64_t)Universe::narrow_klass_base()); 3873 sub(dst, src, rbase); 3874 if (Universe::narrow_klass_shift() != 0) { 3875 assert (LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong"); 3876 lsr(dst, dst, LogKlassAlignmentInBytes); 3877 } 3878 if (dst == src) reinit_heapbase(); 3879 } 3880 3881 void MacroAssembler::encode_klass_not_null(Register r) { 3882 encode_klass_not_null(r, r); 3883 } 3884 3885 void MacroAssembler::decode_klass_not_null(Register dst, Register src) { 3886 Register rbase = dst; 3887 assert (UseCompressedClassPointers, "should only be used for compressed headers"); 3888 3889 if (Universe::narrow_klass_base() == NULL) { 3890 if (Universe::narrow_klass_shift() != 0) { 3891 assert(LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong"); 3892 lsl(dst, src, LogKlassAlignmentInBytes); 3893 } else { 3894 if (dst != src) mov(dst, src); 3895 } 3896 return; 3897 } 3898 3899 if (use_XOR_for_compressed_class_base) { 3900 if (Universe::narrow_klass_shift() != 0) { 3901 lsl(dst, src, LogKlassAlignmentInBytes); 3902 eor(dst, dst, (uint64_t)Universe::narrow_klass_base()); 3903 } else { 3904 eor(dst, src, (uint64_t)Universe::narrow_klass_base()); 3905 } 3906 return; 3907 } 3908 3909 if (((uint64_t)Universe::narrow_klass_base() & 0xffffffff) == 0 3910 && Universe::narrow_klass_shift() == 0) { 3911 if (dst != src) 3912 movw(dst, src); 3913 movk(dst, (uint64_t)Universe::narrow_klass_base() >> 32, 32); 3914 return; 3915 } 3916 3917 // Cannot assert, unverified entry point counts instructions (see .ad file) 3918 // vtableStubs also counts instructions in pd_code_size_limit. 3919 // Also do not verify_oop as this is called by verify_oop. 3920 if (dst == src) rbase = rheapbase; 3921 mov(rbase, (uint64_t)Universe::narrow_klass_base()); 3922 if (Universe::narrow_klass_shift() != 0) { 3923 assert(LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong"); 3924 add(dst, rbase, src, Assembler::LSL, LogKlassAlignmentInBytes); 3925 } else { 3926 add(dst, rbase, src); 3927 } 3928 if (dst == src) reinit_heapbase(); 3929 } 3930 3931 void MacroAssembler::decode_klass_not_null(Register r) { 3932 decode_klass_not_null(r, r); 3933 } 3934 3935 void MacroAssembler::set_narrow_oop(Register dst, jobject obj) { 3936 #ifdef ASSERT 3937 { 3938 ThreadInVMfromUnknown tiv; 3939 assert (UseCompressedOops, "should only be used for compressed oops"); 3940 assert (Universe::heap() != NULL, "java heap should be initialized"); 3941 assert (oop_recorder() != NULL, "this assembler needs an OopRecorder"); 3942 assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "should be real oop"); 3943 } 3944 #endif 3945 int oop_index = oop_recorder()->find_index(obj); 3946 InstructionMark im(this); 3947 RelocationHolder rspec = oop_Relocation::spec(oop_index); 3948 code_section()->relocate(inst_mark(), rspec); 3949 movz(dst, 0xDEAD, 16); 3950 movk(dst, 0xBEEF); 3951 } 3952 3953 void MacroAssembler::set_narrow_klass(Register dst, Klass* k) { 3954 assert (UseCompressedClassPointers, "should only be used for compressed headers"); 3955 assert (oop_recorder() != NULL, "this assembler needs an OopRecorder"); 3956 int index = oop_recorder()->find_index(k); 3957 assert(! Universe::heap()->is_in_reserved(k), "should not be an oop"); 3958 3959 InstructionMark im(this); 3960 RelocationHolder rspec = metadata_Relocation::spec(index); 3961 code_section()->relocate(inst_mark(), rspec); 3962 narrowKlass nk = Klass::encode_klass(k); 3963 movz(dst, (nk >> 16), 16); 3964 movk(dst, nk & 0xffff); 3965 } 3966 3967 void MacroAssembler::access_load_at(BasicType type, DecoratorSet decorators, 3968 Register dst, Address src, 3969 Register tmp1, Register thread_tmp) { 3970 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 3971 decorators = AccessInternal::decorator_fixup(decorators); 3972 bool as_raw = (decorators & AS_RAW) != 0; 3973 if (as_raw) { 3974 bs->BarrierSetAssembler::load_at(this, decorators, type, dst, src, tmp1, thread_tmp); 3975 } else { 3976 bs->load_at(this, decorators, type, dst, src, tmp1, thread_tmp); 3977 } 3978 } 3979 3980 void MacroAssembler::access_store_at(BasicType type, DecoratorSet decorators, 3981 Address dst, Register src, 3982 Register tmp1, Register thread_tmp) { 3983 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 3984 decorators = AccessInternal::decorator_fixup(decorators); 3985 bool as_raw = (decorators & AS_RAW) != 0; 3986 if (as_raw) { 3987 bs->BarrierSetAssembler::store_at(this, decorators, type, dst, src, tmp1, thread_tmp); 3988 } else { 3989 bs->store_at(this, decorators, type, dst, src, tmp1, thread_tmp); 3990 } 3991 } 3992 3993 void MacroAssembler::resolve(DecoratorSet decorators, Register obj) { 3994 // Use stronger ACCESS_WRITE|ACCESS_READ by default. 3995 if ((decorators & (ACCESS_READ | ACCESS_WRITE)) == 0) { 3996 decorators |= ACCESS_READ | ACCESS_WRITE; 3997 } 3998 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 3999 return bs->resolve(this, decorators, obj); 4000 } 4001 4002 void MacroAssembler::load_heap_oop(Register dst, Address src, Register tmp1, 4003 Register thread_tmp, DecoratorSet decorators) { 4004 access_load_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, thread_tmp); 4005 } 4006 4007 void MacroAssembler::load_heap_oop_not_null(Register dst, Address src, Register tmp1, 4008 Register thread_tmp, DecoratorSet decorators) { 4009 access_load_at(T_OBJECT, IN_HEAP | IS_NOT_NULL | decorators, dst, src, tmp1, thread_tmp); 4010 } 4011 4012 void MacroAssembler::store_heap_oop(Address dst, Register src, Register tmp1, 4013 Register thread_tmp, DecoratorSet decorators) { 4014 access_store_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, thread_tmp); 4015 } 4016 4017 // Used for storing NULLs. 4018 void MacroAssembler::store_heap_oop_null(Address dst) { 4019 access_store_at(T_OBJECT, IN_HEAP, dst, noreg, noreg, noreg); 4020 } 4021 4022 Address MacroAssembler::allocate_metadata_address(Metadata* obj) { 4023 assert(oop_recorder() != NULL, "this assembler needs a Recorder"); 4024 int index = oop_recorder()->allocate_metadata_index(obj); 4025 RelocationHolder rspec = metadata_Relocation::spec(index); 4026 return Address((address)obj, rspec); 4027 } 4028 4029 // Move an oop into a register. immediate is true if we want 4030 // immediate instrcutions, i.e. we are not going to patch this 4031 // instruction while the code is being executed by another thread. In 4032 // that case we can use move immediates rather than the constant pool. 4033 void MacroAssembler::movoop(Register dst, jobject obj, bool immediate) { 4034 int oop_index; 4035 if (obj == NULL) { 4036 oop_index = oop_recorder()->allocate_oop_index(obj); 4037 } else { 4038 #ifdef ASSERT 4039 { 4040 ThreadInVMfromUnknown tiv; 4041 assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "should be real oop"); 4042 } 4043 #endif 4044 oop_index = oop_recorder()->find_index(obj); 4045 } 4046 RelocationHolder rspec = oop_Relocation::spec(oop_index); 4047 if (! immediate) { 4048 address dummy = address(uintptr_t(pc()) & -wordSize); // A nearby aligned address 4049 ldr_constant(dst, Address(dummy, rspec)); 4050 } else 4051 mov(dst, Address((address)obj, rspec)); 4052 } 4053 4054 // Move a metadata address into a register. 4055 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) { 4056 int oop_index; 4057 if (obj == NULL) { 4058 oop_index = oop_recorder()->allocate_metadata_index(obj); 4059 } else { 4060 oop_index = oop_recorder()->find_index(obj); 4061 } 4062 RelocationHolder rspec = metadata_Relocation::spec(oop_index); 4063 mov(dst, Address((address)obj, rspec)); 4064 } 4065 4066 Address MacroAssembler::constant_oop_address(jobject obj) { 4067 #ifdef ASSERT 4068 { 4069 ThreadInVMfromUnknown tiv; 4070 assert(oop_recorder() != NULL, "this assembler needs an OopRecorder"); 4071 assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "not an oop"); 4072 } 4073 #endif 4074 int oop_index = oop_recorder()->find_index(obj); 4075 return Address((address)obj, oop_Relocation::spec(oop_index)); 4076 } 4077 4078 // Defines obj, preserves var_size_in_bytes, okay for t2 == var_size_in_bytes. 4079 void MacroAssembler::tlab_allocate(Register obj, 4080 Register var_size_in_bytes, 4081 int con_size_in_bytes, 4082 Register t1, 4083 Register t2, 4084 Label& slow_case) { 4085 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 4086 bs->tlab_allocate(this, obj, var_size_in_bytes, con_size_in_bytes, t1, t2, slow_case); 4087 } 4088 4089 // Defines obj, preserves var_size_in_bytes 4090 void MacroAssembler::eden_allocate(Register obj, 4091 Register var_size_in_bytes, 4092 int con_size_in_bytes, 4093 Register t1, 4094 Label& slow_case) { 4095 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 4096 bs->eden_allocate(this, obj, var_size_in_bytes, con_size_in_bytes, t1, slow_case); 4097 } 4098 4099 // Zero words; len is in bytes 4100 // Destroys all registers except addr 4101 // len must be a nonzero multiple of wordSize 4102 void MacroAssembler::zero_memory(Register addr, Register len, Register t1) { 4103 assert_different_registers(addr, len, t1, rscratch1, rscratch2); 4104 4105 #ifdef ASSERT 4106 { Label L; 4107 tst(len, BytesPerWord - 1); 4108 br(Assembler::EQ, L); 4109 stop("len is not a multiple of BytesPerWord"); 4110 bind(L); 4111 } 4112 #endif 4113 4114 #ifndef PRODUCT 4115 block_comment("zero memory"); 4116 #endif 4117 4118 Label loop; 4119 Label entry; 4120 4121 // Algorithm: 4122 // 4123 // scratch1 = cnt & 7; 4124 // cnt -= scratch1; 4125 // p += scratch1; 4126 // switch (scratch1) { 4127 // do { 4128 // cnt -= 8; 4129 // p[-8] = 0; 4130 // case 7: 4131 // p[-7] = 0; 4132 // case 6: 4133 // p[-6] = 0; 4134 // // ... 4135 // case 1: 4136 // p[-1] = 0; 4137 // case 0: 4138 // p += 8; 4139 // } while (cnt); 4140 // } 4141 4142 const int unroll = 8; // Number of str(zr) instructions we'll unroll 4143 4144 lsr(len, len, LogBytesPerWord); 4145 andr(rscratch1, len, unroll - 1); // tmp1 = cnt % unroll 4146 sub(len, len, rscratch1); // cnt -= unroll 4147 // t1 always points to the end of the region we're about to zero 4148 add(t1, addr, rscratch1, Assembler::LSL, LogBytesPerWord); 4149 adr(rscratch2, entry); 4150 sub(rscratch2, rscratch2, rscratch1, Assembler::LSL, 2); 4151 br(rscratch2); 4152 bind(loop); 4153 sub(len, len, unroll); 4154 for (int i = -unroll; i < 0; i++) 4155 Assembler::str(zr, Address(t1, i * wordSize)); 4156 bind(entry); 4157 add(t1, t1, unroll * wordSize); 4158 cbnz(len, loop); 4159 } 4160 4161 void MacroAssembler::verify_tlab() { 4162 #ifdef ASSERT 4163 if (UseTLAB && VerifyOops) { 4164 Label next, ok; 4165 4166 stp(rscratch2, rscratch1, Address(pre(sp, -16))); 4167 4168 ldr(rscratch2, Address(rthread, in_bytes(JavaThread::tlab_top_offset()))); 4169 ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_start_offset()))); 4170 cmp(rscratch2, rscratch1); 4171 br(Assembler::HS, next); 4172 STOP("assert(top >= start)"); 4173 should_not_reach_here(); 4174 4175 bind(next); 4176 ldr(rscratch2, Address(rthread, in_bytes(JavaThread::tlab_end_offset()))); 4177 ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_top_offset()))); 4178 cmp(rscratch2, rscratch1); 4179 br(Assembler::HS, ok); 4180 STOP("assert(top <= end)"); 4181 should_not_reach_here(); 4182 4183 bind(ok); 4184 ldp(rscratch2, rscratch1, Address(post(sp, 16))); 4185 } 4186 #endif 4187 } 4188 4189 // Writes to stack successive pages until offset reached to check for 4190 // stack overflow + shadow pages. This clobbers tmp. 4191 void MacroAssembler::bang_stack_size(Register size, Register tmp) { 4192 assert_different_registers(tmp, size, rscratch1); 4193 mov(tmp, sp); 4194 // Bang stack for total size given plus shadow page size. 4195 // Bang one page at a time because large size can bang beyond yellow and 4196 // red zones. 4197 Label loop; 4198 mov(rscratch1, os::vm_page_size()); 4199 bind(loop); 4200 lea(tmp, Address(tmp, -os::vm_page_size())); 4201 subsw(size, size, rscratch1); 4202 str(size, Address(tmp)); 4203 br(Assembler::GT, loop); 4204 4205 // Bang down shadow pages too. 4206 // At this point, (tmp-0) is the last address touched, so don't 4207 // touch it again. (It was touched as (tmp-pagesize) but then tmp 4208 // was post-decremented.) Skip this address by starting at i=1, and 4209 // touch a few more pages below. N.B. It is important to touch all 4210 // the way down to and including i=StackShadowPages. 4211 for (int i = 0; i < (int)(JavaThread::stack_shadow_zone_size() / os::vm_page_size()) - 1; i++) { 4212 // this could be any sized move but this is can be a debugging crumb 4213 // so the bigger the better. 4214 lea(tmp, Address(tmp, -os::vm_page_size())); 4215 str(size, Address(tmp)); 4216 } 4217 } 4218 4219 4220 // Move the address of the polling page into dest. 4221 void MacroAssembler::get_polling_page(Register dest, address page, relocInfo::relocType rtype) { 4222 if (SafepointMechanism::uses_thread_local_poll()) { 4223 ldr(dest, Address(rthread, Thread::polling_page_offset())); 4224 } else { 4225 unsigned long off; 4226 adrp(dest, Address(page, rtype), off); 4227 assert(off == 0, "polling page must be page aligned"); 4228 } 4229 } 4230 4231 // Move the address of the polling page into r, then read the polling 4232 // page. 4233 address MacroAssembler::read_polling_page(Register r, address page, relocInfo::relocType rtype) { 4234 get_polling_page(r, page, rtype); 4235 return read_polling_page(r, rtype); 4236 } 4237 4238 // Read the polling page. The address of the polling page must 4239 // already be in r. 4240 address MacroAssembler::read_polling_page(Register r, relocInfo::relocType rtype) { 4241 InstructionMark im(this); 4242 code_section()->relocate(inst_mark(), rtype); 4243 ldrw(zr, Address(r, 0)); 4244 return inst_mark(); 4245 } 4246 4247 void MacroAssembler::adrp(Register reg1, const Address &dest, unsigned long &byte_offset) { 4248 relocInfo::relocType rtype = dest.rspec().reloc()->type(); 4249 unsigned long low_page = (unsigned long)CodeCache::low_bound() >> 12; 4250 unsigned long high_page = (unsigned long)(CodeCache::high_bound()-1) >> 12; 4251 unsigned long dest_page = (unsigned long)dest.target() >> 12; 4252 long offset_low = dest_page - low_page; 4253 long offset_high = dest_page - high_page; 4254 4255 assert(is_valid_AArch64_address(dest.target()), "bad address"); 4256 assert(dest.getMode() == Address::literal, "ADRP must be applied to a literal address"); 4257 4258 InstructionMark im(this); 4259 code_section()->relocate(inst_mark(), dest.rspec()); 4260 // 8143067: Ensure that the adrp can reach the dest from anywhere within 4261 // the code cache so that if it is relocated we know it will still reach 4262 if (offset_high >= -(1<<20) && offset_low < (1<<20)) { 4263 _adrp(reg1, dest.target()); 4264 } else { 4265 unsigned long target = (unsigned long)dest.target(); 4266 unsigned long adrp_target 4267 = (target & 0xffffffffUL) | ((unsigned long)pc() & 0xffff00000000UL); 4268 4269 _adrp(reg1, (address)adrp_target); 4270 movk(reg1, target >> 32, 32); 4271 } 4272 byte_offset = (unsigned long)dest.target() & 0xfff; 4273 } 4274 4275 void MacroAssembler::load_byte_map_base(Register reg) { 4276 jbyte *byte_map_base = 4277 ((CardTableBarrierSet*)(BarrierSet::barrier_set()))->card_table()->byte_map_base(); 4278 4279 if (is_valid_AArch64_address((address)byte_map_base)) { 4280 // Strictly speaking the byte_map_base isn't an address at all, 4281 // and it might even be negative. 4282 unsigned long offset; 4283 adrp(reg, ExternalAddress((address)byte_map_base), offset); 4284 // We expect offset to be zero with most collectors. 4285 if (offset != 0) { 4286 add(reg, reg, offset); 4287 } 4288 } else { 4289 mov(reg, (uint64_t)byte_map_base); 4290 } 4291 } 4292 4293 void MacroAssembler::build_frame(int framesize) { 4294 assert(framesize > 0, "framesize must be > 0"); 4295 if (framesize < ((1 << 9) + 2 * wordSize)) { 4296 sub(sp, sp, framesize); 4297 stp(rfp, lr, Address(sp, framesize - 2 * wordSize)); 4298 if (PreserveFramePointer) add(rfp, sp, framesize - 2 * wordSize); 4299 } else { 4300 stp(rfp, lr, Address(pre(sp, -2 * wordSize))); 4301 if (PreserveFramePointer) mov(rfp, sp); 4302 if (framesize < ((1 << 12) + 2 * wordSize)) 4303 sub(sp, sp, framesize - 2 * wordSize); 4304 else { 4305 mov(rscratch1, framesize - 2 * wordSize); 4306 sub(sp, sp, rscratch1); 4307 } 4308 } 4309 } 4310 4311 void MacroAssembler::remove_frame(int framesize) { 4312 assert(framesize > 0, "framesize must be > 0"); 4313 if (framesize < ((1 << 9) + 2 * wordSize)) { 4314 ldp(rfp, lr, Address(sp, framesize - 2 * wordSize)); 4315 add(sp, sp, framesize); 4316 } else { 4317 if (framesize < ((1 << 12) + 2 * wordSize)) 4318 add(sp, sp, framesize - 2 * wordSize); 4319 else { 4320 mov(rscratch1, framesize - 2 * wordSize); 4321 add(sp, sp, rscratch1); 4322 } 4323 ldp(rfp, lr, Address(post(sp, 2 * wordSize))); 4324 } 4325 } 4326 4327 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr); 4328 4329 // Search for str1 in str2 and return index or -1 4330 void MacroAssembler::string_indexof(Register str2, Register str1, 4331 Register cnt2, Register cnt1, 4332 Register tmp1, Register tmp2, 4333 Register tmp3, Register tmp4, 4334 Register tmp5, Register tmp6, 4335 int icnt1, Register result, int ae) { 4336 // NOTE: tmp5, tmp6 can be zr depending on specific method version 4337 Label LINEARSEARCH, LINEARSTUB, LINEAR_MEDIUM, DONE, NOMATCH, MATCH; 4338 4339 Register ch1 = rscratch1; 4340 Register ch2 = rscratch2; 4341 Register cnt1tmp = tmp1; 4342 Register cnt2tmp = tmp2; 4343 Register cnt1_neg = cnt1; 4344 Register cnt2_neg = cnt2; 4345 Register result_tmp = tmp4; 4346 4347 bool isL = ae == StrIntrinsicNode::LL; 4348 4349 bool str1_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL; 4350 bool str2_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU; 4351 int str1_chr_shift = str1_isL ? 0:1; 4352 int str2_chr_shift = str2_isL ? 0:1; 4353 int str1_chr_size = str1_isL ? 1:2; 4354 int str2_chr_size = str2_isL ? 1:2; 4355 chr_insn str1_load_1chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb : 4356 (chr_insn)&MacroAssembler::ldrh; 4357 chr_insn str2_load_1chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb : 4358 (chr_insn)&MacroAssembler::ldrh; 4359 chr_insn load_2chr = isL ? (chr_insn)&MacroAssembler::ldrh : (chr_insn)&MacroAssembler::ldrw; 4360 chr_insn load_4chr = isL ? (chr_insn)&MacroAssembler::ldrw : (chr_insn)&MacroAssembler::ldr; 4361 4362 // Note, inline_string_indexOf() generates checks: 4363 // if (substr.count > string.count) return -1; 4364 // if (substr.count == 0) return 0; 4365 4366 // We have two strings, a source string in str2, cnt2 and a pattern string 4367 // in str1, cnt1. Find the 1st occurence of pattern in source or return -1. 4368 4369 // For larger pattern and source we use a simplified Boyer Moore algorithm. 4370 // With a small pattern and source we use linear scan. 4371 4372 if (icnt1 == -1) { 4373 sub(result_tmp, cnt2, cnt1); 4374 cmp(cnt1, 8); // Use Linear Scan if cnt1 < 8 || cnt1 >= 256 4375 br(LT, LINEARSEARCH); 4376 dup(v0, T16B, cnt1); // done in separate FPU pipeline. Almost no penalty 4377 cmp(cnt1, 256); 4378 lsr(tmp1, cnt2, 2); 4379 ccmp(cnt1, tmp1, 0b0000, LT); // Source must be 4 * pattern for BM 4380 br(GE, LINEARSTUB); 4381 } 4382 4383 // The Boyer Moore alogorithm is based on the description here:- 4384 // 4385 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm 4386 // 4387 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule 4388 // and the 'Good Suffix' rule. 4389 // 4390 // These rules are essentially heuristics for how far we can shift the 4391 // pattern along the search string. 4392 // 4393 // The implementation here uses the 'Bad Character' rule only because of the 4394 // complexity of initialisation for the 'Good Suffix' rule. 4395 // 4396 // This is also known as the Boyer-Moore-Horspool algorithm:- 4397 // 4398 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm 4399 // 4400 // This particular implementation has few java-specific optimizations. 4401 // 4402 // #define ASIZE 256 4403 // 4404 // int bm(unsigned char *x, int m, unsigned char *y, int n) { 4405 // int i, j; 4406 // unsigned c; 4407 // unsigned char bc[ASIZE]; 4408 // 4409 // /* Preprocessing */ 4410 // for (i = 0; i < ASIZE; ++i) 4411 // bc[i] = m; 4412 // for (i = 0; i < m - 1; ) { 4413 // c = x[i]; 4414 // ++i; 4415 // // c < 256 for Latin1 string, so, no need for branch 4416 // #ifdef PATTERN_STRING_IS_LATIN1 4417 // bc[c] = m - i; 4418 // #else 4419 // if (c < ASIZE) bc[c] = m - i; 4420 // #endif 4421 // } 4422 // 4423 // /* Searching */ 4424 // j = 0; 4425 // while (j <= n - m) { 4426 // c = y[i+j]; 4427 // if (x[m-1] == c) 4428 // for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i); 4429 // if (i < 0) return j; 4430 // // c < 256 for Latin1 string, so, no need for branch 4431 // #ifdef SOURCE_STRING_IS_LATIN1 4432 // // LL case: (c< 256) always true. Remove branch 4433 // j += bc[y[j+m-1]]; 4434 // #endif 4435 // #ifndef PATTERN_STRING_IS_UTF 4436 // // UU case: need if (c<ASIZE) check. Skip 1 character if not. 4437 // if (c < ASIZE) 4438 // j += bc[y[j+m-1]]; 4439 // else 4440 // j += 1 4441 // #endif 4442 // #ifdef PATTERN_IS_LATIN1_AND_SOURCE_IS_UTF 4443 // // UL case: need if (c<ASIZE) check. Skip <pattern length> if not. 4444 // if (c < ASIZE) 4445 // j += bc[y[j+m-1]]; 4446 // else 4447 // j += m 4448 // #endif 4449 // } 4450 // } 4451 4452 if (icnt1 == -1) { 4453 Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH, 4454 BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP; 4455 Register cnt1end = tmp2; 4456 Register str2end = cnt2; 4457 Register skipch = tmp2; 4458 4459 // str1 length is >=8, so, we can read at least 1 register for cases when 4460 // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for 4461 // UL case. We'll re-read last character in inner pre-loop code to have 4462 // single outer pre-loop load 4463 const int firstStep = isL ? 7 : 3; 4464 4465 const int ASIZE = 256; 4466 const int STORED_BYTES = 32; // amount of bytes stored per instruction 4467 sub(sp, sp, ASIZE); 4468 mov(tmp5, ASIZE/STORED_BYTES); // loop iterations 4469 mov(ch1, sp); 4470 BIND(BM_INIT_LOOP); 4471 stpq(v0, v0, Address(post(ch1, STORED_BYTES))); 4472 subs(tmp5, tmp5, 1); 4473 br(GT, BM_INIT_LOOP); 4474 4475 sub(cnt1tmp, cnt1, 1); 4476 mov(tmp5, str2); 4477 add(str2end, str2, result_tmp, LSL, str2_chr_shift); 4478 sub(ch2, cnt1, 1); 4479 mov(tmp3, str1); 4480 BIND(BCLOOP); 4481 (this->*str1_load_1chr)(ch1, Address(post(tmp3, str1_chr_size))); 4482 if (!str1_isL) { 4483 cmp(ch1, ASIZE); 4484 br(HS, BCSKIP); 4485 } 4486 strb(ch2, Address(sp, ch1)); 4487 BIND(BCSKIP); 4488 subs(ch2, ch2, 1); 4489 br(GT, BCLOOP); 4490 4491 add(tmp6, str1, cnt1, LSL, str1_chr_shift); // address after str1 4492 if (str1_isL == str2_isL) { 4493 // load last 8 bytes (8LL/4UU symbols) 4494 ldr(tmp6, Address(tmp6, -wordSize)); 4495 } else { 4496 ldrw(tmp6, Address(tmp6, -wordSize/2)); // load last 4 bytes(4 symbols) 4497 // convert Latin1 to UTF. We'll have to wait until load completed, but 4498 // it's still faster than per-character loads+checks 4499 lsr(tmp3, tmp6, BitsPerByte * (wordSize/2 - str1_chr_size)); // str1[N-1] 4500 ubfx(ch1, tmp6, 8, 8); // str1[N-2] 4501 ubfx(ch2, tmp6, 16, 8); // str1[N-3] 4502 andr(tmp6, tmp6, 0xFF); // str1[N-4] 4503 orr(ch2, ch1, ch2, LSL, 16); 4504 orr(tmp6, tmp6, tmp3, LSL, 48); 4505 orr(tmp6, tmp6, ch2, LSL, 16); 4506 } 4507 BIND(BMLOOPSTR2); 4508 (this->*str2_load_1chr)(skipch, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift))); 4509 sub(cnt1tmp, cnt1tmp, firstStep); // cnt1tmp is positive here, because cnt1 >= 8 4510 if (str1_isL == str2_isL) { 4511 // re-init tmp3. It's for free because it's executed in parallel with 4512 // load above. Alternative is to initialize it before loop, but it'll 4513 // affect performance on in-order systems with 2 or more ld/st pipelines 4514 lsr(tmp3, tmp6, BitsPerByte * (wordSize - str1_chr_size)); 4515 } 4516 if (!isL) { // UU/UL case 4517 lsl(ch2, cnt1tmp, 1); // offset in bytes 4518 } 4519 cmp(tmp3, skipch); 4520 br(NE, BMSKIP); 4521 ldr(ch2, Address(str2, isL ? cnt1tmp : ch2)); 4522 mov(ch1, tmp6); 4523 if (isL) { 4524 b(BMLOOPSTR1_AFTER_LOAD); 4525 } else { 4526 sub(cnt1tmp, cnt1tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8 4527 b(BMLOOPSTR1_CMP); 4528 } 4529 BIND(BMLOOPSTR1); 4530 (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift))); 4531 (this->*str2_load_1chr)(ch2, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift))); 4532 BIND(BMLOOPSTR1_AFTER_LOAD); 4533 subs(cnt1tmp, cnt1tmp, 1); 4534 br(LT, BMLOOPSTR1_LASTCMP); 4535 BIND(BMLOOPSTR1_CMP); 4536 cmp(ch1, ch2); 4537 br(EQ, BMLOOPSTR1); 4538 BIND(BMSKIP); 4539 if (!isL) { 4540 // if we've met UTF symbol while searching Latin1 pattern, then we can 4541 // skip cnt1 symbols 4542 if (str1_isL != str2_isL) { 4543 mov(result_tmp, cnt1); 4544 } else { 4545 mov(result_tmp, 1); 4546 } 4547 cmp(skipch, ASIZE); 4548 br(HS, BMADV); 4549 } 4550 ldrb(result_tmp, Address(sp, skipch)); // load skip distance 4551 BIND(BMADV); 4552 sub(cnt1tmp, cnt1, 1); 4553 add(str2, str2, result_tmp, LSL, str2_chr_shift); 4554 cmp(str2, str2end); 4555 br(LE, BMLOOPSTR2); 4556 add(sp, sp, ASIZE); 4557 b(NOMATCH); 4558 BIND(BMLOOPSTR1_LASTCMP); 4559 cmp(ch1, ch2); 4560 br(NE, BMSKIP); 4561 BIND(BMMATCH); 4562 sub(result, str2, tmp5); 4563 if (!str2_isL) lsr(result, result, 1); 4564 add(sp, sp, ASIZE); 4565 b(DONE); 4566 4567 BIND(LINEARSTUB); 4568 cmp(cnt1, 16); // small patterns still should be handled by simple algorithm 4569 br(LT, LINEAR_MEDIUM); 4570 mov(result, zr); 4571 RuntimeAddress stub = NULL; 4572 if (isL) { 4573 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ll()); 4574 assert(stub.target() != NULL, "string_indexof_linear_ll stub has not been generated"); 4575 } else if (str1_isL) { 4576 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ul()); 4577 assert(stub.target() != NULL, "string_indexof_linear_ul stub has not been generated"); 4578 } else { 4579 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_uu()); 4580 assert(stub.target() != NULL, "string_indexof_linear_uu stub has not been generated"); 4581 } 4582 trampoline_call(stub); 4583 b(DONE); 4584 } 4585 4586 BIND(LINEARSEARCH); 4587 { 4588 Label DO1, DO2, DO3; 4589 4590 Register str2tmp = tmp2; 4591 Register first = tmp3; 4592 4593 if (icnt1 == -1) 4594 { 4595 Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT; 4596 4597 cmp(cnt1, str1_isL == str2_isL ? 4 : 2); 4598 br(LT, DOSHORT); 4599 BIND(LINEAR_MEDIUM); 4600 (this->*str1_load_1chr)(first, Address(str1)); 4601 lea(str1, Address(str1, cnt1, Address::lsl(str1_chr_shift))); 4602 sub(cnt1_neg, zr, cnt1, LSL, str1_chr_shift); 4603 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 4604 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 4605 4606 BIND(FIRST_LOOP); 4607 (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg)); 4608 cmp(first, ch2); 4609 br(EQ, STR1_LOOP); 4610 BIND(STR2_NEXT); 4611 adds(cnt2_neg, cnt2_neg, str2_chr_size); 4612 br(LE, FIRST_LOOP); 4613 b(NOMATCH); 4614 4615 BIND(STR1_LOOP); 4616 adds(cnt1tmp, cnt1_neg, str1_chr_size); 4617 add(cnt2tmp, cnt2_neg, str2_chr_size); 4618 br(GE, MATCH); 4619 4620 BIND(STR1_NEXT); 4621 (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp)); 4622 (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp)); 4623 cmp(ch1, ch2); 4624 br(NE, STR2_NEXT); 4625 adds(cnt1tmp, cnt1tmp, str1_chr_size); 4626 add(cnt2tmp, cnt2tmp, str2_chr_size); 4627 br(LT, STR1_NEXT); 4628 b(MATCH); 4629 4630 BIND(DOSHORT); 4631 if (str1_isL == str2_isL) { 4632 cmp(cnt1, 2); 4633 br(LT, DO1); 4634 br(GT, DO3); 4635 } 4636 } 4637 4638 if (icnt1 == 4) { 4639 Label CH1_LOOP; 4640 4641 (this->*load_4chr)(ch1, str1); 4642 sub(result_tmp, cnt2, 4); 4643 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 4644 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 4645 4646 BIND(CH1_LOOP); 4647 (this->*load_4chr)(ch2, Address(str2, cnt2_neg)); 4648 cmp(ch1, ch2); 4649 br(EQ, MATCH); 4650 adds(cnt2_neg, cnt2_neg, str2_chr_size); 4651 br(LE, CH1_LOOP); 4652 b(NOMATCH); 4653 } 4654 4655 if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 2) { 4656 Label CH1_LOOP; 4657 4658 BIND(DO2); 4659 (this->*load_2chr)(ch1, str1); 4660 if (icnt1 == 2) { 4661 sub(result_tmp, cnt2, 2); 4662 } 4663 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 4664 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 4665 BIND(CH1_LOOP); 4666 (this->*load_2chr)(ch2, Address(str2, cnt2_neg)); 4667 cmp(ch1, ch2); 4668 br(EQ, MATCH); 4669 adds(cnt2_neg, cnt2_neg, str2_chr_size); 4670 br(LE, CH1_LOOP); 4671 b(NOMATCH); 4672 } 4673 4674 if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 3) { 4675 Label FIRST_LOOP, STR2_NEXT, STR1_LOOP; 4676 4677 BIND(DO3); 4678 (this->*load_2chr)(first, str1); 4679 (this->*str1_load_1chr)(ch1, Address(str1, 2*str1_chr_size)); 4680 if (icnt1 == 3) { 4681 sub(result_tmp, cnt2, 3); 4682 } 4683 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 4684 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 4685 BIND(FIRST_LOOP); 4686 (this->*load_2chr)(ch2, Address(str2, cnt2_neg)); 4687 cmpw(first, ch2); 4688 br(EQ, STR1_LOOP); 4689 BIND(STR2_NEXT); 4690 adds(cnt2_neg, cnt2_neg, str2_chr_size); 4691 br(LE, FIRST_LOOP); 4692 b(NOMATCH); 4693 4694 BIND(STR1_LOOP); 4695 add(cnt2tmp, cnt2_neg, 2*str2_chr_size); 4696 (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp)); 4697 cmp(ch1, ch2); 4698 br(NE, STR2_NEXT); 4699 b(MATCH); 4700 } 4701 4702 if (icnt1 == -1 || icnt1 == 1) { 4703 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP; 4704 4705 BIND(DO1); 4706 (this->*str1_load_1chr)(ch1, str1); 4707 cmp(cnt2, 8); 4708 br(LT, DO1_SHORT); 4709 4710 sub(result_tmp, cnt2, 8/str2_chr_size); 4711 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 4712 mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001); 4713 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 4714 4715 if (str2_isL) { 4716 orr(ch1, ch1, ch1, LSL, 8); 4717 } 4718 orr(ch1, ch1, ch1, LSL, 16); 4719 orr(ch1, ch1, ch1, LSL, 32); 4720 BIND(CH1_LOOP); 4721 ldr(ch2, Address(str2, cnt2_neg)); 4722 eor(ch2, ch1, ch2); 4723 sub(tmp1, ch2, tmp3); 4724 orr(tmp2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 4725 bics(tmp1, tmp1, tmp2); 4726 br(NE, HAS_ZERO); 4727 adds(cnt2_neg, cnt2_neg, 8); 4728 br(LT, CH1_LOOP); 4729 4730 cmp(cnt2_neg, 8); 4731 mov(cnt2_neg, 0); 4732 br(LT, CH1_LOOP); 4733 b(NOMATCH); 4734 4735 BIND(HAS_ZERO); 4736 rev(tmp1, tmp1); 4737 clz(tmp1, tmp1); 4738 add(cnt2_neg, cnt2_neg, tmp1, LSR, 3); 4739 b(MATCH); 4740 4741 BIND(DO1_SHORT); 4742 mov(result_tmp, cnt2); 4743 lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift))); 4744 sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift); 4745 BIND(DO1_LOOP); 4746 (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg)); 4747 cmpw(ch1, ch2); 4748 br(EQ, MATCH); 4749 adds(cnt2_neg, cnt2_neg, str2_chr_size); 4750 br(LT, DO1_LOOP); 4751 } 4752 } 4753 BIND(NOMATCH); 4754 mov(result, -1); 4755 b(DONE); 4756 BIND(MATCH); 4757 add(result, result_tmp, cnt2_neg, ASR, str2_chr_shift); 4758 BIND(DONE); 4759 } 4760 4761 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr); 4762 typedef void (MacroAssembler::* uxt_insn)(Register Rd, Register Rn); 4763 4764 void MacroAssembler::string_indexof_char(Register str1, Register cnt1, 4765 Register ch, Register result, 4766 Register tmp1, Register tmp2, Register tmp3) 4767 { 4768 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE; 4769 Register cnt1_neg = cnt1; 4770 Register ch1 = rscratch1; 4771 Register result_tmp = rscratch2; 4772 4773 cmp(cnt1, 4); 4774 br(LT, DO1_SHORT); 4775 4776 orr(ch, ch, ch, LSL, 16); 4777 orr(ch, ch, ch, LSL, 32); 4778 4779 sub(cnt1, cnt1, 4); 4780 mov(result_tmp, cnt1); 4781 lea(str1, Address(str1, cnt1, Address::uxtw(1))); 4782 sub(cnt1_neg, zr, cnt1, LSL, 1); 4783 4784 mov(tmp3, 0x0001000100010001); 4785 4786 BIND(CH1_LOOP); 4787 ldr(ch1, Address(str1, cnt1_neg)); 4788 eor(ch1, ch, ch1); 4789 sub(tmp1, ch1, tmp3); 4790 orr(tmp2, ch1, 0x7fff7fff7fff7fff); 4791 bics(tmp1, tmp1, tmp2); 4792 br(NE, HAS_ZERO); 4793 adds(cnt1_neg, cnt1_neg, 8); 4794 br(LT, CH1_LOOP); 4795 4796 cmp(cnt1_neg, 8); 4797 mov(cnt1_neg, 0); 4798 br(LT, CH1_LOOP); 4799 b(NOMATCH); 4800 4801 BIND(HAS_ZERO); 4802 rev(tmp1, tmp1); 4803 clz(tmp1, tmp1); 4804 add(cnt1_neg, cnt1_neg, tmp1, LSR, 3); 4805 b(MATCH); 4806 4807 BIND(DO1_SHORT); 4808 mov(result_tmp, cnt1); 4809 lea(str1, Address(str1, cnt1, Address::uxtw(1))); 4810 sub(cnt1_neg, zr, cnt1, LSL, 1); 4811 BIND(DO1_LOOP); 4812 ldrh(ch1, Address(str1, cnt1_neg)); 4813 cmpw(ch, ch1); 4814 br(EQ, MATCH); 4815 adds(cnt1_neg, cnt1_neg, 2); 4816 br(LT, DO1_LOOP); 4817 BIND(NOMATCH); 4818 mov(result, -1); 4819 b(DONE); 4820 BIND(MATCH); 4821 add(result, result_tmp, cnt1_neg, ASR, 1); 4822 BIND(DONE); 4823 } 4824 4825 // Compare strings. 4826 void MacroAssembler::string_compare(Register str1, Register str2, 4827 Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2, 4828 FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3, int ae) { 4829 Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB, 4830 DIFFERENCE, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, 4831 SHORT_LOOP_START, TAIL_CHECK; 4832 4833 const int STUB_THRESHOLD = 64 + 8; 4834 bool isLL = ae == StrIntrinsicNode::LL; 4835 bool isLU = ae == StrIntrinsicNode::LU; 4836 bool isUL = ae == StrIntrinsicNode::UL; 4837 4838 bool str1_isL = isLL || isLU; 4839 bool str2_isL = isLL || isUL; 4840 4841 int str1_chr_shift = str1_isL ? 0 : 1; 4842 int str2_chr_shift = str2_isL ? 0 : 1; 4843 int str1_chr_size = str1_isL ? 1 : 2; 4844 int str2_chr_size = str2_isL ? 1 : 2; 4845 int minCharsInWord = isLL ? wordSize : wordSize/2; 4846 4847 FloatRegister vtmpZ = vtmp1, vtmp = vtmp2; 4848 chr_insn str1_load_chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb : 4849 (chr_insn)&MacroAssembler::ldrh; 4850 chr_insn str2_load_chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb : 4851 (chr_insn)&MacroAssembler::ldrh; 4852 uxt_insn ext_chr = isLL ? (uxt_insn)&MacroAssembler::uxtbw : 4853 (uxt_insn)&MacroAssembler::uxthw; 4854 4855 BLOCK_COMMENT("string_compare {"); 4856 4857 // Bizzarely, the counts are passed in bytes, regardless of whether they 4858 // are L or U strings, however the result is always in characters. 4859 if (!str1_isL) asrw(cnt1, cnt1, 1); 4860 if (!str2_isL) asrw(cnt2, cnt2, 1); 4861 4862 // Compute the minimum of the string lengths and save the difference. 4863 subsw(result, cnt1, cnt2); 4864 cselw(cnt2, cnt1, cnt2, Assembler::LE); // min 4865 4866 // A very short string 4867 cmpw(cnt2, minCharsInWord); 4868 br(Assembler::LT, SHORT_STRING); 4869 4870 // Compare longwords 4871 // load first parts of strings and finish initialization while loading 4872 { 4873 if (str1_isL == str2_isL) { // LL or UU 4874 ldr(tmp1, Address(str1)); 4875 cmp(str1, str2); 4876 br(Assembler::EQ, DONE); 4877 ldr(tmp2, Address(str2)); 4878 cmp(cnt2, STUB_THRESHOLD); 4879 br(GE, STUB); 4880 subsw(cnt2, cnt2, minCharsInWord); 4881 br(EQ, TAIL_CHECK); 4882 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift))); 4883 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift))); 4884 sub(cnt2, zr, cnt2, LSL, str2_chr_shift); 4885 } else if (isLU) { 4886 ldrs(vtmp, Address(str1)); 4887 cmp(str1, str2); 4888 br(Assembler::EQ, DONE); 4889 ldr(tmp2, Address(str2)); 4890 cmp(cnt2, STUB_THRESHOLD); 4891 br(GE, STUB); 4892 subsw(cnt2, cnt2, 4); 4893 br(EQ, TAIL_CHECK); 4894 eor(vtmpZ, T16B, vtmpZ, vtmpZ); 4895 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift))); 4896 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift))); 4897 zip1(vtmp, T8B, vtmp, vtmpZ); 4898 sub(cnt1, zr, cnt2, LSL, str1_chr_shift); 4899 sub(cnt2, zr, cnt2, LSL, str2_chr_shift); 4900 add(cnt1, cnt1, 4); 4901 fmovd(tmp1, vtmp); 4902 } else { // UL case 4903 ldr(tmp1, Address(str1)); 4904 cmp(str1, str2); 4905 br(Assembler::EQ, DONE); 4906 ldrs(vtmp, Address(str2)); 4907 cmp(cnt2, STUB_THRESHOLD); 4908 br(GE, STUB); 4909 subsw(cnt2, cnt2, 4); 4910 br(EQ, TAIL_CHECK); 4911 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift))); 4912 eor(vtmpZ, T16B, vtmpZ, vtmpZ); 4913 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift))); 4914 sub(cnt1, zr, cnt2, LSL, str1_chr_shift); 4915 zip1(vtmp, T8B, vtmp, vtmpZ); 4916 sub(cnt2, zr, cnt2, LSL, str2_chr_shift); 4917 add(cnt1, cnt1, 8); 4918 fmovd(tmp2, vtmp); 4919 } 4920 adds(cnt2, cnt2, isUL ? 4 : 8); 4921 br(GE, TAIL); 4922 eor(rscratch2, tmp1, tmp2); 4923 cbnz(rscratch2, DIFFERENCE); 4924 // main loop 4925 bind(NEXT_WORD); 4926 if (str1_isL == str2_isL) { 4927 ldr(tmp1, Address(str1, cnt2)); 4928 ldr(tmp2, Address(str2, cnt2)); 4929 adds(cnt2, cnt2, 8); 4930 } else if (isLU) { 4931 ldrs(vtmp, Address(str1, cnt1)); 4932 ldr(tmp2, Address(str2, cnt2)); 4933 add(cnt1, cnt1, 4); 4934 zip1(vtmp, T8B, vtmp, vtmpZ); 4935 fmovd(tmp1, vtmp); 4936 adds(cnt2, cnt2, 8); 4937 } else { // UL 4938 ldrs(vtmp, Address(str2, cnt2)); 4939 ldr(tmp1, Address(str1, cnt1)); 4940 zip1(vtmp, T8B, vtmp, vtmpZ); 4941 add(cnt1, cnt1, 8); 4942 fmovd(tmp2, vtmp); 4943 adds(cnt2, cnt2, 4); 4944 } 4945 br(GE, TAIL); 4946 4947 eor(rscratch2, tmp1, tmp2); 4948 cbz(rscratch2, NEXT_WORD); 4949 b(DIFFERENCE); 4950 bind(TAIL); 4951 eor(rscratch2, tmp1, tmp2); 4952 cbnz(rscratch2, DIFFERENCE); 4953 // Last longword. In the case where length == 4 we compare the 4954 // same longword twice, but that's still faster than another 4955 // conditional branch. 4956 if (str1_isL == str2_isL) { 4957 ldr(tmp1, Address(str1)); 4958 ldr(tmp2, Address(str2)); 4959 } else if (isLU) { 4960 ldrs(vtmp, Address(str1)); 4961 ldr(tmp2, Address(str2)); 4962 zip1(vtmp, T8B, vtmp, vtmpZ); 4963 fmovd(tmp1, vtmp); 4964 } else { // UL 4965 ldrs(vtmp, Address(str2)); 4966 ldr(tmp1, Address(str1)); 4967 zip1(vtmp, T8B, vtmp, vtmpZ); 4968 fmovd(tmp2, vtmp); 4969 } 4970 bind(TAIL_CHECK); 4971 eor(rscratch2, tmp1, tmp2); 4972 cbz(rscratch2, DONE); 4973 4974 // Find the first different characters in the longwords and 4975 // compute their difference. 4976 bind(DIFFERENCE); 4977 rev(rscratch2, rscratch2); 4978 clz(rscratch2, rscratch2); 4979 andr(rscratch2, rscratch2, isLL ? -8 : -16); 4980 lsrv(tmp1, tmp1, rscratch2); 4981 (this->*ext_chr)(tmp1, tmp1); 4982 lsrv(tmp2, tmp2, rscratch2); 4983 (this->*ext_chr)(tmp2, tmp2); 4984 subw(result, tmp1, tmp2); 4985 b(DONE); 4986 } 4987 4988 bind(STUB); 4989 RuntimeAddress stub = NULL; 4990 switch(ae) { 4991 case StrIntrinsicNode::LL: 4992 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LL()); 4993 break; 4994 case StrIntrinsicNode::UU: 4995 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UU()); 4996 break; 4997 case StrIntrinsicNode::LU: 4998 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LU()); 4999 break; 5000 case StrIntrinsicNode::UL: 5001 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UL()); 5002 break; 5003 default: 5004 ShouldNotReachHere(); 5005 } 5006 assert(stub.target() != NULL, "compare_long_string stub has not been generated"); 5007 trampoline_call(stub); 5008 b(DONE); 5009 5010 bind(SHORT_STRING); 5011 // Is the minimum length zero? 5012 cbz(cnt2, DONE); 5013 // arrange code to do most branches while loading and loading next characters 5014 // while comparing previous 5015 (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size))); 5016 subs(cnt2, cnt2, 1); 5017 br(EQ, SHORT_LAST_INIT); 5018 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size))); 5019 b(SHORT_LOOP_START); 5020 bind(SHORT_LOOP); 5021 subs(cnt2, cnt2, 1); 5022 br(EQ, SHORT_LAST); 5023 bind(SHORT_LOOP_START); 5024 (this->*str1_load_chr)(tmp2, Address(post(str1, str1_chr_size))); 5025 (this->*str2_load_chr)(rscratch1, Address(post(str2, str2_chr_size))); 5026 cmp(tmp1, cnt1); 5027 br(NE, SHORT_LOOP_TAIL); 5028 subs(cnt2, cnt2, 1); 5029 br(EQ, SHORT_LAST2); 5030 (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size))); 5031 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size))); 5032 cmp(tmp2, rscratch1); 5033 br(EQ, SHORT_LOOP); 5034 sub(result, tmp2, rscratch1); 5035 b(DONE); 5036 bind(SHORT_LOOP_TAIL); 5037 sub(result, tmp1, cnt1); 5038 b(DONE); 5039 bind(SHORT_LAST2); 5040 cmp(tmp2, rscratch1); 5041 br(EQ, DONE); 5042 sub(result, tmp2, rscratch1); 5043 5044 b(DONE); 5045 bind(SHORT_LAST_INIT); 5046 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size))); 5047 bind(SHORT_LAST); 5048 cmp(tmp1, cnt1); 5049 br(EQ, DONE); 5050 sub(result, tmp1, cnt1); 5051 5052 bind(DONE); 5053 5054 BLOCK_COMMENT("} string_compare"); 5055 } 5056 5057 // This method checks if provided byte array contains byte with highest bit set. 5058 void MacroAssembler::has_negatives(Register ary1, Register len, Register result) { 5059 // Simple and most common case of aligned small array which is not at the 5060 // end of memory page is placed here. All other cases are in stub. 5061 Label LOOP, END, STUB, STUB_LONG, SET_RESULT, DONE; 5062 const uint64_t UPPER_BIT_MASK=0x8080808080808080; 5063 assert_different_registers(ary1, len, result); 5064 5065 cmpw(len, 0); 5066 br(LE, SET_RESULT); 5067 cmpw(len, 4 * wordSize); 5068 br(GE, STUB_LONG); // size > 32 then go to stub 5069 5070 int shift = 64 - exact_log2(os::vm_page_size()); 5071 lsl(rscratch1, ary1, shift); 5072 mov(rscratch2, (size_t)(4 * wordSize) << shift); 5073 adds(rscratch2, rscratch1, rscratch2); // At end of page? 5074 br(CS, STUB); // at the end of page then go to stub 5075 subs(len, len, wordSize); 5076 br(LT, END); 5077 5078 BIND(LOOP); 5079 ldr(rscratch1, Address(post(ary1, wordSize))); 5080 tst(rscratch1, UPPER_BIT_MASK); 5081 br(NE, SET_RESULT); 5082 subs(len, len, wordSize); 5083 br(GE, LOOP); 5084 cmpw(len, -wordSize); 5085 br(EQ, SET_RESULT); 5086 5087 BIND(END); 5088 ldr(result, Address(ary1)); 5089 sub(len, zr, len, LSL, 3); // LSL 3 is to get bits from bytes 5090 lslv(result, result, len); 5091 tst(result, UPPER_BIT_MASK); 5092 b(SET_RESULT); 5093 5094 BIND(STUB); 5095 RuntimeAddress has_neg = RuntimeAddress(StubRoutines::aarch64::has_negatives()); 5096 assert(has_neg.target() != NULL, "has_negatives stub has not been generated"); 5097 trampoline_call(has_neg); 5098 b(DONE); 5099 5100 BIND(STUB_LONG); 5101 RuntimeAddress has_neg_long = RuntimeAddress( 5102 StubRoutines::aarch64::has_negatives_long()); 5103 assert(has_neg_long.target() != NULL, "has_negatives stub has not been generated"); 5104 trampoline_call(has_neg_long); 5105 b(DONE); 5106 5107 BIND(SET_RESULT); 5108 cset(result, NE); // set true or false 5109 5110 BIND(DONE); 5111 } 5112 5113 void MacroAssembler::arrays_equals(Register a1, Register a2, Register tmp3, 5114 Register tmp4, Register tmp5, Register result, 5115 Register cnt1, int elem_size) { 5116 Label DONE, SAME; 5117 Register tmp1 = rscratch1; 5118 Register tmp2 = rscratch2; 5119 Register cnt2 = tmp2; // cnt2 only used in array length compare 5120 int elem_per_word = wordSize/elem_size; 5121 int log_elem_size = exact_log2(elem_size); 5122 int length_offset = arrayOopDesc::length_offset_in_bytes(); 5123 int base_offset 5124 = arrayOopDesc::base_offset_in_bytes(elem_size == 2 ? T_CHAR : T_BYTE); 5125 int stubBytesThreshold = 3 * 64 + (UseSIMDForArrayEquals ? 0 : 16); 5126 5127 assert(elem_size == 1 || elem_size == 2, "must be char or byte"); 5128 assert_different_registers(a1, a2, result, cnt1, rscratch1, rscratch2); 5129 5130 #ifndef PRODUCT 5131 { 5132 const char kind = (elem_size == 2) ? 'U' : 'L'; 5133 char comment[64]; 5134 snprintf(comment, sizeof comment, "array_equals%c{", kind); 5135 BLOCK_COMMENT(comment); 5136 } 5137 #endif 5138 5139 // if (a1 == a2) 5140 // return true; 5141 cmpoop(a1, a2); // May have read barriers for a1 and a2. 5142 br(EQ, SAME); 5143 5144 if (UseSimpleArrayEquals) { 5145 Label NEXT_WORD, SHORT, TAIL03, TAIL01, A_MIGHT_BE_NULL, A_IS_NOT_NULL; 5146 // if (a1 == null || a2 == null) 5147 // return false; 5148 // a1 & a2 == 0 means (some-pointer is null) or 5149 // (very-rare-or-even-probably-impossible-pointer-values) 5150 // so, we can save one branch in most cases 5151 tst(a1, a2); 5152 mov(result, false); 5153 br(EQ, A_MIGHT_BE_NULL); 5154 // if (a1.length != a2.length) 5155 // return false; 5156 bind(A_IS_NOT_NULL); 5157 ldrw(cnt1, Address(a1, length_offset)); 5158 ldrw(cnt2, Address(a2, length_offset)); 5159 eorw(tmp5, cnt1, cnt2); 5160 cbnzw(tmp5, DONE); 5161 lea(a1, Address(a1, base_offset)); 5162 lea(a2, Address(a2, base_offset)); 5163 // Check for short strings, i.e. smaller than wordSize. 5164 subs(cnt1, cnt1, elem_per_word); 5165 br(Assembler::LT, SHORT); 5166 // Main 8 byte comparison loop. 5167 bind(NEXT_WORD); { 5168 ldr(tmp1, Address(post(a1, wordSize))); 5169 ldr(tmp2, Address(post(a2, wordSize))); 5170 subs(cnt1, cnt1, elem_per_word); 5171 eor(tmp5, tmp1, tmp2); 5172 cbnz(tmp5, DONE); 5173 } br(GT, NEXT_WORD); 5174 // Last longword. In the case where length == 4 we compare the 5175 // same longword twice, but that's still faster than another 5176 // conditional branch. 5177 // cnt1 could be 0, -1, -2, -3, -4 for chars; -4 only happens when 5178 // length == 4. 5179 if (log_elem_size > 0) 5180 lsl(cnt1, cnt1, log_elem_size); 5181 ldr(tmp3, Address(a1, cnt1)); 5182 ldr(tmp4, Address(a2, cnt1)); 5183 eor(tmp5, tmp3, tmp4); 5184 cbnz(tmp5, DONE); 5185 b(SAME); 5186 bind(A_MIGHT_BE_NULL); 5187 // in case both a1 and a2 are not-null, proceed with loads 5188 cbz(a1, DONE); 5189 cbz(a2, DONE); 5190 b(A_IS_NOT_NULL); 5191 bind(SHORT); 5192 5193 tbz(cnt1, 2 - log_elem_size, TAIL03); // 0-7 bytes left. 5194 { 5195 ldrw(tmp1, Address(post(a1, 4))); 5196 ldrw(tmp2, Address(post(a2, 4))); 5197 eorw(tmp5, tmp1, tmp2); 5198 cbnzw(tmp5, DONE); 5199 } 5200 bind(TAIL03); 5201 tbz(cnt1, 1 - log_elem_size, TAIL01); // 0-3 bytes left. 5202 { 5203 ldrh(tmp3, Address(post(a1, 2))); 5204 ldrh(tmp4, Address(post(a2, 2))); 5205 eorw(tmp5, tmp3, tmp4); 5206 cbnzw(tmp5, DONE); 5207 } 5208 bind(TAIL01); 5209 if (elem_size == 1) { // Only needed when comparing byte arrays. 5210 tbz(cnt1, 0, SAME); // 0-1 bytes left. 5211 { 5212 ldrb(tmp1, a1); 5213 ldrb(tmp2, a2); 5214 eorw(tmp5, tmp1, tmp2); 5215 cbnzw(tmp5, DONE); 5216 } 5217 } 5218 } else { 5219 Label NEXT_DWORD, SHORT, TAIL, TAIL2, STUB, EARLY_OUT, 5220 CSET_EQ, LAST_CHECK; 5221 mov(result, false); 5222 cbz(a1, DONE); 5223 ldrw(cnt1, Address(a1, length_offset)); 5224 cbz(a2, DONE); 5225 ldrw(cnt2, Address(a2, length_offset)); 5226 // on most CPUs a2 is still "locked"(surprisingly) in ldrw and it's 5227 // faster to perform another branch before comparing a1 and a2 5228 cmp(cnt1, elem_per_word); 5229 br(LE, SHORT); // short or same 5230 ldr(tmp3, Address(pre(a1, base_offset))); 5231 cmp(cnt1, stubBytesThreshold); 5232 br(GE, STUB); 5233 ldr(tmp4, Address(pre(a2, base_offset))); 5234 sub(tmp5, zr, cnt1, LSL, 3 + log_elem_size); 5235 cmp(cnt2, cnt1); 5236 br(NE, DONE); 5237 5238 // Main 16 byte comparison loop with 2 exits 5239 bind(NEXT_DWORD); { 5240 ldr(tmp1, Address(pre(a1, wordSize))); 5241 ldr(tmp2, Address(pre(a2, wordSize))); 5242 subs(cnt1, cnt1, 2 * elem_per_word); 5243 br(LE, TAIL); 5244 eor(tmp4, tmp3, tmp4); 5245 cbnz(tmp4, DONE); 5246 ldr(tmp3, Address(pre(a1, wordSize))); 5247 ldr(tmp4, Address(pre(a2, wordSize))); 5248 cmp(cnt1, elem_per_word); 5249 br(LE, TAIL2); 5250 cmp(tmp1, tmp2); 5251 } br(EQ, NEXT_DWORD); 5252 b(DONE); 5253 5254 bind(TAIL); 5255 eor(tmp4, tmp3, tmp4); 5256 eor(tmp2, tmp1, tmp2); 5257 lslv(tmp2, tmp2, tmp5); 5258 orr(tmp5, tmp4, tmp2); 5259 cmp(tmp5, zr); 5260 b(CSET_EQ); 5261 5262 bind(TAIL2); 5263 eor(tmp2, tmp1, tmp2); 5264 cbnz(tmp2, DONE); 5265 b(LAST_CHECK); 5266 5267 bind(STUB); 5268 ldr(tmp4, Address(pre(a2, base_offset))); 5269 cmp(cnt2, cnt1); 5270 br(NE, DONE); 5271 if (elem_size == 2) { // convert to byte counter 5272 lsl(cnt1, cnt1, 1); 5273 } 5274 eor(tmp5, tmp3, tmp4); 5275 cbnz(tmp5, DONE); 5276 RuntimeAddress stub = RuntimeAddress(StubRoutines::aarch64::large_array_equals()); 5277 assert(stub.target() != NULL, "array_equals_long stub has not been generated"); 5278 trampoline_call(stub); 5279 b(DONE); 5280 5281 bind(EARLY_OUT); 5282 // (a1 != null && a2 == null) || (a1 != null && a2 != null && a1 == a2) 5283 // so, if a2 == null => return false(0), else return true, so we can return a2 5284 mov(result, a2); 5285 b(DONE); 5286 bind(SHORT); 5287 cmp(cnt2, cnt1); 5288 br(NE, DONE); 5289 cbz(cnt1, SAME); 5290 sub(tmp5, zr, cnt1, LSL, 3 + log_elem_size); 5291 ldr(tmp3, Address(a1, base_offset)); 5292 ldr(tmp4, Address(a2, base_offset)); 5293 bind(LAST_CHECK); 5294 eor(tmp4, tmp3, tmp4); 5295 lslv(tmp5, tmp4, tmp5); 5296 cmp(tmp5, zr); 5297 bind(CSET_EQ); 5298 cset(result, EQ); 5299 b(DONE); 5300 } 5301 5302 bind(SAME); 5303 mov(result, true); 5304 // That's it. 5305 bind(DONE); 5306 5307 BLOCK_COMMENT("} array_equals"); 5308 } 5309 5310 // Compare Strings 5311 5312 // For Strings we're passed the address of the first characters in a1 5313 // and a2 and the length in cnt1. 5314 // elem_size is the element size in bytes: either 1 or 2. 5315 // There are two implementations. For arrays >= 8 bytes, all 5316 // comparisons (including the final one, which may overlap) are 5317 // performed 8 bytes at a time. For strings < 8 bytes, we compare a 5318 // halfword, then a short, and then a byte. 5319 5320 void MacroAssembler::string_equals(Register a1, Register a2, 5321 Register result, Register cnt1, int elem_size) 5322 { 5323 Label SAME, DONE, SHORT, NEXT_WORD; 5324 Register tmp1 = rscratch1; 5325 Register tmp2 = rscratch2; 5326 Register cnt2 = tmp2; // cnt2 only used in array length compare 5327 5328 assert(elem_size == 1 || elem_size == 2, "must be 2 or 1 byte"); 5329 assert_different_registers(a1, a2, result, cnt1, rscratch1, rscratch2); 5330 5331 #ifndef PRODUCT 5332 { 5333 const char kind = (elem_size == 2) ? 'U' : 'L'; 5334 char comment[64]; 5335 snprintf(comment, sizeof comment, "{string_equals%c", kind); 5336 BLOCK_COMMENT(comment); 5337 } 5338 #endif 5339 5340 mov(result, false); 5341 5342 // Check for short strings, i.e. smaller than wordSize. 5343 subs(cnt1, cnt1, wordSize); 5344 br(Assembler::LT, SHORT); 5345 // Main 8 byte comparison loop. 5346 bind(NEXT_WORD); { 5347 ldr(tmp1, Address(post(a1, wordSize))); 5348 ldr(tmp2, Address(post(a2, wordSize))); 5349 subs(cnt1, cnt1, wordSize); 5350 eor(tmp1, tmp1, tmp2); 5351 cbnz(tmp1, DONE); 5352 } br(GT, NEXT_WORD); 5353 // Last longword. In the case where length == 4 we compare the 5354 // same longword twice, but that's still faster than another 5355 // conditional branch. 5356 // cnt1 could be 0, -1, -2, -3, -4 for chars; -4 only happens when 5357 // length == 4. 5358 ldr(tmp1, Address(a1, cnt1)); 5359 ldr(tmp2, Address(a2, cnt1)); 5360 eor(tmp2, tmp1, tmp2); 5361 cbnz(tmp2, DONE); 5362 b(SAME); 5363 5364 bind(SHORT); 5365 Label TAIL03, TAIL01; 5366 5367 tbz(cnt1, 2, TAIL03); // 0-7 bytes left. 5368 { 5369 ldrw(tmp1, Address(post(a1, 4))); 5370 ldrw(tmp2, Address(post(a2, 4))); 5371 eorw(tmp1, tmp1, tmp2); 5372 cbnzw(tmp1, DONE); 5373 } 5374 bind(TAIL03); 5375 tbz(cnt1, 1, TAIL01); // 0-3 bytes left. 5376 { 5377 ldrh(tmp1, Address(post(a1, 2))); 5378 ldrh(tmp2, Address(post(a2, 2))); 5379 eorw(tmp1, tmp1, tmp2); 5380 cbnzw(tmp1, DONE); 5381 } 5382 bind(TAIL01); 5383 if (elem_size == 1) { // Only needed when comparing 1-byte elements 5384 tbz(cnt1, 0, SAME); // 0-1 bytes left. 5385 { 5386 ldrb(tmp1, a1); 5387 ldrb(tmp2, a2); 5388 eorw(tmp1, tmp1, tmp2); 5389 cbnzw(tmp1, DONE); 5390 } 5391 } 5392 // Arrays are equal. 5393 bind(SAME); 5394 mov(result, true); 5395 5396 // That's it. 5397 bind(DONE); 5398 BLOCK_COMMENT("} string_equals"); 5399 } 5400 5401 5402 // The size of the blocks erased by the zero_blocks stub. We must 5403 // handle anything smaller than this ourselves in zero_words(). 5404 const int MacroAssembler::zero_words_block_size = 8; 5405 5406 // zero_words() is used by C2 ClearArray patterns. It is as small as 5407 // possible, handling small word counts locally and delegating 5408 // anything larger to the zero_blocks stub. It is expanded many times 5409 // in compiled code, so it is important to keep it short. 5410 5411 // ptr: Address of a buffer to be zeroed. 5412 // cnt: Count in HeapWords. 5413 // 5414 // ptr, cnt, rscratch1, and rscratch2 are clobbered. 5415 void MacroAssembler::zero_words(Register ptr, Register cnt) 5416 { 5417 assert(is_power_of_2(zero_words_block_size), "adjust this"); 5418 assert(ptr == r10 && cnt == r11, "mismatch in register usage"); 5419 5420 BLOCK_COMMENT("zero_words {"); 5421 cmp(cnt, zero_words_block_size); 5422 Label around, done, done16; 5423 br(LO, around); 5424 { 5425 RuntimeAddress zero_blocks = RuntimeAddress(StubRoutines::aarch64::zero_blocks()); 5426 assert(zero_blocks.target() != NULL, "zero_blocks stub has not been generated"); 5427 if (StubRoutines::aarch64::complete()) { 5428 trampoline_call(zero_blocks); 5429 } else { 5430 bl(zero_blocks); 5431 } 5432 } 5433 bind(around); 5434 for (int i = zero_words_block_size >> 1; i > 1; i >>= 1) { 5435 Label l; 5436 tbz(cnt, exact_log2(i), l); 5437 for (int j = 0; j < i; j += 2) { 5438 stp(zr, zr, post(ptr, 16)); 5439 } 5440 bind(l); 5441 } 5442 { 5443 Label l; 5444 tbz(cnt, 0, l); 5445 str(zr, Address(ptr)); 5446 bind(l); 5447 } 5448 BLOCK_COMMENT("} zero_words"); 5449 } 5450 5451 // base: Address of a buffer to be zeroed, 8 bytes aligned. 5452 // cnt: Immediate count in HeapWords. 5453 #define SmallArraySize (18 * BytesPerLong) 5454 void MacroAssembler::zero_words(Register base, u_int64_t cnt) 5455 { 5456 BLOCK_COMMENT("zero_words {"); 5457 int i = cnt & 1; // store any odd word to start 5458 if (i) str(zr, Address(base)); 5459 5460 if (cnt <= SmallArraySize / BytesPerLong) { 5461 for (; i < (int)cnt; i += 2) 5462 stp(zr, zr, Address(base, i * wordSize)); 5463 } else { 5464 const int unroll = 4; // Number of stp(zr, zr) instructions we'll unroll 5465 int remainder = cnt % (2 * unroll); 5466 for (; i < remainder; i += 2) 5467 stp(zr, zr, Address(base, i * wordSize)); 5468 5469 Label loop; 5470 Register cnt_reg = rscratch1; 5471 Register loop_base = rscratch2; 5472 cnt = cnt - remainder; 5473 mov(cnt_reg, cnt); 5474 // adjust base and prebias by -2 * wordSize so we can pre-increment 5475 add(loop_base, base, (remainder - 2) * wordSize); 5476 bind(loop); 5477 sub(cnt_reg, cnt_reg, 2 * unroll); 5478 for (i = 1; i < unroll; i++) 5479 stp(zr, zr, Address(loop_base, 2 * i * wordSize)); 5480 stp(zr, zr, Address(pre(loop_base, 2 * unroll * wordSize))); 5481 cbnz(cnt_reg, loop); 5482 } 5483 BLOCK_COMMENT("} zero_words"); 5484 } 5485 5486 // Zero blocks of memory by using DC ZVA. 5487 // 5488 // Aligns the base address first sufficently for DC ZVA, then uses 5489 // DC ZVA repeatedly for every full block. cnt is the size to be 5490 // zeroed in HeapWords. Returns the count of words left to be zeroed 5491 // in cnt. 5492 // 5493 // NOTE: This is intended to be used in the zero_blocks() stub. If 5494 // you want to use it elsewhere, note that cnt must be >= 2*zva_length. 5495 void MacroAssembler::zero_dcache_blocks(Register base, Register cnt) { 5496 Register tmp = rscratch1; 5497 Register tmp2 = rscratch2; 5498 int zva_length = VM_Version::zva_length(); 5499 Label initial_table_end, loop_zva; 5500 Label fini; 5501 5502 // Base must be 16 byte aligned. If not just return and let caller handle it 5503 tst(base, 0x0f); 5504 br(Assembler::NE, fini); 5505 // Align base with ZVA length. 5506 neg(tmp, base); 5507 andr(tmp, tmp, zva_length - 1); 5508 5509 // tmp: the number of bytes to be filled to align the base with ZVA length. 5510 add(base, base, tmp); 5511 sub(cnt, cnt, tmp, Assembler::ASR, 3); 5512 adr(tmp2, initial_table_end); 5513 sub(tmp2, tmp2, tmp, Assembler::LSR, 2); 5514 br(tmp2); 5515 5516 for (int i = -zva_length + 16; i < 0; i += 16) 5517 stp(zr, zr, Address(base, i)); 5518 bind(initial_table_end); 5519 5520 sub(cnt, cnt, zva_length >> 3); 5521 bind(loop_zva); 5522 dc(Assembler::ZVA, base); 5523 subs(cnt, cnt, zva_length >> 3); 5524 add(base, base, zva_length); 5525 br(Assembler::GE, loop_zva); 5526 add(cnt, cnt, zva_length >> 3); // count not zeroed by DC ZVA 5527 bind(fini); 5528 } 5529 5530 // base: Address of a buffer to be filled, 8 bytes aligned. 5531 // cnt: Count in 8-byte unit. 5532 // value: Value to be filled with. 5533 // base will point to the end of the buffer after filling. 5534 void MacroAssembler::fill_words(Register base, Register cnt, Register value) 5535 { 5536 // Algorithm: 5537 // 5538 // scratch1 = cnt & 7; 5539 // cnt -= scratch1; 5540 // p += scratch1; 5541 // switch (scratch1) { 5542 // do { 5543 // cnt -= 8; 5544 // p[-8] = v; 5545 // case 7: 5546 // p[-7] = v; 5547 // case 6: 5548 // p[-6] = v; 5549 // // ... 5550 // case 1: 5551 // p[-1] = v; 5552 // case 0: 5553 // p += 8; 5554 // } while (cnt); 5555 // } 5556 5557 assert_different_registers(base, cnt, value, rscratch1, rscratch2); 5558 5559 Label fini, skip, entry, loop; 5560 const int unroll = 8; // Number of stp instructions we'll unroll 5561 5562 cbz(cnt, fini); 5563 tbz(base, 3, skip); 5564 str(value, Address(post(base, 8))); 5565 sub(cnt, cnt, 1); 5566 bind(skip); 5567 5568 andr(rscratch1, cnt, (unroll-1) * 2); 5569 sub(cnt, cnt, rscratch1); 5570 add(base, base, rscratch1, Assembler::LSL, 3); 5571 adr(rscratch2, entry); 5572 sub(rscratch2, rscratch2, rscratch1, Assembler::LSL, 1); 5573 br(rscratch2); 5574 5575 bind(loop); 5576 add(base, base, unroll * 16); 5577 for (int i = -unroll; i < 0; i++) 5578 stp(value, value, Address(base, i * 16)); 5579 bind(entry); 5580 subs(cnt, cnt, unroll * 2); 5581 br(Assembler::GE, loop); 5582 5583 tbz(cnt, 0, fini); 5584 str(value, Address(post(base, 8))); 5585 bind(fini); 5586 } 5587 5588 // Intrinsic for sun/nio/cs/ISO_8859_1$Encoder.implEncodeISOArray and 5589 // java/lang/StringUTF16.compress. 5590 void MacroAssembler::encode_iso_array(Register src, Register dst, 5591 Register len, Register result, 5592 FloatRegister Vtmp1, FloatRegister Vtmp2, 5593 FloatRegister Vtmp3, FloatRegister Vtmp4) 5594 { 5595 Label DONE, SET_RESULT, NEXT_32, NEXT_32_PRFM, LOOP_8, NEXT_8, LOOP_1, NEXT_1, 5596 NEXT_32_START, NEXT_32_PRFM_START; 5597 Register tmp1 = rscratch1, tmp2 = rscratch2; 5598 5599 mov(result, len); // Save initial len 5600 5601 #ifndef BUILTIN_SIM 5602 cmp(len, 8); // handle shortest strings first 5603 br(LT, LOOP_1); 5604 cmp(len, 32); 5605 br(LT, NEXT_8); 5606 // The following code uses the SIMD 'uzp1' and 'uzp2' instructions 5607 // to convert chars to bytes 5608 if (SoftwarePrefetchHintDistance >= 0) { 5609 ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src); 5610 cmp(len, SoftwarePrefetchHintDistance/2 + 16); 5611 br(LE, NEXT_32_START); 5612 b(NEXT_32_PRFM_START); 5613 BIND(NEXT_32_PRFM); 5614 ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src); 5615 BIND(NEXT_32_PRFM_START); 5616 prfm(Address(src, SoftwarePrefetchHintDistance)); 5617 orr(v4, T16B, Vtmp1, Vtmp2); 5618 orr(v5, T16B, Vtmp3, Vtmp4); 5619 uzp1(Vtmp1, T16B, Vtmp1, Vtmp2); 5620 uzp1(Vtmp3, T16B, Vtmp3, Vtmp4); 5621 stpq(Vtmp1, Vtmp3, dst); 5622 uzp2(v5, T16B, v4, v5); // high bytes 5623 umov(tmp2, v5, D, 1); 5624 fmovd(tmp1, v5); 5625 orr(tmp1, tmp1, tmp2); 5626 cbnz(tmp1, LOOP_8); 5627 sub(len, len, 32); 5628 add(dst, dst, 32); 5629 add(src, src, 64); 5630 cmp(len, SoftwarePrefetchHintDistance/2 + 16); 5631 br(GE, NEXT_32_PRFM); 5632 cmp(len, 32); 5633 br(LT, LOOP_8); 5634 BIND(NEXT_32); 5635 ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src); 5636 BIND(NEXT_32_START); 5637 } else { 5638 BIND(NEXT_32); 5639 ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src); 5640 } 5641 prfm(Address(src, SoftwarePrefetchHintDistance)); 5642 uzp1(v4, T16B, Vtmp1, Vtmp2); 5643 uzp1(v5, T16B, Vtmp3, Vtmp4); 5644 stpq(v4, v5, dst); 5645 orr(Vtmp1, T16B, Vtmp1, Vtmp2); 5646 orr(Vtmp3, T16B, Vtmp3, Vtmp4); 5647 uzp2(Vtmp1, T16B, Vtmp1, Vtmp3); // high bytes 5648 umov(tmp2, Vtmp1, D, 1); 5649 fmovd(tmp1, Vtmp1); 5650 orr(tmp1, tmp1, tmp2); 5651 cbnz(tmp1, LOOP_8); 5652 sub(len, len, 32); 5653 add(dst, dst, 32); 5654 add(src, src, 64); 5655 cmp(len, 32); 5656 br(GE, NEXT_32); 5657 cbz(len, DONE); 5658 5659 BIND(LOOP_8); 5660 cmp(len, 8); 5661 br(LT, LOOP_1); 5662 BIND(NEXT_8); 5663 ld1(Vtmp1, T8H, src); 5664 uzp1(Vtmp2, T16B, Vtmp1, Vtmp1); // low bytes 5665 uzp2(Vtmp3, T16B, Vtmp1, Vtmp1); // high bytes 5666 strd(Vtmp2, dst); 5667 fmovd(tmp1, Vtmp3); 5668 cbnz(tmp1, NEXT_1); 5669 5670 sub(len, len, 8); 5671 add(dst, dst, 8); 5672 add(src, src, 16); 5673 cmp(len, 8); 5674 br(GE, NEXT_8); 5675 5676 BIND(LOOP_1); 5677 #endif 5678 cbz(len, DONE); 5679 BIND(NEXT_1); 5680 ldrh(tmp1, Address(post(src, 2))); 5681 strb(tmp1, Address(post(dst, 1))); 5682 tst(tmp1, 0xff00); 5683 br(NE, SET_RESULT); 5684 subs(len, len, 1); 5685 br(GT, NEXT_1); 5686 5687 BIND(SET_RESULT); 5688 sub(result, result, len); // Return index where we stopped 5689 // Return len == 0 if we processed all 5690 // characters 5691 BIND(DONE); 5692 } 5693 5694 5695 // Inflate byte[] array to char[]. 5696 void MacroAssembler::byte_array_inflate(Register src, Register dst, Register len, 5697 FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3, 5698 Register tmp4) { 5699 Label big, done, after_init, to_stub; 5700 5701 assert_different_registers(src, dst, len, tmp4, rscratch1); 5702 5703 fmovd(vtmp1, zr); 5704 lsrw(tmp4, len, 3); 5705 bind(after_init); 5706 cbnzw(tmp4, big); 5707 // Short string: less than 8 bytes. 5708 { 5709 Label loop, tiny; 5710 5711 cmpw(len, 4); 5712 br(LT, tiny); 5713 // Use SIMD to do 4 bytes. 5714 ldrs(vtmp2, post(src, 4)); 5715 zip1(vtmp3, T8B, vtmp2, vtmp1); 5716 subw(len, len, 4); 5717 strd(vtmp3, post(dst, 8)); 5718 5719 cbzw(len, done); 5720 5721 // Do the remaining bytes by steam. 5722 bind(loop); 5723 ldrb(tmp4, post(src, 1)); 5724 strh(tmp4, post(dst, 2)); 5725 subw(len, len, 1); 5726 5727 bind(tiny); 5728 cbnz(len, loop); 5729 5730 b(done); 5731 } 5732 5733 if (SoftwarePrefetchHintDistance >= 0) { 5734 bind(to_stub); 5735 RuntimeAddress stub = RuntimeAddress(StubRoutines::aarch64::large_byte_array_inflate()); 5736 assert(stub.target() != NULL, "large_byte_array_inflate stub has not been generated"); 5737 trampoline_call(stub); 5738 b(after_init); 5739 } 5740 5741 // Unpack the bytes 8 at a time. 5742 bind(big); 5743 { 5744 Label loop, around, loop_last, loop_start; 5745 5746 if (SoftwarePrefetchHintDistance >= 0) { 5747 const int large_loop_threshold = (64 + 16)/8; 5748 ldrd(vtmp2, post(src, 8)); 5749 andw(len, len, 7); 5750 cmp(tmp4, large_loop_threshold); 5751 br(GE, to_stub); 5752 b(loop_start); 5753 5754 bind(loop); 5755 ldrd(vtmp2, post(src, 8)); 5756 bind(loop_start); 5757 subs(tmp4, tmp4, 1); 5758 br(EQ, loop_last); 5759 zip1(vtmp2, T16B, vtmp2, vtmp1); 5760 ldrd(vtmp3, post(src, 8)); 5761 st1(vtmp2, T8H, post(dst, 16)); 5762 subs(tmp4, tmp4, 1); 5763 zip1(vtmp3, T16B, vtmp3, vtmp1); 5764 st1(vtmp3, T8H, post(dst, 16)); 5765 br(NE, loop); 5766 b(around); 5767 bind(loop_last); 5768 zip1(vtmp2, T16B, vtmp2, vtmp1); 5769 st1(vtmp2, T8H, post(dst, 16)); 5770 bind(around); 5771 cbz(len, done); 5772 } else { 5773 andw(len, len, 7); 5774 bind(loop); 5775 ldrd(vtmp2, post(src, 8)); 5776 sub(tmp4, tmp4, 1); 5777 zip1(vtmp3, T16B, vtmp2, vtmp1); 5778 st1(vtmp3, T8H, post(dst, 16)); 5779 cbnz(tmp4, loop); 5780 } 5781 } 5782 5783 // Do the tail of up to 8 bytes. 5784 add(src, src, len); 5785 ldrd(vtmp3, Address(src, -8)); 5786 add(dst, dst, len, ext::uxtw, 1); 5787 zip1(vtmp3, T16B, vtmp3, vtmp1); 5788 strq(vtmp3, Address(dst, -16)); 5789 5790 bind(done); 5791 } 5792 5793 // Compress char[] array to byte[]. 5794 void MacroAssembler::char_array_compress(Register src, Register dst, Register len, 5795 FloatRegister tmp1Reg, FloatRegister tmp2Reg, 5796 FloatRegister tmp3Reg, FloatRegister tmp4Reg, 5797 Register result) { 5798 encode_iso_array(src, dst, len, result, 5799 tmp1Reg, tmp2Reg, tmp3Reg, tmp4Reg); 5800 cmp(len, zr); 5801 csel(result, result, zr, EQ); 5802 } 5803 5804 // get_thread() can be called anywhere inside generated code so we 5805 // need to save whatever non-callee save context might get clobbered 5806 // by the call to JavaThread::aarch64_get_thread_helper() or, indeed, 5807 // the call setup code. 5808 // 5809 // aarch64_get_thread_helper() clobbers only r0, r1, and flags. 5810 // 5811 void MacroAssembler::get_thread(Register dst) { 5812 RegSet saved_regs = RegSet::range(r0, r1) + lr - dst; 5813 push(saved_regs, sp); 5814 5815 mov(lr, CAST_FROM_FN_PTR(address, JavaThread::aarch64_get_thread_helper)); 5816 blrt(lr, 1, 0, 1); 5817 if (dst != c_rarg0) { 5818 mov(dst, c_rarg0); 5819 } 5820 5821 pop(saved_regs, sp); 5822 }