1 /* 2 * Copyright (c) 1997, 2018, Oracle and/or its affiliates. All rights reserved. 3 * Copyright (c) 2014, 2015, Red Hat Inc. All rights reserved. 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. 9 * 10 * This code is distributed in the hope that it will be useful, but WITHOUT 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 * version 2 for more details (a copy is included in the LICENSE file that 14 * accompanied this code). 15 * 16 * You should have received a copy of the GNU General Public License version 17 * 2 along with this work; if not, write to the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 19 * 20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 21 * or visit www.oracle.com if you need additional information or have any 22 * questions. 23 * 24 */ 25 26 #include <sys/types.h> 27 28 #include "precompiled.hpp" 29 #include "jvm.h" 30 #include "asm/assembler.hpp" 31 #include "asm/assembler.inline.hpp" 32 #include "gc/shared/barrierSet.hpp" 33 #include "gc/shared/cardTable.hpp" 34 #include "gc/shared/barrierSetAssembler.hpp" 35 #include "gc/shared/cardTableBarrierSet.hpp" 36 #include "interpreter/interpreter.hpp" 37 #include "compiler/disassembler.hpp" 38 #include "memory/resourceArea.hpp" 39 #include "nativeInst_aarch64.hpp" 40 #include "oops/accessDecorators.hpp" 41 #include "oops/compressedOops.inline.hpp" 42 #include "oops/klass.inline.hpp" 43 #include "oops/oop.hpp" 44 #include "opto/compile.hpp" 45 #include "opto/intrinsicnode.hpp" 46 #include "opto/node.hpp" 47 #include "runtime/biasedLocking.hpp" 48 #include "runtime/icache.hpp" 49 #include "runtime/interfaceSupport.inline.hpp" 50 #include "runtime/jniHandles.inline.hpp" 51 #include "runtime/sharedRuntime.hpp" 52 #include "runtime/thread.hpp" 53 54 #ifdef PRODUCT 55 #define BLOCK_COMMENT(str) /* nothing */ 56 #define STOP(error) stop(error) 57 #else 58 #define BLOCK_COMMENT(str) block_comment(str) 59 #define STOP(error) block_comment(error); stop(error) 60 #endif 61 62 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 63 64 // Patch any kind of instruction; there may be several instructions. 65 // Return the total length (in bytes) of the instructions. 66 int MacroAssembler::pd_patch_instruction_size(address branch, address target) { 67 int instructions = 1; 68 assert((uint64_t)target < (1ul << 48), "48-bit overflow in address constant"); 69 long offset = (target - branch) >> 2; 70 unsigned insn = *(unsigned*)branch; 71 if ((Instruction_aarch64::extract(insn, 29, 24) & 0b111011) == 0b011000) { 72 // Load register (literal) 73 Instruction_aarch64::spatch(branch, 23, 5, offset); 74 } else if (Instruction_aarch64::extract(insn, 30, 26) == 0b00101) { 75 // Unconditional branch (immediate) 76 Instruction_aarch64::spatch(branch, 25, 0, offset); 77 } else if (Instruction_aarch64::extract(insn, 31, 25) == 0b0101010) { 78 // Conditional branch (immediate) 79 Instruction_aarch64::spatch(branch, 23, 5, offset); 80 } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011010) { 81 // Compare & branch (immediate) 82 Instruction_aarch64::spatch(branch, 23, 5, offset); 83 } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011011) { 84 // Test & branch (immediate) 85 Instruction_aarch64::spatch(branch, 18, 5, offset); 86 } else if (Instruction_aarch64::extract(insn, 28, 24) == 0b10000) { 87 // PC-rel. addressing 88 offset = target-branch; 89 int shift = Instruction_aarch64::extract(insn, 31, 31); 90 if (shift) { 91 u_int64_t dest = (u_int64_t)target; 92 uint64_t pc_page = (uint64_t)branch >> 12; 93 uint64_t adr_page = (uint64_t)target >> 12; 94 unsigned offset_lo = dest & 0xfff; 95 offset = adr_page - pc_page; 96 97 // We handle 4 types of PC relative addressing 98 // 1 - adrp Rx, target_page 99 // ldr/str Ry, [Rx, #offset_in_page] 100 // 2 - adrp Rx, target_page 101 // add Ry, Rx, #offset_in_page 102 // 3 - adrp Rx, target_page (page aligned reloc, offset == 0) 103 // movk Rx, #imm16<<32 104 // 4 - adrp Rx, target_page (page aligned reloc, offset == 0) 105 // In the first 3 cases we must check that Rx is the same in the adrp and the 106 // subsequent ldr/str, add or movk instruction. Otherwise we could accidentally end 107 // up treating a type 4 relocation as a type 1, 2 or 3 just because it happened 108 // to be followed by a random unrelated ldr/str, add or movk instruction. 109 // 110 unsigned insn2 = ((unsigned*)branch)[1]; 111 if (Instruction_aarch64::extract(insn2, 29, 24) == 0b111001 && 112 Instruction_aarch64::extract(insn, 4, 0) == 113 Instruction_aarch64::extract(insn2, 9, 5)) { 114 // Load/store register (unsigned immediate) 115 unsigned size = Instruction_aarch64::extract(insn2, 31, 30); 116 Instruction_aarch64::patch(branch + sizeof (unsigned), 117 21, 10, offset_lo >> size); 118 guarantee(((dest >> size) << size) == dest, "misaligned target"); 119 instructions = 2; 120 } else if (Instruction_aarch64::extract(insn2, 31, 22) == 0b1001000100 && 121 Instruction_aarch64::extract(insn, 4, 0) == 122 Instruction_aarch64::extract(insn2, 4, 0)) { 123 // add (immediate) 124 Instruction_aarch64::patch(branch + sizeof (unsigned), 125 21, 10, offset_lo); 126 instructions = 2; 127 } else if (Instruction_aarch64::extract(insn2, 31, 21) == 0b11110010110 && 128 Instruction_aarch64::extract(insn, 4, 0) == 129 Instruction_aarch64::extract(insn2, 4, 0)) { 130 // movk #imm16<<32 131 Instruction_aarch64::patch(branch + 4, 20, 5, (uint64_t)target >> 32); 132 long dest = ((long)target & 0xffffffffL) | ((long)branch & 0xffff00000000L); 133 long pc_page = (long)branch >> 12; 134 long adr_page = (long)dest >> 12; 135 offset = adr_page - pc_page; 136 instructions = 2; 137 } 138 } 139 int offset_lo = offset & 3; 140 offset >>= 2; 141 Instruction_aarch64::spatch(branch, 23, 5, offset); 142 Instruction_aarch64::patch(branch, 30, 29, offset_lo); 143 } else if (Instruction_aarch64::extract(insn, 31, 21) == 0b11010010100) { 144 u_int64_t dest = (u_int64_t)target; 145 // Move wide constant 146 assert(nativeInstruction_at(branch+4)->is_movk(), "wrong insns in patch"); 147 assert(nativeInstruction_at(branch+8)->is_movk(), "wrong insns in patch"); 148 Instruction_aarch64::patch(branch, 20, 5, dest & 0xffff); 149 Instruction_aarch64::patch(branch+4, 20, 5, (dest >>= 16) & 0xffff); 150 Instruction_aarch64::patch(branch+8, 20, 5, (dest >>= 16) & 0xffff); 151 assert(target_addr_for_insn(branch) == target, "should be"); 152 instructions = 3; 153 } else if (Instruction_aarch64::extract(insn, 31, 22) == 0b1011100101 && 154 Instruction_aarch64::extract(insn, 4, 0) == 0b11111) { 155 // nothing to do 156 assert(target == 0, "did not expect to relocate target for polling page load"); 157 } else { 158 ShouldNotReachHere(); 159 } 160 return instructions * NativeInstruction::instruction_size; 161 } 162 163 int MacroAssembler::patch_oop(address insn_addr, address o) { 164 int instructions; 165 unsigned insn = *(unsigned*)insn_addr; 166 assert(nativeInstruction_at(insn_addr+4)->is_movk(), "wrong insns in patch"); 167 168 // OOPs are either narrow (32 bits) or wide (48 bits). We encode 169 // narrow OOPs by setting the upper 16 bits in the first 170 // instruction. 171 if (Instruction_aarch64::extract(insn, 31, 21) == 0b11010010101) { 172 // Move narrow OOP 173 narrowOop n = CompressedOops::encode((oop)o); 174 Instruction_aarch64::patch(insn_addr, 20, 5, n >> 16); 175 Instruction_aarch64::patch(insn_addr+4, 20, 5, n & 0xffff); 176 instructions = 2; 177 } else { 178 // Move wide OOP 179 assert(nativeInstruction_at(insn_addr+8)->is_movk(), "wrong insns in patch"); 180 uintptr_t dest = (uintptr_t)o; 181 Instruction_aarch64::patch(insn_addr, 20, 5, dest & 0xffff); 182 Instruction_aarch64::patch(insn_addr+4, 20, 5, (dest >>= 16) & 0xffff); 183 Instruction_aarch64::patch(insn_addr+8, 20, 5, (dest >>= 16) & 0xffff); 184 instructions = 3; 185 } 186 return instructions * NativeInstruction::instruction_size; 187 } 188 189 int MacroAssembler::patch_narrow_klass(address insn_addr, narrowKlass n) { 190 // Metatdata pointers are either narrow (32 bits) or wide (48 bits). 191 // We encode narrow ones by setting the upper 16 bits in the first 192 // instruction. 193 NativeInstruction *insn = nativeInstruction_at(insn_addr); 194 assert(Instruction_aarch64::extract(insn->encoding(), 31, 21) == 0b11010010101 && 195 nativeInstruction_at(insn_addr+4)->is_movk(), "wrong insns in patch"); 196 197 Instruction_aarch64::patch(insn_addr, 20, 5, n >> 16); 198 Instruction_aarch64::patch(insn_addr+4, 20, 5, n & 0xffff); 199 return 2 * NativeInstruction::instruction_size; 200 } 201 202 address MacroAssembler::target_addr_for_insn(address insn_addr, unsigned insn) { 203 long offset = 0; 204 if ((Instruction_aarch64::extract(insn, 29, 24) & 0b011011) == 0b00011000) { 205 // Load register (literal) 206 offset = Instruction_aarch64::sextract(insn, 23, 5); 207 return address(((uint64_t)insn_addr + (offset << 2))); 208 } else if (Instruction_aarch64::extract(insn, 30, 26) == 0b00101) { 209 // Unconditional branch (immediate) 210 offset = Instruction_aarch64::sextract(insn, 25, 0); 211 } else if (Instruction_aarch64::extract(insn, 31, 25) == 0b0101010) { 212 // Conditional branch (immediate) 213 offset = Instruction_aarch64::sextract(insn, 23, 5); 214 } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011010) { 215 // Compare & branch (immediate) 216 offset = Instruction_aarch64::sextract(insn, 23, 5); 217 } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011011) { 218 // Test & branch (immediate) 219 offset = Instruction_aarch64::sextract(insn, 18, 5); 220 } else if (Instruction_aarch64::extract(insn, 28, 24) == 0b10000) { 221 // PC-rel. addressing 222 offset = Instruction_aarch64::extract(insn, 30, 29); 223 offset |= Instruction_aarch64::sextract(insn, 23, 5) << 2; 224 int shift = Instruction_aarch64::extract(insn, 31, 31) ? 12 : 0; 225 if (shift) { 226 offset <<= shift; 227 uint64_t target_page = ((uint64_t)insn_addr) + offset; 228 target_page &= ((uint64_t)-1) << shift; 229 // Return the target address for the following sequences 230 // 1 - adrp Rx, target_page 231 // ldr/str Ry, [Rx, #offset_in_page] 232 // 2 - adrp Rx, target_page 233 // add Ry, Rx, #offset_in_page 234 // 3 - adrp Rx, target_page (page aligned reloc, offset == 0) 235 // movk Rx, #imm12<<32 236 // 4 - adrp Rx, target_page (page aligned reloc, offset == 0) 237 // 238 // In the first two cases we check that the register is the same and 239 // return the target_page + the offset within the page. 240 // Otherwise we assume it is a page aligned relocation and return 241 // the target page only. 242 // 243 unsigned insn2 = ((unsigned*)insn_addr)[1]; 244 if (Instruction_aarch64::extract(insn2, 29, 24) == 0b111001 && 245 Instruction_aarch64::extract(insn, 4, 0) == 246 Instruction_aarch64::extract(insn2, 9, 5)) { 247 // Load/store register (unsigned immediate) 248 unsigned int byte_offset = Instruction_aarch64::extract(insn2, 21, 10); 249 unsigned int size = Instruction_aarch64::extract(insn2, 31, 30); 250 return address(target_page + (byte_offset << size)); 251 } else if (Instruction_aarch64::extract(insn2, 31, 22) == 0b1001000100 && 252 Instruction_aarch64::extract(insn, 4, 0) == 253 Instruction_aarch64::extract(insn2, 4, 0)) { 254 // add (immediate) 255 unsigned int byte_offset = Instruction_aarch64::extract(insn2, 21, 10); 256 return address(target_page + byte_offset); 257 } else { 258 if (Instruction_aarch64::extract(insn2, 31, 21) == 0b11110010110 && 259 Instruction_aarch64::extract(insn, 4, 0) == 260 Instruction_aarch64::extract(insn2, 4, 0)) { 261 target_page = (target_page & 0xffffffff) | 262 ((uint64_t)Instruction_aarch64::extract(insn2, 20, 5) << 32); 263 } 264 return (address)target_page; 265 } 266 } else { 267 ShouldNotReachHere(); 268 } 269 } else if (Instruction_aarch64::extract(insn, 31, 23) == 0b110100101) { 270 u_int32_t *insns = (u_int32_t *)insn_addr; 271 // Move wide constant: movz, movk, movk. See movptr(). 272 assert(nativeInstruction_at(insns+1)->is_movk(), "wrong insns in patch"); 273 assert(nativeInstruction_at(insns+2)->is_movk(), "wrong insns in patch"); 274 return address(u_int64_t(Instruction_aarch64::extract(insns[0], 20, 5)) 275 + (u_int64_t(Instruction_aarch64::extract(insns[1], 20, 5)) << 16) 276 + (u_int64_t(Instruction_aarch64::extract(insns[2], 20, 5)) << 32)); 277 } else if (Instruction_aarch64::extract(insn, 31, 22) == 0b1011100101 && 278 Instruction_aarch64::extract(insn, 4, 0) == 0b11111) { 279 return 0; 280 } else { 281 ShouldNotReachHere(); 282 } 283 return address(((uint64_t)insn_addr + (offset << 2))); 284 } 285 286 void MacroAssembler::serialize_memory(Register thread, Register tmp) { 287 dsb(Assembler::SY); 288 } 289 290 void MacroAssembler::safepoint_poll(Label& slow_path) { 291 if (SafepointMechanism::uses_thread_local_poll()) { 292 ldr(rscratch1, Address(rthread, Thread::polling_page_offset())); 293 tbnz(rscratch1, exact_log2(SafepointMechanism::poll_bit()), slow_path); 294 } else { 295 unsigned long offset; 296 adrp(rscratch1, ExternalAddress(SafepointSynchronize::address_of_state()), offset); 297 ldrw(rscratch1, Address(rscratch1, offset)); 298 assert(SafepointSynchronize::_not_synchronized == 0, "rewrite this code"); 299 cbnz(rscratch1, slow_path); 300 } 301 } 302 303 // Just like safepoint_poll, but use an acquiring load for thread- 304 // local polling. 305 // 306 // We need an acquire here to ensure that any subsequent load of the 307 // global SafepointSynchronize::_state flag is ordered after this load 308 // of the local Thread::_polling page. We don't want this poll to 309 // return false (i.e. not safepointing) and a later poll of the global 310 // SafepointSynchronize::_state spuriously to return true. 311 // 312 // This is to avoid a race when we're in a native->Java transition 313 // racing the code which wakes up from a safepoint. 314 // 315 void MacroAssembler::safepoint_poll_acquire(Label& slow_path) { 316 if (SafepointMechanism::uses_thread_local_poll()) { 317 lea(rscratch1, Address(rthread, Thread::polling_page_offset())); 318 ldar(rscratch1, rscratch1); 319 tbnz(rscratch1, exact_log2(SafepointMechanism::poll_bit()), slow_path); 320 } else { 321 safepoint_poll(slow_path); 322 } 323 } 324 325 void MacroAssembler::reset_last_Java_frame(bool clear_fp) { 326 // we must set sp to zero to clear frame 327 str(zr, Address(rthread, JavaThread::last_Java_sp_offset())); 328 329 // must clear fp, so that compiled frames are not confused; it is 330 // possible that we need it only for debugging 331 if (clear_fp) { 332 str(zr, Address(rthread, JavaThread::last_Java_fp_offset())); 333 } 334 335 // Always clear the pc because it could have been set by make_walkable() 336 str(zr, Address(rthread, JavaThread::last_Java_pc_offset())); 337 } 338 339 // Calls to C land 340 // 341 // When entering C land, the rfp, & resp of the last Java frame have to be recorded 342 // in the (thread-local) JavaThread object. When leaving C land, the last Java fp 343 // has to be reset to 0. This is required to allow proper stack traversal. 344 void MacroAssembler::set_last_Java_frame(Register last_java_sp, 345 Register last_java_fp, 346 Register last_java_pc, 347 Register scratch) { 348 349 if (last_java_pc->is_valid()) { 350 str(last_java_pc, Address(rthread, 351 JavaThread::frame_anchor_offset() 352 + JavaFrameAnchor::last_Java_pc_offset())); 353 } 354 355 // determine last_java_sp register 356 if (last_java_sp == sp) { 357 mov(scratch, sp); 358 last_java_sp = scratch; 359 } else if (!last_java_sp->is_valid()) { 360 last_java_sp = esp; 361 } 362 363 str(last_java_sp, Address(rthread, JavaThread::last_Java_sp_offset())); 364 365 // last_java_fp is optional 366 if (last_java_fp->is_valid()) { 367 str(last_java_fp, Address(rthread, JavaThread::last_Java_fp_offset())); 368 } 369 } 370 371 void MacroAssembler::set_last_Java_frame(Register last_java_sp, 372 Register last_java_fp, 373 address last_java_pc, 374 Register scratch) { 375 if (last_java_pc != NULL) { 376 adr(scratch, last_java_pc); 377 } else { 378 // FIXME: This is almost never correct. We should delete all 379 // cases of set_last_Java_frame with last_java_pc=NULL and use the 380 // correct return address instead. 381 adr(scratch, pc()); 382 } 383 384 str(scratch, Address(rthread, 385 JavaThread::frame_anchor_offset() 386 + JavaFrameAnchor::last_Java_pc_offset())); 387 388 set_last_Java_frame(last_java_sp, last_java_fp, noreg, scratch); 389 } 390 391 void MacroAssembler::set_last_Java_frame(Register last_java_sp, 392 Register last_java_fp, 393 Label &L, 394 Register scratch) { 395 if (L.is_bound()) { 396 set_last_Java_frame(last_java_sp, last_java_fp, target(L), scratch); 397 } else { 398 InstructionMark im(this); 399 L.add_patch_at(code(), locator()); 400 set_last_Java_frame(last_java_sp, last_java_fp, (address)NULL, scratch); 401 } 402 } 403 404 void MacroAssembler::far_call(Address entry, CodeBuffer *cbuf, Register tmp) { 405 assert(ReservedCodeCacheSize < 4*G, "branch out of range"); 406 assert(CodeCache::find_blob(entry.target()) != NULL, 407 "destination of far call not found in code cache"); 408 if (far_branches()) { 409 unsigned long offset; 410 // We can use ADRP here because we know that the total size of 411 // the code cache cannot exceed 2Gb. 412 adrp(tmp, entry, offset); 413 add(tmp, tmp, offset); 414 if (cbuf) cbuf->set_insts_mark(); 415 blr(tmp); 416 } else { 417 if (cbuf) cbuf->set_insts_mark(); 418 bl(entry); 419 } 420 } 421 422 void MacroAssembler::far_jump(Address entry, CodeBuffer *cbuf, Register tmp) { 423 assert(ReservedCodeCacheSize < 4*G, "branch out of range"); 424 assert(CodeCache::find_blob(entry.target()) != NULL, 425 "destination of far call not found in code cache"); 426 if (far_branches()) { 427 unsigned long offset; 428 // We can use ADRP here because we know that the total size of 429 // the code cache cannot exceed 2Gb. 430 adrp(tmp, entry, offset); 431 add(tmp, tmp, offset); 432 if (cbuf) cbuf->set_insts_mark(); 433 br(tmp); 434 } else { 435 if (cbuf) cbuf->set_insts_mark(); 436 b(entry); 437 } 438 } 439 440 void MacroAssembler::reserved_stack_check() { 441 // testing if reserved zone needs to be enabled 442 Label no_reserved_zone_enabling; 443 444 ldr(rscratch1, Address(rthread, JavaThread::reserved_stack_activation_offset())); 445 cmp(sp, rscratch1); 446 br(Assembler::LO, no_reserved_zone_enabling); 447 448 enter(); // LR and FP are live. 449 lea(rscratch1, CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone)); 450 mov(c_rarg0, rthread); 451 blr(rscratch1); 452 leave(); 453 454 // We have already removed our own frame. 455 // throw_delayed_StackOverflowError will think that it's been 456 // called by our caller. 457 lea(rscratch1, RuntimeAddress(StubRoutines::throw_delayed_StackOverflowError_entry())); 458 br(rscratch1); 459 should_not_reach_here(); 460 461 bind(no_reserved_zone_enabling); 462 } 463 464 int MacroAssembler::biased_locking_enter(Register lock_reg, 465 Register obj_reg, 466 Register swap_reg, 467 Register tmp_reg, 468 bool swap_reg_contains_mark, 469 Label& done, 470 Label* slow_case, 471 BiasedLockingCounters* counters) { 472 assert(UseBiasedLocking, "why call this otherwise?"); 473 assert_different_registers(lock_reg, obj_reg, swap_reg); 474 475 if (PrintBiasedLockingStatistics && counters == NULL) 476 counters = BiasedLocking::counters(); 477 478 assert_different_registers(lock_reg, obj_reg, swap_reg, tmp_reg, rscratch1, rscratch2, noreg); 479 assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits, "biased locking makes assumptions about bit layout"); 480 Address mark_addr (obj_reg, oopDesc::mark_offset_in_bytes()); 481 Address klass_addr (obj_reg, oopDesc::klass_offset_in_bytes()); 482 Address saved_mark_addr(lock_reg, 0); 483 484 // Biased locking 485 // See whether the lock is currently biased toward our thread and 486 // whether the epoch is still valid 487 // Note that the runtime guarantees sufficient alignment of JavaThread 488 // pointers to allow age to be placed into low bits 489 // First check to see whether biasing is even enabled for this object 490 Label cas_label; 491 int null_check_offset = -1; 492 if (!swap_reg_contains_mark) { 493 null_check_offset = offset(); 494 ldr(swap_reg, mark_addr); 495 } 496 andr(tmp_reg, swap_reg, markOopDesc::biased_lock_mask_in_place); 497 cmp(tmp_reg, markOopDesc::biased_lock_pattern); 498 br(Assembler::NE, cas_label); 499 // The bias pattern is present in the object's header. Need to check 500 // whether the bias owner and the epoch are both still current. 501 load_prototype_header(tmp_reg, obj_reg); 502 orr(tmp_reg, tmp_reg, rthread); 503 eor(tmp_reg, swap_reg, tmp_reg); 504 andr(tmp_reg, tmp_reg, ~((int) markOopDesc::age_mask_in_place)); 505 if (counters != NULL) { 506 Label around; 507 cbnz(tmp_reg, around); 508 atomic_incw(Address((address)counters->biased_lock_entry_count_addr()), tmp_reg, rscratch1, rscratch2); 509 b(done); 510 bind(around); 511 } else { 512 cbz(tmp_reg, done); 513 } 514 515 Label try_revoke_bias; 516 Label try_rebias; 517 518 // At this point we know that the header has the bias pattern and 519 // that we are not the bias owner in the current epoch. We need to 520 // figure out more details about the state of the header in order to 521 // know what operations can be legally performed on the object's 522 // header. 523 524 // If the low three bits in the xor result aren't clear, that means 525 // the prototype header is no longer biased and we have to revoke 526 // the bias on this object. 527 andr(rscratch1, tmp_reg, markOopDesc::biased_lock_mask_in_place); 528 cbnz(rscratch1, try_revoke_bias); 529 530 // Biasing is still enabled for this data type. See whether the 531 // epoch of the current bias is still valid, meaning that the epoch 532 // bits of the mark word are equal to the epoch bits of the 533 // prototype header. (Note that the prototype header's epoch bits 534 // only change at a safepoint.) If not, attempt to rebias the object 535 // toward the current thread. Note that we must be absolutely sure 536 // that the current epoch is invalid in order to do this because 537 // otherwise the manipulations it performs on the mark word are 538 // illegal. 539 andr(rscratch1, tmp_reg, markOopDesc::epoch_mask_in_place); 540 cbnz(rscratch1, try_rebias); 541 542 // The epoch of the current bias is still valid but we know nothing 543 // about the owner; it might be set or it might be clear. Try to 544 // acquire the bias of the object using an atomic operation. If this 545 // fails we will go in to the runtime to revoke the object's bias. 546 // Note that we first construct the presumed unbiased header so we 547 // don't accidentally blow away another thread's valid bias. 548 { 549 Label here; 550 mov(rscratch1, markOopDesc::biased_lock_mask_in_place | markOopDesc::age_mask_in_place | markOopDesc::epoch_mask_in_place); 551 andr(swap_reg, swap_reg, rscratch1); 552 orr(tmp_reg, swap_reg, rthread); 553 cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, rscratch1, here, slow_case); 554 // If the biasing toward our thread failed, this means that 555 // another thread succeeded in biasing it toward itself and we 556 // need to revoke that bias. The revocation will occur in the 557 // interpreter runtime in the slow case. 558 bind(here); 559 if (counters != NULL) { 560 atomic_incw(Address((address)counters->anonymously_biased_lock_entry_count_addr()), 561 tmp_reg, rscratch1, rscratch2); 562 } 563 } 564 b(done); 565 566 bind(try_rebias); 567 // At this point we know the epoch has expired, meaning that the 568 // current "bias owner", if any, is actually invalid. Under these 569 // circumstances _only_, we are allowed to use the current header's 570 // value as the comparison value when doing the cas to acquire the 571 // bias in the current epoch. In other words, we allow transfer of 572 // the bias from one thread to another directly in this situation. 573 // 574 // FIXME: due to a lack of registers we currently blow away the age 575 // bits in this situation. Should attempt to preserve them. 576 { 577 Label here; 578 load_prototype_header(tmp_reg, obj_reg); 579 orr(tmp_reg, rthread, tmp_reg); 580 cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, rscratch1, here, slow_case); 581 // If the biasing toward our thread failed, then another thread 582 // succeeded in biasing it toward itself and we need to revoke that 583 // bias. The revocation will occur in the runtime in the slow case. 584 bind(here); 585 if (counters != NULL) { 586 atomic_incw(Address((address)counters->rebiased_lock_entry_count_addr()), 587 tmp_reg, rscratch1, rscratch2); 588 } 589 } 590 b(done); 591 592 bind(try_revoke_bias); 593 // The prototype mark in the klass doesn't have the bias bit set any 594 // more, indicating that objects of this data type are not supposed 595 // to be biased any more. We are going to try to reset the mark of 596 // this object to the prototype value and fall through to the 597 // CAS-based locking scheme. Note that if our CAS fails, it means 598 // that another thread raced us for the privilege of revoking the 599 // bias of this particular object, so it's okay to continue in the 600 // normal locking code. 601 // 602 // FIXME: due to a lack of registers we currently blow away the age 603 // bits in this situation. Should attempt to preserve them. 604 { 605 Label here, nope; 606 load_prototype_header(tmp_reg, obj_reg); 607 cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, rscratch1, here, &nope); 608 bind(here); 609 610 // Fall through to the normal CAS-based lock, because no matter what 611 // the result of the above CAS, some thread must have succeeded in 612 // removing the bias bit from the object's header. 613 if (counters != NULL) { 614 atomic_incw(Address((address)counters->revoked_lock_entry_count_addr()), tmp_reg, 615 rscratch1, rscratch2); 616 } 617 bind(nope); 618 } 619 620 bind(cas_label); 621 622 return null_check_offset; 623 } 624 625 void MacroAssembler::biased_locking_exit(Register obj_reg, Register temp_reg, Label& done) { 626 assert(UseBiasedLocking, "why call this otherwise?"); 627 628 // Check for biased locking unlock case, which is a no-op 629 // Note: we do not have to check the thread ID for two reasons. 630 // First, the interpreter checks for IllegalMonitorStateException at 631 // a higher level. Second, if the bias was revoked while we held the 632 // lock, the object could not be rebiased toward another thread, so 633 // the bias bit would be clear. 634 ldr(temp_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes())); 635 andr(temp_reg, temp_reg, markOopDesc::biased_lock_mask_in_place); 636 cmp(temp_reg, markOopDesc::biased_lock_pattern); 637 br(Assembler::EQ, done); 638 } 639 640 static void pass_arg0(MacroAssembler* masm, Register arg) { 641 if (c_rarg0 != arg ) { 642 masm->mov(c_rarg0, arg); 643 } 644 } 645 646 static void pass_arg1(MacroAssembler* masm, Register arg) { 647 if (c_rarg1 != arg ) { 648 masm->mov(c_rarg1, arg); 649 } 650 } 651 652 static void pass_arg2(MacroAssembler* masm, Register arg) { 653 if (c_rarg2 != arg ) { 654 masm->mov(c_rarg2, arg); 655 } 656 } 657 658 static void pass_arg3(MacroAssembler* masm, Register arg) { 659 if (c_rarg3 != arg ) { 660 masm->mov(c_rarg3, arg); 661 } 662 } 663 664 void MacroAssembler::call_VM_base(Register oop_result, 665 Register java_thread, 666 Register last_java_sp, 667 address entry_point, 668 int number_of_arguments, 669 bool check_exceptions) { 670 // determine java_thread register 671 if (!java_thread->is_valid()) { 672 java_thread = rthread; 673 } 674 675 // determine last_java_sp register 676 if (!last_java_sp->is_valid()) { 677 last_java_sp = esp; 678 } 679 680 // debugging support 681 assert(number_of_arguments >= 0 , "cannot have negative number of arguments"); 682 assert(java_thread == rthread, "unexpected register"); 683 #ifdef ASSERT 684 // TraceBytecodes does not use r12 but saves it over the call, so don't verify 685 // if ((UseCompressedOops || UseCompressedClassPointers) && !TraceBytecodes) verify_heapbase("call_VM_base: heap base corrupted?"); 686 #endif // ASSERT 687 688 assert(java_thread != oop_result , "cannot use the same register for java_thread & oop_result"); 689 assert(java_thread != last_java_sp, "cannot use the same register for java_thread & last_java_sp"); 690 691 // push java thread (becomes first argument of C function) 692 693 mov(c_rarg0, java_thread); 694 695 // set last Java frame before call 696 assert(last_java_sp != rfp, "can't use rfp"); 697 698 Label l; 699 set_last_Java_frame(last_java_sp, rfp, l, rscratch1); 700 701 // do the call, remove parameters 702 MacroAssembler::call_VM_leaf_base(entry_point, number_of_arguments, &l); 703 704 // reset last Java frame 705 // Only interpreter should have to clear fp 706 reset_last_Java_frame(true); 707 708 // C++ interp handles this in the interpreter 709 check_and_handle_popframe(java_thread); 710 check_and_handle_earlyret(java_thread); 711 712 if (check_exceptions) { 713 // check for pending exceptions (java_thread is set upon return) 714 ldr(rscratch1, Address(java_thread, in_bytes(Thread::pending_exception_offset()))); 715 Label ok; 716 cbz(rscratch1, ok); 717 lea(rscratch1, RuntimeAddress(StubRoutines::forward_exception_entry())); 718 br(rscratch1); 719 bind(ok); 720 } 721 722 // get oop result if there is one and reset the value in the thread 723 if (oop_result->is_valid()) { 724 get_vm_result(oop_result, java_thread); 725 } 726 } 727 728 void MacroAssembler::call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions) { 729 call_VM_base(oop_result, noreg, noreg, entry_point, number_of_arguments, check_exceptions); 730 } 731 732 // Maybe emit a call via a trampoline. If the code cache is small 733 // trampolines won't be emitted. 734 735 address MacroAssembler::trampoline_call(Address entry, CodeBuffer *cbuf) { 736 assert(JavaThread::current()->is_Compiler_thread(), "just checking"); 737 assert(entry.rspec().type() == relocInfo::runtime_call_type 738 || entry.rspec().type() == relocInfo::opt_virtual_call_type 739 || entry.rspec().type() == relocInfo::static_call_type 740 || entry.rspec().type() == relocInfo::virtual_call_type, "wrong reloc type"); 741 742 unsigned int start_offset = offset(); 743 if (far_branches() && !Compile::current()->in_scratch_emit_size()) { 744 address stub = emit_trampoline_stub(start_offset, entry.target()); 745 if (stub == NULL) { 746 return NULL; // CodeCache is full 747 } 748 } 749 750 if (cbuf) cbuf->set_insts_mark(); 751 relocate(entry.rspec()); 752 if (!far_branches()) { 753 bl(entry.target()); 754 } else { 755 bl(pc()); 756 } 757 // just need to return a non-null address 758 return pc(); 759 } 760 761 762 // Emit a trampoline stub for a call to a target which is too far away. 763 // 764 // code sequences: 765 // 766 // call-site: 767 // branch-and-link to <destination> or <trampoline stub> 768 // 769 // Related trampoline stub for this call site in the stub section: 770 // load the call target from the constant pool 771 // branch (LR still points to the call site above) 772 773 address MacroAssembler::emit_trampoline_stub(int insts_call_instruction_offset, 774 address dest) { 775 address stub = start_a_stub(Compile::MAX_stubs_size/2); 776 if (stub == NULL) { 777 return NULL; // CodeBuffer::expand failed 778 } 779 780 // Create a trampoline stub relocation which relates this trampoline stub 781 // with the call instruction at insts_call_instruction_offset in the 782 // instructions code-section. 783 align(wordSize); 784 relocate(trampoline_stub_Relocation::spec(code()->insts()->start() 785 + insts_call_instruction_offset)); 786 const int stub_start_offset = offset(); 787 788 // Now, create the trampoline stub's code: 789 // - load the call 790 // - call 791 Label target; 792 ldr(rscratch1, target); 793 br(rscratch1); 794 bind(target); 795 assert(offset() - stub_start_offset == NativeCallTrampolineStub::data_offset, 796 "should be"); 797 emit_int64((int64_t)dest); 798 799 const address stub_start_addr = addr_at(stub_start_offset); 800 801 assert(is_NativeCallTrampolineStub_at(stub_start_addr), "doesn't look like a trampoline"); 802 803 end_a_stub(); 804 return stub_start_addr; 805 } 806 807 address MacroAssembler::ic_call(address entry, jint method_index) { 808 RelocationHolder rh = virtual_call_Relocation::spec(pc(), method_index); 809 // address const_ptr = long_constant((jlong)Universe::non_oop_word()); 810 // unsigned long offset; 811 // ldr_constant(rscratch2, const_ptr); 812 movptr(rscratch2, (uintptr_t)Universe::non_oop_word()); 813 return trampoline_call(Address(entry, rh)); 814 } 815 816 // Implementation of call_VM versions 817 818 void MacroAssembler::call_VM(Register oop_result, 819 address entry_point, 820 bool check_exceptions) { 821 call_VM_helper(oop_result, entry_point, 0, check_exceptions); 822 } 823 824 void MacroAssembler::call_VM(Register oop_result, 825 address entry_point, 826 Register arg_1, 827 bool check_exceptions) { 828 pass_arg1(this, arg_1); 829 call_VM_helper(oop_result, entry_point, 1, check_exceptions); 830 } 831 832 void MacroAssembler::call_VM(Register oop_result, 833 address entry_point, 834 Register arg_1, 835 Register arg_2, 836 bool check_exceptions) { 837 assert(arg_1 != c_rarg2, "smashed arg"); 838 pass_arg2(this, arg_2); 839 pass_arg1(this, arg_1); 840 call_VM_helper(oop_result, entry_point, 2, check_exceptions); 841 } 842 843 void MacroAssembler::call_VM(Register oop_result, 844 address entry_point, 845 Register arg_1, 846 Register arg_2, 847 Register arg_3, 848 bool check_exceptions) { 849 assert(arg_1 != c_rarg3, "smashed arg"); 850 assert(arg_2 != c_rarg3, "smashed arg"); 851 pass_arg3(this, arg_3); 852 853 assert(arg_1 != c_rarg2, "smashed arg"); 854 pass_arg2(this, arg_2); 855 856 pass_arg1(this, arg_1); 857 call_VM_helper(oop_result, entry_point, 3, check_exceptions); 858 } 859 860 void MacroAssembler::call_VM(Register oop_result, 861 Register last_java_sp, 862 address entry_point, 863 int number_of_arguments, 864 bool check_exceptions) { 865 call_VM_base(oop_result, rthread, last_java_sp, entry_point, number_of_arguments, check_exceptions); 866 } 867 868 void MacroAssembler::call_VM(Register oop_result, 869 Register last_java_sp, 870 address entry_point, 871 Register arg_1, 872 bool check_exceptions) { 873 pass_arg1(this, arg_1); 874 call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions); 875 } 876 877 void MacroAssembler::call_VM(Register oop_result, 878 Register last_java_sp, 879 address entry_point, 880 Register arg_1, 881 Register arg_2, 882 bool check_exceptions) { 883 884 assert(arg_1 != c_rarg2, "smashed arg"); 885 pass_arg2(this, arg_2); 886 pass_arg1(this, arg_1); 887 call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions); 888 } 889 890 void MacroAssembler::call_VM(Register oop_result, 891 Register last_java_sp, 892 address entry_point, 893 Register arg_1, 894 Register arg_2, 895 Register arg_3, 896 bool check_exceptions) { 897 assert(arg_1 != c_rarg3, "smashed arg"); 898 assert(arg_2 != c_rarg3, "smashed arg"); 899 pass_arg3(this, arg_3); 900 assert(arg_1 != c_rarg2, "smashed arg"); 901 pass_arg2(this, arg_2); 902 pass_arg1(this, arg_1); 903 call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions); 904 } 905 906 907 void MacroAssembler::get_vm_result(Register oop_result, Register java_thread) { 908 ldr(oop_result, Address(java_thread, JavaThread::vm_result_offset())); 909 str(zr, Address(java_thread, JavaThread::vm_result_offset())); 910 verify_oop(oop_result, "broken oop in call_VM_base"); 911 } 912 913 void MacroAssembler::get_vm_result_2(Register metadata_result, Register java_thread) { 914 ldr(metadata_result, Address(java_thread, JavaThread::vm_result_2_offset())); 915 str(zr, Address(java_thread, JavaThread::vm_result_2_offset())); 916 } 917 918 void MacroAssembler::align(int modulus) { 919 while (offset() % modulus != 0) nop(); 920 } 921 922 // these are no-ops overridden by InterpreterMacroAssembler 923 924 void MacroAssembler::check_and_handle_earlyret(Register java_thread) { } 925 926 void MacroAssembler::check_and_handle_popframe(Register java_thread) { } 927 928 929 RegisterOrConstant MacroAssembler::delayed_value_impl(intptr_t* delayed_value_addr, 930 Register tmp, 931 int offset) { 932 intptr_t value = *delayed_value_addr; 933 if (value != 0) 934 return RegisterOrConstant(value + offset); 935 936 // load indirectly to solve generation ordering problem 937 ldr(tmp, ExternalAddress((address) delayed_value_addr)); 938 939 if (offset != 0) 940 add(tmp, tmp, offset); 941 942 return RegisterOrConstant(tmp); 943 } 944 945 946 void MacroAssembler:: notify(int type) { 947 if (type == bytecode_start) { 948 // set_last_Java_frame(esp, rfp, (address)NULL); 949 Assembler:: notify(type); 950 // reset_last_Java_frame(true); 951 } 952 else 953 Assembler:: notify(type); 954 } 955 956 // Look up the method for a megamorphic invokeinterface call. 957 // The target method is determined by <intf_klass, itable_index>. 958 // The receiver klass is in recv_klass. 959 // On success, the result will be in method_result, and execution falls through. 960 // On failure, execution transfers to the given label. 961 void MacroAssembler::lookup_interface_method(Register recv_klass, 962 Register intf_klass, 963 RegisterOrConstant itable_index, 964 Register method_result, 965 Register scan_temp, 966 Label& L_no_such_interface, 967 bool return_method) { 968 assert_different_registers(recv_klass, intf_klass, scan_temp); 969 assert_different_registers(method_result, intf_klass, scan_temp); 970 assert(recv_klass != method_result || !return_method, 971 "recv_klass can be destroyed when method isn't needed"); 972 assert(itable_index.is_constant() || itable_index.as_register() == method_result, 973 "caller must use same register for non-constant itable index as for method"); 974 975 // Compute start of first itableOffsetEntry (which is at the end of the vtable) 976 int vtable_base = in_bytes(Klass::vtable_start_offset()); 977 int itentry_off = itableMethodEntry::method_offset_in_bytes(); 978 int scan_step = itableOffsetEntry::size() * wordSize; 979 int vte_size = vtableEntry::size_in_bytes(); 980 assert(vte_size == wordSize, "else adjust times_vte_scale"); 981 982 ldrw(scan_temp, Address(recv_klass, Klass::vtable_length_offset())); 983 984 // %%% Could store the aligned, prescaled offset in the klassoop. 985 // lea(scan_temp, Address(recv_klass, scan_temp, times_vte_scale, vtable_base)); 986 lea(scan_temp, Address(recv_klass, scan_temp, Address::lsl(3))); 987 add(scan_temp, scan_temp, vtable_base); 988 989 if (return_method) { 990 // Adjust recv_klass by scaled itable_index, so we can free itable_index. 991 assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below"); 992 // lea(recv_klass, Address(recv_klass, itable_index, Address::times_ptr, itentry_off)); 993 lea(recv_klass, Address(recv_klass, itable_index, Address::lsl(3))); 994 if (itentry_off) 995 add(recv_klass, recv_klass, itentry_off); 996 } 997 998 // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) { 999 // if (scan->interface() == intf) { 1000 // result = (klass + scan->offset() + itable_index); 1001 // } 1002 // } 1003 Label search, found_method; 1004 1005 for (int peel = 1; peel >= 0; peel--) { 1006 ldr(method_result, Address(scan_temp, itableOffsetEntry::interface_offset_in_bytes())); 1007 cmp(intf_klass, method_result); 1008 1009 if (peel) { 1010 br(Assembler::EQ, found_method); 1011 } else { 1012 br(Assembler::NE, search); 1013 // (invert the test to fall through to found_method...) 1014 } 1015 1016 if (!peel) break; 1017 1018 bind(search); 1019 1020 // Check that the previous entry is non-null. A null entry means that 1021 // the receiver class doesn't implement the interface, and wasn't the 1022 // same as when the caller was compiled. 1023 cbz(method_result, L_no_such_interface); 1024 add(scan_temp, scan_temp, scan_step); 1025 } 1026 1027 bind(found_method); 1028 1029 // Got a hit. 1030 if (return_method) { 1031 ldrw(scan_temp, Address(scan_temp, itableOffsetEntry::offset_offset_in_bytes())); 1032 ldr(method_result, Address(recv_klass, scan_temp, Address::uxtw(0))); 1033 } 1034 } 1035 1036 // virtual method calling 1037 void MacroAssembler::lookup_virtual_method(Register recv_klass, 1038 RegisterOrConstant vtable_index, 1039 Register method_result) { 1040 const int base = in_bytes(Klass::vtable_start_offset()); 1041 assert(vtableEntry::size() * wordSize == 8, 1042 "adjust the scaling in the code below"); 1043 int vtable_offset_in_bytes = base + vtableEntry::method_offset_in_bytes(); 1044 1045 if (vtable_index.is_register()) { 1046 lea(method_result, Address(recv_klass, 1047 vtable_index.as_register(), 1048 Address::lsl(LogBytesPerWord))); 1049 ldr(method_result, Address(method_result, vtable_offset_in_bytes)); 1050 } else { 1051 vtable_offset_in_bytes += vtable_index.as_constant() * wordSize; 1052 ldr(method_result, 1053 form_address(rscratch1, recv_klass, vtable_offset_in_bytes, 0)); 1054 } 1055 } 1056 1057 void MacroAssembler::check_klass_subtype(Register sub_klass, 1058 Register super_klass, 1059 Register temp_reg, 1060 Label& L_success) { 1061 Label L_failure; 1062 check_klass_subtype_fast_path(sub_klass, super_klass, temp_reg, &L_success, &L_failure, NULL); 1063 check_klass_subtype_slow_path(sub_klass, super_klass, temp_reg, noreg, &L_success, NULL); 1064 bind(L_failure); 1065 } 1066 1067 1068 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass, 1069 Register super_klass, 1070 Register temp_reg, 1071 Label* L_success, 1072 Label* L_failure, 1073 Label* L_slow_path, 1074 RegisterOrConstant super_check_offset) { 1075 assert_different_registers(sub_klass, super_klass, temp_reg); 1076 bool must_load_sco = (super_check_offset.constant_or_zero() == -1); 1077 if (super_check_offset.is_register()) { 1078 assert_different_registers(sub_klass, super_klass, 1079 super_check_offset.as_register()); 1080 } else if (must_load_sco) { 1081 assert(temp_reg != noreg, "supply either a temp or a register offset"); 1082 } 1083 1084 Label L_fallthrough; 1085 int label_nulls = 0; 1086 if (L_success == NULL) { L_success = &L_fallthrough; label_nulls++; } 1087 if (L_failure == NULL) { L_failure = &L_fallthrough; label_nulls++; } 1088 if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; } 1089 assert(label_nulls <= 1, "at most one NULL in the batch"); 1090 1091 int sc_offset = in_bytes(Klass::secondary_super_cache_offset()); 1092 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 1093 Address super_check_offset_addr(super_klass, sco_offset); 1094 1095 // Hacked jmp, which may only be used just before L_fallthrough. 1096 #define final_jmp(label) \ 1097 if (&(label) == &L_fallthrough) { /*do nothing*/ } \ 1098 else b(label) /*omit semi*/ 1099 1100 // If the pointers are equal, we are done (e.g., String[] elements). 1101 // This self-check enables sharing of secondary supertype arrays among 1102 // non-primary types such as array-of-interface. Otherwise, each such 1103 // type would need its own customized SSA. 1104 // We move this check to the front of the fast path because many 1105 // type checks are in fact trivially successful in this manner, 1106 // so we get a nicely predicted branch right at the start of the check. 1107 cmp(sub_klass, super_klass); 1108 br(Assembler::EQ, *L_success); 1109 1110 // Check the supertype display: 1111 if (must_load_sco) { 1112 ldrw(temp_reg, super_check_offset_addr); 1113 super_check_offset = RegisterOrConstant(temp_reg); 1114 } 1115 Address super_check_addr(sub_klass, super_check_offset); 1116 ldr(rscratch1, super_check_addr); 1117 cmp(super_klass, rscratch1); // load displayed supertype 1118 1119 // This check has worked decisively for primary supers. 1120 // Secondary supers are sought in the super_cache ('super_cache_addr'). 1121 // (Secondary supers are interfaces and very deeply nested subtypes.) 1122 // This works in the same check above because of a tricky aliasing 1123 // between the super_cache and the primary super display elements. 1124 // (The 'super_check_addr' can address either, as the case requires.) 1125 // Note that the cache is updated below if it does not help us find 1126 // what we need immediately. 1127 // So if it was a primary super, we can just fail immediately. 1128 // Otherwise, it's the slow path for us (no success at this point). 1129 1130 if (super_check_offset.is_register()) { 1131 br(Assembler::EQ, *L_success); 1132 cmp(super_check_offset.as_register(), sc_offset); 1133 if (L_failure == &L_fallthrough) { 1134 br(Assembler::EQ, *L_slow_path); 1135 } else { 1136 br(Assembler::NE, *L_failure); 1137 final_jmp(*L_slow_path); 1138 } 1139 } else if (super_check_offset.as_constant() == sc_offset) { 1140 // Need a slow path; fast failure is impossible. 1141 if (L_slow_path == &L_fallthrough) { 1142 br(Assembler::EQ, *L_success); 1143 } else { 1144 br(Assembler::NE, *L_slow_path); 1145 final_jmp(*L_success); 1146 } 1147 } else { 1148 // No slow path; it's a fast decision. 1149 if (L_failure == &L_fallthrough) { 1150 br(Assembler::EQ, *L_success); 1151 } else { 1152 br(Assembler::NE, *L_failure); 1153 final_jmp(*L_success); 1154 } 1155 } 1156 1157 bind(L_fallthrough); 1158 1159 #undef final_jmp 1160 } 1161 1162 // These two are taken from x86, but they look generally useful 1163 1164 // scans count pointer sized words at [addr] for occurence of value, 1165 // generic 1166 void MacroAssembler::repne_scan(Register addr, Register value, Register count, 1167 Register scratch) { 1168 Label Lloop, Lexit; 1169 cbz(count, Lexit); 1170 bind(Lloop); 1171 ldr(scratch, post(addr, wordSize)); 1172 cmp(value, scratch); 1173 br(EQ, Lexit); 1174 sub(count, count, 1); 1175 cbnz(count, Lloop); 1176 bind(Lexit); 1177 } 1178 1179 // scans count 4 byte words at [addr] for occurence of value, 1180 // generic 1181 void MacroAssembler::repne_scanw(Register addr, Register value, Register count, 1182 Register scratch) { 1183 Label Lloop, Lexit; 1184 cbz(count, Lexit); 1185 bind(Lloop); 1186 ldrw(scratch, post(addr, wordSize)); 1187 cmpw(value, scratch); 1188 br(EQ, Lexit); 1189 sub(count, count, 1); 1190 cbnz(count, Lloop); 1191 bind(Lexit); 1192 } 1193 1194 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass, 1195 Register super_klass, 1196 Register temp_reg, 1197 Register temp2_reg, 1198 Label* L_success, 1199 Label* L_failure, 1200 bool set_cond_codes) { 1201 assert_different_registers(sub_klass, super_klass, temp_reg); 1202 if (temp2_reg != noreg) 1203 assert_different_registers(sub_klass, super_klass, temp_reg, temp2_reg, rscratch1); 1204 #define IS_A_TEMP(reg) ((reg) == temp_reg || (reg) == temp2_reg) 1205 1206 Label L_fallthrough; 1207 int label_nulls = 0; 1208 if (L_success == NULL) { L_success = &L_fallthrough; label_nulls++; } 1209 if (L_failure == NULL) { L_failure = &L_fallthrough; label_nulls++; } 1210 assert(label_nulls <= 1, "at most one NULL in the batch"); 1211 1212 // a couple of useful fields in sub_klass: 1213 int ss_offset = in_bytes(Klass::secondary_supers_offset()); 1214 int sc_offset = in_bytes(Klass::secondary_super_cache_offset()); 1215 Address secondary_supers_addr(sub_klass, ss_offset); 1216 Address super_cache_addr( sub_klass, sc_offset); 1217 1218 BLOCK_COMMENT("check_klass_subtype_slow_path"); 1219 1220 // Do a linear scan of the secondary super-klass chain. 1221 // This code is rarely used, so simplicity is a virtue here. 1222 // The repne_scan instruction uses fixed registers, which we must spill. 1223 // Don't worry too much about pre-existing connections with the input regs. 1224 1225 assert(sub_klass != r0, "killed reg"); // killed by mov(r0, super) 1226 assert(sub_klass != r2, "killed reg"); // killed by lea(r2, &pst_counter) 1227 1228 RegSet pushed_registers; 1229 if (!IS_A_TEMP(r2)) pushed_registers += r2; 1230 if (!IS_A_TEMP(r5)) pushed_registers += r5; 1231 1232 if (super_klass != r0 || UseCompressedOops) { 1233 if (!IS_A_TEMP(r0)) pushed_registers += r0; 1234 } 1235 1236 push(pushed_registers, sp); 1237 1238 // Get super_klass value into r0 (even if it was in r5 or r2). 1239 if (super_klass != r0) { 1240 mov(r0, super_klass); 1241 } 1242 1243 #ifndef PRODUCT 1244 mov(rscratch2, (address)&SharedRuntime::_partial_subtype_ctr); 1245 Address pst_counter_addr(rscratch2); 1246 ldr(rscratch1, pst_counter_addr); 1247 add(rscratch1, rscratch1, 1); 1248 str(rscratch1, pst_counter_addr); 1249 #endif //PRODUCT 1250 1251 // We will consult the secondary-super array. 1252 ldr(r5, secondary_supers_addr); 1253 // Load the array length. 1254 ldrw(r2, Address(r5, Array<Klass*>::length_offset_in_bytes())); 1255 // Skip to start of data. 1256 add(r5, r5, Array<Klass*>::base_offset_in_bytes()); 1257 1258 cmp(sp, zr); // Clear Z flag; SP is never zero 1259 // Scan R2 words at [R5] for an occurrence of R0. 1260 // Set NZ/Z based on last compare. 1261 repne_scan(r5, r0, r2, rscratch1); 1262 1263 // Unspill the temp. registers: 1264 pop(pushed_registers, sp); 1265 1266 br(Assembler::NE, *L_failure); 1267 1268 // Success. Cache the super we found and proceed in triumph. 1269 str(super_klass, super_cache_addr); 1270 1271 if (L_success != &L_fallthrough) { 1272 b(*L_success); 1273 } 1274 1275 #undef IS_A_TEMP 1276 1277 bind(L_fallthrough); 1278 } 1279 1280 1281 void MacroAssembler::verify_oop(Register reg, const char* s) { 1282 if (!VerifyOops) return; 1283 1284 // Pass register number to verify_oop_subroutine 1285 const char* b = NULL; 1286 { 1287 ResourceMark rm; 1288 stringStream ss; 1289 ss.print("verify_oop: %s: %s", reg->name(), s); 1290 b = code_string(ss.as_string()); 1291 } 1292 BLOCK_COMMENT("verify_oop {"); 1293 1294 stp(r0, rscratch1, Address(pre(sp, -2 * wordSize))); 1295 stp(rscratch2, lr, Address(pre(sp, -2 * wordSize))); 1296 1297 mov(r0, reg); 1298 mov(rscratch1, (address)b); 1299 1300 // call indirectly to solve generation ordering problem 1301 lea(rscratch2, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address())); 1302 ldr(rscratch2, Address(rscratch2)); 1303 blr(rscratch2); 1304 1305 ldp(rscratch2, lr, Address(post(sp, 2 * wordSize))); 1306 ldp(r0, rscratch1, Address(post(sp, 2 * wordSize))); 1307 1308 BLOCK_COMMENT("} verify_oop"); 1309 } 1310 1311 void MacroAssembler::verify_oop_addr(Address addr, const char* s) { 1312 if (!VerifyOops) return; 1313 1314 const char* b = NULL; 1315 { 1316 ResourceMark rm; 1317 stringStream ss; 1318 ss.print("verify_oop_addr: %s", s); 1319 b = code_string(ss.as_string()); 1320 } 1321 BLOCK_COMMENT("verify_oop_addr {"); 1322 1323 stp(r0, rscratch1, Address(pre(sp, -2 * wordSize))); 1324 stp(rscratch2, lr, Address(pre(sp, -2 * wordSize))); 1325 1326 // addr may contain sp so we will have to adjust it based on the 1327 // pushes that we just did. 1328 if (addr.uses(sp)) { 1329 lea(r0, addr); 1330 ldr(r0, Address(r0, 4 * wordSize)); 1331 } else { 1332 ldr(r0, addr); 1333 } 1334 mov(rscratch1, (address)b); 1335 1336 // call indirectly to solve generation ordering problem 1337 lea(rscratch2, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address())); 1338 ldr(rscratch2, Address(rscratch2)); 1339 blr(rscratch2); 1340 1341 ldp(rscratch2, lr, Address(post(sp, 2 * wordSize))); 1342 ldp(r0, rscratch1, Address(post(sp, 2 * wordSize))); 1343 1344 BLOCK_COMMENT("} verify_oop_addr"); 1345 } 1346 1347 Address MacroAssembler::argument_address(RegisterOrConstant arg_slot, 1348 int extra_slot_offset) { 1349 // cf. TemplateTable::prepare_invoke(), if (load_receiver). 1350 int stackElementSize = Interpreter::stackElementSize; 1351 int offset = Interpreter::expr_offset_in_bytes(extra_slot_offset+0); 1352 #ifdef ASSERT 1353 int offset1 = Interpreter::expr_offset_in_bytes(extra_slot_offset+1); 1354 assert(offset1 - offset == stackElementSize, "correct arithmetic"); 1355 #endif 1356 if (arg_slot.is_constant()) { 1357 return Address(esp, arg_slot.as_constant() * stackElementSize 1358 + offset); 1359 } else { 1360 add(rscratch1, esp, arg_slot.as_register(), 1361 ext::uxtx, exact_log2(stackElementSize)); 1362 return Address(rscratch1, offset); 1363 } 1364 } 1365 1366 void MacroAssembler::call_VM_leaf_base(address entry_point, 1367 int number_of_arguments, 1368 Label *retaddr) { 1369 call_VM_leaf_base1(entry_point, number_of_arguments, 0, ret_type_integral, retaddr); 1370 } 1371 1372 void MacroAssembler::call_VM_leaf_base1(address entry_point, 1373 int number_of_gp_arguments, 1374 int number_of_fp_arguments, 1375 ret_type type, 1376 Label *retaddr) { 1377 Label E, L; 1378 1379 stp(rscratch1, rmethod, Address(pre(sp, -2 * wordSize))); 1380 1381 // We add 1 to number_of_arguments because the thread in arg0 is 1382 // not counted 1383 mov(rscratch1, entry_point); 1384 blrt(rscratch1, number_of_gp_arguments + 1, number_of_fp_arguments, type); 1385 if (retaddr) 1386 bind(*retaddr); 1387 1388 ldp(rscratch1, rmethod, Address(post(sp, 2 * wordSize))); 1389 maybe_isb(); 1390 } 1391 1392 void MacroAssembler::call_VM_leaf(address entry_point, int number_of_arguments) { 1393 call_VM_leaf_base(entry_point, number_of_arguments); 1394 } 1395 1396 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0) { 1397 pass_arg0(this, arg_0); 1398 call_VM_leaf_base(entry_point, 1); 1399 } 1400 1401 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1) { 1402 pass_arg0(this, arg_0); 1403 pass_arg1(this, arg_1); 1404 call_VM_leaf_base(entry_point, 2); 1405 } 1406 1407 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, 1408 Register arg_1, Register arg_2) { 1409 pass_arg0(this, arg_0); 1410 pass_arg1(this, arg_1); 1411 pass_arg2(this, arg_2); 1412 call_VM_leaf_base(entry_point, 3); 1413 } 1414 1415 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0) { 1416 pass_arg0(this, arg_0); 1417 MacroAssembler::call_VM_leaf_base(entry_point, 1); 1418 } 1419 1420 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1) { 1421 1422 assert(arg_0 != c_rarg1, "smashed arg"); 1423 pass_arg1(this, arg_1); 1424 pass_arg0(this, arg_0); 1425 MacroAssembler::call_VM_leaf_base(entry_point, 2); 1426 } 1427 1428 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) { 1429 assert(arg_0 != c_rarg2, "smashed arg"); 1430 assert(arg_1 != c_rarg2, "smashed arg"); 1431 pass_arg2(this, arg_2); 1432 assert(arg_0 != c_rarg1, "smashed arg"); 1433 pass_arg1(this, arg_1); 1434 pass_arg0(this, arg_0); 1435 MacroAssembler::call_VM_leaf_base(entry_point, 3); 1436 } 1437 1438 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) { 1439 assert(arg_0 != c_rarg3, "smashed arg"); 1440 assert(arg_1 != c_rarg3, "smashed arg"); 1441 assert(arg_2 != c_rarg3, "smashed arg"); 1442 pass_arg3(this, arg_3); 1443 assert(arg_0 != c_rarg2, "smashed arg"); 1444 assert(arg_1 != c_rarg2, "smashed arg"); 1445 pass_arg2(this, arg_2); 1446 assert(arg_0 != c_rarg1, "smashed arg"); 1447 pass_arg1(this, arg_1); 1448 pass_arg0(this, arg_0); 1449 MacroAssembler::call_VM_leaf_base(entry_point, 4); 1450 } 1451 1452 void MacroAssembler::null_check(Register reg, int offset) { 1453 if (needs_explicit_null_check(offset)) { 1454 // provoke OS NULL exception if reg = NULL by 1455 // accessing M[reg] w/o changing any registers 1456 // NOTE: this is plenty to provoke a segv 1457 ldr(zr, Address(reg)); 1458 } else { 1459 // nothing to do, (later) access of M[reg + offset] 1460 // will provoke OS NULL exception if reg = NULL 1461 } 1462 } 1463 1464 // MacroAssembler protected routines needed to implement 1465 // public methods 1466 1467 void MacroAssembler::mov(Register r, Address dest) { 1468 code_section()->relocate(pc(), dest.rspec()); 1469 u_int64_t imm64 = (u_int64_t)dest.target(); 1470 movptr(r, imm64); 1471 } 1472 1473 // Move a constant pointer into r. In AArch64 mode the virtual 1474 // address space is 48 bits in size, so we only need three 1475 // instructions to create a patchable instruction sequence that can 1476 // reach anywhere. 1477 void MacroAssembler::movptr(Register r, uintptr_t imm64) { 1478 #ifndef PRODUCT 1479 { 1480 char buffer[64]; 1481 snprintf(buffer, sizeof(buffer), "0x%"PRIX64, imm64); 1482 block_comment(buffer); 1483 } 1484 #endif 1485 assert(imm64 < (1ul << 48), "48-bit overflow in address constant"); 1486 movz(r, imm64 & 0xffff); 1487 imm64 >>= 16; 1488 movk(r, imm64 & 0xffff, 16); 1489 imm64 >>= 16; 1490 movk(r, imm64 & 0xffff, 32); 1491 } 1492 1493 // Macro to mov replicated immediate to vector register. 1494 // Vd will get the following values for different arrangements in T 1495 // imm32 == hex 000000gh T8B: Vd = ghghghghghghghgh 1496 // imm32 == hex 000000gh T16B: Vd = ghghghghghghghghghghghghghghghgh 1497 // imm32 == hex 0000efgh T4H: Vd = efghefghefghefgh 1498 // imm32 == hex 0000efgh T8H: Vd = efghefghefghefghefghefghefghefgh 1499 // imm32 == hex abcdefgh T2S: Vd = abcdefghabcdefgh 1500 // imm32 == hex abcdefgh T4S: Vd = abcdefghabcdefghabcdefghabcdefgh 1501 // T1D/T2D: invalid 1502 void MacroAssembler::mov(FloatRegister Vd, SIMD_Arrangement T, u_int32_t imm32) { 1503 assert(T != T1D && T != T2D, "invalid arrangement"); 1504 if (T == T8B || T == T16B) { 1505 assert((imm32 & ~0xff) == 0, "extraneous bits in unsigned imm32 (T8B/T16B)"); 1506 movi(Vd, T, imm32 & 0xff, 0); 1507 return; 1508 } 1509 u_int32_t nimm32 = ~imm32; 1510 if (T == T4H || T == T8H) { 1511 assert((imm32 & ~0xffff) == 0, "extraneous bits in unsigned imm32 (T4H/T8H)"); 1512 imm32 &= 0xffff; 1513 nimm32 &= 0xffff; 1514 } 1515 u_int32_t x = imm32; 1516 int movi_cnt = 0; 1517 int movn_cnt = 0; 1518 while (x) { if (x & 0xff) movi_cnt++; x >>= 8; } 1519 x = nimm32; 1520 while (x) { if (x & 0xff) movn_cnt++; x >>= 8; } 1521 if (movn_cnt < movi_cnt) imm32 = nimm32; 1522 unsigned lsl = 0; 1523 while (imm32 && (imm32 & 0xff) == 0) { lsl += 8; imm32 >>= 8; } 1524 if (movn_cnt < movi_cnt) 1525 mvni(Vd, T, imm32 & 0xff, lsl); 1526 else 1527 movi(Vd, T, imm32 & 0xff, lsl); 1528 imm32 >>= 8; lsl += 8; 1529 while (imm32) { 1530 while ((imm32 & 0xff) == 0) { lsl += 8; imm32 >>= 8; } 1531 if (movn_cnt < movi_cnt) 1532 bici(Vd, T, imm32 & 0xff, lsl); 1533 else 1534 orri(Vd, T, imm32 & 0xff, lsl); 1535 lsl += 8; imm32 >>= 8; 1536 } 1537 } 1538 1539 void MacroAssembler::mov_immediate64(Register dst, u_int64_t imm64) 1540 { 1541 #ifndef PRODUCT 1542 { 1543 char buffer[64]; 1544 snprintf(buffer, sizeof(buffer), "0x%"PRIX64, imm64); 1545 block_comment(buffer); 1546 } 1547 #endif 1548 if (operand_valid_for_logical_immediate(false, imm64)) { 1549 orr(dst, zr, imm64); 1550 } else { 1551 // we can use a combination of MOVZ or MOVN with 1552 // MOVK to build up the constant 1553 u_int64_t imm_h[4]; 1554 int zero_count = 0; 1555 int neg_count = 0; 1556 int i; 1557 for (i = 0; i < 4; i++) { 1558 imm_h[i] = ((imm64 >> (i * 16)) & 0xffffL); 1559 if (imm_h[i] == 0) { 1560 zero_count++; 1561 } else if (imm_h[i] == 0xffffL) { 1562 neg_count++; 1563 } 1564 } 1565 if (zero_count == 4) { 1566 // one MOVZ will do 1567 movz(dst, 0); 1568 } else if (neg_count == 4) { 1569 // one MOVN will do 1570 movn(dst, 0); 1571 } else if (zero_count == 3) { 1572 for (i = 0; i < 4; i++) { 1573 if (imm_h[i] != 0L) { 1574 movz(dst, (u_int32_t)imm_h[i], (i << 4)); 1575 break; 1576 } 1577 } 1578 } else if (neg_count == 3) { 1579 // one MOVN will do 1580 for (int i = 0; i < 4; i++) { 1581 if (imm_h[i] != 0xffffL) { 1582 movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4)); 1583 break; 1584 } 1585 } 1586 } else if (zero_count == 2) { 1587 // one MOVZ and one MOVK will do 1588 for (i = 0; i < 3; i++) { 1589 if (imm_h[i] != 0L) { 1590 movz(dst, (u_int32_t)imm_h[i], (i << 4)); 1591 i++; 1592 break; 1593 } 1594 } 1595 for (;i < 4; i++) { 1596 if (imm_h[i] != 0L) { 1597 movk(dst, (u_int32_t)imm_h[i], (i << 4)); 1598 } 1599 } 1600 } else if (neg_count == 2) { 1601 // one MOVN and one MOVK will do 1602 for (i = 0; i < 4; i++) { 1603 if (imm_h[i] != 0xffffL) { 1604 movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4)); 1605 i++; 1606 break; 1607 } 1608 } 1609 for (;i < 4; i++) { 1610 if (imm_h[i] != 0xffffL) { 1611 movk(dst, (u_int32_t)imm_h[i], (i << 4)); 1612 } 1613 } 1614 } else if (zero_count == 1) { 1615 // one MOVZ and two MOVKs will do 1616 for (i = 0; i < 4; i++) { 1617 if (imm_h[i] != 0L) { 1618 movz(dst, (u_int32_t)imm_h[i], (i << 4)); 1619 i++; 1620 break; 1621 } 1622 } 1623 for (;i < 4; i++) { 1624 if (imm_h[i] != 0x0L) { 1625 movk(dst, (u_int32_t)imm_h[i], (i << 4)); 1626 } 1627 } 1628 } else if (neg_count == 1) { 1629 // one MOVN and two MOVKs will do 1630 for (i = 0; i < 4; i++) { 1631 if (imm_h[i] != 0xffffL) { 1632 movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4)); 1633 i++; 1634 break; 1635 } 1636 } 1637 for (;i < 4; i++) { 1638 if (imm_h[i] != 0xffffL) { 1639 movk(dst, (u_int32_t)imm_h[i], (i << 4)); 1640 } 1641 } 1642 } else { 1643 // use a MOVZ and 3 MOVKs (makes it easier to debug) 1644 movz(dst, (u_int32_t)imm_h[0], 0); 1645 for (i = 1; i < 4; i++) { 1646 movk(dst, (u_int32_t)imm_h[i], (i << 4)); 1647 } 1648 } 1649 } 1650 } 1651 1652 void MacroAssembler::mov_immediate32(Register dst, u_int32_t imm32) 1653 { 1654 #ifndef PRODUCT 1655 { 1656 char buffer[64]; 1657 snprintf(buffer, sizeof(buffer), "0x%"PRIX32, imm32); 1658 block_comment(buffer); 1659 } 1660 #endif 1661 if (operand_valid_for_logical_immediate(true, imm32)) { 1662 orrw(dst, zr, imm32); 1663 } else { 1664 // we can use MOVZ, MOVN or two calls to MOVK to build up the 1665 // constant 1666 u_int32_t imm_h[2]; 1667 imm_h[0] = imm32 & 0xffff; 1668 imm_h[1] = ((imm32 >> 16) & 0xffff); 1669 if (imm_h[0] == 0) { 1670 movzw(dst, imm_h[1], 16); 1671 } else if (imm_h[0] == 0xffff) { 1672 movnw(dst, imm_h[1] ^ 0xffff, 16); 1673 } else if (imm_h[1] == 0) { 1674 movzw(dst, imm_h[0], 0); 1675 } else if (imm_h[1] == 0xffff) { 1676 movnw(dst, imm_h[0] ^ 0xffff, 0); 1677 } else { 1678 // use a MOVZ and MOVK (makes it easier to debug) 1679 movzw(dst, imm_h[0], 0); 1680 movkw(dst, imm_h[1], 16); 1681 } 1682 } 1683 } 1684 1685 // Form an address from base + offset in Rd. Rd may or may 1686 // not actually be used: you must use the Address that is returned. 1687 // It is up to you to ensure that the shift provided matches the size 1688 // of your data. 1689 Address MacroAssembler::form_address(Register Rd, Register base, long byte_offset, int shift) { 1690 if (Address::offset_ok_for_immed(byte_offset, shift)) 1691 // It fits; no need for any heroics 1692 return Address(base, byte_offset); 1693 1694 // Don't do anything clever with negative or misaligned offsets 1695 unsigned mask = (1 << shift) - 1; 1696 if (byte_offset < 0 || byte_offset & mask) { 1697 mov(Rd, byte_offset); 1698 add(Rd, base, Rd); 1699 return Address(Rd); 1700 } 1701 1702 // See if we can do this with two 12-bit offsets 1703 { 1704 unsigned long word_offset = byte_offset >> shift; 1705 unsigned long masked_offset = word_offset & 0xfff000; 1706 if (Address::offset_ok_for_immed(word_offset - masked_offset) 1707 && Assembler::operand_valid_for_add_sub_immediate(masked_offset << shift)) { 1708 add(Rd, base, masked_offset << shift); 1709 word_offset -= masked_offset; 1710 return Address(Rd, word_offset << shift); 1711 } 1712 } 1713 1714 // Do it the hard way 1715 mov(Rd, byte_offset); 1716 add(Rd, base, Rd); 1717 return Address(Rd); 1718 } 1719 1720 void MacroAssembler::atomic_incw(Register counter_addr, Register tmp, Register tmp2) { 1721 if (UseLSE) { 1722 mov(tmp, 1); 1723 ldadd(Assembler::word, tmp, zr, counter_addr); 1724 return; 1725 } 1726 Label retry_load; 1727 if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH)) 1728 prfm(Address(counter_addr), PSTL1STRM); 1729 bind(retry_load); 1730 // flush and load exclusive from the memory location 1731 ldxrw(tmp, counter_addr); 1732 addw(tmp, tmp, 1); 1733 // if we store+flush with no intervening write tmp wil be zero 1734 stxrw(tmp2, tmp, counter_addr); 1735 cbnzw(tmp2, retry_load); 1736 } 1737 1738 1739 int MacroAssembler::corrected_idivl(Register result, Register ra, Register rb, 1740 bool want_remainder, Register scratch) 1741 { 1742 // Full implementation of Java idiv and irem. The function 1743 // returns the (pc) offset of the div instruction - may be needed 1744 // for implicit exceptions. 1745 // 1746 // constraint : ra/rb =/= scratch 1747 // normal case 1748 // 1749 // input : ra: dividend 1750 // rb: divisor 1751 // 1752 // result: either 1753 // quotient (= ra idiv rb) 1754 // remainder (= ra irem rb) 1755 1756 assert(ra != scratch && rb != scratch, "reg cannot be scratch"); 1757 1758 int idivl_offset = offset(); 1759 if (! want_remainder) { 1760 sdivw(result, ra, rb); 1761 } else { 1762 sdivw(scratch, ra, rb); 1763 Assembler::msubw(result, scratch, rb, ra); 1764 } 1765 1766 return idivl_offset; 1767 } 1768 1769 int MacroAssembler::corrected_idivq(Register result, Register ra, Register rb, 1770 bool want_remainder, Register scratch) 1771 { 1772 // Full implementation of Java ldiv and lrem. The function 1773 // returns the (pc) offset of the div instruction - may be needed 1774 // for implicit exceptions. 1775 // 1776 // constraint : ra/rb =/= scratch 1777 // normal case 1778 // 1779 // input : ra: dividend 1780 // rb: divisor 1781 // 1782 // result: either 1783 // quotient (= ra idiv rb) 1784 // remainder (= ra irem rb) 1785 1786 assert(ra != scratch && rb != scratch, "reg cannot be scratch"); 1787 1788 int idivq_offset = offset(); 1789 if (! want_remainder) { 1790 sdiv(result, ra, rb); 1791 } else { 1792 sdiv(scratch, ra, rb); 1793 Assembler::msub(result, scratch, rb, ra); 1794 } 1795 1796 return idivq_offset; 1797 } 1798 1799 void MacroAssembler::membar(Membar_mask_bits order_constraint) { 1800 address prev = pc() - NativeMembar::instruction_size; 1801 address last = code()->last_insn(); 1802 if (last != NULL && nativeInstruction_at(last)->is_Membar() && prev == last) { 1803 NativeMembar *bar = NativeMembar_at(prev); 1804 // We are merging two memory barrier instructions. On AArch64 we 1805 // can do this simply by ORing them together. 1806 bar->set_kind(bar->get_kind() | order_constraint); 1807 BLOCK_COMMENT("merged membar"); 1808 } else { 1809 code()->set_last_insn(pc()); 1810 dmb(Assembler::barrier(order_constraint)); 1811 } 1812 } 1813 1814 bool MacroAssembler::try_merge_ldst(Register rt, const Address &adr, size_t size_in_bytes, bool is_store) { 1815 if (ldst_can_merge(rt, adr, size_in_bytes, is_store)) { 1816 merge_ldst(rt, adr, size_in_bytes, is_store); 1817 code()->clear_last_insn(); 1818 return true; 1819 } else { 1820 assert(size_in_bytes == 8 || size_in_bytes == 4, "only 8 bytes or 4 bytes load/store is supported."); 1821 const unsigned mask = size_in_bytes - 1; 1822 if (adr.getMode() == Address::base_plus_offset && 1823 (adr.offset() & mask) == 0) { // only supports base_plus_offset. 1824 code()->set_last_insn(pc()); 1825 } 1826 return false; 1827 } 1828 } 1829 1830 void MacroAssembler::ldr(Register Rx, const Address &adr) { 1831 // We always try to merge two adjacent loads into one ldp. 1832 if (!try_merge_ldst(Rx, adr, 8, false)) { 1833 Assembler::ldr(Rx, adr); 1834 } 1835 } 1836 1837 void MacroAssembler::ldrw(Register Rw, const Address &adr) { 1838 // We always try to merge two adjacent loads into one ldp. 1839 if (!try_merge_ldst(Rw, adr, 4, false)) { 1840 Assembler::ldrw(Rw, adr); 1841 } 1842 } 1843 1844 void MacroAssembler::str(Register Rx, const Address &adr) { 1845 // We always try to merge two adjacent stores into one stp. 1846 if (!try_merge_ldst(Rx, adr, 8, true)) { 1847 Assembler::str(Rx, adr); 1848 } 1849 } 1850 1851 void MacroAssembler::strw(Register Rw, const Address &adr) { 1852 // We always try to merge two adjacent stores into one stp. 1853 if (!try_merge_ldst(Rw, adr, 4, true)) { 1854 Assembler::strw(Rw, adr); 1855 } 1856 } 1857 1858 // MacroAssembler routines found actually to be needed 1859 1860 void MacroAssembler::push(Register src) 1861 { 1862 str(src, Address(pre(esp, -1 * wordSize))); 1863 } 1864 1865 void MacroAssembler::pop(Register dst) 1866 { 1867 ldr(dst, Address(post(esp, 1 * wordSize))); 1868 } 1869 1870 // Note: load_unsigned_short used to be called load_unsigned_word. 1871 int MacroAssembler::load_unsigned_short(Register dst, Address src) { 1872 int off = offset(); 1873 ldrh(dst, src); 1874 return off; 1875 } 1876 1877 int MacroAssembler::load_unsigned_byte(Register dst, Address src) { 1878 int off = offset(); 1879 ldrb(dst, src); 1880 return off; 1881 } 1882 1883 int MacroAssembler::load_signed_short(Register dst, Address src) { 1884 int off = offset(); 1885 ldrsh(dst, src); 1886 return off; 1887 } 1888 1889 int MacroAssembler::load_signed_byte(Register dst, Address src) { 1890 int off = offset(); 1891 ldrsb(dst, src); 1892 return off; 1893 } 1894 1895 int MacroAssembler::load_signed_short32(Register dst, Address src) { 1896 int off = offset(); 1897 ldrshw(dst, src); 1898 return off; 1899 } 1900 1901 int MacroAssembler::load_signed_byte32(Register dst, Address src) { 1902 int off = offset(); 1903 ldrsbw(dst, src); 1904 return off; 1905 } 1906 1907 void MacroAssembler::load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed, Register dst2) { 1908 switch (size_in_bytes) { 1909 case 8: ldr(dst, src); break; 1910 case 4: ldrw(dst, src); break; 1911 case 2: is_signed ? load_signed_short(dst, src) : load_unsigned_short(dst, src); break; 1912 case 1: is_signed ? load_signed_byte( dst, src) : load_unsigned_byte( dst, src); break; 1913 default: ShouldNotReachHere(); 1914 } 1915 } 1916 1917 void MacroAssembler::store_sized_value(Address dst, Register src, size_t size_in_bytes, Register src2) { 1918 switch (size_in_bytes) { 1919 case 8: str(src, dst); break; 1920 case 4: strw(src, dst); break; 1921 case 2: strh(src, dst); break; 1922 case 1: strb(src, dst); break; 1923 default: ShouldNotReachHere(); 1924 } 1925 } 1926 1927 void MacroAssembler::decrementw(Register reg, int value) 1928 { 1929 if (value < 0) { incrementw(reg, -value); return; } 1930 if (value == 0) { return; } 1931 if (value < (1 << 12)) { subw(reg, reg, value); return; } 1932 /* else */ { 1933 guarantee(reg != rscratch2, "invalid dst for register decrement"); 1934 movw(rscratch2, (unsigned)value); 1935 subw(reg, reg, rscratch2); 1936 } 1937 } 1938 1939 void MacroAssembler::decrement(Register reg, int value) 1940 { 1941 if (value < 0) { increment(reg, -value); return; } 1942 if (value == 0) { return; } 1943 if (value < (1 << 12)) { sub(reg, reg, value); return; } 1944 /* else */ { 1945 assert(reg != rscratch2, "invalid dst for register decrement"); 1946 mov(rscratch2, (unsigned long)value); 1947 sub(reg, reg, rscratch2); 1948 } 1949 } 1950 1951 void MacroAssembler::decrementw(Address dst, int value) 1952 { 1953 assert(!dst.uses(rscratch1), "invalid dst for address decrement"); 1954 if (dst.getMode() == Address::literal) { 1955 assert(abs(value) < (1 << 12), "invalid value and address mode combination"); 1956 lea(rscratch2, dst); 1957 dst = Address(rscratch2); 1958 } 1959 ldrw(rscratch1, dst); 1960 decrementw(rscratch1, value); 1961 strw(rscratch1, dst); 1962 } 1963 1964 void MacroAssembler::decrement(Address dst, int value) 1965 { 1966 assert(!dst.uses(rscratch1), "invalid address for decrement"); 1967 if (dst.getMode() == Address::literal) { 1968 assert(abs(value) < (1 << 12), "invalid value and address mode combination"); 1969 lea(rscratch2, dst); 1970 dst = Address(rscratch2); 1971 } 1972 ldr(rscratch1, dst); 1973 decrement(rscratch1, value); 1974 str(rscratch1, dst); 1975 } 1976 1977 void MacroAssembler::incrementw(Register reg, int value) 1978 { 1979 if (value < 0) { decrementw(reg, -value); return; } 1980 if (value == 0) { return; } 1981 if (value < (1 << 12)) { addw(reg, reg, value); return; } 1982 /* else */ { 1983 assert(reg != rscratch2, "invalid dst for register increment"); 1984 movw(rscratch2, (unsigned)value); 1985 addw(reg, reg, rscratch2); 1986 } 1987 } 1988 1989 void MacroAssembler::increment(Register reg, int value) 1990 { 1991 if (value < 0) { decrement(reg, -value); return; } 1992 if (value == 0) { return; } 1993 if (value < (1 << 12)) { add(reg, reg, value); return; } 1994 /* else */ { 1995 assert(reg != rscratch2, "invalid dst for register increment"); 1996 movw(rscratch2, (unsigned)value); 1997 add(reg, reg, rscratch2); 1998 } 1999 } 2000 2001 void MacroAssembler::incrementw(Address dst, int value) 2002 { 2003 assert(!dst.uses(rscratch1), "invalid dst for address increment"); 2004 if (dst.getMode() == Address::literal) { 2005 assert(abs(value) < (1 << 12), "invalid value and address mode combination"); 2006 lea(rscratch2, dst); 2007 dst = Address(rscratch2); 2008 } 2009 ldrw(rscratch1, dst); 2010 incrementw(rscratch1, value); 2011 strw(rscratch1, dst); 2012 } 2013 2014 void MacroAssembler::increment(Address dst, int value) 2015 { 2016 assert(!dst.uses(rscratch1), "invalid dst for address increment"); 2017 if (dst.getMode() == Address::literal) { 2018 assert(abs(value) < (1 << 12), "invalid value and address mode combination"); 2019 lea(rscratch2, dst); 2020 dst = Address(rscratch2); 2021 } 2022 ldr(rscratch1, dst); 2023 increment(rscratch1, value); 2024 str(rscratch1, dst); 2025 } 2026 2027 2028 void MacroAssembler::pusha() { 2029 push(0x7fffffff, sp); 2030 } 2031 2032 void MacroAssembler::popa() { 2033 pop(0x7fffffff, sp); 2034 } 2035 2036 // Push lots of registers in the bit set supplied. Don't push sp. 2037 // Return the number of words pushed 2038 int MacroAssembler::push(unsigned int bitset, Register stack) { 2039 int words_pushed = 0; 2040 2041 // Scan bitset to accumulate register pairs 2042 unsigned char regs[32]; 2043 int count = 0; 2044 for (int reg = 0; reg <= 30; reg++) { 2045 if (1 & bitset) 2046 regs[count++] = reg; 2047 bitset >>= 1; 2048 } 2049 regs[count++] = zr->encoding_nocheck(); 2050 count &= ~1; // Only push an even nuber of regs 2051 2052 if (count) { 2053 stp(as_Register(regs[0]), as_Register(regs[1]), 2054 Address(pre(stack, -count * wordSize))); 2055 words_pushed += 2; 2056 } 2057 for (int i = 2; i < count; i += 2) { 2058 stp(as_Register(regs[i]), as_Register(regs[i+1]), 2059 Address(stack, i * wordSize)); 2060 words_pushed += 2; 2061 } 2062 2063 assert(words_pushed == count, "oops, pushed != count"); 2064 2065 return count; 2066 } 2067 2068 int MacroAssembler::pop(unsigned int bitset, Register stack) { 2069 int words_pushed = 0; 2070 2071 // Scan bitset to accumulate register pairs 2072 unsigned char regs[32]; 2073 int count = 0; 2074 for (int reg = 0; reg <= 30; reg++) { 2075 if (1 & bitset) 2076 regs[count++] = reg; 2077 bitset >>= 1; 2078 } 2079 regs[count++] = zr->encoding_nocheck(); 2080 count &= ~1; 2081 2082 for (int i = 2; i < count; i += 2) { 2083 ldp(as_Register(regs[i]), as_Register(regs[i+1]), 2084 Address(stack, i * wordSize)); 2085 words_pushed += 2; 2086 } 2087 if (count) { 2088 ldp(as_Register(regs[0]), as_Register(regs[1]), 2089 Address(post(stack, count * wordSize))); 2090 words_pushed += 2; 2091 } 2092 2093 assert(words_pushed == count, "oops, pushed != count"); 2094 2095 return count; 2096 } 2097 #ifdef ASSERT 2098 void MacroAssembler::verify_heapbase(const char* msg) { 2099 #if 0 2100 assert (UseCompressedOops || UseCompressedClassPointers, "should be compressed"); 2101 assert (Universe::heap() != NULL, "java heap should be initialized"); 2102 if (CheckCompressedOops) { 2103 Label ok; 2104 push(1 << rscratch1->encoding(), sp); // cmpptr trashes rscratch1 2105 cmpptr(rheapbase, ExternalAddress((address)Universe::narrow_ptrs_base_addr())); 2106 br(Assembler::EQ, ok); 2107 stop(msg); 2108 bind(ok); 2109 pop(1 << rscratch1->encoding(), sp); 2110 } 2111 #endif 2112 } 2113 #endif 2114 2115 void MacroAssembler::resolve_jobject(Register value, Register thread, Register tmp) { 2116 Label done, not_weak; 2117 cbz(value, done); // Use NULL as-is. 2118 2119 STATIC_ASSERT(JNIHandles::weak_tag_mask == 1u); 2120 tbz(r0, 0, not_weak); // Test for jweak tag. 2121 2122 // Resolve jweak. 2123 access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF, value, 2124 Address(value, -JNIHandles::weak_tag_value), tmp, thread); 2125 verify_oop(value); 2126 b(done); 2127 2128 bind(not_weak); 2129 // Resolve (untagged) jobject. 2130 access_load_at(T_OBJECT, IN_NATIVE, value, Address(value, 0), tmp, thread); 2131 verify_oop(value); 2132 bind(done); 2133 } 2134 2135 void MacroAssembler::stop(const char* msg) { 2136 address ip = pc(); 2137 pusha(); 2138 mov(c_rarg0, (address)msg); 2139 mov(c_rarg1, (address)ip); 2140 mov(c_rarg2, sp); 2141 mov(c_rarg3, CAST_FROM_FN_PTR(address, MacroAssembler::debug64)); 2142 // call(c_rarg3); 2143 blrt(c_rarg3, 3, 0, 1); 2144 hlt(0); 2145 } 2146 2147 void MacroAssembler::unimplemented(const char* what) { 2148 const char* buf = NULL; 2149 { 2150 ResourceMark rm; 2151 stringStream ss; 2152 ss.print("unimplemented: %s", what); 2153 buf = code_string(ss.as_string()); 2154 } 2155 stop(buf); 2156 } 2157 2158 // If a constant does not fit in an immediate field, generate some 2159 // number of MOV instructions and then perform the operation. 2160 void MacroAssembler::wrap_add_sub_imm_insn(Register Rd, Register Rn, unsigned imm, 2161 add_sub_imm_insn insn1, 2162 add_sub_reg_insn insn2) { 2163 assert(Rd != zr, "Rd = zr and not setting flags?"); 2164 if (operand_valid_for_add_sub_immediate((int)imm)) { 2165 (this->*insn1)(Rd, Rn, imm); 2166 } else { 2167 if (uabs(imm) < (1 << 24)) { 2168 (this->*insn1)(Rd, Rn, imm & -(1 << 12)); 2169 (this->*insn1)(Rd, Rd, imm & ((1 << 12)-1)); 2170 } else { 2171 assert_different_registers(Rd, Rn); 2172 mov(Rd, (uint64_t)imm); 2173 (this->*insn2)(Rd, Rn, Rd, LSL, 0); 2174 } 2175 } 2176 } 2177 2178 // Seperate vsn which sets the flags. Optimisations are more restricted 2179 // because we must set the flags correctly. 2180 void MacroAssembler::wrap_adds_subs_imm_insn(Register Rd, Register Rn, unsigned imm, 2181 add_sub_imm_insn insn1, 2182 add_sub_reg_insn insn2) { 2183 if (operand_valid_for_add_sub_immediate((int)imm)) { 2184 (this->*insn1)(Rd, Rn, imm); 2185 } else { 2186 assert_different_registers(Rd, Rn); 2187 assert(Rd != zr, "overflow in immediate operand"); 2188 mov(Rd, (uint64_t)imm); 2189 (this->*insn2)(Rd, Rn, Rd, LSL, 0); 2190 } 2191 } 2192 2193 2194 void MacroAssembler::add(Register Rd, Register Rn, RegisterOrConstant increment) { 2195 if (increment.is_register()) { 2196 add(Rd, Rn, increment.as_register()); 2197 } else { 2198 add(Rd, Rn, increment.as_constant()); 2199 } 2200 } 2201 2202 void MacroAssembler::addw(Register Rd, Register Rn, RegisterOrConstant increment) { 2203 if (increment.is_register()) { 2204 addw(Rd, Rn, increment.as_register()); 2205 } else { 2206 addw(Rd, Rn, increment.as_constant()); 2207 } 2208 } 2209 2210 void MacroAssembler::sub(Register Rd, Register Rn, RegisterOrConstant decrement) { 2211 if (decrement.is_register()) { 2212 sub(Rd, Rn, decrement.as_register()); 2213 } else { 2214 sub(Rd, Rn, decrement.as_constant()); 2215 } 2216 } 2217 2218 void MacroAssembler::subw(Register Rd, Register Rn, RegisterOrConstant decrement) { 2219 if (decrement.is_register()) { 2220 subw(Rd, Rn, decrement.as_register()); 2221 } else { 2222 subw(Rd, Rn, decrement.as_constant()); 2223 } 2224 } 2225 2226 void MacroAssembler::reinit_heapbase() 2227 { 2228 if (UseCompressedOops) { 2229 if (Universe::is_fully_initialized()) { 2230 mov(rheapbase, Universe::narrow_ptrs_base()); 2231 } else { 2232 lea(rheapbase, ExternalAddress((address)Universe::narrow_ptrs_base_addr())); 2233 ldr(rheapbase, Address(rheapbase)); 2234 } 2235 } 2236 } 2237 2238 // this simulates the behaviour of the x86 cmpxchg instruction using a 2239 // load linked/store conditional pair. we use the acquire/release 2240 // versions of these instructions so that we flush pending writes as 2241 // per Java semantics. 2242 2243 // n.b the x86 version assumes the old value to be compared against is 2244 // in rax and updates rax with the value located in memory if the 2245 // cmpxchg fails. we supply a register for the old value explicitly 2246 2247 // the aarch64 load linked/store conditional instructions do not 2248 // accept an offset. so, unlike x86, we must provide a plain register 2249 // to identify the memory word to be compared/exchanged rather than a 2250 // register+offset Address. 2251 2252 void MacroAssembler::cmpxchgptr(Register oldv, Register newv, Register addr, Register tmp, 2253 Label &succeed, Label *fail) { 2254 // oldv holds comparison value 2255 // newv holds value to write in exchange 2256 // addr identifies memory word to compare against/update 2257 if (UseLSE) { 2258 mov(tmp, oldv); 2259 casal(Assembler::xword, oldv, newv, addr); 2260 cmp(tmp, oldv); 2261 br(Assembler::EQ, succeed); 2262 membar(AnyAny); 2263 } else { 2264 Label retry_load, nope; 2265 if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH)) 2266 prfm(Address(addr), PSTL1STRM); 2267 bind(retry_load); 2268 // flush and load exclusive from the memory location 2269 // and fail if it is not what we expect 2270 ldaxr(tmp, addr); 2271 cmp(tmp, oldv); 2272 br(Assembler::NE, nope); 2273 // if we store+flush with no intervening write tmp wil be zero 2274 stlxr(tmp, newv, addr); 2275 cbzw(tmp, succeed); 2276 // retry so we only ever return after a load fails to compare 2277 // ensures we don't return a stale value after a failed write. 2278 b(retry_load); 2279 // if the memory word differs we return it in oldv and signal a fail 2280 bind(nope); 2281 membar(AnyAny); 2282 mov(oldv, tmp); 2283 } 2284 if (fail) 2285 b(*fail); 2286 } 2287 2288 void MacroAssembler::cmpxchg_obj_header(Register oldv, Register newv, Register obj, Register tmp, 2289 Label &succeed, Label *fail) { 2290 assert(oopDesc::mark_offset_in_bytes() == 0, "assumption"); 2291 cmpxchgptr(oldv, newv, obj, tmp, succeed, fail); 2292 } 2293 2294 void MacroAssembler::cmpxchgw(Register oldv, Register newv, Register addr, Register tmp, 2295 Label &succeed, Label *fail) { 2296 // oldv holds comparison value 2297 // newv holds value to write in exchange 2298 // addr identifies memory word to compare against/update 2299 // tmp returns 0/1 for success/failure 2300 if (UseLSE) { 2301 mov(tmp, oldv); 2302 casal(Assembler::word, oldv, newv, addr); 2303 cmp(tmp, oldv); 2304 br(Assembler::EQ, succeed); 2305 membar(AnyAny); 2306 } else { 2307 Label retry_load, nope; 2308 if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH)) 2309 prfm(Address(addr), PSTL1STRM); 2310 bind(retry_load); 2311 // flush and load exclusive from the memory location 2312 // and fail if it is not what we expect 2313 ldaxrw(tmp, addr); 2314 cmp(tmp, oldv); 2315 br(Assembler::NE, nope); 2316 // if we store+flush with no intervening write tmp wil be zero 2317 stlxrw(tmp, newv, addr); 2318 cbzw(tmp, succeed); 2319 // retry so we only ever return after a load fails to compare 2320 // ensures we don't return a stale value after a failed write. 2321 b(retry_load); 2322 // if the memory word differs we return it in oldv and signal a fail 2323 bind(nope); 2324 membar(AnyAny); 2325 mov(oldv, tmp); 2326 } 2327 if (fail) 2328 b(*fail); 2329 } 2330 2331 // A generic CAS; success or failure is in the EQ flag. A weak CAS 2332 // doesn't retry and may fail spuriously. If the oldval is wanted, 2333 // Pass a register for the result, otherwise pass noreg. 2334 2335 // Clobbers rscratch1 2336 void MacroAssembler::cmpxchg(Register addr, Register expected, 2337 Register new_val, 2338 enum operand_size size, 2339 bool acquire, bool release, 2340 bool weak, 2341 Register result) { 2342 if (result == noreg) result = rscratch1; 2343 if (UseLSE) { 2344 mov(result, expected); 2345 lse_cas(result, new_val, addr, size, acquire, release, /*not_pair*/ true); 2346 cmp(result, expected); 2347 } else { 2348 BLOCK_COMMENT("cmpxchg {"); 2349 Label retry_load, done; 2350 if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH)) 2351 prfm(Address(addr), PSTL1STRM); 2352 bind(retry_load); 2353 load_exclusive(result, addr, size, acquire); 2354 if (size == xword) 2355 cmp(result, expected); 2356 else 2357 cmpw(result, expected); 2358 br(Assembler::NE, done); 2359 store_exclusive(rscratch1, new_val, addr, size, release); 2360 if (weak) { 2361 cmpw(rscratch1, 0u); // If the store fails, return NE to our caller. 2362 } else { 2363 cbnzw(rscratch1, retry_load); 2364 } 2365 bind(done); 2366 BLOCK_COMMENT("} cmpxchg"); 2367 } 2368 } 2369 2370 static bool different(Register a, RegisterOrConstant b, Register c) { 2371 if (b.is_constant()) 2372 return a != c; 2373 else 2374 return a != b.as_register() && a != c && b.as_register() != c; 2375 } 2376 2377 #define ATOMIC_OP(NAME, LDXR, OP, IOP, AOP, STXR, sz) \ 2378 void MacroAssembler::atomic_##NAME(Register prev, RegisterOrConstant incr, Register addr) { \ 2379 if (UseLSE) { \ 2380 prev = prev->is_valid() ? prev : zr; \ 2381 if (incr.is_register()) { \ 2382 AOP(sz, incr.as_register(), prev, addr); \ 2383 } else { \ 2384 mov(rscratch2, incr.as_constant()); \ 2385 AOP(sz, rscratch2, prev, addr); \ 2386 } \ 2387 return; \ 2388 } \ 2389 Register result = rscratch2; \ 2390 if (prev->is_valid()) \ 2391 result = different(prev, incr, addr) ? prev : rscratch2; \ 2392 \ 2393 Label retry_load; \ 2394 if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH)) \ 2395 prfm(Address(addr), PSTL1STRM); \ 2396 bind(retry_load); \ 2397 LDXR(result, addr); \ 2398 OP(rscratch1, result, incr); \ 2399 STXR(rscratch2, rscratch1, addr); \ 2400 cbnzw(rscratch2, retry_load); \ 2401 if (prev->is_valid() && prev != result) { \ 2402 IOP(prev, rscratch1, incr); \ 2403 } \ 2404 } 2405 2406 ATOMIC_OP(add, ldxr, add, sub, ldadd, stxr, Assembler::xword) 2407 ATOMIC_OP(addw, ldxrw, addw, subw, ldadd, stxrw, Assembler::word) 2408 ATOMIC_OP(addal, ldaxr, add, sub, ldaddal, stlxr, Assembler::xword) 2409 ATOMIC_OP(addalw, ldaxrw, addw, subw, ldaddal, stlxrw, Assembler::word) 2410 2411 #undef ATOMIC_OP 2412 2413 #define ATOMIC_XCHG(OP, AOP, LDXR, STXR, sz) \ 2414 void MacroAssembler::atomic_##OP(Register prev, Register newv, Register addr) { \ 2415 if (UseLSE) { \ 2416 prev = prev->is_valid() ? prev : zr; \ 2417 AOP(sz, newv, prev, addr); \ 2418 return; \ 2419 } \ 2420 Register result = rscratch2; \ 2421 if (prev->is_valid()) \ 2422 result = different(prev, newv, addr) ? prev : rscratch2; \ 2423 \ 2424 Label retry_load; \ 2425 if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH)) \ 2426 prfm(Address(addr), PSTL1STRM); \ 2427 bind(retry_load); \ 2428 LDXR(result, addr); \ 2429 STXR(rscratch1, newv, addr); \ 2430 cbnzw(rscratch1, retry_load); \ 2431 if (prev->is_valid() && prev != result) \ 2432 mov(prev, result); \ 2433 } 2434 2435 ATOMIC_XCHG(xchg, swp, ldxr, stxr, Assembler::xword) 2436 ATOMIC_XCHG(xchgw, swp, ldxrw, stxrw, Assembler::word) 2437 ATOMIC_XCHG(xchgal, swpal, ldaxr, stlxr, Assembler::xword) 2438 ATOMIC_XCHG(xchgalw, swpal, ldaxrw, stlxrw, Assembler::word) 2439 2440 #undef ATOMIC_XCHG 2441 2442 #ifndef PRODUCT 2443 extern "C" void findpc(intptr_t x); 2444 #endif 2445 2446 void MacroAssembler::debug64(char* msg, int64_t pc, int64_t regs[]) 2447 { 2448 // In order to get locks to work, we need to fake a in_VM state 2449 if (ShowMessageBoxOnError ) { 2450 JavaThread* thread = JavaThread::current(); 2451 JavaThreadState saved_state = thread->thread_state(); 2452 thread->set_thread_state(_thread_in_vm); 2453 #ifndef PRODUCT 2454 if (CountBytecodes || TraceBytecodes || StopInterpreterAt) { 2455 ttyLocker ttyl; 2456 BytecodeCounter::print(); 2457 } 2458 #endif 2459 if (os::message_box(msg, "Execution stopped, print registers?")) { 2460 ttyLocker ttyl; 2461 tty->print_cr(" pc = 0x%016lx", pc); 2462 #ifndef PRODUCT 2463 tty->cr(); 2464 findpc(pc); 2465 tty->cr(); 2466 #endif 2467 tty->print_cr(" r0 = 0x%016lx", regs[0]); 2468 tty->print_cr(" r1 = 0x%016lx", regs[1]); 2469 tty->print_cr(" r2 = 0x%016lx", regs[2]); 2470 tty->print_cr(" r3 = 0x%016lx", regs[3]); 2471 tty->print_cr(" r4 = 0x%016lx", regs[4]); 2472 tty->print_cr(" r5 = 0x%016lx", regs[5]); 2473 tty->print_cr(" r6 = 0x%016lx", regs[6]); 2474 tty->print_cr(" r7 = 0x%016lx", regs[7]); 2475 tty->print_cr(" r8 = 0x%016lx", regs[8]); 2476 tty->print_cr(" r9 = 0x%016lx", regs[9]); 2477 tty->print_cr("r10 = 0x%016lx", regs[10]); 2478 tty->print_cr("r11 = 0x%016lx", regs[11]); 2479 tty->print_cr("r12 = 0x%016lx", regs[12]); 2480 tty->print_cr("r13 = 0x%016lx", regs[13]); 2481 tty->print_cr("r14 = 0x%016lx", regs[14]); 2482 tty->print_cr("r15 = 0x%016lx", regs[15]); 2483 tty->print_cr("r16 = 0x%016lx", regs[16]); 2484 tty->print_cr("r17 = 0x%016lx", regs[17]); 2485 tty->print_cr("r18 = 0x%016lx", regs[18]); 2486 tty->print_cr("r19 = 0x%016lx", regs[19]); 2487 tty->print_cr("r20 = 0x%016lx", regs[20]); 2488 tty->print_cr("r21 = 0x%016lx", regs[21]); 2489 tty->print_cr("r22 = 0x%016lx", regs[22]); 2490 tty->print_cr("r23 = 0x%016lx", regs[23]); 2491 tty->print_cr("r24 = 0x%016lx", regs[24]); 2492 tty->print_cr("r25 = 0x%016lx", regs[25]); 2493 tty->print_cr("r26 = 0x%016lx", regs[26]); 2494 tty->print_cr("r27 = 0x%016lx", regs[27]); 2495 tty->print_cr("r28 = 0x%016lx", regs[28]); 2496 tty->print_cr("r30 = 0x%016lx", regs[30]); 2497 tty->print_cr("r31 = 0x%016lx", regs[31]); 2498 BREAKPOINT; 2499 } 2500 ThreadStateTransition::transition(thread, _thread_in_vm, saved_state); 2501 } else { 2502 ttyLocker ttyl; 2503 ::tty->print_cr("=============== DEBUG MESSAGE: %s ================\n", 2504 msg); 2505 assert(false, "DEBUG MESSAGE: %s", msg); 2506 } 2507 } 2508 2509 #ifdef BUILTIN_SIM 2510 // routine to generate an x86 prolog for a stub function which 2511 // bootstraps into the generated ARM code which directly follows the 2512 // stub 2513 // 2514 // the argument encodes the number of general and fp registers 2515 // passed by the caller and the callng convention (currently just 2516 // the number of general registers and assumes C argument passing) 2517 2518 extern "C" { 2519 int aarch64_stub_prolog_size(); 2520 void aarch64_stub_prolog(); 2521 void aarch64_prolog(); 2522 } 2523 2524 void MacroAssembler::c_stub_prolog(int gp_arg_count, int fp_arg_count, int ret_type, 2525 address *prolog_ptr) 2526 { 2527 int calltype = (((ret_type & 0x3) << 8) | 2528 ((fp_arg_count & 0xf) << 4) | 2529 (gp_arg_count & 0xf)); 2530 2531 // the addresses for the x86 to ARM entry code we need to use 2532 address start = pc(); 2533 // printf("start = %lx\n", start); 2534 int byteCount = aarch64_stub_prolog_size(); 2535 // printf("byteCount = %x\n", byteCount); 2536 int instructionCount = (byteCount + 3)/ 4; 2537 // printf("instructionCount = %x\n", instructionCount); 2538 for (int i = 0; i < instructionCount; i++) { 2539 nop(); 2540 } 2541 2542 memcpy(start, (void*)aarch64_stub_prolog, byteCount); 2543 2544 // write the address of the setup routine and the call format at the 2545 // end of into the copied code 2546 u_int64_t *patch_end = (u_int64_t *)(start + byteCount); 2547 if (prolog_ptr) 2548 patch_end[-2] = (u_int64_t)prolog_ptr; 2549 patch_end[-1] = calltype; 2550 } 2551 #endif 2552 2553 void MacroAssembler::push_call_clobbered_registers() { 2554 int step = 4 * wordSize; 2555 push(RegSet::range(r0, r18) - RegSet::of(rscratch1, rscratch2), sp); 2556 sub(sp, sp, step); 2557 mov(rscratch1, -step); 2558 // Push v0-v7, v16-v31. 2559 for (int i = 31; i>= 4; i -= 4) { 2560 if (i <= v7->encoding() || i >= v16->encoding()) 2561 st1(as_FloatRegister(i-3), as_FloatRegister(i-2), as_FloatRegister(i-1), 2562 as_FloatRegister(i), T1D, Address(post(sp, rscratch1))); 2563 } 2564 st1(as_FloatRegister(0), as_FloatRegister(1), as_FloatRegister(2), 2565 as_FloatRegister(3), T1D, Address(sp)); 2566 } 2567 2568 void MacroAssembler::pop_call_clobbered_registers() { 2569 for (int i = 0; i < 32; i += 4) { 2570 if (i <= v7->encoding() || i >= v16->encoding()) 2571 ld1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2), 2572 as_FloatRegister(i+3), T1D, Address(post(sp, 4 * wordSize))); 2573 } 2574 2575 pop(RegSet::range(r0, r18) - RegSet::of(rscratch1, rscratch2), sp); 2576 } 2577 2578 void MacroAssembler::push_CPU_state(bool save_vectors) { 2579 int step = (save_vectors ? 8 : 4) * wordSize; 2580 push(0x3fffffff, sp); // integer registers except lr & sp 2581 mov(rscratch1, -step); 2582 sub(sp, sp, step); 2583 for (int i = 28; i >= 4; i -= 4) { 2584 st1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2), 2585 as_FloatRegister(i+3), save_vectors ? T2D : T1D, Address(post(sp, rscratch1))); 2586 } 2587 st1(v0, v1, v2, v3, save_vectors ? T2D : T1D, sp); 2588 } 2589 2590 void MacroAssembler::pop_CPU_state(bool restore_vectors) { 2591 int step = (restore_vectors ? 8 : 4) * wordSize; 2592 for (int i = 0; i <= 28; i += 4) 2593 ld1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2), 2594 as_FloatRegister(i+3), restore_vectors ? T2D : T1D, Address(post(sp, step))); 2595 pop(0x3fffffff, sp); // integer registers except lr & sp 2596 } 2597 2598 /** 2599 * Helpers for multiply_to_len(). 2600 */ 2601 void MacroAssembler::add2_with_carry(Register final_dest_hi, Register dest_hi, Register dest_lo, 2602 Register src1, Register src2) { 2603 adds(dest_lo, dest_lo, src1); 2604 adc(dest_hi, dest_hi, zr); 2605 adds(dest_lo, dest_lo, src2); 2606 adc(final_dest_hi, dest_hi, zr); 2607 } 2608 2609 // Generate an address from (r + r1 extend offset). "size" is the 2610 // size of the operand. The result may be in rscratch2. 2611 Address MacroAssembler::offsetted_address(Register r, Register r1, 2612 Address::extend ext, int offset, int size) { 2613 if (offset || (ext.shift() % size != 0)) { 2614 lea(rscratch2, Address(r, r1, ext)); 2615 return Address(rscratch2, offset); 2616 } else { 2617 return Address(r, r1, ext); 2618 } 2619 } 2620 2621 Address MacroAssembler::spill_address(int size, int offset, Register tmp) 2622 { 2623 assert(offset >= 0, "spill to negative address?"); 2624 // Offset reachable ? 2625 // Not aligned - 9 bits signed offset 2626 // Aligned - 12 bits unsigned offset shifted 2627 Register base = sp; 2628 if ((offset & (size-1)) && offset >= (1<<8)) { 2629 add(tmp, base, offset & ((1<<12)-1)); 2630 base = tmp; 2631 offset &= -1<<12; 2632 } 2633 2634 if (offset >= (1<<12) * size) { 2635 add(tmp, base, offset & (((1<<12)-1)<<12)); 2636 base = tmp; 2637 offset &= ~(((1<<12)-1)<<12); 2638 } 2639 2640 return Address(base, offset); 2641 } 2642 2643 // Checks whether offset is aligned. 2644 // Returns true if it is, else false. 2645 bool MacroAssembler::merge_alignment_check(Register base, 2646 size_t size, 2647 long cur_offset, 2648 long prev_offset) const { 2649 if (AvoidUnalignedAccesses) { 2650 if (base == sp) { 2651 // Checks whether low offset if aligned to pair of registers. 2652 long pair_mask = size * 2 - 1; 2653 long offset = prev_offset > cur_offset ? cur_offset : prev_offset; 2654 return (offset & pair_mask) == 0; 2655 } else { // If base is not sp, we can't guarantee the access is aligned. 2656 return false; 2657 } 2658 } else { 2659 long mask = size - 1; 2660 // Load/store pair instruction only supports element size aligned offset. 2661 return (cur_offset & mask) == 0 && (prev_offset & mask) == 0; 2662 } 2663 } 2664 2665 // Checks whether current and previous loads/stores can be merged. 2666 // Returns true if it can be merged, else false. 2667 bool MacroAssembler::ldst_can_merge(Register rt, 2668 const Address &adr, 2669 size_t cur_size_in_bytes, 2670 bool is_store) const { 2671 address prev = pc() - NativeInstruction::instruction_size; 2672 address last = code()->last_insn(); 2673 2674 if (last == NULL || !nativeInstruction_at(last)->is_Imm_LdSt()) { 2675 return false; 2676 } 2677 2678 if (adr.getMode() != Address::base_plus_offset || prev != last) { 2679 return false; 2680 } 2681 2682 NativeLdSt* prev_ldst = NativeLdSt_at(prev); 2683 size_t prev_size_in_bytes = prev_ldst->size_in_bytes(); 2684 2685 assert(prev_size_in_bytes == 4 || prev_size_in_bytes == 8, "only supports 64/32bit merging."); 2686 assert(cur_size_in_bytes == 4 || cur_size_in_bytes == 8, "only supports 64/32bit merging."); 2687 2688 if (cur_size_in_bytes != prev_size_in_bytes || is_store != prev_ldst->is_store()) { 2689 return false; 2690 } 2691 2692 long max_offset = 63 * prev_size_in_bytes; 2693 long min_offset = -64 * prev_size_in_bytes; 2694 2695 assert(prev_ldst->is_not_pre_post_index(), "pre-index or post-index is not supported to be merged."); 2696 2697 // Only same base can be merged. 2698 if (adr.base() != prev_ldst->base()) { 2699 return false; 2700 } 2701 2702 long cur_offset = adr.offset(); 2703 long prev_offset = prev_ldst->offset(); 2704 size_t diff = abs(cur_offset - prev_offset); 2705 if (diff != prev_size_in_bytes) { 2706 return false; 2707 } 2708 2709 // Following cases can not be merged: 2710 // ldr x2, [x2, #8] 2711 // ldr x3, [x2, #16] 2712 // or: 2713 // ldr x2, [x3, #8] 2714 // ldr x2, [x3, #16] 2715 // If t1 and t2 is the same in "ldp t1, t2, [xn, #imm]", we'll get SIGILL. 2716 if (!is_store && (adr.base() == prev_ldst->target() || rt == prev_ldst->target())) { 2717 return false; 2718 } 2719 2720 long low_offset = prev_offset > cur_offset ? cur_offset : prev_offset; 2721 // Offset range must be in ldp/stp instruction's range. 2722 if (low_offset > max_offset || low_offset < min_offset) { 2723 return false; 2724 } 2725 2726 if (merge_alignment_check(adr.base(), prev_size_in_bytes, cur_offset, prev_offset)) { 2727 return true; 2728 } 2729 2730 return false; 2731 } 2732 2733 // Merge current load/store with previous load/store into ldp/stp. 2734 void MacroAssembler::merge_ldst(Register rt, 2735 const Address &adr, 2736 size_t cur_size_in_bytes, 2737 bool is_store) { 2738 2739 assert(ldst_can_merge(rt, adr, cur_size_in_bytes, is_store) == true, "cur and prev must be able to be merged."); 2740 2741 Register rt_low, rt_high; 2742 address prev = pc() - NativeInstruction::instruction_size; 2743 NativeLdSt* prev_ldst = NativeLdSt_at(prev); 2744 2745 long offset; 2746 2747 if (adr.offset() < prev_ldst->offset()) { 2748 offset = adr.offset(); 2749 rt_low = rt; 2750 rt_high = prev_ldst->target(); 2751 } else { 2752 offset = prev_ldst->offset(); 2753 rt_low = prev_ldst->target(); 2754 rt_high = rt; 2755 } 2756 2757 Address adr_p = Address(prev_ldst->base(), offset); 2758 // Overwrite previous generated binary. 2759 code_section()->set_end(prev); 2760 2761 const int sz = prev_ldst->size_in_bytes(); 2762 assert(sz == 8 || sz == 4, "only supports 64/32bit merging."); 2763 if (!is_store) { 2764 BLOCK_COMMENT("merged ldr pair"); 2765 if (sz == 8) { 2766 ldp(rt_low, rt_high, adr_p); 2767 } else { 2768 ldpw(rt_low, rt_high, adr_p); 2769 } 2770 } else { 2771 BLOCK_COMMENT("merged str pair"); 2772 if (sz == 8) { 2773 stp(rt_low, rt_high, adr_p); 2774 } else { 2775 stpw(rt_low, rt_high, adr_p); 2776 } 2777 } 2778 } 2779 2780 /** 2781 * Multiply 64 bit by 64 bit first loop. 2782 */ 2783 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart, 2784 Register y, Register y_idx, Register z, 2785 Register carry, Register product, 2786 Register idx, Register kdx) { 2787 // 2788 // jlong carry, x[], y[], z[]; 2789 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) { 2790 // huge_128 product = y[idx] * x[xstart] + carry; 2791 // z[kdx] = (jlong)product; 2792 // carry = (jlong)(product >>> 64); 2793 // } 2794 // z[xstart] = carry; 2795 // 2796 2797 Label L_first_loop, L_first_loop_exit; 2798 Label L_one_x, L_one_y, L_multiply; 2799 2800 subsw(xstart, xstart, 1); 2801 br(Assembler::MI, L_one_x); 2802 2803 lea(rscratch1, Address(x, xstart, Address::lsl(LogBytesPerInt))); 2804 ldr(x_xstart, Address(rscratch1)); 2805 ror(x_xstart, x_xstart, 32); // convert big-endian to little-endian 2806 2807 bind(L_first_loop); 2808 subsw(idx, idx, 1); 2809 br(Assembler::MI, L_first_loop_exit); 2810 subsw(idx, idx, 1); 2811 br(Assembler::MI, L_one_y); 2812 lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt))); 2813 ldr(y_idx, Address(rscratch1)); 2814 ror(y_idx, y_idx, 32); // convert big-endian to little-endian 2815 bind(L_multiply); 2816 2817 // AArch64 has a multiply-accumulate instruction that we can't use 2818 // here because it has no way to process carries, so we have to use 2819 // separate add and adc instructions. Bah. 2820 umulh(rscratch1, x_xstart, y_idx); // x_xstart * y_idx -> rscratch1:product 2821 mul(product, x_xstart, y_idx); 2822 adds(product, product, carry); 2823 adc(carry, rscratch1, zr); // x_xstart * y_idx + carry -> carry:product 2824 2825 subw(kdx, kdx, 2); 2826 ror(product, product, 32); // back to big-endian 2827 str(product, offsetted_address(z, kdx, Address::uxtw(LogBytesPerInt), 0, BytesPerLong)); 2828 2829 b(L_first_loop); 2830 2831 bind(L_one_y); 2832 ldrw(y_idx, Address(y, 0)); 2833 b(L_multiply); 2834 2835 bind(L_one_x); 2836 ldrw(x_xstart, Address(x, 0)); 2837 b(L_first_loop); 2838 2839 bind(L_first_loop_exit); 2840 } 2841 2842 /** 2843 * Multiply 128 bit by 128. Unrolled inner loop. 2844 * 2845 */ 2846 void MacroAssembler::multiply_128_x_128_loop(Register y, Register z, 2847 Register carry, Register carry2, 2848 Register idx, Register jdx, 2849 Register yz_idx1, Register yz_idx2, 2850 Register tmp, Register tmp3, Register tmp4, 2851 Register tmp6, Register product_hi) { 2852 2853 // jlong carry, x[], y[], z[]; 2854 // int kdx = ystart+1; 2855 // for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop 2856 // huge_128 tmp3 = (y[idx+1] * product_hi) + z[kdx+idx+1] + carry; 2857 // jlong carry2 = (jlong)(tmp3 >>> 64); 2858 // huge_128 tmp4 = (y[idx] * product_hi) + z[kdx+idx] + carry2; 2859 // carry = (jlong)(tmp4 >>> 64); 2860 // z[kdx+idx+1] = (jlong)tmp3; 2861 // z[kdx+idx] = (jlong)tmp4; 2862 // } 2863 // idx += 2; 2864 // if (idx > 0) { 2865 // yz_idx1 = (y[idx] * product_hi) + z[kdx+idx] + carry; 2866 // z[kdx+idx] = (jlong)yz_idx1; 2867 // carry = (jlong)(yz_idx1 >>> 64); 2868 // } 2869 // 2870 2871 Label L_third_loop, L_third_loop_exit, L_post_third_loop_done; 2872 2873 lsrw(jdx, idx, 2); 2874 2875 bind(L_third_loop); 2876 2877 subsw(jdx, jdx, 1); 2878 br(Assembler::MI, L_third_loop_exit); 2879 subw(idx, idx, 4); 2880 2881 lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt))); 2882 2883 ldp(yz_idx2, yz_idx1, Address(rscratch1, 0)); 2884 2885 lea(tmp6, Address(z, idx, Address::uxtw(LogBytesPerInt))); 2886 2887 ror(yz_idx1, yz_idx1, 32); // convert big-endian to little-endian 2888 ror(yz_idx2, yz_idx2, 32); 2889 2890 ldp(rscratch2, rscratch1, Address(tmp6, 0)); 2891 2892 mul(tmp3, product_hi, yz_idx1); // yz_idx1 * product_hi -> tmp4:tmp3 2893 umulh(tmp4, product_hi, yz_idx1); 2894 2895 ror(rscratch1, rscratch1, 32); // convert big-endian to little-endian 2896 ror(rscratch2, rscratch2, 32); 2897 2898 mul(tmp, product_hi, yz_idx2); // yz_idx2 * product_hi -> carry2:tmp 2899 umulh(carry2, product_hi, yz_idx2); 2900 2901 // propagate sum of both multiplications into carry:tmp4:tmp3 2902 adds(tmp3, tmp3, carry); 2903 adc(tmp4, tmp4, zr); 2904 adds(tmp3, tmp3, rscratch1); 2905 adcs(tmp4, tmp4, tmp); 2906 adc(carry, carry2, zr); 2907 adds(tmp4, tmp4, rscratch2); 2908 adc(carry, carry, zr); 2909 2910 ror(tmp3, tmp3, 32); // convert little-endian to big-endian 2911 ror(tmp4, tmp4, 32); 2912 stp(tmp4, tmp3, Address(tmp6, 0)); 2913 2914 b(L_third_loop); 2915 bind (L_third_loop_exit); 2916 2917 andw (idx, idx, 0x3); 2918 cbz(idx, L_post_third_loop_done); 2919 2920 Label L_check_1; 2921 subsw(idx, idx, 2); 2922 br(Assembler::MI, L_check_1); 2923 2924 lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt))); 2925 ldr(yz_idx1, Address(rscratch1, 0)); 2926 ror(yz_idx1, yz_idx1, 32); 2927 mul(tmp3, product_hi, yz_idx1); // yz_idx1 * product_hi -> tmp4:tmp3 2928 umulh(tmp4, product_hi, yz_idx1); 2929 lea(rscratch1, Address(z, idx, Address::uxtw(LogBytesPerInt))); 2930 ldr(yz_idx2, Address(rscratch1, 0)); 2931 ror(yz_idx2, yz_idx2, 32); 2932 2933 add2_with_carry(carry, tmp4, tmp3, carry, yz_idx2); 2934 2935 ror(tmp3, tmp3, 32); 2936 str(tmp3, Address(rscratch1, 0)); 2937 2938 bind (L_check_1); 2939 2940 andw (idx, idx, 0x1); 2941 subsw(idx, idx, 1); 2942 br(Assembler::MI, L_post_third_loop_done); 2943 ldrw(tmp4, Address(y, idx, Address::uxtw(LogBytesPerInt))); 2944 mul(tmp3, tmp4, product_hi); // tmp4 * product_hi -> carry2:tmp3 2945 umulh(carry2, tmp4, product_hi); 2946 ldrw(tmp4, Address(z, idx, Address::uxtw(LogBytesPerInt))); 2947 2948 add2_with_carry(carry2, tmp3, tmp4, carry); 2949 2950 strw(tmp3, Address(z, idx, Address::uxtw(LogBytesPerInt))); 2951 extr(carry, carry2, tmp3, 32); 2952 2953 bind(L_post_third_loop_done); 2954 } 2955 2956 /** 2957 * Code for BigInteger::multiplyToLen() instrinsic. 2958 * 2959 * r0: x 2960 * r1: xlen 2961 * r2: y 2962 * r3: ylen 2963 * r4: z 2964 * r5: zlen 2965 * r10: tmp1 2966 * r11: tmp2 2967 * r12: tmp3 2968 * r13: tmp4 2969 * r14: tmp5 2970 * r15: tmp6 2971 * r16: tmp7 2972 * 2973 */ 2974 void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Register ylen, 2975 Register z, Register zlen, 2976 Register tmp1, Register tmp2, Register tmp3, Register tmp4, 2977 Register tmp5, Register tmp6, Register product_hi) { 2978 2979 assert_different_registers(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6); 2980 2981 const Register idx = tmp1; 2982 const Register kdx = tmp2; 2983 const Register xstart = tmp3; 2984 2985 const Register y_idx = tmp4; 2986 const Register carry = tmp5; 2987 const Register product = xlen; 2988 const Register x_xstart = zlen; // reuse register 2989 2990 // First Loop. 2991 // 2992 // final static long LONG_MASK = 0xffffffffL; 2993 // int xstart = xlen - 1; 2994 // int ystart = ylen - 1; 2995 // long carry = 0; 2996 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) { 2997 // long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry; 2998 // z[kdx] = (int)product; 2999 // carry = product >>> 32; 3000 // } 3001 // z[xstart] = (int)carry; 3002 // 3003 3004 movw(idx, ylen); // idx = ylen; 3005 movw(kdx, zlen); // kdx = xlen+ylen; 3006 mov(carry, zr); // carry = 0; 3007 3008 Label L_done; 3009 3010 movw(xstart, xlen); 3011 subsw(xstart, xstart, 1); 3012 br(Assembler::MI, L_done); 3013 3014 multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, carry, product, idx, kdx); 3015 3016 Label L_second_loop; 3017 cbzw(kdx, L_second_loop); 3018 3019 Label L_carry; 3020 subw(kdx, kdx, 1); 3021 cbzw(kdx, L_carry); 3022 3023 strw(carry, Address(z, kdx, Address::uxtw(LogBytesPerInt))); 3024 lsr(carry, carry, 32); 3025 subw(kdx, kdx, 1); 3026 3027 bind(L_carry); 3028 strw(carry, Address(z, kdx, Address::uxtw(LogBytesPerInt))); 3029 3030 // Second and third (nested) loops. 3031 // 3032 // for (int i = xstart-1; i >= 0; i--) { // Second loop 3033 // carry = 0; 3034 // for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop 3035 // long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) + 3036 // (z[k] & LONG_MASK) + carry; 3037 // z[k] = (int)product; 3038 // carry = product >>> 32; 3039 // } 3040 // z[i] = (int)carry; 3041 // } 3042 // 3043 // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = product_hi 3044 3045 const Register jdx = tmp1; 3046 3047 bind(L_second_loop); 3048 mov(carry, zr); // carry = 0; 3049 movw(jdx, ylen); // j = ystart+1 3050 3051 subsw(xstart, xstart, 1); // i = xstart-1; 3052 br(Assembler::MI, L_done); 3053 3054 str(z, Address(pre(sp, -4 * wordSize))); 3055 3056 Label L_last_x; 3057 lea(z, offsetted_address(z, xstart, Address::uxtw(LogBytesPerInt), 4, BytesPerInt)); // z = z + k - j 3058 subsw(xstart, xstart, 1); // i = xstart-1; 3059 br(Assembler::MI, L_last_x); 3060 3061 lea(rscratch1, Address(x, xstart, Address::uxtw(LogBytesPerInt))); 3062 ldr(product_hi, Address(rscratch1)); 3063 ror(product_hi, product_hi, 32); // convert big-endian to little-endian 3064 3065 Label L_third_loop_prologue; 3066 bind(L_third_loop_prologue); 3067 3068 str(ylen, Address(sp, wordSize)); 3069 stp(x, xstart, Address(sp, 2 * wordSize)); 3070 multiply_128_x_128_loop(y, z, carry, x, jdx, ylen, product, 3071 tmp2, x_xstart, tmp3, tmp4, tmp6, product_hi); 3072 ldp(z, ylen, Address(post(sp, 2 * wordSize))); 3073 ldp(x, xlen, Address(post(sp, 2 * wordSize))); // copy old xstart -> xlen 3074 3075 addw(tmp3, xlen, 1); 3076 strw(carry, Address(z, tmp3, Address::uxtw(LogBytesPerInt))); 3077 subsw(tmp3, tmp3, 1); 3078 br(Assembler::MI, L_done); 3079 3080 lsr(carry, carry, 32); 3081 strw(carry, Address(z, tmp3, Address::uxtw(LogBytesPerInt))); 3082 b(L_second_loop); 3083 3084 // Next infrequent code is moved outside loops. 3085 bind(L_last_x); 3086 ldrw(product_hi, Address(x, 0)); 3087 b(L_third_loop_prologue); 3088 3089 bind(L_done); 3090 } 3091 3092 // Code for BigInteger::mulAdd instrinsic 3093 // out = r0 3094 // in = r1 3095 // offset = r2 (already out.length-offset) 3096 // len = r3 3097 // k = r4 3098 // 3099 // pseudo code from java implementation: 3100 // carry = 0; 3101 // offset = out.length-offset - 1; 3102 // for (int j=len-1; j >= 0; j--) { 3103 // product = (in[j] & LONG_MASK) * kLong + (out[offset] & LONG_MASK) + carry; 3104 // out[offset--] = (int)product; 3105 // carry = product >>> 32; 3106 // } 3107 // return (int)carry; 3108 void MacroAssembler::mul_add(Register out, Register in, Register offset, 3109 Register len, Register k) { 3110 Label LOOP, END; 3111 // pre-loop 3112 cmp(len, zr); // cmp, not cbz/cbnz: to use condition twice => less branches 3113 csel(out, zr, out, Assembler::EQ); 3114 br(Assembler::EQ, END); 3115 add(in, in, len, LSL, 2); // in[j+1] address 3116 add(offset, out, offset, LSL, 2); // out[offset + 1] address 3117 mov(out, zr); // used to keep carry now 3118 BIND(LOOP); 3119 ldrw(rscratch1, Address(pre(in, -4))); 3120 madd(rscratch1, rscratch1, k, out); 3121 ldrw(rscratch2, Address(pre(offset, -4))); 3122 add(rscratch1, rscratch1, rscratch2); 3123 strw(rscratch1, Address(offset)); 3124 lsr(out, rscratch1, 32); 3125 subs(len, len, 1); 3126 br(Assembler::NE, LOOP); 3127 BIND(END); 3128 } 3129 3130 /** 3131 * Emits code to update CRC-32 with a byte value according to constants in table 3132 * 3133 * @param [in,out]crc Register containing the crc. 3134 * @param [in]val Register containing the byte to fold into the CRC. 3135 * @param [in]table Register containing the table of crc constants. 3136 * 3137 * uint32_t crc; 3138 * val = crc_table[(val ^ crc) & 0xFF]; 3139 * crc = val ^ (crc >> 8); 3140 * 3141 */ 3142 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) { 3143 eor(val, val, crc); 3144 andr(val, val, 0xff); 3145 ldrw(val, Address(table, val, Address::lsl(2))); 3146 eor(crc, val, crc, Assembler::LSR, 8); 3147 } 3148 3149 /** 3150 * Emits code to update CRC-32 with a 32-bit value according to tables 0 to 3 3151 * 3152 * @param [in,out]crc Register containing the crc. 3153 * @param [in]v Register containing the 32-bit to fold into the CRC. 3154 * @param [in]table0 Register containing table 0 of crc constants. 3155 * @param [in]table1 Register containing table 1 of crc constants. 3156 * @param [in]table2 Register containing table 2 of crc constants. 3157 * @param [in]table3 Register containing table 3 of crc constants. 3158 * 3159 * uint32_t crc; 3160 * v = crc ^ v 3161 * crc = table3[v&0xff]^table2[(v>>8)&0xff]^table1[(v>>16)&0xff]^table0[v>>24] 3162 * 3163 */ 3164 void MacroAssembler::update_word_crc32(Register crc, Register v, Register tmp, 3165 Register table0, Register table1, Register table2, Register table3, 3166 bool upper) { 3167 eor(v, crc, v, upper ? LSR:LSL, upper ? 32:0); 3168 uxtb(tmp, v); 3169 ldrw(crc, Address(table3, tmp, Address::lsl(2))); 3170 ubfx(tmp, v, 8, 8); 3171 ldrw(tmp, Address(table2, tmp, Address::lsl(2))); 3172 eor(crc, crc, tmp); 3173 ubfx(tmp, v, 16, 8); 3174 ldrw(tmp, Address(table1, tmp, Address::lsl(2))); 3175 eor(crc, crc, tmp); 3176 ubfx(tmp, v, 24, 8); 3177 ldrw(tmp, Address(table0, tmp, Address::lsl(2))); 3178 eor(crc, crc, tmp); 3179 } 3180 3181 void MacroAssembler::kernel_crc32_using_crc32(Register crc, Register buf, 3182 Register len, Register tmp0, Register tmp1, Register tmp2, 3183 Register tmp3) { 3184 Label CRC_by64_loop, CRC_by4_loop, CRC_by1_loop, CRC_less64, CRC_by64_pre, CRC_by32_loop, CRC_less32, L_exit; 3185 assert_different_registers(crc, buf, len, tmp0, tmp1, tmp2, tmp3); 3186 3187 mvnw(crc, crc); 3188 3189 subs(len, len, 128); 3190 br(Assembler::GE, CRC_by64_pre); 3191 BIND(CRC_less64); 3192 adds(len, len, 128-32); 3193 br(Assembler::GE, CRC_by32_loop); 3194 BIND(CRC_less32); 3195 adds(len, len, 32-4); 3196 br(Assembler::GE, CRC_by4_loop); 3197 adds(len, len, 4); 3198 br(Assembler::GT, CRC_by1_loop); 3199 b(L_exit); 3200 3201 BIND(CRC_by32_loop); 3202 ldp(tmp0, tmp1, Address(post(buf, 16))); 3203 subs(len, len, 32); 3204 crc32x(crc, crc, tmp0); 3205 ldr(tmp2, Address(post(buf, 8))); 3206 crc32x(crc, crc, tmp1); 3207 ldr(tmp3, Address(post(buf, 8))); 3208 crc32x(crc, crc, tmp2); 3209 crc32x(crc, crc, tmp3); 3210 br(Assembler::GE, CRC_by32_loop); 3211 cmn(len, 32); 3212 br(Assembler::NE, CRC_less32); 3213 b(L_exit); 3214 3215 BIND(CRC_by4_loop); 3216 ldrw(tmp0, Address(post(buf, 4))); 3217 subs(len, len, 4); 3218 crc32w(crc, crc, tmp0); 3219 br(Assembler::GE, CRC_by4_loop); 3220 adds(len, len, 4); 3221 br(Assembler::LE, L_exit); 3222 BIND(CRC_by1_loop); 3223 ldrb(tmp0, Address(post(buf, 1))); 3224 subs(len, len, 1); 3225 crc32b(crc, crc, tmp0); 3226 br(Assembler::GT, CRC_by1_loop); 3227 b(L_exit); 3228 3229 BIND(CRC_by64_pre); 3230 sub(buf, buf, 8); 3231 ldp(tmp0, tmp1, Address(buf, 8)); 3232 crc32x(crc, crc, tmp0); 3233 ldr(tmp2, Address(buf, 24)); 3234 crc32x(crc, crc, tmp1); 3235 ldr(tmp3, Address(buf, 32)); 3236 crc32x(crc, crc, tmp2); 3237 ldr(tmp0, Address(buf, 40)); 3238 crc32x(crc, crc, tmp3); 3239 ldr(tmp1, Address(buf, 48)); 3240 crc32x(crc, crc, tmp0); 3241 ldr(tmp2, Address(buf, 56)); 3242 crc32x(crc, crc, tmp1); 3243 ldr(tmp3, Address(pre(buf, 64))); 3244 3245 b(CRC_by64_loop); 3246 3247 align(CodeEntryAlignment); 3248 BIND(CRC_by64_loop); 3249 subs(len, len, 64); 3250 crc32x(crc, crc, tmp2); 3251 ldr(tmp0, Address(buf, 8)); 3252 crc32x(crc, crc, tmp3); 3253 ldr(tmp1, Address(buf, 16)); 3254 crc32x(crc, crc, tmp0); 3255 ldr(tmp2, Address(buf, 24)); 3256 crc32x(crc, crc, tmp1); 3257 ldr(tmp3, Address(buf, 32)); 3258 crc32x(crc, crc, tmp2); 3259 ldr(tmp0, Address(buf, 40)); 3260 crc32x(crc, crc, tmp3); 3261 ldr(tmp1, Address(buf, 48)); 3262 crc32x(crc, crc, tmp0); 3263 ldr(tmp2, Address(buf, 56)); 3264 crc32x(crc, crc, tmp1); 3265 ldr(tmp3, Address(pre(buf, 64))); 3266 br(Assembler::GE, CRC_by64_loop); 3267 3268 // post-loop 3269 crc32x(crc, crc, tmp2); 3270 crc32x(crc, crc, tmp3); 3271 3272 sub(len, len, 64); 3273 add(buf, buf, 8); 3274 cmn(len, 128); 3275 br(Assembler::NE, CRC_less64); 3276 BIND(L_exit); 3277 mvnw(crc, crc); 3278 } 3279 3280 /** 3281 * @param crc register containing existing CRC (32-bit) 3282 * @param buf register pointing to input byte buffer (byte*) 3283 * @param len register containing number of bytes 3284 * @param table register that will contain address of CRC table 3285 * @param tmp scratch register 3286 */ 3287 void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len, 3288 Register table0, Register table1, Register table2, Register table3, 3289 Register tmp, Register tmp2, Register tmp3) { 3290 Label L_by16, L_by16_loop, L_by4, L_by4_loop, L_by1, L_by1_loop, L_exit; 3291 unsigned long offset; 3292 3293 if (UseCRC32) { 3294 kernel_crc32_using_crc32(crc, buf, len, table0, table1, table2, table3); 3295 return; 3296 } 3297 3298 mvnw(crc, crc); 3299 3300 adrp(table0, ExternalAddress(StubRoutines::crc_table_addr()), offset); 3301 if (offset) add(table0, table0, offset); 3302 add(table1, table0, 1*256*sizeof(juint)); 3303 add(table2, table0, 2*256*sizeof(juint)); 3304 add(table3, table0, 3*256*sizeof(juint)); 3305 3306 if (UseNeon) { 3307 cmp(len, 64); 3308 br(Assembler::LT, L_by16); 3309 eor(v16, T16B, v16, v16); 3310 3311 Label L_fold; 3312 3313 add(tmp, table0, 4*256*sizeof(juint)); // Point at the Neon constants 3314 3315 ld1(v0, v1, T2D, post(buf, 32)); 3316 ld1r(v4, T2D, post(tmp, 8)); 3317 ld1r(v5, T2D, post(tmp, 8)); 3318 ld1r(v6, T2D, post(tmp, 8)); 3319 ld1r(v7, T2D, post(tmp, 8)); 3320 mov(v16, T4S, 0, crc); 3321 3322 eor(v0, T16B, v0, v16); 3323 sub(len, len, 64); 3324 3325 BIND(L_fold); 3326 pmull(v22, T8H, v0, v5, T8B); 3327 pmull(v20, T8H, v0, v7, T8B); 3328 pmull(v23, T8H, v0, v4, T8B); 3329 pmull(v21, T8H, v0, v6, T8B); 3330 3331 pmull2(v18, T8H, v0, v5, T16B); 3332 pmull2(v16, T8H, v0, v7, T16B); 3333 pmull2(v19, T8H, v0, v4, T16B); 3334 pmull2(v17, T8H, v0, v6, T16B); 3335 3336 uzp1(v24, T8H, v20, v22); 3337 uzp2(v25, T8H, v20, v22); 3338 eor(v20, T16B, v24, v25); 3339 3340 uzp1(v26, T8H, v16, v18); 3341 uzp2(v27, T8H, v16, v18); 3342 eor(v16, T16B, v26, v27); 3343 3344 ushll2(v22, T4S, v20, T8H, 8); 3345 ushll(v20, T4S, v20, T4H, 8); 3346 3347 ushll2(v18, T4S, v16, T8H, 8); 3348 ushll(v16, T4S, v16, T4H, 8); 3349 3350 eor(v22, T16B, v23, v22); 3351 eor(v18, T16B, v19, v18); 3352 eor(v20, T16B, v21, v20); 3353 eor(v16, T16B, v17, v16); 3354 3355 uzp1(v17, T2D, v16, v20); 3356 uzp2(v21, T2D, v16, v20); 3357 eor(v17, T16B, v17, v21); 3358 3359 ushll2(v20, T2D, v17, T4S, 16); 3360 ushll(v16, T2D, v17, T2S, 16); 3361 3362 eor(v20, T16B, v20, v22); 3363 eor(v16, T16B, v16, v18); 3364 3365 uzp1(v17, T2D, v20, v16); 3366 uzp2(v21, T2D, v20, v16); 3367 eor(v28, T16B, v17, v21); 3368 3369 pmull(v22, T8H, v1, v5, T8B); 3370 pmull(v20, T8H, v1, v7, T8B); 3371 pmull(v23, T8H, v1, v4, T8B); 3372 pmull(v21, T8H, v1, v6, T8B); 3373 3374 pmull2(v18, T8H, v1, v5, T16B); 3375 pmull2(v16, T8H, v1, v7, T16B); 3376 pmull2(v19, T8H, v1, v4, T16B); 3377 pmull2(v17, T8H, v1, v6, T16B); 3378 3379 ld1(v0, v1, T2D, post(buf, 32)); 3380 3381 uzp1(v24, T8H, v20, v22); 3382 uzp2(v25, T8H, v20, v22); 3383 eor(v20, T16B, v24, v25); 3384 3385 uzp1(v26, T8H, v16, v18); 3386 uzp2(v27, T8H, v16, v18); 3387 eor(v16, T16B, v26, v27); 3388 3389 ushll2(v22, T4S, v20, T8H, 8); 3390 ushll(v20, T4S, v20, T4H, 8); 3391 3392 ushll2(v18, T4S, v16, T8H, 8); 3393 ushll(v16, T4S, v16, T4H, 8); 3394 3395 eor(v22, T16B, v23, v22); 3396 eor(v18, T16B, v19, v18); 3397 eor(v20, T16B, v21, v20); 3398 eor(v16, T16B, v17, v16); 3399 3400 uzp1(v17, T2D, v16, v20); 3401 uzp2(v21, T2D, v16, v20); 3402 eor(v16, T16B, v17, v21); 3403 3404 ushll2(v20, T2D, v16, T4S, 16); 3405 ushll(v16, T2D, v16, T2S, 16); 3406 3407 eor(v20, T16B, v22, v20); 3408 eor(v16, T16B, v16, v18); 3409 3410 uzp1(v17, T2D, v20, v16); 3411 uzp2(v21, T2D, v20, v16); 3412 eor(v20, T16B, v17, v21); 3413 3414 shl(v16, T2D, v28, 1); 3415 shl(v17, T2D, v20, 1); 3416 3417 eor(v0, T16B, v0, v16); 3418 eor(v1, T16B, v1, v17); 3419 3420 subs(len, len, 32); 3421 br(Assembler::GE, L_fold); 3422 3423 mov(crc, 0); 3424 mov(tmp, v0, T1D, 0); 3425 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false); 3426 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true); 3427 mov(tmp, v0, T1D, 1); 3428 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false); 3429 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true); 3430 mov(tmp, v1, T1D, 0); 3431 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false); 3432 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true); 3433 mov(tmp, v1, T1D, 1); 3434 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false); 3435 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true); 3436 3437 add(len, len, 32); 3438 } 3439 3440 BIND(L_by16); 3441 subs(len, len, 16); 3442 br(Assembler::GE, L_by16_loop); 3443 adds(len, len, 16-4); 3444 br(Assembler::GE, L_by4_loop); 3445 adds(len, len, 4); 3446 br(Assembler::GT, L_by1_loop); 3447 b(L_exit); 3448 3449 BIND(L_by4_loop); 3450 ldrw(tmp, Address(post(buf, 4))); 3451 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3); 3452 subs(len, len, 4); 3453 br(Assembler::GE, L_by4_loop); 3454 adds(len, len, 4); 3455 br(Assembler::LE, L_exit); 3456 BIND(L_by1_loop); 3457 subs(len, len, 1); 3458 ldrb(tmp, Address(post(buf, 1))); 3459 update_byte_crc32(crc, tmp, table0); 3460 br(Assembler::GT, L_by1_loop); 3461 b(L_exit); 3462 3463 align(CodeEntryAlignment); 3464 BIND(L_by16_loop); 3465 subs(len, len, 16); 3466 ldp(tmp, tmp3, Address(post(buf, 16))); 3467 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false); 3468 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true); 3469 update_word_crc32(crc, tmp3, tmp2, table0, table1, table2, table3, false); 3470 update_word_crc32(crc, tmp3, tmp2, table0, table1, table2, table3, true); 3471 br(Assembler::GE, L_by16_loop); 3472 adds(len, len, 16-4); 3473 br(Assembler::GE, L_by4_loop); 3474 adds(len, len, 4); 3475 br(Assembler::GT, L_by1_loop); 3476 BIND(L_exit); 3477 mvnw(crc, crc); 3478 } 3479 3480 void MacroAssembler::kernel_crc32c_using_crc32c(Register crc, Register buf, 3481 Register len, Register tmp0, Register tmp1, Register tmp2, 3482 Register tmp3) { 3483 Label CRC_by64_loop, CRC_by4_loop, CRC_by1_loop, CRC_less64, CRC_by64_pre, CRC_by32_loop, CRC_less32, L_exit; 3484 assert_different_registers(crc, buf, len, tmp0, tmp1, tmp2, tmp3); 3485 3486 subs(len, len, 128); 3487 br(Assembler::GE, CRC_by64_pre); 3488 BIND(CRC_less64); 3489 adds(len, len, 128-32); 3490 br(Assembler::GE, CRC_by32_loop); 3491 BIND(CRC_less32); 3492 adds(len, len, 32-4); 3493 br(Assembler::GE, CRC_by4_loop); 3494 adds(len, len, 4); 3495 br(Assembler::GT, CRC_by1_loop); 3496 b(L_exit); 3497 3498 BIND(CRC_by32_loop); 3499 ldp(tmp0, tmp1, Address(post(buf, 16))); 3500 subs(len, len, 32); 3501 crc32cx(crc, crc, tmp0); 3502 ldr(tmp2, Address(post(buf, 8))); 3503 crc32cx(crc, crc, tmp1); 3504 ldr(tmp3, Address(post(buf, 8))); 3505 crc32cx(crc, crc, tmp2); 3506 crc32cx(crc, crc, tmp3); 3507 br(Assembler::GE, CRC_by32_loop); 3508 cmn(len, 32); 3509 br(Assembler::NE, CRC_less32); 3510 b(L_exit); 3511 3512 BIND(CRC_by4_loop); 3513 ldrw(tmp0, Address(post(buf, 4))); 3514 subs(len, len, 4); 3515 crc32cw(crc, crc, tmp0); 3516 br(Assembler::GE, CRC_by4_loop); 3517 adds(len, len, 4); 3518 br(Assembler::LE, L_exit); 3519 BIND(CRC_by1_loop); 3520 ldrb(tmp0, Address(post(buf, 1))); 3521 subs(len, len, 1); 3522 crc32cb(crc, crc, tmp0); 3523 br(Assembler::GT, CRC_by1_loop); 3524 b(L_exit); 3525 3526 BIND(CRC_by64_pre); 3527 sub(buf, buf, 8); 3528 ldp(tmp0, tmp1, Address(buf, 8)); 3529 crc32cx(crc, crc, tmp0); 3530 ldr(tmp2, Address(buf, 24)); 3531 crc32cx(crc, crc, tmp1); 3532 ldr(tmp3, Address(buf, 32)); 3533 crc32cx(crc, crc, tmp2); 3534 ldr(tmp0, Address(buf, 40)); 3535 crc32cx(crc, crc, tmp3); 3536 ldr(tmp1, Address(buf, 48)); 3537 crc32cx(crc, crc, tmp0); 3538 ldr(tmp2, Address(buf, 56)); 3539 crc32cx(crc, crc, tmp1); 3540 ldr(tmp3, Address(pre(buf, 64))); 3541 3542 b(CRC_by64_loop); 3543 3544 align(CodeEntryAlignment); 3545 BIND(CRC_by64_loop); 3546 subs(len, len, 64); 3547 crc32cx(crc, crc, tmp2); 3548 ldr(tmp0, Address(buf, 8)); 3549 crc32cx(crc, crc, tmp3); 3550 ldr(tmp1, Address(buf, 16)); 3551 crc32cx(crc, crc, tmp0); 3552 ldr(tmp2, Address(buf, 24)); 3553 crc32cx(crc, crc, tmp1); 3554 ldr(tmp3, Address(buf, 32)); 3555 crc32cx(crc, crc, tmp2); 3556 ldr(tmp0, Address(buf, 40)); 3557 crc32cx(crc, crc, tmp3); 3558 ldr(tmp1, Address(buf, 48)); 3559 crc32cx(crc, crc, tmp0); 3560 ldr(tmp2, Address(buf, 56)); 3561 crc32cx(crc, crc, tmp1); 3562 ldr(tmp3, Address(pre(buf, 64))); 3563 br(Assembler::GE, CRC_by64_loop); 3564 3565 // post-loop 3566 crc32cx(crc, crc, tmp2); 3567 crc32cx(crc, crc, tmp3); 3568 3569 sub(len, len, 64); 3570 add(buf, buf, 8); 3571 cmn(len, 128); 3572 br(Assembler::NE, CRC_less64); 3573 BIND(L_exit); 3574 } 3575 3576 /** 3577 * @param crc register containing existing CRC (32-bit) 3578 * @param buf register pointing to input byte buffer (byte*) 3579 * @param len register containing number of bytes 3580 * @param table register that will contain address of CRC table 3581 * @param tmp scratch register 3582 */ 3583 void MacroAssembler::kernel_crc32c(Register crc, Register buf, Register len, 3584 Register table0, Register table1, Register table2, Register table3, 3585 Register tmp, Register tmp2, Register tmp3) { 3586 kernel_crc32c_using_crc32c(crc, buf, len, table0, table1, table2, table3); 3587 } 3588 3589 3590 SkipIfEqual::SkipIfEqual( 3591 MacroAssembler* masm, const bool* flag_addr, bool value) { 3592 _masm = masm; 3593 unsigned long offset; 3594 _masm->adrp(rscratch1, ExternalAddress((address)flag_addr), offset); 3595 _masm->ldrb(rscratch1, Address(rscratch1, offset)); 3596 _masm->cbzw(rscratch1, _label); 3597 } 3598 3599 SkipIfEqual::~SkipIfEqual() { 3600 _masm->bind(_label); 3601 } 3602 3603 void MacroAssembler::addptr(const Address &dst, int32_t src) { 3604 Address adr; 3605 switch(dst.getMode()) { 3606 case Address::base_plus_offset: 3607 // This is the expected mode, although we allow all the other 3608 // forms below. 3609 adr = form_address(rscratch2, dst.base(), dst.offset(), LogBytesPerWord); 3610 break; 3611 default: 3612 lea(rscratch2, dst); 3613 adr = Address(rscratch2); 3614 break; 3615 } 3616 ldr(rscratch1, adr); 3617 add(rscratch1, rscratch1, src); 3618 str(rscratch1, adr); 3619 } 3620 3621 void MacroAssembler::cmpptr(Register src1, Address src2) { 3622 unsigned long offset; 3623 adrp(rscratch1, src2, offset); 3624 ldr(rscratch1, Address(rscratch1, offset)); 3625 cmp(src1, rscratch1); 3626 } 3627 3628 void MacroAssembler::cmpoop(Register obj1, Register obj2) { 3629 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 3630 bs->obj_equals(this, obj1, obj2); 3631 } 3632 3633 void MacroAssembler::load_klass(Register dst, Register src) { 3634 if (UseCompressedClassPointers) { 3635 ldrw(dst, Address(src, oopDesc::klass_offset_in_bytes())); 3636 decode_klass_not_null(dst); 3637 } else { 3638 ldr(dst, Address(src, oopDesc::klass_offset_in_bytes())); 3639 } 3640 } 3641 3642 // ((OopHandle)result).resolve(); 3643 void MacroAssembler::resolve_oop_handle(Register result, Register tmp) { 3644 // OopHandle::resolve is an indirection. 3645 access_load_at(T_OBJECT, IN_NATIVE, result, Address(result, 0), tmp, noreg); 3646 } 3647 3648 void MacroAssembler::load_mirror(Register dst, Register method, Register tmp) { 3649 const int mirror_offset = in_bytes(Klass::java_mirror_offset()); 3650 ldr(dst, Address(rmethod, Method::const_offset())); 3651 ldr(dst, Address(dst, ConstMethod::constants_offset())); 3652 ldr(dst, Address(dst, ConstantPool::pool_holder_offset_in_bytes())); 3653 ldr(dst, Address(dst, mirror_offset)); 3654 resolve_oop_handle(dst, tmp); 3655 } 3656 3657 void MacroAssembler::cmp_klass(Register oop, Register trial_klass, Register tmp) { 3658 if (UseCompressedClassPointers) { 3659 ldrw(tmp, Address(oop, oopDesc::klass_offset_in_bytes())); 3660 if (Universe::narrow_klass_base() == NULL) { 3661 cmp(trial_klass, tmp, LSL, Universe::narrow_klass_shift()); 3662 return; 3663 } else if (((uint64_t)Universe::narrow_klass_base() & 0xffffffff) == 0 3664 && Universe::narrow_klass_shift() == 0) { 3665 // Only the bottom 32 bits matter 3666 cmpw(trial_klass, tmp); 3667 return; 3668 } 3669 decode_klass_not_null(tmp); 3670 } else { 3671 ldr(tmp, Address(oop, oopDesc::klass_offset_in_bytes())); 3672 } 3673 cmp(trial_klass, tmp); 3674 } 3675 3676 void MacroAssembler::load_prototype_header(Register dst, Register src) { 3677 load_klass(dst, src); 3678 ldr(dst, Address(dst, Klass::prototype_header_offset())); 3679 } 3680 3681 void MacroAssembler::store_klass(Register dst, Register src) { 3682 // FIXME: Should this be a store release? concurrent gcs assumes 3683 // klass length is valid if klass field is not null. 3684 if (UseCompressedClassPointers) { 3685 encode_klass_not_null(src); 3686 strw(src, Address(dst, oopDesc::klass_offset_in_bytes())); 3687 } else { 3688 str(src, Address(dst, oopDesc::klass_offset_in_bytes())); 3689 } 3690 } 3691 3692 void MacroAssembler::store_klass_gap(Register dst, Register src) { 3693 if (UseCompressedClassPointers) { 3694 // Store to klass gap in destination 3695 strw(src, Address(dst, oopDesc::klass_gap_offset_in_bytes())); 3696 } 3697 } 3698 3699 // Algorithm must match CompressedOops::encode. 3700 void MacroAssembler::encode_heap_oop(Register d, Register s) { 3701 #ifdef ASSERT 3702 verify_heapbase("MacroAssembler::encode_heap_oop: heap base corrupted?"); 3703 #endif 3704 verify_oop(s, "broken oop in encode_heap_oop"); 3705 if (Universe::narrow_oop_base() == NULL) { 3706 if (Universe::narrow_oop_shift() != 0) { 3707 assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong"); 3708 lsr(d, s, LogMinObjAlignmentInBytes); 3709 } else { 3710 mov(d, s); 3711 } 3712 } else { 3713 subs(d, s, rheapbase); 3714 csel(d, d, zr, Assembler::HS); 3715 lsr(d, d, LogMinObjAlignmentInBytes); 3716 3717 /* Old algorithm: is this any worse? 3718 Label nonnull; 3719 cbnz(r, nonnull); 3720 sub(r, r, rheapbase); 3721 bind(nonnull); 3722 lsr(r, r, LogMinObjAlignmentInBytes); 3723 */ 3724 } 3725 } 3726 3727 void MacroAssembler::encode_heap_oop_not_null(Register r) { 3728 #ifdef ASSERT 3729 verify_heapbase("MacroAssembler::encode_heap_oop_not_null: heap base corrupted?"); 3730 if (CheckCompressedOops) { 3731 Label ok; 3732 cbnz(r, ok); 3733 stop("null oop passed to encode_heap_oop_not_null"); 3734 bind(ok); 3735 } 3736 #endif 3737 verify_oop(r, "broken oop in encode_heap_oop_not_null"); 3738 if (Universe::narrow_oop_base() != NULL) { 3739 sub(r, r, rheapbase); 3740 } 3741 if (Universe::narrow_oop_shift() != 0) { 3742 assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong"); 3743 lsr(r, r, LogMinObjAlignmentInBytes); 3744 } 3745 } 3746 3747 void MacroAssembler::encode_heap_oop_not_null(Register dst, Register src) { 3748 #ifdef ASSERT 3749 verify_heapbase("MacroAssembler::encode_heap_oop_not_null2: heap base corrupted?"); 3750 if (CheckCompressedOops) { 3751 Label ok; 3752 cbnz(src, ok); 3753 stop("null oop passed to encode_heap_oop_not_null2"); 3754 bind(ok); 3755 } 3756 #endif 3757 verify_oop(src, "broken oop in encode_heap_oop_not_null2"); 3758 3759 Register data = src; 3760 if (Universe::narrow_oop_base() != NULL) { 3761 sub(dst, src, rheapbase); 3762 data = dst; 3763 } 3764 if (Universe::narrow_oop_shift() != 0) { 3765 assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong"); 3766 lsr(dst, data, LogMinObjAlignmentInBytes); 3767 data = dst; 3768 } 3769 if (data == src) 3770 mov(dst, src); 3771 } 3772 3773 void MacroAssembler::decode_heap_oop(Register d, Register s) { 3774 #ifdef ASSERT 3775 verify_heapbase("MacroAssembler::decode_heap_oop: heap base corrupted?"); 3776 #endif 3777 if (Universe::narrow_oop_base() == NULL) { 3778 if (Universe::narrow_oop_shift() != 0 || d != s) { 3779 lsl(d, s, Universe::narrow_oop_shift()); 3780 } 3781 } else { 3782 Label done; 3783 if (d != s) 3784 mov(d, s); 3785 cbz(s, done); 3786 add(d, rheapbase, s, Assembler::LSL, LogMinObjAlignmentInBytes); 3787 bind(done); 3788 } 3789 verify_oop(d, "broken oop in decode_heap_oop"); 3790 } 3791 3792 void MacroAssembler::decode_heap_oop_not_null(Register r) { 3793 assert (UseCompressedOops, "should only be used for compressed headers"); 3794 assert (Universe::heap() != NULL, "java heap should be initialized"); 3795 // Cannot assert, unverified entry point counts instructions (see .ad file) 3796 // vtableStubs also counts instructions in pd_code_size_limit. 3797 // Also do not verify_oop as this is called by verify_oop. 3798 if (Universe::narrow_oop_shift() != 0) { 3799 assert(LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong"); 3800 if (Universe::narrow_oop_base() != NULL) { 3801 add(r, rheapbase, r, Assembler::LSL, LogMinObjAlignmentInBytes); 3802 } else { 3803 add(r, zr, r, Assembler::LSL, LogMinObjAlignmentInBytes); 3804 } 3805 } else { 3806 assert (Universe::narrow_oop_base() == NULL, "sanity"); 3807 } 3808 } 3809 3810 void MacroAssembler::decode_heap_oop_not_null(Register dst, Register src) { 3811 assert (UseCompressedOops, "should only be used for compressed headers"); 3812 assert (Universe::heap() != NULL, "java heap should be initialized"); 3813 // Cannot assert, unverified entry point counts instructions (see .ad file) 3814 // vtableStubs also counts instructions in pd_code_size_limit. 3815 // Also do not verify_oop as this is called by verify_oop. 3816 if (Universe::narrow_oop_shift() != 0) { 3817 assert(LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong"); 3818 if (Universe::narrow_oop_base() != NULL) { 3819 add(dst, rheapbase, src, Assembler::LSL, LogMinObjAlignmentInBytes); 3820 } else { 3821 add(dst, zr, src, Assembler::LSL, LogMinObjAlignmentInBytes); 3822 } 3823 } else { 3824 assert (Universe::narrow_oop_base() == NULL, "sanity"); 3825 if (dst != src) { 3826 mov(dst, src); 3827 } 3828 } 3829 } 3830 3831 void MacroAssembler::encode_klass_not_null(Register dst, Register src) { 3832 if (Universe::narrow_klass_base() == NULL) { 3833 if (Universe::narrow_klass_shift() != 0) { 3834 assert (LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong"); 3835 lsr(dst, src, LogKlassAlignmentInBytes); 3836 } else { 3837 if (dst != src) mov(dst, src); 3838 } 3839 return; 3840 } 3841 3842 if (use_XOR_for_compressed_class_base) { 3843 if (Universe::narrow_klass_shift() != 0) { 3844 eor(dst, src, (uint64_t)Universe::narrow_klass_base()); 3845 lsr(dst, dst, LogKlassAlignmentInBytes); 3846 } else { 3847 eor(dst, src, (uint64_t)Universe::narrow_klass_base()); 3848 } 3849 return; 3850 } 3851 3852 if (((uint64_t)Universe::narrow_klass_base() & 0xffffffff) == 0 3853 && Universe::narrow_klass_shift() == 0) { 3854 movw(dst, src); 3855 return; 3856 } 3857 3858 #ifdef ASSERT 3859 verify_heapbase("MacroAssembler::encode_klass_not_null2: heap base corrupted?"); 3860 #endif 3861 3862 Register rbase = dst; 3863 if (dst == src) rbase = rheapbase; 3864 mov(rbase, (uint64_t)Universe::narrow_klass_base()); 3865 sub(dst, src, rbase); 3866 if (Universe::narrow_klass_shift() != 0) { 3867 assert (LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong"); 3868 lsr(dst, dst, LogKlassAlignmentInBytes); 3869 } 3870 if (dst == src) reinit_heapbase(); 3871 } 3872 3873 void MacroAssembler::encode_klass_not_null(Register r) { 3874 encode_klass_not_null(r, r); 3875 } 3876 3877 void MacroAssembler::decode_klass_not_null(Register dst, Register src) { 3878 Register rbase = dst; 3879 assert (UseCompressedClassPointers, "should only be used for compressed headers"); 3880 3881 if (Universe::narrow_klass_base() == NULL) { 3882 if (Universe::narrow_klass_shift() != 0) { 3883 assert(LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong"); 3884 lsl(dst, src, LogKlassAlignmentInBytes); 3885 } else { 3886 if (dst != src) mov(dst, src); 3887 } 3888 return; 3889 } 3890 3891 if (use_XOR_for_compressed_class_base) { 3892 if (Universe::narrow_klass_shift() != 0) { 3893 lsl(dst, src, LogKlassAlignmentInBytes); 3894 eor(dst, dst, (uint64_t)Universe::narrow_klass_base()); 3895 } else { 3896 eor(dst, src, (uint64_t)Universe::narrow_klass_base()); 3897 } 3898 return; 3899 } 3900 3901 if (((uint64_t)Universe::narrow_klass_base() & 0xffffffff) == 0 3902 && Universe::narrow_klass_shift() == 0) { 3903 if (dst != src) 3904 movw(dst, src); 3905 movk(dst, (uint64_t)Universe::narrow_klass_base() >> 32, 32); 3906 return; 3907 } 3908 3909 // Cannot assert, unverified entry point counts instructions (see .ad file) 3910 // vtableStubs also counts instructions in pd_code_size_limit. 3911 // Also do not verify_oop as this is called by verify_oop. 3912 if (dst == src) rbase = rheapbase; 3913 mov(rbase, (uint64_t)Universe::narrow_klass_base()); 3914 if (Universe::narrow_klass_shift() != 0) { 3915 assert(LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong"); 3916 add(dst, rbase, src, Assembler::LSL, LogKlassAlignmentInBytes); 3917 } else { 3918 add(dst, rbase, src); 3919 } 3920 if (dst == src) reinit_heapbase(); 3921 } 3922 3923 void MacroAssembler::decode_klass_not_null(Register r) { 3924 decode_klass_not_null(r, r); 3925 } 3926 3927 void MacroAssembler::set_narrow_oop(Register dst, jobject obj) { 3928 #ifdef ASSERT 3929 { 3930 ThreadInVMfromUnknown tiv; 3931 assert (UseCompressedOops, "should only be used for compressed oops"); 3932 assert (Universe::heap() != NULL, "java heap should be initialized"); 3933 assert (oop_recorder() != NULL, "this assembler needs an OopRecorder"); 3934 assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "should be real oop"); 3935 } 3936 #endif 3937 int oop_index = oop_recorder()->find_index(obj); 3938 InstructionMark im(this); 3939 RelocationHolder rspec = oop_Relocation::spec(oop_index); 3940 code_section()->relocate(inst_mark(), rspec); 3941 movz(dst, 0xDEAD, 16); 3942 movk(dst, 0xBEEF); 3943 } 3944 3945 void MacroAssembler::set_narrow_klass(Register dst, Klass* k) { 3946 assert (UseCompressedClassPointers, "should only be used for compressed headers"); 3947 assert (oop_recorder() != NULL, "this assembler needs an OopRecorder"); 3948 int index = oop_recorder()->find_index(k); 3949 assert(! Universe::heap()->is_in_reserved(k), "should not be an oop"); 3950 3951 InstructionMark im(this); 3952 RelocationHolder rspec = metadata_Relocation::spec(index); 3953 code_section()->relocate(inst_mark(), rspec); 3954 narrowKlass nk = Klass::encode_klass(k); 3955 movz(dst, (nk >> 16), 16); 3956 movk(dst, nk & 0xffff); 3957 } 3958 3959 void MacroAssembler::access_load_at(BasicType type, DecoratorSet decorators, 3960 Register dst, Address src, 3961 Register tmp1, Register thread_tmp) { 3962 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 3963 decorators = AccessInternal::decorator_fixup(decorators); 3964 bool as_raw = (decorators & AS_RAW) != 0; 3965 if (as_raw) { 3966 bs->BarrierSetAssembler::load_at(this, decorators, type, dst, src, tmp1, thread_tmp); 3967 } else { 3968 bs->load_at(this, decorators, type, dst, src, tmp1, thread_tmp); 3969 } 3970 } 3971 3972 void MacroAssembler::access_store_at(BasicType type, DecoratorSet decorators, 3973 Address dst, Register src, 3974 Register tmp1, Register thread_tmp) { 3975 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 3976 decorators = AccessInternal::decorator_fixup(decorators); 3977 bool as_raw = (decorators & AS_RAW) != 0; 3978 if (as_raw) { 3979 bs->BarrierSetAssembler::store_at(this, decorators, type, dst, src, tmp1, thread_tmp); 3980 } else { 3981 bs->store_at(this, decorators, type, dst, src, tmp1, thread_tmp); 3982 } 3983 } 3984 3985 void MacroAssembler::resolve(DecoratorSet decorators, Register obj) { 3986 if ((decorators & ACCESS_READ) == 0) { 3987 decorators |= ACCESS_WRITE; 3988 } 3989 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 3990 return bs->resolve(this, decorators, obj); 3991 } 3992 3993 void MacroAssembler::load_heap_oop(Register dst, Address src, Register tmp1, 3994 Register thread_tmp, DecoratorSet decorators) { 3995 access_load_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, thread_tmp); 3996 } 3997 3998 void MacroAssembler::load_heap_oop_not_null(Register dst, Address src, Register tmp1, 3999 Register thread_tmp, DecoratorSet decorators) { 4000 access_load_at(T_OBJECT, IN_HEAP | IS_NOT_NULL | decorators, dst, src, tmp1, thread_tmp); 4001 } 4002 4003 void MacroAssembler::store_heap_oop(Address dst, Register src, Register tmp1, 4004 Register thread_tmp, DecoratorSet decorators) { 4005 access_store_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, thread_tmp); 4006 } 4007 4008 // Used for storing NULLs. 4009 void MacroAssembler::store_heap_oop_null(Address dst) { 4010 access_store_at(T_OBJECT, IN_HEAP, dst, noreg, noreg, noreg); 4011 } 4012 4013 Address MacroAssembler::allocate_metadata_address(Metadata* obj) { 4014 assert(oop_recorder() != NULL, "this assembler needs a Recorder"); 4015 int index = oop_recorder()->allocate_metadata_index(obj); 4016 RelocationHolder rspec = metadata_Relocation::spec(index); 4017 return Address((address)obj, rspec); 4018 } 4019 4020 // Move an oop into a register. immediate is true if we want 4021 // immediate instrcutions, i.e. we are not going to patch this 4022 // instruction while the code is being executed by another thread. In 4023 // that case we can use move immediates rather than the constant pool. 4024 void MacroAssembler::movoop(Register dst, jobject obj, bool immediate) { 4025 int oop_index; 4026 if (obj == NULL) { 4027 oop_index = oop_recorder()->allocate_oop_index(obj); 4028 } else { 4029 #ifdef ASSERT 4030 { 4031 ThreadInVMfromUnknown tiv; 4032 assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "should be real oop"); 4033 } 4034 #endif 4035 oop_index = oop_recorder()->find_index(obj); 4036 } 4037 RelocationHolder rspec = oop_Relocation::spec(oop_index); 4038 if (! immediate) { 4039 address dummy = address(uintptr_t(pc()) & -wordSize); // A nearby aligned address 4040 ldr_constant(dst, Address(dummy, rspec)); 4041 } else 4042 mov(dst, Address((address)obj, rspec)); 4043 } 4044 4045 // Move a metadata address into a register. 4046 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) { 4047 int oop_index; 4048 if (obj == NULL) { 4049 oop_index = oop_recorder()->allocate_metadata_index(obj); 4050 } else { 4051 oop_index = oop_recorder()->find_index(obj); 4052 } 4053 RelocationHolder rspec = metadata_Relocation::spec(oop_index); 4054 mov(dst, Address((address)obj, rspec)); 4055 } 4056 4057 Address MacroAssembler::constant_oop_address(jobject obj) { 4058 #ifdef ASSERT 4059 { 4060 ThreadInVMfromUnknown tiv; 4061 assert(oop_recorder() != NULL, "this assembler needs an OopRecorder"); 4062 assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "not an oop"); 4063 } 4064 #endif 4065 int oop_index = oop_recorder()->find_index(obj); 4066 return Address((address)obj, oop_Relocation::spec(oop_index)); 4067 } 4068 4069 // Defines obj, preserves var_size_in_bytes, okay for t2 == var_size_in_bytes. 4070 void MacroAssembler::tlab_allocate(Register obj, 4071 Register var_size_in_bytes, 4072 int con_size_in_bytes, 4073 Register t1, 4074 Register t2, 4075 Label& slow_case) { 4076 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 4077 bs->tlab_allocate(this, obj, var_size_in_bytes, con_size_in_bytes, t1, t2, slow_case); 4078 } 4079 4080 // Defines obj, preserves var_size_in_bytes 4081 void MacroAssembler::eden_allocate(Register obj, 4082 Register var_size_in_bytes, 4083 int con_size_in_bytes, 4084 Register t1, 4085 Label& slow_case) { 4086 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 4087 bs->eden_allocate(this, obj, var_size_in_bytes, con_size_in_bytes, t1, slow_case); 4088 } 4089 4090 // Zero words; len is in bytes 4091 // Destroys all registers except addr 4092 // len must be a nonzero multiple of wordSize 4093 void MacroAssembler::zero_memory(Register addr, Register len, Register t1) { 4094 assert_different_registers(addr, len, t1, rscratch1, rscratch2); 4095 4096 #ifdef ASSERT 4097 { Label L; 4098 tst(len, BytesPerWord - 1); 4099 br(Assembler::EQ, L); 4100 stop("len is not a multiple of BytesPerWord"); 4101 bind(L); 4102 } 4103 #endif 4104 4105 #ifndef PRODUCT 4106 block_comment("zero memory"); 4107 #endif 4108 4109 Label loop; 4110 Label entry; 4111 4112 // Algorithm: 4113 // 4114 // scratch1 = cnt & 7; 4115 // cnt -= scratch1; 4116 // p += scratch1; 4117 // switch (scratch1) { 4118 // do { 4119 // cnt -= 8; 4120 // p[-8] = 0; 4121 // case 7: 4122 // p[-7] = 0; 4123 // case 6: 4124 // p[-6] = 0; 4125 // // ... 4126 // case 1: 4127 // p[-1] = 0; 4128 // case 0: 4129 // p += 8; 4130 // } while (cnt); 4131 // } 4132 4133 const int unroll = 8; // Number of str(zr) instructions we'll unroll 4134 4135 lsr(len, len, LogBytesPerWord); 4136 andr(rscratch1, len, unroll - 1); // tmp1 = cnt % unroll 4137 sub(len, len, rscratch1); // cnt -= unroll 4138 // t1 always points to the end of the region we're about to zero 4139 add(t1, addr, rscratch1, Assembler::LSL, LogBytesPerWord); 4140 adr(rscratch2, entry); 4141 sub(rscratch2, rscratch2, rscratch1, Assembler::LSL, 2); 4142 br(rscratch2); 4143 bind(loop); 4144 sub(len, len, unroll); 4145 for (int i = -unroll; i < 0; i++) 4146 Assembler::str(zr, Address(t1, i * wordSize)); 4147 bind(entry); 4148 add(t1, t1, unroll * wordSize); 4149 cbnz(len, loop); 4150 } 4151 4152 void MacroAssembler::verify_tlab() { 4153 #ifdef ASSERT 4154 if (UseTLAB && VerifyOops) { 4155 Label next, ok; 4156 4157 stp(rscratch2, rscratch1, Address(pre(sp, -16))); 4158 4159 ldr(rscratch2, Address(rthread, in_bytes(JavaThread::tlab_top_offset()))); 4160 ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_start_offset()))); 4161 cmp(rscratch2, rscratch1); 4162 br(Assembler::HS, next); 4163 STOP("assert(top >= start)"); 4164 should_not_reach_here(); 4165 4166 bind(next); 4167 ldr(rscratch2, Address(rthread, in_bytes(JavaThread::tlab_end_offset()))); 4168 ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_top_offset()))); 4169 cmp(rscratch2, rscratch1); 4170 br(Assembler::HS, ok); 4171 STOP("assert(top <= end)"); 4172 should_not_reach_here(); 4173 4174 bind(ok); 4175 ldp(rscratch2, rscratch1, Address(post(sp, 16))); 4176 } 4177 #endif 4178 } 4179 4180 // Writes to stack successive pages until offset reached to check for 4181 // stack overflow + shadow pages. This clobbers tmp. 4182 void MacroAssembler::bang_stack_size(Register size, Register tmp) { 4183 assert_different_registers(tmp, size, rscratch1); 4184 mov(tmp, sp); 4185 // Bang stack for total size given plus shadow page size. 4186 // Bang one page at a time because large size can bang beyond yellow and 4187 // red zones. 4188 Label loop; 4189 mov(rscratch1, os::vm_page_size()); 4190 bind(loop); 4191 lea(tmp, Address(tmp, -os::vm_page_size())); 4192 subsw(size, size, rscratch1); 4193 str(size, Address(tmp)); 4194 br(Assembler::GT, loop); 4195 4196 // Bang down shadow pages too. 4197 // At this point, (tmp-0) is the last address touched, so don't 4198 // touch it again. (It was touched as (tmp-pagesize) but then tmp 4199 // was post-decremented.) Skip this address by starting at i=1, and 4200 // touch a few more pages below. N.B. It is important to touch all 4201 // the way down to and including i=StackShadowPages. 4202 for (int i = 0; i < (int)(JavaThread::stack_shadow_zone_size() / os::vm_page_size()) - 1; i++) { 4203 // this could be any sized move but this is can be a debugging crumb 4204 // so the bigger the better. 4205 lea(tmp, Address(tmp, -os::vm_page_size())); 4206 str(size, Address(tmp)); 4207 } 4208 } 4209 4210 4211 // Move the address of the polling page into dest. 4212 void MacroAssembler::get_polling_page(Register dest, address page, relocInfo::relocType rtype) { 4213 if (SafepointMechanism::uses_thread_local_poll()) { 4214 ldr(dest, Address(rthread, Thread::polling_page_offset())); 4215 } else { 4216 unsigned long off; 4217 adrp(dest, Address(page, rtype), off); 4218 assert(off == 0, "polling page must be page aligned"); 4219 } 4220 } 4221 4222 // Move the address of the polling page into r, then read the polling 4223 // page. 4224 address MacroAssembler::read_polling_page(Register r, address page, relocInfo::relocType rtype) { 4225 get_polling_page(r, page, rtype); 4226 return read_polling_page(r, rtype); 4227 } 4228 4229 // Read the polling page. The address of the polling page must 4230 // already be in r. 4231 address MacroAssembler::read_polling_page(Register r, relocInfo::relocType rtype) { 4232 InstructionMark im(this); 4233 code_section()->relocate(inst_mark(), rtype); 4234 ldrw(zr, Address(r, 0)); 4235 return inst_mark(); 4236 } 4237 4238 void MacroAssembler::adrp(Register reg1, const Address &dest, unsigned long &byte_offset) { 4239 relocInfo::relocType rtype = dest.rspec().reloc()->type(); 4240 unsigned long low_page = (unsigned long)CodeCache::low_bound() >> 12; 4241 unsigned long high_page = (unsigned long)(CodeCache::high_bound()-1) >> 12; 4242 unsigned long dest_page = (unsigned long)dest.target() >> 12; 4243 long offset_low = dest_page - low_page; 4244 long offset_high = dest_page - high_page; 4245 4246 assert(is_valid_AArch64_address(dest.target()), "bad address"); 4247 assert(dest.getMode() == Address::literal, "ADRP must be applied to a literal address"); 4248 4249 InstructionMark im(this); 4250 code_section()->relocate(inst_mark(), dest.rspec()); 4251 // 8143067: Ensure that the adrp can reach the dest from anywhere within 4252 // the code cache so that if it is relocated we know it will still reach 4253 if (offset_high >= -(1<<20) && offset_low < (1<<20)) { 4254 _adrp(reg1, dest.target()); 4255 } else { 4256 unsigned long target = (unsigned long)dest.target(); 4257 unsigned long adrp_target 4258 = (target & 0xffffffffUL) | ((unsigned long)pc() & 0xffff00000000UL); 4259 4260 _adrp(reg1, (address)adrp_target); 4261 movk(reg1, target >> 32, 32); 4262 } 4263 byte_offset = (unsigned long)dest.target() & 0xfff; 4264 } 4265 4266 void MacroAssembler::load_byte_map_base(Register reg) { 4267 jbyte *byte_map_base = 4268 ((CardTableBarrierSet*)(BarrierSet::barrier_set()))->card_table()->byte_map_base(); 4269 4270 if (is_valid_AArch64_address((address)byte_map_base)) { 4271 // Strictly speaking the byte_map_base isn't an address at all, 4272 // and it might even be negative. 4273 unsigned long offset; 4274 adrp(reg, ExternalAddress((address)byte_map_base), offset); 4275 // We expect offset to be zero with most collectors. 4276 if (offset != 0) { 4277 add(reg, reg, offset); 4278 } 4279 } else { 4280 mov(reg, (uint64_t)byte_map_base); 4281 } 4282 } 4283 4284 void MacroAssembler::build_frame(int framesize) { 4285 assert(framesize > 0, "framesize must be > 0"); 4286 if (framesize < ((1 << 9) + 2 * wordSize)) { 4287 sub(sp, sp, framesize); 4288 stp(rfp, lr, Address(sp, framesize - 2 * wordSize)); 4289 if (PreserveFramePointer) add(rfp, sp, framesize - 2 * wordSize); 4290 } else { 4291 stp(rfp, lr, Address(pre(sp, -2 * wordSize))); 4292 if (PreserveFramePointer) mov(rfp, sp); 4293 if (framesize < ((1 << 12) + 2 * wordSize)) 4294 sub(sp, sp, framesize - 2 * wordSize); 4295 else { 4296 mov(rscratch1, framesize - 2 * wordSize); 4297 sub(sp, sp, rscratch1); 4298 } 4299 } 4300 } 4301 4302 void MacroAssembler::remove_frame(int framesize) { 4303 assert(framesize > 0, "framesize must be > 0"); 4304 if (framesize < ((1 << 9) + 2 * wordSize)) { 4305 ldp(rfp, lr, Address(sp, framesize - 2 * wordSize)); 4306 add(sp, sp, framesize); 4307 } else { 4308 if (framesize < ((1 << 12) + 2 * wordSize)) 4309 add(sp, sp, framesize - 2 * wordSize); 4310 else { 4311 mov(rscratch1, framesize - 2 * wordSize); 4312 add(sp, sp, rscratch1); 4313 } 4314 ldp(rfp, lr, Address(post(sp, 2 * wordSize))); 4315 } 4316 } 4317 4318 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr); 4319 4320 // Search for str1 in str2 and return index or -1 4321 void MacroAssembler::string_indexof(Register str2, Register str1, 4322 Register cnt2, Register cnt1, 4323 Register tmp1, Register tmp2, 4324 Register tmp3, Register tmp4, 4325 Register tmp5, Register tmp6, 4326 int icnt1, Register result, int ae) { 4327 // NOTE: tmp5, tmp6 can be zr depending on specific method version 4328 Label LINEARSEARCH, LINEARSTUB, LINEAR_MEDIUM, DONE, NOMATCH, MATCH; 4329 4330 Register ch1 = rscratch1; 4331 Register ch2 = rscratch2; 4332 Register cnt1tmp = tmp1; 4333 Register cnt2tmp = tmp2; 4334 Register cnt1_neg = cnt1; 4335 Register cnt2_neg = cnt2; 4336 Register result_tmp = tmp4; 4337 4338 bool isL = ae == StrIntrinsicNode::LL; 4339 4340 bool str1_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL; 4341 bool str2_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU; 4342 int str1_chr_shift = str1_isL ? 0:1; 4343 int str2_chr_shift = str2_isL ? 0:1; 4344 int str1_chr_size = str1_isL ? 1:2; 4345 int str2_chr_size = str2_isL ? 1:2; 4346 chr_insn str1_load_1chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb : 4347 (chr_insn)&MacroAssembler::ldrh; 4348 chr_insn str2_load_1chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb : 4349 (chr_insn)&MacroAssembler::ldrh; 4350 chr_insn load_2chr = isL ? (chr_insn)&MacroAssembler::ldrh : (chr_insn)&MacroAssembler::ldrw; 4351 chr_insn load_4chr = isL ? (chr_insn)&MacroAssembler::ldrw : (chr_insn)&MacroAssembler::ldr; 4352 4353 // Note, inline_string_indexOf() generates checks: 4354 // if (substr.count > string.count) return -1; 4355 // if (substr.count == 0) return 0; 4356 4357 // We have two strings, a source string in str2, cnt2 and a pattern string 4358 // in str1, cnt1. Find the 1st occurence of pattern in source or return -1. 4359 4360 // For larger pattern and source we use a simplified Boyer Moore algorithm. 4361 // With a small pattern and source we use linear scan. 4362 4363 if (icnt1 == -1) { 4364 sub(result_tmp, cnt2, cnt1); 4365 cmp(cnt1, 8); // Use Linear Scan if cnt1 < 8 || cnt1 >= 256 4366 br(LT, LINEARSEARCH); 4367 dup(v0, T16B, cnt1); // done in separate FPU pipeline. Almost no penalty 4368 cmp(cnt1, 256); 4369 lsr(tmp1, cnt2, 2); 4370 ccmp(cnt1, tmp1, 0b0000, LT); // Source must be 4 * pattern for BM 4371 br(GE, LINEARSTUB); 4372 } 4373 4374 // The Boyer Moore alogorithm is based on the description here:- 4375 // 4376 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm 4377 // 4378 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule 4379 // and the 'Good Suffix' rule. 4380 // 4381 // These rules are essentially heuristics for how far we can shift the 4382 // pattern along the search string. 4383 // 4384 // The implementation here uses the 'Bad Character' rule only because of the 4385 // complexity of initialisation for the 'Good Suffix' rule. 4386 // 4387 // This is also known as the Boyer-Moore-Horspool algorithm:- 4388 // 4389 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm 4390 // 4391 // This particular implementation has few java-specific optimizations. 4392 // 4393 // #define ASIZE 256 4394 // 4395 // int bm(unsigned char *x, int m, unsigned char *y, int n) { 4396 // int i, j; 4397 // unsigned c; 4398 // unsigned char bc[ASIZE]; 4399 // 4400 // /* Preprocessing */ 4401 // for (i = 0; i < ASIZE; ++i) 4402 // bc[i] = m; 4403 // for (i = 0; i < m - 1; ) { 4404 // c = x[i]; 4405 // ++i; 4406 // // c < 256 for Latin1 string, so, no need for branch 4407 // #ifdef PATTERN_STRING_IS_LATIN1 4408 // bc[c] = m - i; 4409 // #else 4410 // if (c < ASIZE) bc[c] = m - i; 4411 // #endif 4412 // } 4413 // 4414 // /* Searching */ 4415 // j = 0; 4416 // while (j <= n - m) { 4417 // c = y[i+j]; 4418 // if (x[m-1] == c) 4419 // for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i); 4420 // if (i < 0) return j; 4421 // // c < 256 for Latin1 string, so, no need for branch 4422 // #ifdef SOURCE_STRING_IS_LATIN1 4423 // // LL case: (c< 256) always true. Remove branch 4424 // j += bc[y[j+m-1]]; 4425 // #endif 4426 // #ifndef PATTERN_STRING_IS_UTF 4427 // // UU case: need if (c<ASIZE) check. Skip 1 character if not. 4428 // if (c < ASIZE) 4429 // j += bc[y[j+m-1]]; 4430 // else 4431 // j += 1 4432 // #endif 4433 // #ifdef PATTERN_IS_LATIN1_AND_SOURCE_IS_UTF 4434 // // UL case: need if (c<ASIZE) check. Skip <pattern length> if not. 4435 // if (c < ASIZE) 4436 // j += bc[y[j+m-1]]; 4437 // else 4438 // j += m 4439 // #endif 4440 // } 4441 // } 4442 4443 if (icnt1 == -1) { 4444 Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH, 4445 BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP; 4446 Register cnt1end = tmp2; 4447 Register str2end = cnt2; 4448 Register skipch = tmp2; 4449 4450 // str1 length is >=8, so, we can read at least 1 register for cases when 4451 // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for 4452 // UL case. We'll re-read last character in inner pre-loop code to have 4453 // single outer pre-loop load 4454 const int firstStep = isL ? 7 : 3; 4455 4456 const int ASIZE = 256; 4457 const int STORED_BYTES = 32; // amount of bytes stored per instruction 4458 sub(sp, sp, ASIZE); 4459 mov(tmp5, ASIZE/STORED_BYTES); // loop iterations 4460 mov(ch1, sp); 4461 BIND(BM_INIT_LOOP); 4462 stpq(v0, v0, Address(post(ch1, STORED_BYTES))); 4463 subs(tmp5, tmp5, 1); 4464 br(GT, BM_INIT_LOOP); 4465 4466 sub(cnt1tmp, cnt1, 1); 4467 mov(tmp5, str2); 4468 add(str2end, str2, result_tmp, LSL, str2_chr_shift); 4469 sub(ch2, cnt1, 1); 4470 mov(tmp3, str1); 4471 BIND(BCLOOP); 4472 (this->*str1_load_1chr)(ch1, Address(post(tmp3, str1_chr_size))); 4473 if (!str1_isL) { 4474 cmp(ch1, ASIZE); 4475 br(HS, BCSKIP); 4476 } 4477 strb(ch2, Address(sp, ch1)); 4478 BIND(BCSKIP); 4479 subs(ch2, ch2, 1); 4480 br(GT, BCLOOP); 4481 4482 add(tmp6, str1, cnt1, LSL, str1_chr_shift); // address after str1 4483 if (str1_isL == str2_isL) { 4484 // load last 8 bytes (8LL/4UU symbols) 4485 ldr(tmp6, Address(tmp6, -wordSize)); 4486 } else { 4487 ldrw(tmp6, Address(tmp6, -wordSize/2)); // load last 4 bytes(4 symbols) 4488 // convert Latin1 to UTF. We'll have to wait until load completed, but 4489 // it's still faster than per-character loads+checks 4490 lsr(tmp3, tmp6, BitsPerByte * (wordSize/2 - str1_chr_size)); // str1[N-1] 4491 ubfx(ch1, tmp6, 8, 8); // str1[N-2] 4492 ubfx(ch2, tmp6, 16, 8); // str1[N-3] 4493 andr(tmp6, tmp6, 0xFF); // str1[N-4] 4494 orr(ch2, ch1, ch2, LSL, 16); 4495 orr(tmp6, tmp6, tmp3, LSL, 48); 4496 orr(tmp6, tmp6, ch2, LSL, 16); 4497 } 4498 BIND(BMLOOPSTR2); 4499 (this->*str2_load_1chr)(skipch, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift))); 4500 sub(cnt1tmp, cnt1tmp, firstStep); // cnt1tmp is positive here, because cnt1 >= 8 4501 if (str1_isL == str2_isL) { 4502 // re-init tmp3. It's for free because it's executed in parallel with 4503 // load above. Alternative is to initialize it before loop, but it'll 4504 // affect performance on in-order systems with 2 or more ld/st pipelines 4505 lsr(tmp3, tmp6, BitsPerByte * (wordSize - str1_chr_size)); 4506 } 4507 if (!isL) { // UU/UL case 4508 lsl(ch2, cnt1tmp, 1); // offset in bytes 4509 } 4510 cmp(tmp3, skipch); 4511 br(NE, BMSKIP); 4512 ldr(ch2, Address(str2, isL ? cnt1tmp : ch2)); 4513 mov(ch1, tmp6); 4514 if (isL) { 4515 b(BMLOOPSTR1_AFTER_LOAD); 4516 } else { 4517 sub(cnt1tmp, cnt1tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8 4518 b(BMLOOPSTR1_CMP); 4519 } 4520 BIND(BMLOOPSTR1); 4521 (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift))); 4522 (this->*str2_load_1chr)(ch2, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift))); 4523 BIND(BMLOOPSTR1_AFTER_LOAD); 4524 subs(cnt1tmp, cnt1tmp, 1); 4525 br(LT, BMLOOPSTR1_LASTCMP); 4526 BIND(BMLOOPSTR1_CMP); 4527 cmp(ch1, ch2); 4528 br(EQ, BMLOOPSTR1); 4529 BIND(BMSKIP); 4530 if (!isL) { 4531 // if we've met UTF symbol while searching Latin1 pattern, then we can 4532 // skip cnt1 symbols 4533 if (str1_isL != str2_isL) { 4534 mov(result_tmp, cnt1); 4535 } else { 4536 mov(result_tmp, 1); 4537 } 4538 cmp(skipch, ASIZE); 4539 br(HS, BMADV); 4540 } 4541 ldrb(result_tmp, Address(sp, skipch)); // load skip distance 4542 BIND(BMADV); 4543 sub(cnt1tmp, cnt1, 1); 4544 add(str2, str2, result_tmp, LSL, str2_chr_shift); 4545 cmp(str2, str2end); 4546 br(LE, BMLOOPSTR2); 4547 add(sp, sp, ASIZE); 4548 b(NOMATCH); 4549 BIND(BMLOOPSTR1_LASTCMP); 4550 cmp(ch1, ch2); 4551 br(NE, BMSKIP); 4552 BIND(BMMATCH); 4553 sub(result, str2, tmp5); 4554 if (!str2_isL) lsr(result, result, 1); 4555 add(sp, sp, ASIZE); 4556 b(DONE); 4557 4558 BIND(LINEARSTUB); 4559 cmp(cnt1, 16); // small patterns still should be handled by simple algorithm 4560 br(LT, LINEAR_MEDIUM); 4561 mov(result, zr); 4562 RuntimeAddress stub = NULL; 4563 if (isL) { 4564 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ll()); 4565 assert(stub.target() != NULL, "string_indexof_linear_ll stub has not been generated"); 4566 } else if (str1_isL) { 4567 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ul()); 4568 assert(stub.target() != NULL, "string_indexof_linear_ul stub has not been generated"); 4569 } else { 4570 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_uu()); 4571 assert(stub.target() != NULL, "string_indexof_linear_uu stub has not been generated"); 4572 } 4573 trampoline_call(stub); 4574 b(DONE); 4575 } 4576 4577 BIND(LINEARSEARCH); 4578 { 4579 Label DO1, DO2, DO3; 4580 4581 Register str2tmp = tmp2; 4582 Register first = tmp3; 4583 4584 if (icnt1 == -1) 4585 { 4586 Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT; 4587 4588 cmp(cnt1, str1_isL == str2_isL ? 4 : 2); 4589 br(LT, DOSHORT); 4590 BIND(LINEAR_MEDIUM); 4591 (this->*str1_load_1chr)(first, Address(str1)); 4592 lea(str1, Address(str1, cnt1, Address::lsl(str1_chr_shift))); 4593 sub(cnt1_neg, zr, cnt1, LSL, str1_chr_shift); 4594 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 4595 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 4596 4597 BIND(FIRST_LOOP); 4598 (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg)); 4599 cmp(first, ch2); 4600 br(EQ, STR1_LOOP); 4601 BIND(STR2_NEXT); 4602 adds(cnt2_neg, cnt2_neg, str2_chr_size); 4603 br(LE, FIRST_LOOP); 4604 b(NOMATCH); 4605 4606 BIND(STR1_LOOP); 4607 adds(cnt1tmp, cnt1_neg, str1_chr_size); 4608 add(cnt2tmp, cnt2_neg, str2_chr_size); 4609 br(GE, MATCH); 4610 4611 BIND(STR1_NEXT); 4612 (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp)); 4613 (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp)); 4614 cmp(ch1, ch2); 4615 br(NE, STR2_NEXT); 4616 adds(cnt1tmp, cnt1tmp, str1_chr_size); 4617 add(cnt2tmp, cnt2tmp, str2_chr_size); 4618 br(LT, STR1_NEXT); 4619 b(MATCH); 4620 4621 BIND(DOSHORT); 4622 if (str1_isL == str2_isL) { 4623 cmp(cnt1, 2); 4624 br(LT, DO1); 4625 br(GT, DO3); 4626 } 4627 } 4628 4629 if (icnt1 == 4) { 4630 Label CH1_LOOP; 4631 4632 (this->*load_4chr)(ch1, str1); 4633 sub(result_tmp, cnt2, 4); 4634 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 4635 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 4636 4637 BIND(CH1_LOOP); 4638 (this->*load_4chr)(ch2, Address(str2, cnt2_neg)); 4639 cmp(ch1, ch2); 4640 br(EQ, MATCH); 4641 adds(cnt2_neg, cnt2_neg, str2_chr_size); 4642 br(LE, CH1_LOOP); 4643 b(NOMATCH); 4644 } 4645 4646 if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 2) { 4647 Label CH1_LOOP; 4648 4649 BIND(DO2); 4650 (this->*load_2chr)(ch1, str1); 4651 if (icnt1 == 2) { 4652 sub(result_tmp, cnt2, 2); 4653 } 4654 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 4655 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 4656 BIND(CH1_LOOP); 4657 (this->*load_2chr)(ch2, Address(str2, cnt2_neg)); 4658 cmp(ch1, ch2); 4659 br(EQ, MATCH); 4660 adds(cnt2_neg, cnt2_neg, str2_chr_size); 4661 br(LE, CH1_LOOP); 4662 b(NOMATCH); 4663 } 4664 4665 if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 3) { 4666 Label FIRST_LOOP, STR2_NEXT, STR1_LOOP; 4667 4668 BIND(DO3); 4669 (this->*load_2chr)(first, str1); 4670 (this->*str1_load_1chr)(ch1, Address(str1, 2*str1_chr_size)); 4671 if (icnt1 == 3) { 4672 sub(result_tmp, cnt2, 3); 4673 } 4674 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 4675 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 4676 BIND(FIRST_LOOP); 4677 (this->*load_2chr)(ch2, Address(str2, cnt2_neg)); 4678 cmpw(first, ch2); 4679 br(EQ, STR1_LOOP); 4680 BIND(STR2_NEXT); 4681 adds(cnt2_neg, cnt2_neg, str2_chr_size); 4682 br(LE, FIRST_LOOP); 4683 b(NOMATCH); 4684 4685 BIND(STR1_LOOP); 4686 add(cnt2tmp, cnt2_neg, 2*str2_chr_size); 4687 (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp)); 4688 cmp(ch1, ch2); 4689 br(NE, STR2_NEXT); 4690 b(MATCH); 4691 } 4692 4693 if (icnt1 == -1 || icnt1 == 1) { 4694 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP; 4695 4696 BIND(DO1); 4697 (this->*str1_load_1chr)(ch1, str1); 4698 cmp(cnt2, 8); 4699 br(LT, DO1_SHORT); 4700 4701 sub(result_tmp, cnt2, 8/str2_chr_size); 4702 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 4703 mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001); 4704 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 4705 4706 if (str2_isL) { 4707 orr(ch1, ch1, ch1, LSL, 8); 4708 } 4709 orr(ch1, ch1, ch1, LSL, 16); 4710 orr(ch1, ch1, ch1, LSL, 32); 4711 BIND(CH1_LOOP); 4712 ldr(ch2, Address(str2, cnt2_neg)); 4713 eor(ch2, ch1, ch2); 4714 sub(tmp1, ch2, tmp3); 4715 orr(tmp2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 4716 bics(tmp1, tmp1, tmp2); 4717 br(NE, HAS_ZERO); 4718 adds(cnt2_neg, cnt2_neg, 8); 4719 br(LT, CH1_LOOP); 4720 4721 cmp(cnt2_neg, 8); 4722 mov(cnt2_neg, 0); 4723 br(LT, CH1_LOOP); 4724 b(NOMATCH); 4725 4726 BIND(HAS_ZERO); 4727 rev(tmp1, tmp1); 4728 clz(tmp1, tmp1); 4729 add(cnt2_neg, cnt2_neg, tmp1, LSR, 3); 4730 b(MATCH); 4731 4732 BIND(DO1_SHORT); 4733 mov(result_tmp, cnt2); 4734 lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift))); 4735 sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift); 4736 BIND(DO1_LOOP); 4737 (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg)); 4738 cmpw(ch1, ch2); 4739 br(EQ, MATCH); 4740 adds(cnt2_neg, cnt2_neg, str2_chr_size); 4741 br(LT, DO1_LOOP); 4742 } 4743 } 4744 BIND(NOMATCH); 4745 mov(result, -1); 4746 b(DONE); 4747 BIND(MATCH); 4748 add(result, result_tmp, cnt2_neg, ASR, str2_chr_shift); 4749 BIND(DONE); 4750 } 4751 4752 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr); 4753 typedef void (MacroAssembler::* uxt_insn)(Register Rd, Register Rn); 4754 4755 void MacroAssembler::string_indexof_char(Register str1, Register cnt1, 4756 Register ch, Register result, 4757 Register tmp1, Register tmp2, Register tmp3) 4758 { 4759 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE; 4760 Register cnt1_neg = cnt1; 4761 Register ch1 = rscratch1; 4762 Register result_tmp = rscratch2; 4763 4764 cmp(cnt1, 4); 4765 br(LT, DO1_SHORT); 4766 4767 orr(ch, ch, ch, LSL, 16); 4768 orr(ch, ch, ch, LSL, 32); 4769 4770 sub(cnt1, cnt1, 4); 4771 mov(result_tmp, cnt1); 4772 lea(str1, Address(str1, cnt1, Address::uxtw(1))); 4773 sub(cnt1_neg, zr, cnt1, LSL, 1); 4774 4775 mov(tmp3, 0x0001000100010001); 4776 4777 BIND(CH1_LOOP); 4778 ldr(ch1, Address(str1, cnt1_neg)); 4779 eor(ch1, ch, ch1); 4780 sub(tmp1, ch1, tmp3); 4781 orr(tmp2, ch1, 0x7fff7fff7fff7fff); 4782 bics(tmp1, tmp1, tmp2); 4783 br(NE, HAS_ZERO); 4784 adds(cnt1_neg, cnt1_neg, 8); 4785 br(LT, CH1_LOOP); 4786 4787 cmp(cnt1_neg, 8); 4788 mov(cnt1_neg, 0); 4789 br(LT, CH1_LOOP); 4790 b(NOMATCH); 4791 4792 BIND(HAS_ZERO); 4793 rev(tmp1, tmp1); 4794 clz(tmp1, tmp1); 4795 add(cnt1_neg, cnt1_neg, tmp1, LSR, 3); 4796 b(MATCH); 4797 4798 BIND(DO1_SHORT); 4799 mov(result_tmp, cnt1); 4800 lea(str1, Address(str1, cnt1, Address::uxtw(1))); 4801 sub(cnt1_neg, zr, cnt1, LSL, 1); 4802 BIND(DO1_LOOP); 4803 ldrh(ch1, Address(str1, cnt1_neg)); 4804 cmpw(ch, ch1); 4805 br(EQ, MATCH); 4806 adds(cnt1_neg, cnt1_neg, 2); 4807 br(LT, DO1_LOOP); 4808 BIND(NOMATCH); 4809 mov(result, -1); 4810 b(DONE); 4811 BIND(MATCH); 4812 add(result, result_tmp, cnt1_neg, ASR, 1); 4813 BIND(DONE); 4814 } 4815 4816 // Compare strings. 4817 void MacroAssembler::string_compare(Register str1, Register str2, 4818 Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2, 4819 FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3, int ae) { 4820 Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB, 4821 DIFFERENCE, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, 4822 SHORT_LOOP_START, TAIL_CHECK; 4823 4824 const int STUB_THRESHOLD = 64 + 8; 4825 bool isLL = ae == StrIntrinsicNode::LL; 4826 bool isLU = ae == StrIntrinsicNode::LU; 4827 bool isUL = ae == StrIntrinsicNode::UL; 4828 4829 bool str1_isL = isLL || isLU; 4830 bool str2_isL = isLL || isUL; 4831 4832 int str1_chr_shift = str1_isL ? 0 : 1; 4833 int str2_chr_shift = str2_isL ? 0 : 1; 4834 int str1_chr_size = str1_isL ? 1 : 2; 4835 int str2_chr_size = str2_isL ? 1 : 2; 4836 int minCharsInWord = isLL ? wordSize : wordSize/2; 4837 4838 FloatRegister vtmpZ = vtmp1, vtmp = vtmp2; 4839 chr_insn str1_load_chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb : 4840 (chr_insn)&MacroAssembler::ldrh; 4841 chr_insn str2_load_chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb : 4842 (chr_insn)&MacroAssembler::ldrh; 4843 uxt_insn ext_chr = isLL ? (uxt_insn)&MacroAssembler::uxtbw : 4844 (uxt_insn)&MacroAssembler::uxthw; 4845 4846 BLOCK_COMMENT("string_compare {"); 4847 4848 // Bizzarely, the counts are passed in bytes, regardless of whether they 4849 // are L or U strings, however the result is always in characters. 4850 if (!str1_isL) asrw(cnt1, cnt1, 1); 4851 if (!str2_isL) asrw(cnt2, cnt2, 1); 4852 4853 // Compute the minimum of the string lengths and save the difference. 4854 subsw(result, cnt1, cnt2); 4855 cselw(cnt2, cnt1, cnt2, Assembler::LE); // min 4856 4857 // A very short string 4858 cmpw(cnt2, minCharsInWord); 4859 br(Assembler::LT, SHORT_STRING); 4860 4861 // Compare longwords 4862 // load first parts of strings and finish initialization while loading 4863 { 4864 if (str1_isL == str2_isL) { // LL or UU 4865 ldr(tmp1, Address(str1)); 4866 cmp(str1, str2); 4867 br(Assembler::EQ, DONE); 4868 ldr(tmp2, Address(str2)); 4869 cmp(cnt2, STUB_THRESHOLD); 4870 br(GE, STUB); 4871 subsw(cnt2, cnt2, minCharsInWord); 4872 br(EQ, TAIL_CHECK); 4873 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift))); 4874 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift))); 4875 sub(cnt2, zr, cnt2, LSL, str2_chr_shift); 4876 } else if (isLU) { 4877 ldrs(vtmp, Address(str1)); 4878 cmp(str1, str2); 4879 br(Assembler::EQ, DONE); 4880 ldr(tmp2, Address(str2)); 4881 cmp(cnt2, STUB_THRESHOLD); 4882 br(GE, STUB); 4883 subsw(cnt2, cnt2, 4); 4884 br(EQ, TAIL_CHECK); 4885 eor(vtmpZ, T16B, vtmpZ, vtmpZ); 4886 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift))); 4887 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift))); 4888 zip1(vtmp, T8B, vtmp, vtmpZ); 4889 sub(cnt1, zr, cnt2, LSL, str1_chr_shift); 4890 sub(cnt2, zr, cnt2, LSL, str2_chr_shift); 4891 add(cnt1, cnt1, 4); 4892 fmovd(tmp1, vtmp); 4893 } else { // UL case 4894 ldr(tmp1, Address(str1)); 4895 cmp(str1, str2); 4896 br(Assembler::EQ, DONE); 4897 ldrs(vtmp, Address(str2)); 4898 cmp(cnt2, STUB_THRESHOLD); 4899 br(GE, STUB); 4900 subsw(cnt2, cnt2, 4); 4901 br(EQ, TAIL_CHECK); 4902 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift))); 4903 eor(vtmpZ, T16B, vtmpZ, vtmpZ); 4904 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift))); 4905 sub(cnt1, zr, cnt2, LSL, str1_chr_shift); 4906 zip1(vtmp, T8B, vtmp, vtmpZ); 4907 sub(cnt2, zr, cnt2, LSL, str2_chr_shift); 4908 add(cnt1, cnt1, 8); 4909 fmovd(tmp2, vtmp); 4910 } 4911 adds(cnt2, cnt2, isUL ? 4 : 8); 4912 br(GE, TAIL); 4913 eor(rscratch2, tmp1, tmp2); 4914 cbnz(rscratch2, DIFFERENCE); 4915 // main loop 4916 bind(NEXT_WORD); 4917 if (str1_isL == str2_isL) { 4918 ldr(tmp1, Address(str1, cnt2)); 4919 ldr(tmp2, Address(str2, cnt2)); 4920 adds(cnt2, cnt2, 8); 4921 } else if (isLU) { 4922 ldrs(vtmp, Address(str1, cnt1)); 4923 ldr(tmp2, Address(str2, cnt2)); 4924 add(cnt1, cnt1, 4); 4925 zip1(vtmp, T8B, vtmp, vtmpZ); 4926 fmovd(tmp1, vtmp); 4927 adds(cnt2, cnt2, 8); 4928 } else { // UL 4929 ldrs(vtmp, Address(str2, cnt2)); 4930 ldr(tmp1, Address(str1, cnt1)); 4931 zip1(vtmp, T8B, vtmp, vtmpZ); 4932 add(cnt1, cnt1, 8); 4933 fmovd(tmp2, vtmp); 4934 adds(cnt2, cnt2, 4); 4935 } 4936 br(GE, TAIL); 4937 4938 eor(rscratch2, tmp1, tmp2); 4939 cbz(rscratch2, NEXT_WORD); 4940 b(DIFFERENCE); 4941 bind(TAIL); 4942 eor(rscratch2, tmp1, tmp2); 4943 cbnz(rscratch2, DIFFERENCE); 4944 // Last longword. In the case where length == 4 we compare the 4945 // same longword twice, but that's still faster than another 4946 // conditional branch. 4947 if (str1_isL == str2_isL) { 4948 ldr(tmp1, Address(str1)); 4949 ldr(tmp2, Address(str2)); 4950 } else if (isLU) { 4951 ldrs(vtmp, Address(str1)); 4952 ldr(tmp2, Address(str2)); 4953 zip1(vtmp, T8B, vtmp, vtmpZ); 4954 fmovd(tmp1, vtmp); 4955 } else { // UL 4956 ldrs(vtmp, Address(str2)); 4957 ldr(tmp1, Address(str1)); 4958 zip1(vtmp, T8B, vtmp, vtmpZ); 4959 fmovd(tmp2, vtmp); 4960 } 4961 bind(TAIL_CHECK); 4962 eor(rscratch2, tmp1, tmp2); 4963 cbz(rscratch2, DONE); 4964 4965 // Find the first different characters in the longwords and 4966 // compute their difference. 4967 bind(DIFFERENCE); 4968 rev(rscratch2, rscratch2); 4969 clz(rscratch2, rscratch2); 4970 andr(rscratch2, rscratch2, isLL ? -8 : -16); 4971 lsrv(tmp1, tmp1, rscratch2); 4972 (this->*ext_chr)(tmp1, tmp1); 4973 lsrv(tmp2, tmp2, rscratch2); 4974 (this->*ext_chr)(tmp2, tmp2); 4975 subw(result, tmp1, tmp2); 4976 b(DONE); 4977 } 4978 4979 bind(STUB); 4980 RuntimeAddress stub = NULL; 4981 switch(ae) { 4982 case StrIntrinsicNode::LL: 4983 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LL()); 4984 break; 4985 case StrIntrinsicNode::UU: 4986 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UU()); 4987 break; 4988 case StrIntrinsicNode::LU: 4989 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LU()); 4990 break; 4991 case StrIntrinsicNode::UL: 4992 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UL()); 4993 break; 4994 default: 4995 ShouldNotReachHere(); 4996 } 4997 assert(stub.target() != NULL, "compare_long_string stub has not been generated"); 4998 trampoline_call(stub); 4999 b(DONE); 5000 5001 bind(SHORT_STRING); 5002 // Is the minimum length zero? 5003 cbz(cnt2, DONE); 5004 // arrange code to do most branches while loading and loading next characters 5005 // while comparing previous 5006 (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size))); 5007 subs(cnt2, cnt2, 1); 5008 br(EQ, SHORT_LAST_INIT); 5009 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size))); 5010 b(SHORT_LOOP_START); 5011 bind(SHORT_LOOP); 5012 subs(cnt2, cnt2, 1); 5013 br(EQ, SHORT_LAST); 5014 bind(SHORT_LOOP_START); 5015 (this->*str1_load_chr)(tmp2, Address(post(str1, str1_chr_size))); 5016 (this->*str2_load_chr)(rscratch1, Address(post(str2, str2_chr_size))); 5017 cmp(tmp1, cnt1); 5018 br(NE, SHORT_LOOP_TAIL); 5019 subs(cnt2, cnt2, 1); 5020 br(EQ, SHORT_LAST2); 5021 (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size))); 5022 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size))); 5023 cmp(tmp2, rscratch1); 5024 br(EQ, SHORT_LOOP); 5025 sub(result, tmp2, rscratch1); 5026 b(DONE); 5027 bind(SHORT_LOOP_TAIL); 5028 sub(result, tmp1, cnt1); 5029 b(DONE); 5030 bind(SHORT_LAST2); 5031 cmp(tmp2, rscratch1); 5032 br(EQ, DONE); 5033 sub(result, tmp2, rscratch1); 5034 5035 b(DONE); 5036 bind(SHORT_LAST_INIT); 5037 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size))); 5038 bind(SHORT_LAST); 5039 cmp(tmp1, cnt1); 5040 br(EQ, DONE); 5041 sub(result, tmp1, cnt1); 5042 5043 bind(DONE); 5044 5045 BLOCK_COMMENT("} string_compare"); 5046 } 5047 5048 // This method checks if provided byte array contains byte with highest bit set. 5049 void MacroAssembler::has_negatives(Register ary1, Register len, Register result) { 5050 // Simple and most common case of aligned small array which is not at the 5051 // end of memory page is placed here. All other cases are in stub. 5052 Label LOOP, END, STUB, STUB_LONG, SET_RESULT, DONE; 5053 const uint64_t UPPER_BIT_MASK=0x8080808080808080; 5054 assert_different_registers(ary1, len, result); 5055 5056 cmpw(len, 0); 5057 br(LE, SET_RESULT); 5058 cmpw(len, 4 * wordSize); 5059 br(GE, STUB_LONG); // size > 32 then go to stub 5060 5061 int shift = 64 - exact_log2(os::vm_page_size()); 5062 lsl(rscratch1, ary1, shift); 5063 mov(rscratch2, (size_t)(4 * wordSize) << shift); 5064 adds(rscratch2, rscratch1, rscratch2); // At end of page? 5065 br(CS, STUB); // at the end of page then go to stub 5066 subs(len, len, wordSize); 5067 br(LT, END); 5068 5069 BIND(LOOP); 5070 ldr(rscratch1, Address(post(ary1, wordSize))); 5071 tst(rscratch1, UPPER_BIT_MASK); 5072 br(NE, SET_RESULT); 5073 subs(len, len, wordSize); 5074 br(GE, LOOP); 5075 cmpw(len, -wordSize); 5076 br(EQ, SET_RESULT); 5077 5078 BIND(END); 5079 ldr(result, Address(ary1)); 5080 sub(len, zr, len, LSL, 3); // LSL 3 is to get bits from bytes 5081 lslv(result, result, len); 5082 tst(result, UPPER_BIT_MASK); 5083 b(SET_RESULT); 5084 5085 BIND(STUB); 5086 RuntimeAddress has_neg = RuntimeAddress(StubRoutines::aarch64::has_negatives()); 5087 assert(has_neg.target() != NULL, "has_negatives stub has not been generated"); 5088 trampoline_call(has_neg); 5089 b(DONE); 5090 5091 BIND(STUB_LONG); 5092 RuntimeAddress has_neg_long = RuntimeAddress( 5093 StubRoutines::aarch64::has_negatives_long()); 5094 assert(has_neg_long.target() != NULL, "has_negatives stub has not been generated"); 5095 trampoline_call(has_neg_long); 5096 b(DONE); 5097 5098 BIND(SET_RESULT); 5099 cset(result, NE); // set true or false 5100 5101 BIND(DONE); 5102 } 5103 5104 void MacroAssembler::arrays_equals(Register a1, Register a2, Register tmp3, 5105 Register tmp4, Register tmp5, Register result, 5106 Register cnt1, int elem_size) { 5107 Label DONE, SAME; 5108 Register tmp1 = rscratch1; 5109 Register tmp2 = rscratch2; 5110 Register cnt2 = tmp2; // cnt2 only used in array length compare 5111 int elem_per_word = wordSize/elem_size; 5112 int log_elem_size = exact_log2(elem_size); 5113 int length_offset = arrayOopDesc::length_offset_in_bytes(); 5114 int base_offset 5115 = arrayOopDesc::base_offset_in_bytes(elem_size == 2 ? T_CHAR : T_BYTE); 5116 int stubBytesThreshold = 3 * 64 + (UseSIMDForArrayEquals ? 0 : 16); 5117 5118 assert(elem_size == 1 || elem_size == 2, "must be char or byte"); 5119 assert_different_registers(a1, a2, result, cnt1, rscratch1, rscratch2); 5120 5121 #ifndef PRODUCT 5122 { 5123 const char kind = (elem_size == 2) ? 'U' : 'L'; 5124 char comment[64]; 5125 snprintf(comment, sizeof comment, "array_equals%c{", kind); 5126 BLOCK_COMMENT(comment); 5127 } 5128 #endif 5129 5130 // if (a1 == a2) 5131 // return true; 5132 cmpoop(a1, a2); // May have read barriers for a1 and a2. 5133 br(EQ, SAME); 5134 5135 if (UseSimpleArrayEquals) { 5136 Label NEXT_WORD, SHORT, TAIL03, TAIL01, A_MIGHT_BE_NULL, A_IS_NOT_NULL; 5137 // if (a1 == null || a2 == null) 5138 // return false; 5139 // a1 & a2 == 0 means (some-pointer is null) or 5140 // (very-rare-or-even-probably-impossible-pointer-values) 5141 // so, we can save one branch in most cases 5142 tst(a1, a2); 5143 mov(result, false); 5144 br(EQ, A_MIGHT_BE_NULL); 5145 // if (a1.length != a2.length) 5146 // return false; 5147 bind(A_IS_NOT_NULL); 5148 ldrw(cnt1, Address(a1, length_offset)); 5149 ldrw(cnt2, Address(a2, length_offset)); 5150 eorw(tmp5, cnt1, cnt2); 5151 cbnzw(tmp5, DONE); 5152 lea(a1, Address(a1, base_offset)); 5153 lea(a2, Address(a2, base_offset)); 5154 // Check for short strings, i.e. smaller than wordSize. 5155 subs(cnt1, cnt1, elem_per_word); 5156 br(Assembler::LT, SHORT); 5157 // Main 8 byte comparison loop. 5158 bind(NEXT_WORD); { 5159 ldr(tmp1, Address(post(a1, wordSize))); 5160 ldr(tmp2, Address(post(a2, wordSize))); 5161 subs(cnt1, cnt1, elem_per_word); 5162 eor(tmp5, tmp1, tmp2); 5163 cbnz(tmp5, DONE); 5164 } br(GT, NEXT_WORD); 5165 // Last longword. In the case where length == 4 we compare the 5166 // same longword twice, but that's still faster than another 5167 // conditional branch. 5168 // cnt1 could be 0, -1, -2, -3, -4 for chars; -4 only happens when 5169 // length == 4. 5170 if (log_elem_size > 0) 5171 lsl(cnt1, cnt1, log_elem_size); 5172 ldr(tmp3, Address(a1, cnt1)); 5173 ldr(tmp4, Address(a2, cnt1)); 5174 eor(tmp5, tmp3, tmp4); 5175 cbnz(tmp5, DONE); 5176 b(SAME); 5177 bind(A_MIGHT_BE_NULL); 5178 // in case both a1 and a2 are not-null, proceed with loads 5179 cbz(a1, DONE); 5180 cbz(a2, DONE); 5181 b(A_IS_NOT_NULL); 5182 bind(SHORT); 5183 5184 tbz(cnt1, 2 - log_elem_size, TAIL03); // 0-7 bytes left. 5185 { 5186 ldrw(tmp1, Address(post(a1, 4))); 5187 ldrw(tmp2, Address(post(a2, 4))); 5188 eorw(tmp5, tmp1, tmp2); 5189 cbnzw(tmp5, DONE); 5190 } 5191 bind(TAIL03); 5192 tbz(cnt1, 1 - log_elem_size, TAIL01); // 0-3 bytes left. 5193 { 5194 ldrh(tmp3, Address(post(a1, 2))); 5195 ldrh(tmp4, Address(post(a2, 2))); 5196 eorw(tmp5, tmp3, tmp4); 5197 cbnzw(tmp5, DONE); 5198 } 5199 bind(TAIL01); 5200 if (elem_size == 1) { // Only needed when comparing byte arrays. 5201 tbz(cnt1, 0, SAME); // 0-1 bytes left. 5202 { 5203 ldrb(tmp1, a1); 5204 ldrb(tmp2, a2); 5205 eorw(tmp5, tmp1, tmp2); 5206 cbnzw(tmp5, DONE); 5207 } 5208 } 5209 } else { 5210 Label NEXT_DWORD, SHORT, TAIL, TAIL2, STUB, EARLY_OUT, 5211 CSET_EQ, LAST_CHECK; 5212 mov(result, false); 5213 cbz(a1, DONE); 5214 ldrw(cnt1, Address(a1, length_offset)); 5215 cbz(a2, DONE); 5216 ldrw(cnt2, Address(a2, length_offset)); 5217 // on most CPUs a2 is still "locked"(surprisingly) in ldrw and it's 5218 // faster to perform another branch before comparing a1 and a2 5219 cmp(cnt1, elem_per_word); 5220 br(LE, SHORT); // short or same 5221 ldr(tmp3, Address(pre(a1, base_offset))); 5222 cmp(cnt1, stubBytesThreshold); 5223 br(GE, STUB); 5224 ldr(tmp4, Address(pre(a2, base_offset))); 5225 sub(tmp5, zr, cnt1, LSL, 3 + log_elem_size); 5226 cmp(cnt2, cnt1); 5227 br(NE, DONE); 5228 5229 // Main 16 byte comparison loop with 2 exits 5230 bind(NEXT_DWORD); { 5231 ldr(tmp1, Address(pre(a1, wordSize))); 5232 ldr(tmp2, Address(pre(a2, wordSize))); 5233 subs(cnt1, cnt1, 2 * elem_per_word); 5234 br(LE, TAIL); 5235 eor(tmp4, tmp3, tmp4); 5236 cbnz(tmp4, DONE); 5237 ldr(tmp3, Address(pre(a1, wordSize))); 5238 ldr(tmp4, Address(pre(a2, wordSize))); 5239 cmp(cnt1, elem_per_word); 5240 br(LE, TAIL2); 5241 cmp(tmp1, tmp2); 5242 } br(EQ, NEXT_DWORD); 5243 b(DONE); 5244 5245 bind(TAIL); 5246 eor(tmp4, tmp3, tmp4); 5247 eor(tmp2, tmp1, tmp2); 5248 lslv(tmp2, tmp2, tmp5); 5249 orr(tmp5, tmp4, tmp2); 5250 cmp(tmp5, zr); 5251 b(CSET_EQ); 5252 5253 bind(TAIL2); 5254 eor(tmp2, tmp1, tmp2); 5255 cbnz(tmp2, DONE); 5256 b(LAST_CHECK); 5257 5258 bind(STUB); 5259 ldr(tmp4, Address(pre(a2, base_offset))); 5260 cmp(cnt2, cnt1); 5261 br(NE, DONE); 5262 if (elem_size == 2) { // convert to byte counter 5263 lsl(cnt1, cnt1, 1); 5264 } 5265 eor(tmp5, tmp3, tmp4); 5266 cbnz(tmp5, DONE); 5267 RuntimeAddress stub = RuntimeAddress(StubRoutines::aarch64::large_array_equals()); 5268 assert(stub.target() != NULL, "array_equals_long stub has not been generated"); 5269 trampoline_call(stub); 5270 b(DONE); 5271 5272 bind(EARLY_OUT); 5273 // (a1 != null && a2 == null) || (a1 != null && a2 != null && a1 == a2) 5274 // so, if a2 == null => return false(0), else return true, so we can return a2 5275 mov(result, a2); 5276 b(DONE); 5277 bind(SHORT); 5278 cmp(cnt2, cnt1); 5279 br(NE, DONE); 5280 cbz(cnt1, SAME); 5281 sub(tmp5, zr, cnt1, LSL, 3 + log_elem_size); 5282 ldr(tmp3, Address(a1, base_offset)); 5283 ldr(tmp4, Address(a2, base_offset)); 5284 bind(LAST_CHECK); 5285 eor(tmp4, tmp3, tmp4); 5286 lslv(tmp5, tmp4, tmp5); 5287 cmp(tmp5, zr); 5288 bind(CSET_EQ); 5289 cset(result, EQ); 5290 b(DONE); 5291 } 5292 5293 bind(SAME); 5294 mov(result, true); 5295 // That's it. 5296 bind(DONE); 5297 5298 BLOCK_COMMENT("} array_equals"); 5299 } 5300 5301 // Compare Strings 5302 5303 // For Strings we're passed the address of the first characters in a1 5304 // and a2 and the length in cnt1. 5305 // elem_size is the element size in bytes: either 1 or 2. 5306 // There are two implementations. For arrays >= 8 bytes, all 5307 // comparisons (including the final one, which may overlap) are 5308 // performed 8 bytes at a time. For strings < 8 bytes, we compare a 5309 // halfword, then a short, and then a byte. 5310 5311 void MacroAssembler::string_equals(Register a1, Register a2, 5312 Register result, Register cnt1, int elem_size) 5313 { 5314 Label SAME, DONE, SHORT, NEXT_WORD; 5315 Register tmp1 = rscratch1; 5316 Register tmp2 = rscratch2; 5317 Register cnt2 = tmp2; // cnt2 only used in array length compare 5318 5319 assert(elem_size == 1 || elem_size == 2, "must be 2 or 1 byte"); 5320 assert_different_registers(a1, a2, result, cnt1, rscratch1, rscratch2); 5321 5322 #ifndef PRODUCT 5323 { 5324 const char kind = (elem_size == 2) ? 'U' : 'L'; 5325 char comment[64]; 5326 snprintf(comment, sizeof comment, "{string_equals%c", kind); 5327 BLOCK_COMMENT(comment); 5328 } 5329 #endif 5330 5331 mov(result, false); 5332 5333 // Check for short strings, i.e. smaller than wordSize. 5334 subs(cnt1, cnt1, wordSize); 5335 br(Assembler::LT, SHORT); 5336 // Main 8 byte comparison loop. 5337 bind(NEXT_WORD); { 5338 ldr(tmp1, Address(post(a1, wordSize))); 5339 ldr(tmp2, Address(post(a2, wordSize))); 5340 subs(cnt1, cnt1, wordSize); 5341 eor(tmp1, tmp1, tmp2); 5342 cbnz(tmp1, DONE); 5343 } br(GT, NEXT_WORD); 5344 // Last longword. In the case where length == 4 we compare the 5345 // same longword twice, but that's still faster than another 5346 // conditional branch. 5347 // cnt1 could be 0, -1, -2, -3, -4 for chars; -4 only happens when 5348 // length == 4. 5349 ldr(tmp1, Address(a1, cnt1)); 5350 ldr(tmp2, Address(a2, cnt1)); 5351 eor(tmp2, tmp1, tmp2); 5352 cbnz(tmp2, DONE); 5353 b(SAME); 5354 5355 bind(SHORT); 5356 Label TAIL03, TAIL01; 5357 5358 tbz(cnt1, 2, TAIL03); // 0-7 bytes left. 5359 { 5360 ldrw(tmp1, Address(post(a1, 4))); 5361 ldrw(tmp2, Address(post(a2, 4))); 5362 eorw(tmp1, tmp1, tmp2); 5363 cbnzw(tmp1, DONE); 5364 } 5365 bind(TAIL03); 5366 tbz(cnt1, 1, TAIL01); // 0-3 bytes left. 5367 { 5368 ldrh(tmp1, Address(post(a1, 2))); 5369 ldrh(tmp2, Address(post(a2, 2))); 5370 eorw(tmp1, tmp1, tmp2); 5371 cbnzw(tmp1, DONE); 5372 } 5373 bind(TAIL01); 5374 if (elem_size == 1) { // Only needed when comparing 1-byte elements 5375 tbz(cnt1, 0, SAME); // 0-1 bytes left. 5376 { 5377 ldrb(tmp1, a1); 5378 ldrb(tmp2, a2); 5379 eorw(tmp1, tmp1, tmp2); 5380 cbnzw(tmp1, DONE); 5381 } 5382 } 5383 // Arrays are equal. 5384 bind(SAME); 5385 mov(result, true); 5386 5387 // That's it. 5388 bind(DONE); 5389 BLOCK_COMMENT("} string_equals"); 5390 } 5391 5392 5393 // The size of the blocks erased by the zero_blocks stub. We must 5394 // handle anything smaller than this ourselves in zero_words(). 5395 const int MacroAssembler::zero_words_block_size = 8; 5396 5397 // zero_words() is used by C2 ClearArray patterns. It is as small as 5398 // possible, handling small word counts locally and delegating 5399 // anything larger to the zero_blocks stub. It is expanded many times 5400 // in compiled code, so it is important to keep it short. 5401 5402 // ptr: Address of a buffer to be zeroed. 5403 // cnt: Count in HeapWords. 5404 // 5405 // ptr, cnt, rscratch1, and rscratch2 are clobbered. 5406 void MacroAssembler::zero_words(Register ptr, Register cnt) 5407 { 5408 assert(is_power_of_2(zero_words_block_size), "adjust this"); 5409 assert(ptr == r10 && cnt == r11, "mismatch in register usage"); 5410 5411 BLOCK_COMMENT("zero_words {"); 5412 cmp(cnt, zero_words_block_size); 5413 Label around, done, done16; 5414 br(LO, around); 5415 { 5416 RuntimeAddress zero_blocks = RuntimeAddress(StubRoutines::aarch64::zero_blocks()); 5417 assert(zero_blocks.target() != NULL, "zero_blocks stub has not been generated"); 5418 if (StubRoutines::aarch64::complete()) { 5419 trampoline_call(zero_blocks); 5420 } else { 5421 bl(zero_blocks); 5422 } 5423 } 5424 bind(around); 5425 for (int i = zero_words_block_size >> 1; i > 1; i >>= 1) { 5426 Label l; 5427 tbz(cnt, exact_log2(i), l); 5428 for (int j = 0; j < i; j += 2) { 5429 stp(zr, zr, post(ptr, 16)); 5430 } 5431 bind(l); 5432 } 5433 { 5434 Label l; 5435 tbz(cnt, 0, l); 5436 str(zr, Address(ptr)); 5437 bind(l); 5438 } 5439 BLOCK_COMMENT("} zero_words"); 5440 } 5441 5442 // base: Address of a buffer to be zeroed, 8 bytes aligned. 5443 // cnt: Immediate count in HeapWords. 5444 #define SmallArraySize (18 * BytesPerLong) 5445 void MacroAssembler::zero_words(Register base, u_int64_t cnt) 5446 { 5447 BLOCK_COMMENT("zero_words {"); 5448 int i = cnt & 1; // store any odd word to start 5449 if (i) str(zr, Address(base)); 5450 5451 if (cnt <= SmallArraySize / BytesPerLong) { 5452 for (; i < (int)cnt; i += 2) 5453 stp(zr, zr, Address(base, i * wordSize)); 5454 } else { 5455 const int unroll = 4; // Number of stp(zr, zr) instructions we'll unroll 5456 int remainder = cnt % (2 * unroll); 5457 for (; i < remainder; i += 2) 5458 stp(zr, zr, Address(base, i * wordSize)); 5459 5460 Label loop; 5461 Register cnt_reg = rscratch1; 5462 Register loop_base = rscratch2; 5463 cnt = cnt - remainder; 5464 mov(cnt_reg, cnt); 5465 // adjust base and prebias by -2 * wordSize so we can pre-increment 5466 add(loop_base, base, (remainder - 2) * wordSize); 5467 bind(loop); 5468 sub(cnt_reg, cnt_reg, 2 * unroll); 5469 for (i = 1; i < unroll; i++) 5470 stp(zr, zr, Address(loop_base, 2 * i * wordSize)); 5471 stp(zr, zr, Address(pre(loop_base, 2 * unroll * wordSize))); 5472 cbnz(cnt_reg, loop); 5473 } 5474 BLOCK_COMMENT("} zero_words"); 5475 } 5476 5477 // Zero blocks of memory by using DC ZVA. 5478 // 5479 // Aligns the base address first sufficently for DC ZVA, then uses 5480 // DC ZVA repeatedly for every full block. cnt is the size to be 5481 // zeroed in HeapWords. Returns the count of words left to be zeroed 5482 // in cnt. 5483 // 5484 // NOTE: This is intended to be used in the zero_blocks() stub. If 5485 // you want to use it elsewhere, note that cnt must be >= 2*zva_length. 5486 void MacroAssembler::zero_dcache_blocks(Register base, Register cnt) { 5487 Register tmp = rscratch1; 5488 Register tmp2 = rscratch2; 5489 int zva_length = VM_Version::zva_length(); 5490 Label initial_table_end, loop_zva; 5491 Label fini; 5492 5493 // Base must be 16 byte aligned. If not just return and let caller handle it 5494 tst(base, 0x0f); 5495 br(Assembler::NE, fini); 5496 // Align base with ZVA length. 5497 neg(tmp, base); 5498 andr(tmp, tmp, zva_length - 1); 5499 5500 // tmp: the number of bytes to be filled to align the base with ZVA length. 5501 add(base, base, tmp); 5502 sub(cnt, cnt, tmp, Assembler::ASR, 3); 5503 adr(tmp2, initial_table_end); 5504 sub(tmp2, tmp2, tmp, Assembler::LSR, 2); 5505 br(tmp2); 5506 5507 for (int i = -zva_length + 16; i < 0; i += 16) 5508 stp(zr, zr, Address(base, i)); 5509 bind(initial_table_end); 5510 5511 sub(cnt, cnt, zva_length >> 3); 5512 bind(loop_zva); 5513 dc(Assembler::ZVA, base); 5514 subs(cnt, cnt, zva_length >> 3); 5515 add(base, base, zva_length); 5516 br(Assembler::GE, loop_zva); 5517 add(cnt, cnt, zva_length >> 3); // count not zeroed by DC ZVA 5518 bind(fini); 5519 } 5520 5521 // base: Address of a buffer to be filled, 8 bytes aligned. 5522 // cnt: Count in 8-byte unit. 5523 // value: Value to be filled with. 5524 // base will point to the end of the buffer after filling. 5525 void MacroAssembler::fill_words(Register base, Register cnt, Register value) 5526 { 5527 // Algorithm: 5528 // 5529 // scratch1 = cnt & 7; 5530 // cnt -= scratch1; 5531 // p += scratch1; 5532 // switch (scratch1) { 5533 // do { 5534 // cnt -= 8; 5535 // p[-8] = v; 5536 // case 7: 5537 // p[-7] = v; 5538 // case 6: 5539 // p[-6] = v; 5540 // // ... 5541 // case 1: 5542 // p[-1] = v; 5543 // case 0: 5544 // p += 8; 5545 // } while (cnt); 5546 // } 5547 5548 assert_different_registers(base, cnt, value, rscratch1, rscratch2); 5549 5550 Label fini, skip, entry, loop; 5551 const int unroll = 8; // Number of stp instructions we'll unroll 5552 5553 cbz(cnt, fini); 5554 tbz(base, 3, skip); 5555 str(value, Address(post(base, 8))); 5556 sub(cnt, cnt, 1); 5557 bind(skip); 5558 5559 andr(rscratch1, cnt, (unroll-1) * 2); 5560 sub(cnt, cnt, rscratch1); 5561 add(base, base, rscratch1, Assembler::LSL, 3); 5562 adr(rscratch2, entry); 5563 sub(rscratch2, rscratch2, rscratch1, Assembler::LSL, 1); 5564 br(rscratch2); 5565 5566 bind(loop); 5567 add(base, base, unroll * 16); 5568 for (int i = -unroll; i < 0; i++) 5569 stp(value, value, Address(base, i * 16)); 5570 bind(entry); 5571 subs(cnt, cnt, unroll * 2); 5572 br(Assembler::GE, loop); 5573 5574 tbz(cnt, 0, fini); 5575 str(value, Address(post(base, 8))); 5576 bind(fini); 5577 } 5578 5579 // Intrinsic for sun/nio/cs/ISO_8859_1$Encoder.implEncodeISOArray and 5580 // java/lang/StringUTF16.compress. 5581 void MacroAssembler::encode_iso_array(Register src, Register dst, 5582 Register len, Register result, 5583 FloatRegister Vtmp1, FloatRegister Vtmp2, 5584 FloatRegister Vtmp3, FloatRegister Vtmp4) 5585 { 5586 Label DONE, SET_RESULT, NEXT_32, NEXT_32_PRFM, LOOP_8, NEXT_8, LOOP_1, NEXT_1, 5587 NEXT_32_START, NEXT_32_PRFM_START; 5588 Register tmp1 = rscratch1, tmp2 = rscratch2; 5589 5590 mov(result, len); // Save initial len 5591 5592 #ifndef BUILTIN_SIM 5593 cmp(len, 8); // handle shortest strings first 5594 br(LT, LOOP_1); 5595 cmp(len, 32); 5596 br(LT, NEXT_8); 5597 // The following code uses the SIMD 'uzp1' and 'uzp2' instructions 5598 // to convert chars to bytes 5599 if (SoftwarePrefetchHintDistance >= 0) { 5600 ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src); 5601 cmp(len, SoftwarePrefetchHintDistance/2 + 16); 5602 br(LE, NEXT_32_START); 5603 b(NEXT_32_PRFM_START); 5604 BIND(NEXT_32_PRFM); 5605 ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src); 5606 BIND(NEXT_32_PRFM_START); 5607 prfm(Address(src, SoftwarePrefetchHintDistance)); 5608 orr(v4, T16B, Vtmp1, Vtmp2); 5609 orr(v5, T16B, Vtmp3, Vtmp4); 5610 uzp1(Vtmp1, T16B, Vtmp1, Vtmp2); 5611 uzp1(Vtmp3, T16B, Vtmp3, Vtmp4); 5612 stpq(Vtmp1, Vtmp3, dst); 5613 uzp2(v5, T16B, v4, v5); // high bytes 5614 umov(tmp2, v5, D, 1); 5615 fmovd(tmp1, v5); 5616 orr(tmp1, tmp1, tmp2); 5617 cbnz(tmp1, LOOP_8); 5618 sub(len, len, 32); 5619 add(dst, dst, 32); 5620 add(src, src, 64); 5621 cmp(len, SoftwarePrefetchHintDistance/2 + 16); 5622 br(GE, NEXT_32_PRFM); 5623 cmp(len, 32); 5624 br(LT, LOOP_8); 5625 BIND(NEXT_32); 5626 ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src); 5627 BIND(NEXT_32_START); 5628 } else { 5629 BIND(NEXT_32); 5630 ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src); 5631 } 5632 prfm(Address(src, SoftwarePrefetchHintDistance)); 5633 uzp1(v4, T16B, Vtmp1, Vtmp2); 5634 uzp1(v5, T16B, Vtmp3, Vtmp4); 5635 stpq(v4, v5, dst); 5636 orr(Vtmp1, T16B, Vtmp1, Vtmp2); 5637 orr(Vtmp3, T16B, Vtmp3, Vtmp4); 5638 uzp2(Vtmp1, T16B, Vtmp1, Vtmp3); // high bytes 5639 umov(tmp2, Vtmp1, D, 1); 5640 fmovd(tmp1, Vtmp1); 5641 orr(tmp1, tmp1, tmp2); 5642 cbnz(tmp1, LOOP_8); 5643 sub(len, len, 32); 5644 add(dst, dst, 32); 5645 add(src, src, 64); 5646 cmp(len, 32); 5647 br(GE, NEXT_32); 5648 cbz(len, DONE); 5649 5650 BIND(LOOP_8); 5651 cmp(len, 8); 5652 br(LT, LOOP_1); 5653 BIND(NEXT_8); 5654 ld1(Vtmp1, T8H, src); 5655 uzp1(Vtmp2, T16B, Vtmp1, Vtmp1); // low bytes 5656 uzp2(Vtmp3, T16B, Vtmp1, Vtmp1); // high bytes 5657 strd(Vtmp2, dst); 5658 fmovd(tmp1, Vtmp3); 5659 cbnz(tmp1, NEXT_1); 5660 5661 sub(len, len, 8); 5662 add(dst, dst, 8); 5663 add(src, src, 16); 5664 cmp(len, 8); 5665 br(GE, NEXT_8); 5666 5667 BIND(LOOP_1); 5668 #endif 5669 cbz(len, DONE); 5670 BIND(NEXT_1); 5671 ldrh(tmp1, Address(post(src, 2))); 5672 strb(tmp1, Address(post(dst, 1))); 5673 tst(tmp1, 0xff00); 5674 br(NE, SET_RESULT); 5675 subs(len, len, 1); 5676 br(GT, NEXT_1); 5677 5678 BIND(SET_RESULT); 5679 sub(result, result, len); // Return index where we stopped 5680 // Return len == 0 if we processed all 5681 // characters 5682 BIND(DONE); 5683 } 5684 5685 5686 // Inflate byte[] array to char[]. 5687 void MacroAssembler::byte_array_inflate(Register src, Register dst, Register len, 5688 FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3, 5689 Register tmp4) { 5690 Label big, done, after_init, to_stub; 5691 5692 assert_different_registers(src, dst, len, tmp4, rscratch1); 5693 5694 fmovd(vtmp1, zr); 5695 lsrw(tmp4, len, 3); 5696 bind(after_init); 5697 cbnzw(tmp4, big); 5698 // Short string: less than 8 bytes. 5699 { 5700 Label loop, tiny; 5701 5702 cmpw(len, 4); 5703 br(LT, tiny); 5704 // Use SIMD to do 4 bytes. 5705 ldrs(vtmp2, post(src, 4)); 5706 zip1(vtmp3, T8B, vtmp2, vtmp1); 5707 subw(len, len, 4); 5708 strd(vtmp3, post(dst, 8)); 5709 5710 cbzw(len, done); 5711 5712 // Do the remaining bytes by steam. 5713 bind(loop); 5714 ldrb(tmp4, post(src, 1)); 5715 strh(tmp4, post(dst, 2)); 5716 subw(len, len, 1); 5717 5718 bind(tiny); 5719 cbnz(len, loop); 5720 5721 b(done); 5722 } 5723 5724 if (SoftwarePrefetchHintDistance >= 0) { 5725 bind(to_stub); 5726 RuntimeAddress stub = RuntimeAddress(StubRoutines::aarch64::large_byte_array_inflate()); 5727 assert(stub.target() != NULL, "large_byte_array_inflate stub has not been generated"); 5728 trampoline_call(stub); 5729 b(after_init); 5730 } 5731 5732 // Unpack the bytes 8 at a time. 5733 bind(big); 5734 { 5735 Label loop, around, loop_last, loop_start; 5736 5737 if (SoftwarePrefetchHintDistance >= 0) { 5738 const int large_loop_threshold = (64 + 16)/8; 5739 ldrd(vtmp2, post(src, 8)); 5740 andw(len, len, 7); 5741 cmp(tmp4, large_loop_threshold); 5742 br(GE, to_stub); 5743 b(loop_start); 5744 5745 bind(loop); 5746 ldrd(vtmp2, post(src, 8)); 5747 bind(loop_start); 5748 subs(tmp4, tmp4, 1); 5749 br(EQ, loop_last); 5750 zip1(vtmp2, T16B, vtmp2, vtmp1); 5751 ldrd(vtmp3, post(src, 8)); 5752 st1(vtmp2, T8H, post(dst, 16)); 5753 subs(tmp4, tmp4, 1); 5754 zip1(vtmp3, T16B, vtmp3, vtmp1); 5755 st1(vtmp3, T8H, post(dst, 16)); 5756 br(NE, loop); 5757 b(around); 5758 bind(loop_last); 5759 zip1(vtmp2, T16B, vtmp2, vtmp1); 5760 st1(vtmp2, T8H, post(dst, 16)); 5761 bind(around); 5762 cbz(len, done); 5763 } else { 5764 andw(len, len, 7); 5765 bind(loop); 5766 ldrd(vtmp2, post(src, 8)); 5767 sub(tmp4, tmp4, 1); 5768 zip1(vtmp3, T16B, vtmp2, vtmp1); 5769 st1(vtmp3, T8H, post(dst, 16)); 5770 cbnz(tmp4, loop); 5771 } 5772 } 5773 5774 // Do the tail of up to 8 bytes. 5775 add(src, src, len); 5776 ldrd(vtmp3, Address(src, -8)); 5777 add(dst, dst, len, ext::uxtw, 1); 5778 zip1(vtmp3, T16B, vtmp3, vtmp1); 5779 strq(vtmp3, Address(dst, -16)); 5780 5781 bind(done); 5782 } 5783 5784 // Compress char[] array to byte[]. 5785 void MacroAssembler::char_array_compress(Register src, Register dst, Register len, 5786 FloatRegister tmp1Reg, FloatRegister tmp2Reg, 5787 FloatRegister tmp3Reg, FloatRegister tmp4Reg, 5788 Register result) { 5789 encode_iso_array(src, dst, len, result, 5790 tmp1Reg, tmp2Reg, tmp3Reg, tmp4Reg); 5791 cmp(len, zr); 5792 csel(result, result, zr, EQ); 5793 } 5794 5795 // get_thread() can be called anywhere inside generated code so we 5796 // need to save whatever non-callee save context might get clobbered 5797 // by the call to JavaThread::aarch64_get_thread_helper() or, indeed, 5798 // the call setup code. 5799 // 5800 // aarch64_get_thread_helper() clobbers only r0, r1, and flags. 5801 // 5802 void MacroAssembler::get_thread(Register dst) { 5803 RegSet saved_regs = RegSet::range(r0, r1) + lr - dst; 5804 push(saved_regs, sp); 5805 5806 mov(lr, CAST_FROM_FN_PTR(address, JavaThread::aarch64_get_thread_helper)); 5807 blrt(lr, 1, 0, 1); 5808 if (dst != c_rarg0) { 5809 mov(dst, c_rarg0); 5810 } 5811 5812 pop(saved_regs, sp); 5813 }