1 /* 2 * Copyright (c) 1997, 2018, Oracle and/or its affiliates. All rights reserved. 3 * Copyright (c) 2014, 2015, Red Hat Inc. All rights reserved. 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. 9 * 10 * This code is distributed in the hope that it will be useful, but WITHOUT 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 * version 2 for more details (a copy is included in the LICENSE file that 14 * accompanied this code). 15 * 16 * You should have received a copy of the GNU General Public License version 17 * 2 along with this work; if not, write to the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 19 * 20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 21 * or visit www.oracle.com if you need additional information or have any 22 * questions. 23 * 24 */ 25 26 #include <sys/types.h> 27 28 #include "precompiled.hpp" 29 #include "jvm.h" 30 #include "asm/assembler.hpp" 31 #include "asm/assembler.inline.hpp" 32 #include "gc/shared/barrierSet.hpp" 33 #include "gc/shared/cardTable.hpp" 34 #include "gc/shared/barrierSetAssembler.hpp" 35 #include "gc/shared/cardTableBarrierSet.hpp" 36 #include "interpreter/interpreter.hpp" 37 #include "compiler/disassembler.hpp" 38 #include "gc/shared/collectedHeap.hpp" 39 #include "memory/resourceArea.hpp" 40 #include "nativeInst_aarch64.hpp" 41 #include "oops/accessDecorators.hpp" 42 #include "oops/compressedOops.inline.hpp" 43 #include "oops/klass.inline.hpp" 44 #include "oops/oop.hpp" 45 #include "opto/compile.hpp" 46 #include "opto/intrinsicnode.hpp" 47 #include "opto/node.hpp" 48 #include "runtime/biasedLocking.hpp" 49 #include "runtime/icache.hpp" 50 #include "runtime/interfaceSupport.inline.hpp" 51 #include "runtime/jniHandles.inline.hpp" 52 #include "runtime/sharedRuntime.hpp" 53 #include "runtime/thread.hpp" 54 55 #ifdef PRODUCT 56 #define BLOCK_COMMENT(str) /* nothing */ 57 #define STOP(error) stop(error) 58 #else 59 #define BLOCK_COMMENT(str) block_comment(str) 60 #define STOP(error) block_comment(error); stop(error) 61 #endif 62 63 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 64 65 // Patch any kind of instruction; there may be several instructions. 66 // Return the total length (in bytes) of the instructions. 67 int MacroAssembler::pd_patch_instruction_size(address branch, address target) { 68 int instructions = 1; 69 assert((uint64_t)target < (1ul << 48), "48-bit overflow in address constant"); 70 long offset = (target - branch) >> 2; 71 unsigned insn = *(unsigned*)branch; 72 if ((Instruction_aarch64::extract(insn, 29, 24) & 0b111011) == 0b011000) { 73 // Load register (literal) 74 Instruction_aarch64::spatch(branch, 23, 5, offset); 75 } else if (Instruction_aarch64::extract(insn, 30, 26) == 0b00101) { 76 // Unconditional branch (immediate) 77 Instruction_aarch64::spatch(branch, 25, 0, offset); 78 } else if (Instruction_aarch64::extract(insn, 31, 25) == 0b0101010) { 79 // Conditional branch (immediate) 80 Instruction_aarch64::spatch(branch, 23, 5, offset); 81 } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011010) { 82 // Compare & branch (immediate) 83 Instruction_aarch64::spatch(branch, 23, 5, offset); 84 } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011011) { 85 // Test & branch (immediate) 86 Instruction_aarch64::spatch(branch, 18, 5, offset); 87 } else if (Instruction_aarch64::extract(insn, 28, 24) == 0b10000) { 88 // PC-rel. addressing 89 offset = target-branch; 90 int shift = Instruction_aarch64::extract(insn, 31, 31); 91 if (shift) { 92 u_int64_t dest = (u_int64_t)target; 93 uint64_t pc_page = (uint64_t)branch >> 12; 94 uint64_t adr_page = (uint64_t)target >> 12; 95 unsigned offset_lo = dest & 0xfff; 96 offset = adr_page - pc_page; 97 98 // We handle 4 types of PC relative addressing 99 // 1 - adrp Rx, target_page 100 // ldr/str Ry, [Rx, #offset_in_page] 101 // 2 - adrp Rx, target_page 102 // add Ry, Rx, #offset_in_page 103 // 3 - adrp Rx, target_page (page aligned reloc, offset == 0) 104 // movk Rx, #imm16<<32 105 // 4 - adrp Rx, target_page (page aligned reloc, offset == 0) 106 // In the first 3 cases we must check that Rx is the same in the adrp and the 107 // subsequent ldr/str, add or movk instruction. Otherwise we could accidentally end 108 // up treating a type 4 relocation as a type 1, 2 or 3 just because it happened 109 // to be followed by a random unrelated ldr/str, add or movk instruction. 110 // 111 unsigned insn2 = ((unsigned*)branch)[1]; 112 if (Instruction_aarch64::extract(insn2, 29, 24) == 0b111001 && 113 Instruction_aarch64::extract(insn, 4, 0) == 114 Instruction_aarch64::extract(insn2, 9, 5)) { 115 // Load/store register (unsigned immediate) 116 unsigned size = Instruction_aarch64::extract(insn2, 31, 30); 117 Instruction_aarch64::patch(branch + sizeof (unsigned), 118 21, 10, offset_lo >> size); 119 guarantee(((dest >> size) << size) == dest, "misaligned target"); 120 instructions = 2; 121 } else if (Instruction_aarch64::extract(insn2, 31, 22) == 0b1001000100 && 122 Instruction_aarch64::extract(insn, 4, 0) == 123 Instruction_aarch64::extract(insn2, 4, 0)) { 124 // add (immediate) 125 Instruction_aarch64::patch(branch + sizeof (unsigned), 126 21, 10, offset_lo); 127 instructions = 2; 128 } else if (Instruction_aarch64::extract(insn2, 31, 21) == 0b11110010110 && 129 Instruction_aarch64::extract(insn, 4, 0) == 130 Instruction_aarch64::extract(insn2, 4, 0)) { 131 // movk #imm16<<32 132 Instruction_aarch64::patch(branch + 4, 20, 5, (uint64_t)target >> 32); 133 long dest = ((long)target & 0xffffffffL) | ((long)branch & 0xffff00000000L); 134 long pc_page = (long)branch >> 12; 135 long adr_page = (long)dest >> 12; 136 offset = adr_page - pc_page; 137 instructions = 2; 138 } 139 } 140 int offset_lo = offset & 3; 141 offset >>= 2; 142 Instruction_aarch64::spatch(branch, 23, 5, offset); 143 Instruction_aarch64::patch(branch, 30, 29, offset_lo); 144 } else if (Instruction_aarch64::extract(insn, 31, 21) == 0b11010010100) { 145 u_int64_t dest = (u_int64_t)target; 146 // Move wide constant 147 assert(nativeInstruction_at(branch+4)->is_movk(), "wrong insns in patch"); 148 assert(nativeInstruction_at(branch+8)->is_movk(), "wrong insns in patch"); 149 Instruction_aarch64::patch(branch, 20, 5, dest & 0xffff); 150 Instruction_aarch64::patch(branch+4, 20, 5, (dest >>= 16) & 0xffff); 151 Instruction_aarch64::patch(branch+8, 20, 5, (dest >>= 16) & 0xffff); 152 assert(target_addr_for_insn(branch) == target, "should be"); 153 instructions = 3; 154 } else if (Instruction_aarch64::extract(insn, 31, 22) == 0b1011100101 && 155 Instruction_aarch64::extract(insn, 4, 0) == 0b11111) { 156 // nothing to do 157 assert(target == 0, "did not expect to relocate target for polling page load"); 158 } else { 159 ShouldNotReachHere(); 160 } 161 return instructions * NativeInstruction::instruction_size; 162 } 163 164 int MacroAssembler::patch_oop(address insn_addr, address o) { 165 int instructions; 166 unsigned insn = *(unsigned*)insn_addr; 167 assert(nativeInstruction_at(insn_addr+4)->is_movk(), "wrong insns in patch"); 168 169 // OOPs are either narrow (32 bits) or wide (48 bits). We encode 170 // narrow OOPs by setting the upper 16 bits in the first 171 // instruction. 172 if (Instruction_aarch64::extract(insn, 31, 21) == 0b11010010101) { 173 // Move narrow OOP 174 narrowOop n = CompressedOops::encode((oop)o); 175 Instruction_aarch64::patch(insn_addr, 20, 5, n >> 16); 176 Instruction_aarch64::patch(insn_addr+4, 20, 5, n & 0xffff); 177 instructions = 2; 178 } else { 179 // Move wide OOP 180 assert(nativeInstruction_at(insn_addr+8)->is_movk(), "wrong insns in patch"); 181 uintptr_t dest = (uintptr_t)o; 182 Instruction_aarch64::patch(insn_addr, 20, 5, dest & 0xffff); 183 Instruction_aarch64::patch(insn_addr+4, 20, 5, (dest >>= 16) & 0xffff); 184 Instruction_aarch64::patch(insn_addr+8, 20, 5, (dest >>= 16) & 0xffff); 185 instructions = 3; 186 } 187 return instructions * NativeInstruction::instruction_size; 188 } 189 190 int MacroAssembler::patch_narrow_klass(address insn_addr, narrowKlass n) { 191 // Metatdata pointers are either narrow (32 bits) or wide (48 bits). 192 // We encode narrow ones by setting the upper 16 bits in the first 193 // instruction. 194 NativeInstruction *insn = nativeInstruction_at(insn_addr); 195 assert(Instruction_aarch64::extract(insn->encoding(), 31, 21) == 0b11010010101 && 196 nativeInstruction_at(insn_addr+4)->is_movk(), "wrong insns in patch"); 197 198 Instruction_aarch64::patch(insn_addr, 20, 5, n >> 16); 199 Instruction_aarch64::patch(insn_addr+4, 20, 5, n & 0xffff); 200 return 2 * NativeInstruction::instruction_size; 201 } 202 203 address MacroAssembler::target_addr_for_insn(address insn_addr, unsigned insn) { 204 long offset = 0; 205 if ((Instruction_aarch64::extract(insn, 29, 24) & 0b011011) == 0b00011000) { 206 // Load register (literal) 207 offset = Instruction_aarch64::sextract(insn, 23, 5); 208 return address(((uint64_t)insn_addr + (offset << 2))); 209 } else if (Instruction_aarch64::extract(insn, 30, 26) == 0b00101) { 210 // Unconditional branch (immediate) 211 offset = Instruction_aarch64::sextract(insn, 25, 0); 212 } else if (Instruction_aarch64::extract(insn, 31, 25) == 0b0101010) { 213 // Conditional branch (immediate) 214 offset = Instruction_aarch64::sextract(insn, 23, 5); 215 } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011010) { 216 // Compare & branch (immediate) 217 offset = Instruction_aarch64::sextract(insn, 23, 5); 218 } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011011) { 219 // Test & branch (immediate) 220 offset = Instruction_aarch64::sextract(insn, 18, 5); 221 } else if (Instruction_aarch64::extract(insn, 28, 24) == 0b10000) { 222 // PC-rel. addressing 223 offset = Instruction_aarch64::extract(insn, 30, 29); 224 offset |= Instruction_aarch64::sextract(insn, 23, 5) << 2; 225 int shift = Instruction_aarch64::extract(insn, 31, 31) ? 12 : 0; 226 if (shift) { 227 offset <<= shift; 228 uint64_t target_page = ((uint64_t)insn_addr) + offset; 229 target_page &= ((uint64_t)-1) << shift; 230 // Return the target address for the following sequences 231 // 1 - adrp Rx, target_page 232 // ldr/str Ry, [Rx, #offset_in_page] 233 // 2 - adrp Rx, target_page 234 // add Ry, Rx, #offset_in_page 235 // 3 - adrp Rx, target_page (page aligned reloc, offset == 0) 236 // movk Rx, #imm12<<32 237 // 4 - adrp Rx, target_page (page aligned reloc, offset == 0) 238 // 239 // In the first two cases we check that the register is the same and 240 // return the target_page + the offset within the page. 241 // Otherwise we assume it is a page aligned relocation and return 242 // the target page only. 243 // 244 unsigned insn2 = ((unsigned*)insn_addr)[1]; 245 if (Instruction_aarch64::extract(insn2, 29, 24) == 0b111001 && 246 Instruction_aarch64::extract(insn, 4, 0) == 247 Instruction_aarch64::extract(insn2, 9, 5)) { 248 // Load/store register (unsigned immediate) 249 unsigned int byte_offset = Instruction_aarch64::extract(insn2, 21, 10); 250 unsigned int size = Instruction_aarch64::extract(insn2, 31, 30); 251 return address(target_page + (byte_offset << size)); 252 } else if (Instruction_aarch64::extract(insn2, 31, 22) == 0b1001000100 && 253 Instruction_aarch64::extract(insn, 4, 0) == 254 Instruction_aarch64::extract(insn2, 4, 0)) { 255 // add (immediate) 256 unsigned int byte_offset = Instruction_aarch64::extract(insn2, 21, 10); 257 return address(target_page + byte_offset); 258 } else { 259 if (Instruction_aarch64::extract(insn2, 31, 21) == 0b11110010110 && 260 Instruction_aarch64::extract(insn, 4, 0) == 261 Instruction_aarch64::extract(insn2, 4, 0)) { 262 target_page = (target_page & 0xffffffff) | 263 ((uint64_t)Instruction_aarch64::extract(insn2, 20, 5) << 32); 264 } 265 return (address)target_page; 266 } 267 } else { 268 ShouldNotReachHere(); 269 } 270 } else if (Instruction_aarch64::extract(insn, 31, 23) == 0b110100101) { 271 u_int32_t *insns = (u_int32_t *)insn_addr; 272 // Move wide constant: movz, movk, movk. See movptr(). 273 assert(nativeInstruction_at(insns+1)->is_movk(), "wrong insns in patch"); 274 assert(nativeInstruction_at(insns+2)->is_movk(), "wrong insns in patch"); 275 return address(u_int64_t(Instruction_aarch64::extract(insns[0], 20, 5)) 276 + (u_int64_t(Instruction_aarch64::extract(insns[1], 20, 5)) << 16) 277 + (u_int64_t(Instruction_aarch64::extract(insns[2], 20, 5)) << 32)); 278 } else if (Instruction_aarch64::extract(insn, 31, 22) == 0b1011100101 && 279 Instruction_aarch64::extract(insn, 4, 0) == 0b11111) { 280 return 0; 281 } else { 282 ShouldNotReachHere(); 283 } 284 return address(((uint64_t)insn_addr + (offset << 2))); 285 } 286 287 void MacroAssembler::serialize_memory(Register thread, Register tmp) { 288 dsb(Assembler::SY); 289 } 290 291 void MacroAssembler::safepoint_poll(Label& slow_path) { 292 if (SafepointMechanism::uses_thread_local_poll()) { 293 ldr(rscratch1, Address(rthread, Thread::polling_page_offset())); 294 tbnz(rscratch1, exact_log2(SafepointMechanism::poll_bit()), slow_path); 295 } else { 296 unsigned long offset; 297 adrp(rscratch1, ExternalAddress(SafepointSynchronize::address_of_state()), offset); 298 ldrw(rscratch1, Address(rscratch1, offset)); 299 assert(SafepointSynchronize::_not_synchronized == 0, "rewrite this code"); 300 cbnz(rscratch1, slow_path); 301 } 302 } 303 304 // Just like safepoint_poll, but use an acquiring load for thread- 305 // local polling. 306 // 307 // We need an acquire here to ensure that any subsequent load of the 308 // global SafepointSynchronize::_state flag is ordered after this load 309 // of the local Thread::_polling page. We don't want this poll to 310 // return false (i.e. not safepointing) and a later poll of the global 311 // SafepointSynchronize::_state spuriously to return true. 312 // 313 // This is to avoid a race when we're in a native->Java transition 314 // racing the code which wakes up from a safepoint. 315 // 316 void MacroAssembler::safepoint_poll_acquire(Label& slow_path) { 317 if (SafepointMechanism::uses_thread_local_poll()) { 318 lea(rscratch1, Address(rthread, Thread::polling_page_offset())); 319 ldar(rscratch1, rscratch1); 320 tbnz(rscratch1, exact_log2(SafepointMechanism::poll_bit()), slow_path); 321 } else { 322 safepoint_poll(slow_path); 323 } 324 } 325 326 void MacroAssembler::reset_last_Java_frame(bool clear_fp) { 327 // we must set sp to zero to clear frame 328 str(zr, Address(rthread, JavaThread::last_Java_sp_offset())); 329 330 // must clear fp, so that compiled frames are not confused; it is 331 // possible that we need it only for debugging 332 if (clear_fp) { 333 str(zr, Address(rthread, JavaThread::last_Java_fp_offset())); 334 } 335 336 // Always clear the pc because it could have been set by make_walkable() 337 str(zr, Address(rthread, JavaThread::last_Java_pc_offset())); 338 } 339 340 // Calls to C land 341 // 342 // When entering C land, the rfp, & resp of the last Java frame have to be recorded 343 // in the (thread-local) JavaThread object. When leaving C land, the last Java fp 344 // has to be reset to 0. This is required to allow proper stack traversal. 345 void MacroAssembler::set_last_Java_frame(Register last_java_sp, 346 Register last_java_fp, 347 Register last_java_pc, 348 Register scratch) { 349 350 if (last_java_pc->is_valid()) { 351 str(last_java_pc, Address(rthread, 352 JavaThread::frame_anchor_offset() 353 + JavaFrameAnchor::last_Java_pc_offset())); 354 } 355 356 // determine last_java_sp register 357 if (last_java_sp == sp) { 358 mov(scratch, sp); 359 last_java_sp = scratch; 360 } else if (!last_java_sp->is_valid()) { 361 last_java_sp = esp; 362 } 363 364 str(last_java_sp, Address(rthread, JavaThread::last_Java_sp_offset())); 365 366 // last_java_fp is optional 367 if (last_java_fp->is_valid()) { 368 str(last_java_fp, Address(rthread, JavaThread::last_Java_fp_offset())); 369 } 370 } 371 372 void MacroAssembler::set_last_Java_frame(Register last_java_sp, 373 Register last_java_fp, 374 address last_java_pc, 375 Register scratch) { 376 if (last_java_pc != NULL) { 377 adr(scratch, last_java_pc); 378 } else { 379 // FIXME: This is almost never correct. We should delete all 380 // cases of set_last_Java_frame with last_java_pc=NULL and use the 381 // correct return address instead. 382 adr(scratch, pc()); 383 } 384 385 str(scratch, Address(rthread, 386 JavaThread::frame_anchor_offset() 387 + JavaFrameAnchor::last_Java_pc_offset())); 388 389 set_last_Java_frame(last_java_sp, last_java_fp, noreg, scratch); 390 } 391 392 void MacroAssembler::set_last_Java_frame(Register last_java_sp, 393 Register last_java_fp, 394 Label &L, 395 Register scratch) { 396 if (L.is_bound()) { 397 set_last_Java_frame(last_java_sp, last_java_fp, target(L), scratch); 398 } else { 399 InstructionMark im(this); 400 L.add_patch_at(code(), locator()); 401 set_last_Java_frame(last_java_sp, last_java_fp, (address)NULL, scratch); 402 } 403 } 404 405 void MacroAssembler::far_call(Address entry, CodeBuffer *cbuf, Register tmp) { 406 assert(ReservedCodeCacheSize < 4*G, "branch out of range"); 407 assert(CodeCache::find_blob(entry.target()) != NULL, 408 "destination of far call not found in code cache"); 409 if (far_branches()) { 410 unsigned long offset; 411 // We can use ADRP here because we know that the total size of 412 // the code cache cannot exceed 2Gb. 413 adrp(tmp, entry, offset); 414 add(tmp, tmp, offset); 415 if (cbuf) cbuf->set_insts_mark(); 416 blr(tmp); 417 } else { 418 if (cbuf) cbuf->set_insts_mark(); 419 bl(entry); 420 } 421 } 422 423 void MacroAssembler::far_jump(Address entry, CodeBuffer *cbuf, Register tmp) { 424 assert(ReservedCodeCacheSize < 4*G, "branch out of range"); 425 assert(CodeCache::find_blob(entry.target()) != NULL, 426 "destination of far call not found in code cache"); 427 if (far_branches()) { 428 unsigned long offset; 429 // We can use ADRP here because we know that the total size of 430 // the code cache cannot exceed 2Gb. 431 adrp(tmp, entry, offset); 432 add(tmp, tmp, offset); 433 if (cbuf) cbuf->set_insts_mark(); 434 br(tmp); 435 } else { 436 if (cbuf) cbuf->set_insts_mark(); 437 b(entry); 438 } 439 } 440 441 void MacroAssembler::reserved_stack_check() { 442 // testing if reserved zone needs to be enabled 443 Label no_reserved_zone_enabling; 444 445 ldr(rscratch1, Address(rthread, JavaThread::reserved_stack_activation_offset())); 446 cmp(sp, rscratch1); 447 br(Assembler::LO, no_reserved_zone_enabling); 448 449 enter(); // LR and FP are live. 450 lea(rscratch1, CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone)); 451 mov(c_rarg0, rthread); 452 blr(rscratch1); 453 leave(); 454 455 // We have already removed our own frame. 456 // throw_delayed_StackOverflowError will think that it's been 457 // called by our caller. 458 lea(rscratch1, RuntimeAddress(StubRoutines::throw_delayed_StackOverflowError_entry())); 459 br(rscratch1); 460 should_not_reach_here(); 461 462 bind(no_reserved_zone_enabling); 463 } 464 465 int MacroAssembler::biased_locking_enter(Register lock_reg, 466 Register obj_reg, 467 Register swap_reg, 468 Register tmp_reg, 469 bool swap_reg_contains_mark, 470 Label& done, 471 Label* slow_case, 472 BiasedLockingCounters* counters) { 473 assert(UseBiasedLocking, "why call this otherwise?"); 474 assert_different_registers(lock_reg, obj_reg, swap_reg); 475 476 if (PrintBiasedLockingStatistics && counters == NULL) 477 counters = BiasedLocking::counters(); 478 479 assert_different_registers(lock_reg, obj_reg, swap_reg, tmp_reg, rscratch1, rscratch2, noreg); 480 assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits, "biased locking makes assumptions about bit layout"); 481 Address mark_addr (obj_reg, oopDesc::mark_offset_in_bytes()); 482 Address klass_addr (obj_reg, oopDesc::klass_offset_in_bytes()); 483 Address saved_mark_addr(lock_reg, 0); 484 485 // Biased locking 486 // See whether the lock is currently biased toward our thread and 487 // whether the epoch is still valid 488 // Note that the runtime guarantees sufficient alignment of JavaThread 489 // pointers to allow age to be placed into low bits 490 // First check to see whether biasing is even enabled for this object 491 Label cas_label; 492 int null_check_offset = -1; 493 if (!swap_reg_contains_mark) { 494 null_check_offset = offset(); 495 ldr(swap_reg, mark_addr); 496 } 497 andr(tmp_reg, swap_reg, markOopDesc::biased_lock_mask_in_place); 498 cmp(tmp_reg, (u1)markOopDesc::biased_lock_pattern); 499 br(Assembler::NE, cas_label); 500 // The bias pattern is present in the object's header. Need to check 501 // whether the bias owner and the epoch are both still current. 502 load_prototype_header(tmp_reg, obj_reg); 503 orr(tmp_reg, tmp_reg, rthread); 504 eor(tmp_reg, swap_reg, tmp_reg); 505 andr(tmp_reg, tmp_reg, ~((int) markOopDesc::age_mask_in_place)); 506 if (counters != NULL) { 507 Label around; 508 cbnz(tmp_reg, around); 509 atomic_incw(Address((address)counters->biased_lock_entry_count_addr()), tmp_reg, rscratch1, rscratch2); 510 b(done); 511 bind(around); 512 } else { 513 cbz(tmp_reg, done); 514 } 515 516 Label try_revoke_bias; 517 Label try_rebias; 518 519 // At this point we know that the header has the bias pattern and 520 // that we are not the bias owner in the current epoch. We need to 521 // figure out more details about the state of the header in order to 522 // know what operations can be legally performed on the object's 523 // header. 524 525 // If the low three bits in the xor result aren't clear, that means 526 // the prototype header is no longer biased and we have to revoke 527 // the bias on this object. 528 andr(rscratch1, tmp_reg, markOopDesc::biased_lock_mask_in_place); 529 cbnz(rscratch1, try_revoke_bias); 530 531 // Biasing is still enabled for this data type. See whether the 532 // epoch of the current bias is still valid, meaning that the epoch 533 // bits of the mark word are equal to the epoch bits of the 534 // prototype header. (Note that the prototype header's epoch bits 535 // only change at a safepoint.) If not, attempt to rebias the object 536 // toward the current thread. Note that we must be absolutely sure 537 // that the current epoch is invalid in order to do this because 538 // otherwise the manipulations it performs on the mark word are 539 // illegal. 540 andr(rscratch1, tmp_reg, markOopDesc::epoch_mask_in_place); 541 cbnz(rscratch1, try_rebias); 542 543 // The epoch of the current bias is still valid but we know nothing 544 // about the owner; it might be set or it might be clear. Try to 545 // acquire the bias of the object using an atomic operation. If this 546 // fails we will go in to the runtime to revoke the object's bias. 547 // Note that we first construct the presumed unbiased header so we 548 // don't accidentally blow away another thread's valid bias. 549 { 550 Label here; 551 mov(rscratch1, markOopDesc::biased_lock_mask_in_place | markOopDesc::age_mask_in_place | markOopDesc::epoch_mask_in_place); 552 andr(swap_reg, swap_reg, rscratch1); 553 orr(tmp_reg, swap_reg, rthread); 554 cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, rscratch1, here, slow_case); 555 // If the biasing toward our thread failed, this means that 556 // another thread succeeded in biasing it toward itself and we 557 // need to revoke that bias. The revocation will occur in the 558 // interpreter runtime in the slow case. 559 bind(here); 560 if (counters != NULL) { 561 atomic_incw(Address((address)counters->anonymously_biased_lock_entry_count_addr()), 562 tmp_reg, rscratch1, rscratch2); 563 } 564 } 565 b(done); 566 567 bind(try_rebias); 568 // At this point we know the epoch has expired, meaning that the 569 // current "bias owner", if any, is actually invalid. Under these 570 // circumstances _only_, we are allowed to use the current header's 571 // value as the comparison value when doing the cas to acquire the 572 // bias in the current epoch. In other words, we allow transfer of 573 // the bias from one thread to another directly in this situation. 574 // 575 // FIXME: due to a lack of registers we currently blow away the age 576 // bits in this situation. Should attempt to preserve them. 577 { 578 Label here; 579 load_prototype_header(tmp_reg, obj_reg); 580 orr(tmp_reg, rthread, tmp_reg); 581 cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, rscratch1, here, slow_case); 582 // If the biasing toward our thread failed, then another thread 583 // succeeded in biasing it toward itself and we need to revoke that 584 // bias. The revocation will occur in the runtime in the slow case. 585 bind(here); 586 if (counters != NULL) { 587 atomic_incw(Address((address)counters->rebiased_lock_entry_count_addr()), 588 tmp_reg, rscratch1, rscratch2); 589 } 590 } 591 b(done); 592 593 bind(try_revoke_bias); 594 // The prototype mark in the klass doesn't have the bias bit set any 595 // more, indicating that objects of this data type are not supposed 596 // to be biased any more. We are going to try to reset the mark of 597 // this object to the prototype value and fall through to the 598 // CAS-based locking scheme. Note that if our CAS fails, it means 599 // that another thread raced us for the privilege of revoking the 600 // bias of this particular object, so it's okay to continue in the 601 // normal locking code. 602 // 603 // FIXME: due to a lack of registers we currently blow away the age 604 // bits in this situation. Should attempt to preserve them. 605 { 606 Label here, nope; 607 load_prototype_header(tmp_reg, obj_reg); 608 cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, rscratch1, here, &nope); 609 bind(here); 610 611 // Fall through to the normal CAS-based lock, because no matter what 612 // the result of the above CAS, some thread must have succeeded in 613 // removing the bias bit from the object's header. 614 if (counters != NULL) { 615 atomic_incw(Address((address)counters->revoked_lock_entry_count_addr()), tmp_reg, 616 rscratch1, rscratch2); 617 } 618 bind(nope); 619 } 620 621 bind(cas_label); 622 623 return null_check_offset; 624 } 625 626 void MacroAssembler::biased_locking_exit(Register obj_reg, Register temp_reg, Label& done) { 627 assert(UseBiasedLocking, "why call this otherwise?"); 628 629 // Check for biased locking unlock case, which is a no-op 630 // Note: we do not have to check the thread ID for two reasons. 631 // First, the interpreter checks for IllegalMonitorStateException at 632 // a higher level. Second, if the bias was revoked while we held the 633 // lock, the object could not be rebiased toward another thread, so 634 // the bias bit would be clear. 635 ldr(temp_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes())); 636 andr(temp_reg, temp_reg, markOopDesc::biased_lock_mask_in_place); 637 cmp(temp_reg, (u1)markOopDesc::biased_lock_pattern); 638 br(Assembler::EQ, done); 639 } 640 641 static void pass_arg0(MacroAssembler* masm, Register arg) { 642 if (c_rarg0 != arg ) { 643 masm->mov(c_rarg0, arg); 644 } 645 } 646 647 static void pass_arg1(MacroAssembler* masm, Register arg) { 648 if (c_rarg1 != arg ) { 649 masm->mov(c_rarg1, arg); 650 } 651 } 652 653 static void pass_arg2(MacroAssembler* masm, Register arg) { 654 if (c_rarg2 != arg ) { 655 masm->mov(c_rarg2, arg); 656 } 657 } 658 659 static void pass_arg3(MacroAssembler* masm, Register arg) { 660 if (c_rarg3 != arg ) { 661 masm->mov(c_rarg3, arg); 662 } 663 } 664 665 void MacroAssembler::call_VM_base(Register oop_result, 666 Register java_thread, 667 Register last_java_sp, 668 address entry_point, 669 int number_of_arguments, 670 bool check_exceptions) { 671 // determine java_thread register 672 if (!java_thread->is_valid()) { 673 java_thread = rthread; 674 } 675 676 // determine last_java_sp register 677 if (!last_java_sp->is_valid()) { 678 last_java_sp = esp; 679 } 680 681 // debugging support 682 assert(number_of_arguments >= 0 , "cannot have negative number of arguments"); 683 assert(java_thread == rthread, "unexpected register"); 684 #ifdef ASSERT 685 // TraceBytecodes does not use r12 but saves it over the call, so don't verify 686 // if ((UseCompressedOops || UseCompressedClassPointers) && !TraceBytecodes) verify_heapbase("call_VM_base: heap base corrupted?"); 687 #endif // ASSERT 688 689 assert(java_thread != oop_result , "cannot use the same register for java_thread & oop_result"); 690 assert(java_thread != last_java_sp, "cannot use the same register for java_thread & last_java_sp"); 691 692 // push java thread (becomes first argument of C function) 693 694 mov(c_rarg0, java_thread); 695 696 // set last Java frame before call 697 assert(last_java_sp != rfp, "can't use rfp"); 698 699 Label l; 700 set_last_Java_frame(last_java_sp, rfp, l, rscratch1); 701 702 // do the call, remove parameters 703 MacroAssembler::call_VM_leaf_base(entry_point, number_of_arguments, &l); 704 705 // reset last Java frame 706 // Only interpreter should have to clear fp 707 reset_last_Java_frame(true); 708 709 // C++ interp handles this in the interpreter 710 check_and_handle_popframe(java_thread); 711 check_and_handle_earlyret(java_thread); 712 713 if (check_exceptions) { 714 // check for pending exceptions (java_thread is set upon return) 715 ldr(rscratch1, Address(java_thread, in_bytes(Thread::pending_exception_offset()))); 716 Label ok; 717 cbz(rscratch1, ok); 718 lea(rscratch1, RuntimeAddress(StubRoutines::forward_exception_entry())); 719 br(rscratch1); 720 bind(ok); 721 } 722 723 // get oop result if there is one and reset the value in the thread 724 if (oop_result->is_valid()) { 725 get_vm_result(oop_result, java_thread); 726 } 727 } 728 729 void MacroAssembler::call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions) { 730 call_VM_base(oop_result, noreg, noreg, entry_point, number_of_arguments, check_exceptions); 731 } 732 733 // Maybe emit a call via a trampoline. If the code cache is small 734 // trampolines won't be emitted. 735 736 address MacroAssembler::trampoline_call(Address entry, CodeBuffer *cbuf) { 737 assert(JavaThread::current()->is_Compiler_thread(), "just checking"); 738 assert(entry.rspec().type() == relocInfo::runtime_call_type 739 || entry.rspec().type() == relocInfo::opt_virtual_call_type 740 || entry.rspec().type() == relocInfo::static_call_type 741 || entry.rspec().type() == relocInfo::virtual_call_type, "wrong reloc type"); 742 743 // We need a trampoline if branches are far. 744 if (far_branches()) { 745 // We don't want to emit a trampoline if C2 is generating dummy 746 // code during its branch shortening phase. 747 CompileTask* task = ciEnv::current()->task(); 748 bool in_scratch_emit_size = 749 (task != NULL && is_c2_compile(task->comp_level()) && 750 Compile::current()->in_scratch_emit_size()); 751 if (!in_scratch_emit_size) { 752 address stub = emit_trampoline_stub(offset(), entry.target()); 753 if (stub == NULL) { 754 return NULL; // CodeCache is full 755 } 756 } 757 } 758 759 if (cbuf) cbuf->set_insts_mark(); 760 relocate(entry.rspec()); 761 if (!far_branches()) { 762 bl(entry.target()); 763 } else { 764 bl(pc()); 765 } 766 // just need to return a non-null address 767 return pc(); 768 } 769 770 771 // Emit a trampoline stub for a call to a target which is too far away. 772 // 773 // code sequences: 774 // 775 // call-site: 776 // branch-and-link to <destination> or <trampoline stub> 777 // 778 // Related trampoline stub for this call site in the stub section: 779 // load the call target from the constant pool 780 // branch (LR still points to the call site above) 781 782 address MacroAssembler::emit_trampoline_stub(int insts_call_instruction_offset, 783 address dest) { 784 address stub = start_a_stub(Compile::MAX_stubs_size/2); 785 if (stub == NULL) { 786 return NULL; // CodeBuffer::expand failed 787 } 788 789 // Create a trampoline stub relocation which relates this trampoline stub 790 // with the call instruction at insts_call_instruction_offset in the 791 // instructions code-section. 792 align(wordSize); 793 relocate(trampoline_stub_Relocation::spec(code()->insts()->start() 794 + insts_call_instruction_offset)); 795 const int stub_start_offset = offset(); 796 797 // Now, create the trampoline stub's code: 798 // - load the call 799 // - call 800 Label target; 801 ldr(rscratch1, target); 802 br(rscratch1); 803 bind(target); 804 assert(offset() - stub_start_offset == NativeCallTrampolineStub::data_offset, 805 "should be"); 806 emit_int64((int64_t)dest); 807 808 const address stub_start_addr = addr_at(stub_start_offset); 809 810 assert(is_NativeCallTrampolineStub_at(stub_start_addr), "doesn't look like a trampoline"); 811 812 end_a_stub(); 813 return stub_start_addr; 814 } 815 816 address MacroAssembler::ic_call(address entry, jint method_index) { 817 RelocationHolder rh = virtual_call_Relocation::spec(pc(), method_index); 818 // address const_ptr = long_constant((jlong)Universe::non_oop_word()); 819 // unsigned long offset; 820 // ldr_constant(rscratch2, const_ptr); 821 movptr(rscratch2, (uintptr_t)Universe::non_oop_word()); 822 return trampoline_call(Address(entry, rh)); 823 } 824 825 // Implementation of call_VM versions 826 827 void MacroAssembler::call_VM(Register oop_result, 828 address entry_point, 829 bool check_exceptions) { 830 call_VM_helper(oop_result, entry_point, 0, check_exceptions); 831 } 832 833 void MacroAssembler::call_VM(Register oop_result, 834 address entry_point, 835 Register arg_1, 836 bool check_exceptions) { 837 pass_arg1(this, arg_1); 838 call_VM_helper(oop_result, entry_point, 1, check_exceptions); 839 } 840 841 void MacroAssembler::call_VM(Register oop_result, 842 address entry_point, 843 Register arg_1, 844 Register arg_2, 845 bool check_exceptions) { 846 assert(arg_1 != c_rarg2, "smashed arg"); 847 pass_arg2(this, arg_2); 848 pass_arg1(this, arg_1); 849 call_VM_helper(oop_result, entry_point, 2, check_exceptions); 850 } 851 852 void MacroAssembler::call_VM(Register oop_result, 853 address entry_point, 854 Register arg_1, 855 Register arg_2, 856 Register arg_3, 857 bool check_exceptions) { 858 assert(arg_1 != c_rarg3, "smashed arg"); 859 assert(arg_2 != c_rarg3, "smashed arg"); 860 pass_arg3(this, arg_3); 861 862 assert(arg_1 != c_rarg2, "smashed arg"); 863 pass_arg2(this, arg_2); 864 865 pass_arg1(this, arg_1); 866 call_VM_helper(oop_result, entry_point, 3, check_exceptions); 867 } 868 869 void MacroAssembler::call_VM(Register oop_result, 870 Register last_java_sp, 871 address entry_point, 872 int number_of_arguments, 873 bool check_exceptions) { 874 call_VM_base(oop_result, rthread, last_java_sp, entry_point, number_of_arguments, check_exceptions); 875 } 876 877 void MacroAssembler::call_VM(Register oop_result, 878 Register last_java_sp, 879 address entry_point, 880 Register arg_1, 881 bool check_exceptions) { 882 pass_arg1(this, arg_1); 883 call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions); 884 } 885 886 void MacroAssembler::call_VM(Register oop_result, 887 Register last_java_sp, 888 address entry_point, 889 Register arg_1, 890 Register arg_2, 891 bool check_exceptions) { 892 893 assert(arg_1 != c_rarg2, "smashed arg"); 894 pass_arg2(this, arg_2); 895 pass_arg1(this, arg_1); 896 call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions); 897 } 898 899 void MacroAssembler::call_VM(Register oop_result, 900 Register last_java_sp, 901 address entry_point, 902 Register arg_1, 903 Register arg_2, 904 Register arg_3, 905 bool check_exceptions) { 906 assert(arg_1 != c_rarg3, "smashed arg"); 907 assert(arg_2 != c_rarg3, "smashed arg"); 908 pass_arg3(this, arg_3); 909 assert(arg_1 != c_rarg2, "smashed arg"); 910 pass_arg2(this, arg_2); 911 pass_arg1(this, arg_1); 912 call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions); 913 } 914 915 916 void MacroAssembler::get_vm_result(Register oop_result, Register java_thread) { 917 ldr(oop_result, Address(java_thread, JavaThread::vm_result_offset())); 918 str(zr, Address(java_thread, JavaThread::vm_result_offset())); 919 verify_oop(oop_result, "broken oop in call_VM_base"); 920 } 921 922 void MacroAssembler::get_vm_result_2(Register metadata_result, Register java_thread) { 923 ldr(metadata_result, Address(java_thread, JavaThread::vm_result_2_offset())); 924 str(zr, Address(java_thread, JavaThread::vm_result_2_offset())); 925 } 926 927 void MacroAssembler::align(int modulus) { 928 while (offset() % modulus != 0) nop(); 929 } 930 931 // these are no-ops overridden by InterpreterMacroAssembler 932 933 void MacroAssembler::check_and_handle_earlyret(Register java_thread) { } 934 935 void MacroAssembler::check_and_handle_popframe(Register java_thread) { } 936 937 938 RegisterOrConstant MacroAssembler::delayed_value_impl(intptr_t* delayed_value_addr, 939 Register tmp, 940 int offset) { 941 intptr_t value = *delayed_value_addr; 942 if (value != 0) 943 return RegisterOrConstant(value + offset); 944 945 // load indirectly to solve generation ordering problem 946 ldr(tmp, ExternalAddress((address) delayed_value_addr)); 947 948 if (offset != 0) 949 add(tmp, tmp, offset); 950 951 return RegisterOrConstant(tmp); 952 } 953 954 955 void MacroAssembler:: notify(int type) { 956 if (type == bytecode_start) { 957 // set_last_Java_frame(esp, rfp, (address)NULL); 958 Assembler:: notify(type); 959 // reset_last_Java_frame(true); 960 } 961 else 962 Assembler:: notify(type); 963 } 964 965 // Look up the method for a megamorphic invokeinterface call. 966 // The target method is determined by <intf_klass, itable_index>. 967 // The receiver klass is in recv_klass. 968 // On success, the result will be in method_result, and execution falls through. 969 // On failure, execution transfers to the given label. 970 void MacroAssembler::lookup_interface_method(Register recv_klass, 971 Register intf_klass, 972 RegisterOrConstant itable_index, 973 Register method_result, 974 Register scan_temp, 975 Label& L_no_such_interface, 976 bool return_method) { 977 assert_different_registers(recv_klass, intf_klass, scan_temp); 978 assert_different_registers(method_result, intf_klass, scan_temp); 979 assert(recv_klass != method_result || !return_method, 980 "recv_klass can be destroyed when method isn't needed"); 981 assert(itable_index.is_constant() || itable_index.as_register() == method_result, 982 "caller must use same register for non-constant itable index as for method"); 983 984 // Compute start of first itableOffsetEntry (which is at the end of the vtable) 985 int vtable_base = in_bytes(Klass::vtable_start_offset()); 986 int itentry_off = itableMethodEntry::method_offset_in_bytes(); 987 int scan_step = itableOffsetEntry::size() * wordSize; 988 int vte_size = vtableEntry::size_in_bytes(); 989 assert(vte_size == wordSize, "else adjust times_vte_scale"); 990 991 ldrw(scan_temp, Address(recv_klass, Klass::vtable_length_offset())); 992 993 // %%% Could store the aligned, prescaled offset in the klassoop. 994 // lea(scan_temp, Address(recv_klass, scan_temp, times_vte_scale, vtable_base)); 995 lea(scan_temp, Address(recv_klass, scan_temp, Address::lsl(3))); 996 add(scan_temp, scan_temp, vtable_base); 997 998 if (return_method) { 999 // Adjust recv_klass by scaled itable_index, so we can free itable_index. 1000 assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below"); 1001 // lea(recv_klass, Address(recv_klass, itable_index, Address::times_ptr, itentry_off)); 1002 lea(recv_klass, Address(recv_klass, itable_index, Address::lsl(3))); 1003 if (itentry_off) 1004 add(recv_klass, recv_klass, itentry_off); 1005 } 1006 1007 // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) { 1008 // if (scan->interface() == intf) { 1009 // result = (klass + scan->offset() + itable_index); 1010 // } 1011 // } 1012 Label search, found_method; 1013 1014 for (int peel = 1; peel >= 0; peel--) { 1015 ldr(method_result, Address(scan_temp, itableOffsetEntry::interface_offset_in_bytes())); 1016 cmp(intf_klass, method_result); 1017 1018 if (peel) { 1019 br(Assembler::EQ, found_method); 1020 } else { 1021 br(Assembler::NE, search); 1022 // (invert the test to fall through to found_method...) 1023 } 1024 1025 if (!peel) break; 1026 1027 bind(search); 1028 1029 // Check that the previous entry is non-null. A null entry means that 1030 // the receiver class doesn't implement the interface, and wasn't the 1031 // same as when the caller was compiled. 1032 cbz(method_result, L_no_such_interface); 1033 add(scan_temp, scan_temp, scan_step); 1034 } 1035 1036 bind(found_method); 1037 1038 // Got a hit. 1039 if (return_method) { 1040 ldrw(scan_temp, Address(scan_temp, itableOffsetEntry::offset_offset_in_bytes())); 1041 ldr(method_result, Address(recv_klass, scan_temp, Address::uxtw(0))); 1042 } 1043 } 1044 1045 // virtual method calling 1046 void MacroAssembler::lookup_virtual_method(Register recv_klass, 1047 RegisterOrConstant vtable_index, 1048 Register method_result) { 1049 const int base = in_bytes(Klass::vtable_start_offset()); 1050 assert(vtableEntry::size() * wordSize == 8, 1051 "adjust the scaling in the code below"); 1052 int vtable_offset_in_bytes = base + vtableEntry::method_offset_in_bytes(); 1053 1054 if (vtable_index.is_register()) { 1055 lea(method_result, Address(recv_klass, 1056 vtable_index.as_register(), 1057 Address::lsl(LogBytesPerWord))); 1058 ldr(method_result, Address(method_result, vtable_offset_in_bytes)); 1059 } else { 1060 vtable_offset_in_bytes += vtable_index.as_constant() * wordSize; 1061 ldr(method_result, 1062 form_address(rscratch1, recv_klass, vtable_offset_in_bytes, 0)); 1063 } 1064 } 1065 1066 void MacroAssembler::check_klass_subtype(Register sub_klass, 1067 Register super_klass, 1068 Register temp_reg, 1069 Label& L_success) { 1070 Label L_failure; 1071 check_klass_subtype_fast_path(sub_klass, super_klass, temp_reg, &L_success, &L_failure, NULL); 1072 check_klass_subtype_slow_path(sub_klass, super_klass, temp_reg, noreg, &L_success, NULL); 1073 bind(L_failure); 1074 } 1075 1076 1077 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass, 1078 Register super_klass, 1079 Register temp_reg, 1080 Label* L_success, 1081 Label* L_failure, 1082 Label* L_slow_path, 1083 RegisterOrConstant super_check_offset) { 1084 assert_different_registers(sub_klass, super_klass, temp_reg); 1085 bool must_load_sco = (super_check_offset.constant_or_zero() == -1); 1086 if (super_check_offset.is_register()) { 1087 assert_different_registers(sub_klass, super_klass, 1088 super_check_offset.as_register()); 1089 } else if (must_load_sco) { 1090 assert(temp_reg != noreg, "supply either a temp or a register offset"); 1091 } 1092 1093 Label L_fallthrough; 1094 int label_nulls = 0; 1095 if (L_success == NULL) { L_success = &L_fallthrough; label_nulls++; } 1096 if (L_failure == NULL) { L_failure = &L_fallthrough; label_nulls++; } 1097 if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; } 1098 assert(label_nulls <= 1, "at most one NULL in the batch"); 1099 1100 int sc_offset = in_bytes(Klass::secondary_super_cache_offset()); 1101 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 1102 Address super_check_offset_addr(super_klass, sco_offset); 1103 1104 // Hacked jmp, which may only be used just before L_fallthrough. 1105 #define final_jmp(label) \ 1106 if (&(label) == &L_fallthrough) { /*do nothing*/ } \ 1107 else b(label) /*omit semi*/ 1108 1109 // If the pointers are equal, we are done (e.g., String[] elements). 1110 // This self-check enables sharing of secondary supertype arrays among 1111 // non-primary types such as array-of-interface. Otherwise, each such 1112 // type would need its own customized SSA. 1113 // We move this check to the front of the fast path because many 1114 // type checks are in fact trivially successful in this manner, 1115 // so we get a nicely predicted branch right at the start of the check. 1116 cmp(sub_klass, super_klass); 1117 br(Assembler::EQ, *L_success); 1118 1119 // Check the supertype display: 1120 if (must_load_sco) { 1121 ldrw(temp_reg, super_check_offset_addr); 1122 super_check_offset = RegisterOrConstant(temp_reg); 1123 } 1124 Address super_check_addr(sub_klass, super_check_offset); 1125 ldr(rscratch1, super_check_addr); 1126 cmp(super_klass, rscratch1); // load displayed supertype 1127 1128 // This check has worked decisively for primary supers. 1129 // Secondary supers are sought in the super_cache ('super_cache_addr'). 1130 // (Secondary supers are interfaces and very deeply nested subtypes.) 1131 // This works in the same check above because of a tricky aliasing 1132 // between the super_cache and the primary super display elements. 1133 // (The 'super_check_addr' can address either, as the case requires.) 1134 // Note that the cache is updated below if it does not help us find 1135 // what we need immediately. 1136 // So if it was a primary super, we can just fail immediately. 1137 // Otherwise, it's the slow path for us (no success at this point). 1138 1139 if (super_check_offset.is_register()) { 1140 br(Assembler::EQ, *L_success); 1141 subs(zr, super_check_offset.as_register(), sc_offset); 1142 if (L_failure == &L_fallthrough) { 1143 br(Assembler::EQ, *L_slow_path); 1144 } else { 1145 br(Assembler::NE, *L_failure); 1146 final_jmp(*L_slow_path); 1147 } 1148 } else if (super_check_offset.as_constant() == sc_offset) { 1149 // Need a slow path; fast failure is impossible. 1150 if (L_slow_path == &L_fallthrough) { 1151 br(Assembler::EQ, *L_success); 1152 } else { 1153 br(Assembler::NE, *L_slow_path); 1154 final_jmp(*L_success); 1155 } 1156 } else { 1157 // No slow path; it's a fast decision. 1158 if (L_failure == &L_fallthrough) { 1159 br(Assembler::EQ, *L_success); 1160 } else { 1161 br(Assembler::NE, *L_failure); 1162 final_jmp(*L_success); 1163 } 1164 } 1165 1166 bind(L_fallthrough); 1167 1168 #undef final_jmp 1169 } 1170 1171 // These two are taken from x86, but they look generally useful 1172 1173 // scans count pointer sized words at [addr] for occurence of value, 1174 // generic 1175 void MacroAssembler::repne_scan(Register addr, Register value, Register count, 1176 Register scratch) { 1177 Label Lloop, Lexit; 1178 cbz(count, Lexit); 1179 bind(Lloop); 1180 ldr(scratch, post(addr, wordSize)); 1181 cmp(value, scratch); 1182 br(EQ, Lexit); 1183 sub(count, count, 1); 1184 cbnz(count, Lloop); 1185 bind(Lexit); 1186 } 1187 1188 // scans count 4 byte words at [addr] for occurence of value, 1189 // generic 1190 void MacroAssembler::repne_scanw(Register addr, Register value, Register count, 1191 Register scratch) { 1192 Label Lloop, Lexit; 1193 cbz(count, Lexit); 1194 bind(Lloop); 1195 ldrw(scratch, post(addr, wordSize)); 1196 cmpw(value, scratch); 1197 br(EQ, Lexit); 1198 sub(count, count, 1); 1199 cbnz(count, Lloop); 1200 bind(Lexit); 1201 } 1202 1203 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass, 1204 Register super_klass, 1205 Register temp_reg, 1206 Register temp2_reg, 1207 Label* L_success, 1208 Label* L_failure, 1209 bool set_cond_codes) { 1210 assert_different_registers(sub_klass, super_klass, temp_reg); 1211 if (temp2_reg != noreg) 1212 assert_different_registers(sub_klass, super_klass, temp_reg, temp2_reg, rscratch1); 1213 #define IS_A_TEMP(reg) ((reg) == temp_reg || (reg) == temp2_reg) 1214 1215 Label L_fallthrough; 1216 int label_nulls = 0; 1217 if (L_success == NULL) { L_success = &L_fallthrough; label_nulls++; } 1218 if (L_failure == NULL) { L_failure = &L_fallthrough; label_nulls++; } 1219 assert(label_nulls <= 1, "at most one NULL in the batch"); 1220 1221 // a couple of useful fields in sub_klass: 1222 int ss_offset = in_bytes(Klass::secondary_supers_offset()); 1223 int sc_offset = in_bytes(Klass::secondary_super_cache_offset()); 1224 Address secondary_supers_addr(sub_klass, ss_offset); 1225 Address super_cache_addr( sub_klass, sc_offset); 1226 1227 BLOCK_COMMENT("check_klass_subtype_slow_path"); 1228 1229 // Do a linear scan of the secondary super-klass chain. 1230 // This code is rarely used, so simplicity is a virtue here. 1231 // The repne_scan instruction uses fixed registers, which we must spill. 1232 // Don't worry too much about pre-existing connections with the input regs. 1233 1234 assert(sub_klass != r0, "killed reg"); // killed by mov(r0, super) 1235 assert(sub_klass != r2, "killed reg"); // killed by lea(r2, &pst_counter) 1236 1237 RegSet pushed_registers; 1238 if (!IS_A_TEMP(r2)) pushed_registers += r2; 1239 if (!IS_A_TEMP(r5)) pushed_registers += r5; 1240 1241 if (super_klass != r0 || UseCompressedOops) { 1242 if (!IS_A_TEMP(r0)) pushed_registers += r0; 1243 } 1244 1245 push(pushed_registers, sp); 1246 1247 // Get super_klass value into r0 (even if it was in r5 or r2). 1248 if (super_klass != r0) { 1249 mov(r0, super_klass); 1250 } 1251 1252 #ifndef PRODUCT 1253 mov(rscratch2, (address)&SharedRuntime::_partial_subtype_ctr); 1254 Address pst_counter_addr(rscratch2); 1255 ldr(rscratch1, pst_counter_addr); 1256 add(rscratch1, rscratch1, 1); 1257 str(rscratch1, pst_counter_addr); 1258 #endif //PRODUCT 1259 1260 // We will consult the secondary-super array. 1261 ldr(r5, secondary_supers_addr); 1262 // Load the array length. 1263 ldrw(r2, Address(r5, Array<Klass*>::length_offset_in_bytes())); 1264 // Skip to start of data. 1265 add(r5, r5, Array<Klass*>::base_offset_in_bytes()); 1266 1267 cmp(sp, zr); // Clear Z flag; SP is never zero 1268 // Scan R2 words at [R5] for an occurrence of R0. 1269 // Set NZ/Z based on last compare. 1270 repne_scan(r5, r0, r2, rscratch1); 1271 1272 // Unspill the temp. registers: 1273 pop(pushed_registers, sp); 1274 1275 br(Assembler::NE, *L_failure); 1276 1277 // Success. Cache the super we found and proceed in triumph. 1278 str(super_klass, super_cache_addr); 1279 1280 if (L_success != &L_fallthrough) { 1281 b(*L_success); 1282 } 1283 1284 #undef IS_A_TEMP 1285 1286 bind(L_fallthrough); 1287 } 1288 1289 1290 void MacroAssembler::verify_oop(Register reg, const char* s) { 1291 if (!VerifyOops) return; 1292 1293 // Pass register number to verify_oop_subroutine 1294 const char* b = NULL; 1295 { 1296 ResourceMark rm; 1297 stringStream ss; 1298 ss.print("verify_oop: %s: %s", reg->name(), s); 1299 b = code_string(ss.as_string()); 1300 } 1301 BLOCK_COMMENT("verify_oop {"); 1302 1303 stp(r0, rscratch1, Address(pre(sp, -2 * wordSize))); 1304 stp(rscratch2, lr, Address(pre(sp, -2 * wordSize))); 1305 1306 mov(r0, reg); 1307 mov(rscratch1, (address)b); 1308 1309 // call indirectly to solve generation ordering problem 1310 lea(rscratch2, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address())); 1311 ldr(rscratch2, Address(rscratch2)); 1312 blr(rscratch2); 1313 1314 ldp(rscratch2, lr, Address(post(sp, 2 * wordSize))); 1315 ldp(r0, rscratch1, Address(post(sp, 2 * wordSize))); 1316 1317 BLOCK_COMMENT("} verify_oop"); 1318 } 1319 1320 void MacroAssembler::verify_oop_addr(Address addr, const char* s) { 1321 if (!VerifyOops) return; 1322 1323 const char* b = NULL; 1324 { 1325 ResourceMark rm; 1326 stringStream ss; 1327 ss.print("verify_oop_addr: %s", s); 1328 b = code_string(ss.as_string()); 1329 } 1330 BLOCK_COMMENT("verify_oop_addr {"); 1331 1332 stp(r0, rscratch1, Address(pre(sp, -2 * wordSize))); 1333 stp(rscratch2, lr, Address(pre(sp, -2 * wordSize))); 1334 1335 // addr may contain sp so we will have to adjust it based on the 1336 // pushes that we just did. 1337 if (addr.uses(sp)) { 1338 lea(r0, addr); 1339 ldr(r0, Address(r0, 4 * wordSize)); 1340 } else { 1341 ldr(r0, addr); 1342 } 1343 mov(rscratch1, (address)b); 1344 1345 // call indirectly to solve generation ordering problem 1346 lea(rscratch2, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address())); 1347 ldr(rscratch2, Address(rscratch2)); 1348 blr(rscratch2); 1349 1350 ldp(rscratch2, lr, Address(post(sp, 2 * wordSize))); 1351 ldp(r0, rscratch1, Address(post(sp, 2 * wordSize))); 1352 1353 BLOCK_COMMENT("} verify_oop_addr"); 1354 } 1355 1356 Address MacroAssembler::argument_address(RegisterOrConstant arg_slot, 1357 int extra_slot_offset) { 1358 // cf. TemplateTable::prepare_invoke(), if (load_receiver). 1359 int stackElementSize = Interpreter::stackElementSize; 1360 int offset = Interpreter::expr_offset_in_bytes(extra_slot_offset+0); 1361 #ifdef ASSERT 1362 int offset1 = Interpreter::expr_offset_in_bytes(extra_slot_offset+1); 1363 assert(offset1 - offset == stackElementSize, "correct arithmetic"); 1364 #endif 1365 if (arg_slot.is_constant()) { 1366 return Address(esp, arg_slot.as_constant() * stackElementSize 1367 + offset); 1368 } else { 1369 add(rscratch1, esp, arg_slot.as_register(), 1370 ext::uxtx, exact_log2(stackElementSize)); 1371 return Address(rscratch1, offset); 1372 } 1373 } 1374 1375 void MacroAssembler::call_VM_leaf_base(address entry_point, 1376 int number_of_arguments, 1377 Label *retaddr) { 1378 call_VM_leaf_base1(entry_point, number_of_arguments, 0, ret_type_integral, retaddr); 1379 } 1380 1381 void MacroAssembler::call_VM_leaf_base1(address entry_point, 1382 int number_of_gp_arguments, 1383 int number_of_fp_arguments, 1384 ret_type type, 1385 Label *retaddr) { 1386 Label E, L; 1387 1388 stp(rscratch1, rmethod, Address(pre(sp, -2 * wordSize))); 1389 1390 // We add 1 to number_of_arguments because the thread in arg0 is 1391 // not counted 1392 mov(rscratch1, entry_point); 1393 blrt(rscratch1, number_of_gp_arguments + 1, number_of_fp_arguments, type); 1394 if (retaddr) 1395 bind(*retaddr); 1396 1397 ldp(rscratch1, rmethod, Address(post(sp, 2 * wordSize))); 1398 maybe_isb(); 1399 } 1400 1401 void MacroAssembler::call_VM_leaf(address entry_point, int number_of_arguments) { 1402 call_VM_leaf_base(entry_point, number_of_arguments); 1403 } 1404 1405 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0) { 1406 pass_arg0(this, arg_0); 1407 call_VM_leaf_base(entry_point, 1); 1408 } 1409 1410 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1) { 1411 pass_arg0(this, arg_0); 1412 pass_arg1(this, arg_1); 1413 call_VM_leaf_base(entry_point, 2); 1414 } 1415 1416 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, 1417 Register arg_1, Register arg_2) { 1418 pass_arg0(this, arg_0); 1419 pass_arg1(this, arg_1); 1420 pass_arg2(this, arg_2); 1421 call_VM_leaf_base(entry_point, 3); 1422 } 1423 1424 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0) { 1425 pass_arg0(this, arg_0); 1426 MacroAssembler::call_VM_leaf_base(entry_point, 1); 1427 } 1428 1429 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1) { 1430 1431 assert(arg_0 != c_rarg1, "smashed arg"); 1432 pass_arg1(this, arg_1); 1433 pass_arg0(this, arg_0); 1434 MacroAssembler::call_VM_leaf_base(entry_point, 2); 1435 } 1436 1437 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) { 1438 assert(arg_0 != c_rarg2, "smashed arg"); 1439 assert(arg_1 != c_rarg2, "smashed arg"); 1440 pass_arg2(this, arg_2); 1441 assert(arg_0 != c_rarg1, "smashed arg"); 1442 pass_arg1(this, arg_1); 1443 pass_arg0(this, arg_0); 1444 MacroAssembler::call_VM_leaf_base(entry_point, 3); 1445 } 1446 1447 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) { 1448 assert(arg_0 != c_rarg3, "smashed arg"); 1449 assert(arg_1 != c_rarg3, "smashed arg"); 1450 assert(arg_2 != c_rarg3, "smashed arg"); 1451 pass_arg3(this, arg_3); 1452 assert(arg_0 != c_rarg2, "smashed arg"); 1453 assert(arg_1 != c_rarg2, "smashed arg"); 1454 pass_arg2(this, arg_2); 1455 assert(arg_0 != c_rarg1, "smashed arg"); 1456 pass_arg1(this, arg_1); 1457 pass_arg0(this, arg_0); 1458 MacroAssembler::call_VM_leaf_base(entry_point, 4); 1459 } 1460 1461 void MacroAssembler::null_check(Register reg, int offset) { 1462 if (needs_explicit_null_check(offset)) { 1463 // provoke OS NULL exception if reg = NULL by 1464 // accessing M[reg] w/o changing any registers 1465 // NOTE: this is plenty to provoke a segv 1466 ldr(zr, Address(reg)); 1467 } else { 1468 // nothing to do, (later) access of M[reg + offset] 1469 // will provoke OS NULL exception if reg = NULL 1470 } 1471 } 1472 1473 // MacroAssembler protected routines needed to implement 1474 // public methods 1475 1476 void MacroAssembler::mov(Register r, Address dest) { 1477 code_section()->relocate(pc(), dest.rspec()); 1478 u_int64_t imm64 = (u_int64_t)dest.target(); 1479 movptr(r, imm64); 1480 } 1481 1482 // Move a constant pointer into r. In AArch64 mode the virtual 1483 // address space is 48 bits in size, so we only need three 1484 // instructions to create a patchable instruction sequence that can 1485 // reach anywhere. 1486 void MacroAssembler::movptr(Register r, uintptr_t imm64) { 1487 #ifndef PRODUCT 1488 { 1489 char buffer[64]; 1490 snprintf(buffer, sizeof(buffer), "0x%"PRIX64, imm64); 1491 block_comment(buffer); 1492 } 1493 #endif 1494 assert(imm64 < (1ul << 48), "48-bit overflow in address constant"); 1495 movz(r, imm64 & 0xffff); 1496 imm64 >>= 16; 1497 movk(r, imm64 & 0xffff, 16); 1498 imm64 >>= 16; 1499 movk(r, imm64 & 0xffff, 32); 1500 } 1501 1502 // Macro to mov replicated immediate to vector register. 1503 // Vd will get the following values for different arrangements in T 1504 // imm32 == hex 000000gh T8B: Vd = ghghghghghghghgh 1505 // imm32 == hex 000000gh T16B: Vd = ghghghghghghghghghghghghghghghgh 1506 // imm32 == hex 0000efgh T4H: Vd = efghefghefghefgh 1507 // imm32 == hex 0000efgh T8H: Vd = efghefghefghefghefghefghefghefgh 1508 // imm32 == hex abcdefgh T2S: Vd = abcdefghabcdefgh 1509 // imm32 == hex abcdefgh T4S: Vd = abcdefghabcdefghabcdefghabcdefgh 1510 // T1D/T2D: invalid 1511 void MacroAssembler::mov(FloatRegister Vd, SIMD_Arrangement T, u_int32_t imm32) { 1512 assert(T != T1D && T != T2D, "invalid arrangement"); 1513 if (T == T8B || T == T16B) { 1514 assert((imm32 & ~0xff) == 0, "extraneous bits in unsigned imm32 (T8B/T16B)"); 1515 movi(Vd, T, imm32 & 0xff, 0); 1516 return; 1517 } 1518 u_int32_t nimm32 = ~imm32; 1519 if (T == T4H || T == T8H) { 1520 assert((imm32 & ~0xffff) == 0, "extraneous bits in unsigned imm32 (T4H/T8H)"); 1521 imm32 &= 0xffff; 1522 nimm32 &= 0xffff; 1523 } 1524 u_int32_t x = imm32; 1525 int movi_cnt = 0; 1526 int movn_cnt = 0; 1527 while (x) { if (x & 0xff) movi_cnt++; x >>= 8; } 1528 x = nimm32; 1529 while (x) { if (x & 0xff) movn_cnt++; x >>= 8; } 1530 if (movn_cnt < movi_cnt) imm32 = nimm32; 1531 unsigned lsl = 0; 1532 while (imm32 && (imm32 & 0xff) == 0) { lsl += 8; imm32 >>= 8; } 1533 if (movn_cnt < movi_cnt) 1534 mvni(Vd, T, imm32 & 0xff, lsl); 1535 else 1536 movi(Vd, T, imm32 & 0xff, lsl); 1537 imm32 >>= 8; lsl += 8; 1538 while (imm32) { 1539 while ((imm32 & 0xff) == 0) { lsl += 8; imm32 >>= 8; } 1540 if (movn_cnt < movi_cnt) 1541 bici(Vd, T, imm32 & 0xff, lsl); 1542 else 1543 orri(Vd, T, imm32 & 0xff, lsl); 1544 lsl += 8; imm32 >>= 8; 1545 } 1546 } 1547 1548 void MacroAssembler::mov_immediate64(Register dst, u_int64_t imm64) 1549 { 1550 #ifndef PRODUCT 1551 { 1552 char buffer[64]; 1553 snprintf(buffer, sizeof(buffer), "0x%"PRIX64, imm64); 1554 block_comment(buffer); 1555 } 1556 #endif 1557 if (operand_valid_for_logical_immediate(false, imm64)) { 1558 orr(dst, zr, imm64); 1559 } else { 1560 // we can use a combination of MOVZ or MOVN with 1561 // MOVK to build up the constant 1562 u_int64_t imm_h[4]; 1563 int zero_count = 0; 1564 int neg_count = 0; 1565 int i; 1566 for (i = 0; i < 4; i++) { 1567 imm_h[i] = ((imm64 >> (i * 16)) & 0xffffL); 1568 if (imm_h[i] == 0) { 1569 zero_count++; 1570 } else if (imm_h[i] == 0xffffL) { 1571 neg_count++; 1572 } 1573 } 1574 if (zero_count == 4) { 1575 // one MOVZ will do 1576 movz(dst, 0); 1577 } else if (neg_count == 4) { 1578 // one MOVN will do 1579 movn(dst, 0); 1580 } else if (zero_count == 3) { 1581 for (i = 0; i < 4; i++) { 1582 if (imm_h[i] != 0L) { 1583 movz(dst, (u_int32_t)imm_h[i], (i << 4)); 1584 break; 1585 } 1586 } 1587 } else if (neg_count == 3) { 1588 // one MOVN will do 1589 for (int i = 0; i < 4; i++) { 1590 if (imm_h[i] != 0xffffL) { 1591 movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4)); 1592 break; 1593 } 1594 } 1595 } else if (zero_count == 2) { 1596 // one MOVZ and one MOVK will do 1597 for (i = 0; i < 3; i++) { 1598 if (imm_h[i] != 0L) { 1599 movz(dst, (u_int32_t)imm_h[i], (i << 4)); 1600 i++; 1601 break; 1602 } 1603 } 1604 for (;i < 4; i++) { 1605 if (imm_h[i] != 0L) { 1606 movk(dst, (u_int32_t)imm_h[i], (i << 4)); 1607 } 1608 } 1609 } else if (neg_count == 2) { 1610 // one MOVN and one MOVK will do 1611 for (i = 0; i < 4; i++) { 1612 if (imm_h[i] != 0xffffL) { 1613 movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4)); 1614 i++; 1615 break; 1616 } 1617 } 1618 for (;i < 4; i++) { 1619 if (imm_h[i] != 0xffffL) { 1620 movk(dst, (u_int32_t)imm_h[i], (i << 4)); 1621 } 1622 } 1623 } else if (zero_count == 1) { 1624 // one MOVZ and two MOVKs will do 1625 for (i = 0; i < 4; i++) { 1626 if (imm_h[i] != 0L) { 1627 movz(dst, (u_int32_t)imm_h[i], (i << 4)); 1628 i++; 1629 break; 1630 } 1631 } 1632 for (;i < 4; i++) { 1633 if (imm_h[i] != 0x0L) { 1634 movk(dst, (u_int32_t)imm_h[i], (i << 4)); 1635 } 1636 } 1637 } else if (neg_count == 1) { 1638 // one MOVN and two MOVKs will do 1639 for (i = 0; i < 4; i++) { 1640 if (imm_h[i] != 0xffffL) { 1641 movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4)); 1642 i++; 1643 break; 1644 } 1645 } 1646 for (;i < 4; i++) { 1647 if (imm_h[i] != 0xffffL) { 1648 movk(dst, (u_int32_t)imm_h[i], (i << 4)); 1649 } 1650 } 1651 } else { 1652 // use a MOVZ and 3 MOVKs (makes it easier to debug) 1653 movz(dst, (u_int32_t)imm_h[0], 0); 1654 for (i = 1; i < 4; i++) { 1655 movk(dst, (u_int32_t)imm_h[i], (i << 4)); 1656 } 1657 } 1658 } 1659 } 1660 1661 void MacroAssembler::mov_immediate32(Register dst, u_int32_t imm32) 1662 { 1663 #ifndef PRODUCT 1664 { 1665 char buffer[64]; 1666 snprintf(buffer, sizeof(buffer), "0x%"PRIX32, imm32); 1667 block_comment(buffer); 1668 } 1669 #endif 1670 if (operand_valid_for_logical_immediate(true, imm32)) { 1671 orrw(dst, zr, imm32); 1672 } else { 1673 // we can use MOVZ, MOVN or two calls to MOVK to build up the 1674 // constant 1675 u_int32_t imm_h[2]; 1676 imm_h[0] = imm32 & 0xffff; 1677 imm_h[1] = ((imm32 >> 16) & 0xffff); 1678 if (imm_h[0] == 0) { 1679 movzw(dst, imm_h[1], 16); 1680 } else if (imm_h[0] == 0xffff) { 1681 movnw(dst, imm_h[1] ^ 0xffff, 16); 1682 } else if (imm_h[1] == 0) { 1683 movzw(dst, imm_h[0], 0); 1684 } else if (imm_h[1] == 0xffff) { 1685 movnw(dst, imm_h[0] ^ 0xffff, 0); 1686 } else { 1687 // use a MOVZ and MOVK (makes it easier to debug) 1688 movzw(dst, imm_h[0], 0); 1689 movkw(dst, imm_h[1], 16); 1690 } 1691 } 1692 } 1693 1694 // Form an address from base + offset in Rd. Rd may or may 1695 // not actually be used: you must use the Address that is returned. 1696 // It is up to you to ensure that the shift provided matches the size 1697 // of your data. 1698 Address MacroAssembler::form_address(Register Rd, Register base, long byte_offset, int shift) { 1699 if (Address::offset_ok_for_immed(byte_offset, shift)) 1700 // It fits; no need for any heroics 1701 return Address(base, byte_offset); 1702 1703 // Don't do anything clever with negative or misaligned offsets 1704 unsigned mask = (1 << shift) - 1; 1705 if (byte_offset < 0 || byte_offset & mask) { 1706 mov(Rd, byte_offset); 1707 add(Rd, base, Rd); 1708 return Address(Rd); 1709 } 1710 1711 // See if we can do this with two 12-bit offsets 1712 { 1713 unsigned long word_offset = byte_offset >> shift; 1714 unsigned long masked_offset = word_offset & 0xfff000; 1715 if (Address::offset_ok_for_immed(word_offset - masked_offset) 1716 && Assembler::operand_valid_for_add_sub_immediate(masked_offset << shift)) { 1717 add(Rd, base, masked_offset << shift); 1718 word_offset -= masked_offset; 1719 return Address(Rd, word_offset << shift); 1720 } 1721 } 1722 1723 // Do it the hard way 1724 mov(Rd, byte_offset); 1725 add(Rd, base, Rd); 1726 return Address(Rd); 1727 } 1728 1729 void MacroAssembler::atomic_incw(Register counter_addr, Register tmp, Register tmp2) { 1730 if (UseLSE) { 1731 mov(tmp, 1); 1732 ldadd(Assembler::word, tmp, zr, counter_addr); 1733 return; 1734 } 1735 Label retry_load; 1736 if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH)) 1737 prfm(Address(counter_addr), PSTL1STRM); 1738 bind(retry_load); 1739 // flush and load exclusive from the memory location 1740 ldxrw(tmp, counter_addr); 1741 addw(tmp, tmp, 1); 1742 // if we store+flush with no intervening write tmp wil be zero 1743 stxrw(tmp2, tmp, counter_addr); 1744 cbnzw(tmp2, retry_load); 1745 } 1746 1747 1748 int MacroAssembler::corrected_idivl(Register result, Register ra, Register rb, 1749 bool want_remainder, Register scratch) 1750 { 1751 // Full implementation of Java idiv and irem. The function 1752 // returns the (pc) offset of the div instruction - may be needed 1753 // for implicit exceptions. 1754 // 1755 // constraint : ra/rb =/= scratch 1756 // normal case 1757 // 1758 // input : ra: dividend 1759 // rb: divisor 1760 // 1761 // result: either 1762 // quotient (= ra idiv rb) 1763 // remainder (= ra irem rb) 1764 1765 assert(ra != scratch && rb != scratch, "reg cannot be scratch"); 1766 1767 int idivl_offset = offset(); 1768 if (! want_remainder) { 1769 sdivw(result, ra, rb); 1770 } else { 1771 sdivw(scratch, ra, rb); 1772 Assembler::msubw(result, scratch, rb, ra); 1773 } 1774 1775 return idivl_offset; 1776 } 1777 1778 int MacroAssembler::corrected_idivq(Register result, Register ra, Register rb, 1779 bool want_remainder, Register scratch) 1780 { 1781 // Full implementation of Java ldiv and lrem. The function 1782 // returns the (pc) offset of the div instruction - may be needed 1783 // for implicit exceptions. 1784 // 1785 // constraint : ra/rb =/= scratch 1786 // normal case 1787 // 1788 // input : ra: dividend 1789 // rb: divisor 1790 // 1791 // result: either 1792 // quotient (= ra idiv rb) 1793 // remainder (= ra irem rb) 1794 1795 assert(ra != scratch && rb != scratch, "reg cannot be scratch"); 1796 1797 int idivq_offset = offset(); 1798 if (! want_remainder) { 1799 sdiv(result, ra, rb); 1800 } else { 1801 sdiv(scratch, ra, rb); 1802 Assembler::msub(result, scratch, rb, ra); 1803 } 1804 1805 return idivq_offset; 1806 } 1807 1808 void MacroAssembler::membar(Membar_mask_bits order_constraint) { 1809 address prev = pc() - NativeMembar::instruction_size; 1810 address last = code()->last_insn(); 1811 if (last != NULL && nativeInstruction_at(last)->is_Membar() && prev == last) { 1812 NativeMembar *bar = NativeMembar_at(prev); 1813 // We are merging two memory barrier instructions. On AArch64 we 1814 // can do this simply by ORing them together. 1815 bar->set_kind(bar->get_kind() | order_constraint); 1816 BLOCK_COMMENT("merged membar"); 1817 } else { 1818 code()->set_last_insn(pc()); 1819 dmb(Assembler::barrier(order_constraint)); 1820 } 1821 } 1822 1823 bool MacroAssembler::try_merge_ldst(Register rt, const Address &adr, size_t size_in_bytes, bool is_store) { 1824 if (ldst_can_merge(rt, adr, size_in_bytes, is_store)) { 1825 merge_ldst(rt, adr, size_in_bytes, is_store); 1826 code()->clear_last_insn(); 1827 return true; 1828 } else { 1829 assert(size_in_bytes == 8 || size_in_bytes == 4, "only 8 bytes or 4 bytes load/store is supported."); 1830 const unsigned mask = size_in_bytes - 1; 1831 if (adr.getMode() == Address::base_plus_offset && 1832 (adr.offset() & mask) == 0) { // only supports base_plus_offset. 1833 code()->set_last_insn(pc()); 1834 } 1835 return false; 1836 } 1837 } 1838 1839 void MacroAssembler::ldr(Register Rx, const Address &adr) { 1840 // We always try to merge two adjacent loads into one ldp. 1841 if (!try_merge_ldst(Rx, adr, 8, false)) { 1842 Assembler::ldr(Rx, adr); 1843 } 1844 } 1845 1846 void MacroAssembler::ldrw(Register Rw, const Address &adr) { 1847 // We always try to merge two adjacent loads into one ldp. 1848 if (!try_merge_ldst(Rw, adr, 4, false)) { 1849 Assembler::ldrw(Rw, adr); 1850 } 1851 } 1852 1853 void MacroAssembler::str(Register Rx, const Address &adr) { 1854 // We always try to merge two adjacent stores into one stp. 1855 if (!try_merge_ldst(Rx, adr, 8, true)) { 1856 Assembler::str(Rx, adr); 1857 } 1858 } 1859 1860 void MacroAssembler::strw(Register Rw, const Address &adr) { 1861 // We always try to merge two adjacent stores into one stp. 1862 if (!try_merge_ldst(Rw, adr, 4, true)) { 1863 Assembler::strw(Rw, adr); 1864 } 1865 } 1866 1867 // MacroAssembler routines found actually to be needed 1868 1869 void MacroAssembler::push(Register src) 1870 { 1871 str(src, Address(pre(esp, -1 * wordSize))); 1872 } 1873 1874 void MacroAssembler::pop(Register dst) 1875 { 1876 ldr(dst, Address(post(esp, 1 * wordSize))); 1877 } 1878 1879 // Note: load_unsigned_short used to be called load_unsigned_word. 1880 int MacroAssembler::load_unsigned_short(Register dst, Address src) { 1881 int off = offset(); 1882 ldrh(dst, src); 1883 return off; 1884 } 1885 1886 int MacroAssembler::load_unsigned_byte(Register dst, Address src) { 1887 int off = offset(); 1888 ldrb(dst, src); 1889 return off; 1890 } 1891 1892 int MacroAssembler::load_signed_short(Register dst, Address src) { 1893 int off = offset(); 1894 ldrsh(dst, src); 1895 return off; 1896 } 1897 1898 int MacroAssembler::load_signed_byte(Register dst, Address src) { 1899 int off = offset(); 1900 ldrsb(dst, src); 1901 return off; 1902 } 1903 1904 int MacroAssembler::load_signed_short32(Register dst, Address src) { 1905 int off = offset(); 1906 ldrshw(dst, src); 1907 return off; 1908 } 1909 1910 int MacroAssembler::load_signed_byte32(Register dst, Address src) { 1911 int off = offset(); 1912 ldrsbw(dst, src); 1913 return off; 1914 } 1915 1916 void MacroAssembler::load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed, Register dst2) { 1917 switch (size_in_bytes) { 1918 case 8: ldr(dst, src); break; 1919 case 4: ldrw(dst, src); break; 1920 case 2: is_signed ? load_signed_short(dst, src) : load_unsigned_short(dst, src); break; 1921 case 1: is_signed ? load_signed_byte( dst, src) : load_unsigned_byte( dst, src); break; 1922 default: ShouldNotReachHere(); 1923 } 1924 } 1925 1926 void MacroAssembler::store_sized_value(Address dst, Register src, size_t size_in_bytes, Register src2) { 1927 switch (size_in_bytes) { 1928 case 8: str(src, dst); break; 1929 case 4: strw(src, dst); break; 1930 case 2: strh(src, dst); break; 1931 case 1: strb(src, dst); break; 1932 default: ShouldNotReachHere(); 1933 } 1934 } 1935 1936 void MacroAssembler::decrementw(Register reg, int value) 1937 { 1938 if (value < 0) { incrementw(reg, -value); return; } 1939 if (value == 0) { return; } 1940 if (value < (1 << 12)) { subw(reg, reg, value); return; } 1941 /* else */ { 1942 guarantee(reg != rscratch2, "invalid dst for register decrement"); 1943 movw(rscratch2, (unsigned)value); 1944 subw(reg, reg, rscratch2); 1945 } 1946 } 1947 1948 void MacroAssembler::decrement(Register reg, int value) 1949 { 1950 if (value < 0) { increment(reg, -value); return; } 1951 if (value == 0) { return; } 1952 if (value < (1 << 12)) { sub(reg, reg, value); return; } 1953 /* else */ { 1954 assert(reg != rscratch2, "invalid dst for register decrement"); 1955 mov(rscratch2, (unsigned long)value); 1956 sub(reg, reg, rscratch2); 1957 } 1958 } 1959 1960 void MacroAssembler::decrementw(Address dst, int value) 1961 { 1962 assert(!dst.uses(rscratch1), "invalid dst for address decrement"); 1963 if (dst.getMode() == Address::literal) { 1964 assert(abs(value) < (1 << 12), "invalid value and address mode combination"); 1965 lea(rscratch2, dst); 1966 dst = Address(rscratch2); 1967 } 1968 ldrw(rscratch1, dst); 1969 decrementw(rscratch1, value); 1970 strw(rscratch1, dst); 1971 } 1972 1973 void MacroAssembler::decrement(Address dst, int value) 1974 { 1975 assert(!dst.uses(rscratch1), "invalid address for decrement"); 1976 if (dst.getMode() == Address::literal) { 1977 assert(abs(value) < (1 << 12), "invalid value and address mode combination"); 1978 lea(rscratch2, dst); 1979 dst = Address(rscratch2); 1980 } 1981 ldr(rscratch1, dst); 1982 decrement(rscratch1, value); 1983 str(rscratch1, dst); 1984 } 1985 1986 void MacroAssembler::incrementw(Register reg, int value) 1987 { 1988 if (value < 0) { decrementw(reg, -value); return; } 1989 if (value == 0) { return; } 1990 if (value < (1 << 12)) { addw(reg, reg, value); return; } 1991 /* else */ { 1992 assert(reg != rscratch2, "invalid dst for register increment"); 1993 movw(rscratch2, (unsigned)value); 1994 addw(reg, reg, rscratch2); 1995 } 1996 } 1997 1998 void MacroAssembler::increment(Register reg, int value) 1999 { 2000 if (value < 0) { decrement(reg, -value); return; } 2001 if (value == 0) { return; } 2002 if (value < (1 << 12)) { add(reg, reg, value); return; } 2003 /* else */ { 2004 assert(reg != rscratch2, "invalid dst for register increment"); 2005 movw(rscratch2, (unsigned)value); 2006 add(reg, reg, rscratch2); 2007 } 2008 } 2009 2010 void MacroAssembler::incrementw(Address dst, int value) 2011 { 2012 assert(!dst.uses(rscratch1), "invalid dst for address increment"); 2013 if (dst.getMode() == Address::literal) { 2014 assert(abs(value) < (1 << 12), "invalid value and address mode combination"); 2015 lea(rscratch2, dst); 2016 dst = Address(rscratch2); 2017 } 2018 ldrw(rscratch1, dst); 2019 incrementw(rscratch1, value); 2020 strw(rscratch1, dst); 2021 } 2022 2023 void MacroAssembler::increment(Address dst, int value) 2024 { 2025 assert(!dst.uses(rscratch1), "invalid dst for address increment"); 2026 if (dst.getMode() == Address::literal) { 2027 assert(abs(value) < (1 << 12), "invalid value and address mode combination"); 2028 lea(rscratch2, dst); 2029 dst = Address(rscratch2); 2030 } 2031 ldr(rscratch1, dst); 2032 increment(rscratch1, value); 2033 str(rscratch1, dst); 2034 } 2035 2036 2037 void MacroAssembler::pusha() { 2038 push(0x7fffffff, sp); 2039 } 2040 2041 void MacroAssembler::popa() { 2042 pop(0x7fffffff, sp); 2043 } 2044 2045 // Push lots of registers in the bit set supplied. Don't push sp. 2046 // Return the number of words pushed 2047 int MacroAssembler::push(unsigned int bitset, Register stack) { 2048 int words_pushed = 0; 2049 2050 // Scan bitset to accumulate register pairs 2051 unsigned char regs[32]; 2052 int count = 0; 2053 for (int reg = 0; reg <= 30; reg++) { 2054 if (1 & bitset) 2055 regs[count++] = reg; 2056 bitset >>= 1; 2057 } 2058 regs[count++] = zr->encoding_nocheck(); 2059 count &= ~1; // Only push an even nuber of regs 2060 2061 if (count) { 2062 stp(as_Register(regs[0]), as_Register(regs[1]), 2063 Address(pre(stack, -count * wordSize))); 2064 words_pushed += 2; 2065 } 2066 for (int i = 2; i < count; i += 2) { 2067 stp(as_Register(regs[i]), as_Register(regs[i+1]), 2068 Address(stack, i * wordSize)); 2069 words_pushed += 2; 2070 } 2071 2072 assert(words_pushed == count, "oops, pushed != count"); 2073 2074 return count; 2075 } 2076 2077 int MacroAssembler::pop(unsigned int bitset, Register stack) { 2078 int words_pushed = 0; 2079 2080 // Scan bitset to accumulate register pairs 2081 unsigned char regs[32]; 2082 int count = 0; 2083 for (int reg = 0; reg <= 30; reg++) { 2084 if (1 & bitset) 2085 regs[count++] = reg; 2086 bitset >>= 1; 2087 } 2088 regs[count++] = zr->encoding_nocheck(); 2089 count &= ~1; 2090 2091 for (int i = 2; i < count; i += 2) { 2092 ldp(as_Register(regs[i]), as_Register(regs[i+1]), 2093 Address(stack, i * wordSize)); 2094 words_pushed += 2; 2095 } 2096 if (count) { 2097 ldp(as_Register(regs[0]), as_Register(regs[1]), 2098 Address(post(stack, count * wordSize))); 2099 words_pushed += 2; 2100 } 2101 2102 assert(words_pushed == count, "oops, pushed != count"); 2103 2104 return count; 2105 } 2106 #ifdef ASSERT 2107 void MacroAssembler::verify_heapbase(const char* msg) { 2108 #if 0 2109 assert (UseCompressedOops || UseCompressedClassPointers, "should be compressed"); 2110 assert (Universe::heap() != NULL, "java heap should be initialized"); 2111 if (CheckCompressedOops) { 2112 Label ok; 2113 push(1 << rscratch1->encoding(), sp); // cmpptr trashes rscratch1 2114 cmpptr(rheapbase, ExternalAddress((address)Universe::narrow_ptrs_base_addr())); 2115 br(Assembler::EQ, ok); 2116 stop(msg); 2117 bind(ok); 2118 pop(1 << rscratch1->encoding(), sp); 2119 } 2120 #endif 2121 } 2122 #endif 2123 2124 void MacroAssembler::resolve_jobject(Register value, Register thread, Register tmp) { 2125 Label done, not_weak; 2126 cbz(value, done); // Use NULL as-is. 2127 2128 STATIC_ASSERT(JNIHandles::weak_tag_mask == 1u); 2129 tbz(r0, 0, not_weak); // Test for jweak tag. 2130 2131 // Resolve jweak. 2132 access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF, value, 2133 Address(value, -JNIHandles::weak_tag_value), tmp, thread); 2134 verify_oop(value); 2135 b(done); 2136 2137 bind(not_weak); 2138 // Resolve (untagged) jobject. 2139 access_load_at(T_OBJECT, IN_NATIVE, value, Address(value, 0), tmp, thread); 2140 verify_oop(value); 2141 bind(done); 2142 } 2143 2144 void MacroAssembler::stop(const char* msg) { 2145 address ip = pc(); 2146 pusha(); 2147 // We use movptr rather than mov here because we need code size not 2148 // to depend on the pointer value of msg otherwise C2 can observe 2149 // the same node with different sizes when emitted in a scratch 2150 // buffer and later when emitted for good. 2151 movptr(c_rarg0, (uintptr_t)msg); 2152 movptr(c_rarg1, (uintptr_t)ip); 2153 mov(c_rarg2, sp); 2154 mov(c_rarg3, CAST_FROM_FN_PTR(address, MacroAssembler::debug64)); 2155 // call(c_rarg3); 2156 blrt(c_rarg3, 3, 0, 1); 2157 hlt(0); 2158 } 2159 2160 void MacroAssembler::unimplemented(const char* what) { 2161 const char* buf = NULL; 2162 { 2163 ResourceMark rm; 2164 stringStream ss; 2165 ss.print("unimplemented: %s", what); 2166 buf = code_string(ss.as_string()); 2167 } 2168 stop(buf); 2169 } 2170 2171 // If a constant does not fit in an immediate field, generate some 2172 // number of MOV instructions and then perform the operation. 2173 void MacroAssembler::wrap_add_sub_imm_insn(Register Rd, Register Rn, unsigned imm, 2174 add_sub_imm_insn insn1, 2175 add_sub_reg_insn insn2) { 2176 assert(Rd != zr, "Rd = zr and not setting flags?"); 2177 if (operand_valid_for_add_sub_immediate((int)imm)) { 2178 (this->*insn1)(Rd, Rn, imm); 2179 } else { 2180 if (uabs(imm) < (1 << 24)) { 2181 (this->*insn1)(Rd, Rn, imm & -(1 << 12)); 2182 (this->*insn1)(Rd, Rd, imm & ((1 << 12)-1)); 2183 } else { 2184 assert_different_registers(Rd, Rn); 2185 mov(Rd, (uint64_t)imm); 2186 (this->*insn2)(Rd, Rn, Rd, LSL, 0); 2187 } 2188 } 2189 } 2190 2191 // Seperate vsn which sets the flags. Optimisations are more restricted 2192 // because we must set the flags correctly. 2193 void MacroAssembler::wrap_adds_subs_imm_insn(Register Rd, Register Rn, unsigned imm, 2194 add_sub_imm_insn insn1, 2195 add_sub_reg_insn insn2) { 2196 if (operand_valid_for_add_sub_immediate((int)imm)) { 2197 (this->*insn1)(Rd, Rn, imm); 2198 } else { 2199 assert_different_registers(Rd, Rn); 2200 assert(Rd != zr, "overflow in immediate operand"); 2201 mov(Rd, (uint64_t)imm); 2202 (this->*insn2)(Rd, Rn, Rd, LSL, 0); 2203 } 2204 } 2205 2206 2207 void MacroAssembler::add(Register Rd, Register Rn, RegisterOrConstant increment) { 2208 if (increment.is_register()) { 2209 add(Rd, Rn, increment.as_register()); 2210 } else { 2211 add(Rd, Rn, increment.as_constant()); 2212 } 2213 } 2214 2215 void MacroAssembler::addw(Register Rd, Register Rn, RegisterOrConstant increment) { 2216 if (increment.is_register()) { 2217 addw(Rd, Rn, increment.as_register()); 2218 } else { 2219 addw(Rd, Rn, increment.as_constant()); 2220 } 2221 } 2222 2223 void MacroAssembler::sub(Register Rd, Register Rn, RegisterOrConstant decrement) { 2224 if (decrement.is_register()) { 2225 sub(Rd, Rn, decrement.as_register()); 2226 } else { 2227 sub(Rd, Rn, decrement.as_constant()); 2228 } 2229 } 2230 2231 void MacroAssembler::subw(Register Rd, Register Rn, RegisterOrConstant decrement) { 2232 if (decrement.is_register()) { 2233 subw(Rd, Rn, decrement.as_register()); 2234 } else { 2235 subw(Rd, Rn, decrement.as_constant()); 2236 } 2237 } 2238 2239 void MacroAssembler::reinit_heapbase() 2240 { 2241 if (UseCompressedOops) { 2242 if (Universe::is_fully_initialized()) { 2243 mov(rheapbase, Universe::narrow_ptrs_base()); 2244 } else { 2245 lea(rheapbase, ExternalAddress((address)Universe::narrow_ptrs_base_addr())); 2246 ldr(rheapbase, Address(rheapbase)); 2247 } 2248 } 2249 } 2250 2251 // this simulates the behaviour of the x86 cmpxchg instruction using a 2252 // load linked/store conditional pair. we use the acquire/release 2253 // versions of these instructions so that we flush pending writes as 2254 // per Java semantics. 2255 2256 // n.b the x86 version assumes the old value to be compared against is 2257 // in rax and updates rax with the value located in memory if the 2258 // cmpxchg fails. we supply a register for the old value explicitly 2259 2260 // the aarch64 load linked/store conditional instructions do not 2261 // accept an offset. so, unlike x86, we must provide a plain register 2262 // to identify the memory word to be compared/exchanged rather than a 2263 // register+offset Address. 2264 2265 void MacroAssembler::cmpxchgptr(Register oldv, Register newv, Register addr, Register tmp, 2266 Label &succeed, Label *fail) { 2267 // oldv holds comparison value 2268 // newv holds value to write in exchange 2269 // addr identifies memory word to compare against/update 2270 if (UseLSE) { 2271 mov(tmp, oldv); 2272 casal(Assembler::xword, oldv, newv, addr); 2273 cmp(tmp, oldv); 2274 br(Assembler::EQ, succeed); 2275 membar(AnyAny); 2276 } else { 2277 Label retry_load, nope; 2278 if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH)) 2279 prfm(Address(addr), PSTL1STRM); 2280 bind(retry_load); 2281 // flush and load exclusive from the memory location 2282 // and fail if it is not what we expect 2283 ldaxr(tmp, addr); 2284 cmp(tmp, oldv); 2285 br(Assembler::NE, nope); 2286 // if we store+flush with no intervening write tmp wil be zero 2287 stlxr(tmp, newv, addr); 2288 cbzw(tmp, succeed); 2289 // retry so we only ever return after a load fails to compare 2290 // ensures we don't return a stale value after a failed write. 2291 b(retry_load); 2292 // if the memory word differs we return it in oldv and signal a fail 2293 bind(nope); 2294 membar(AnyAny); 2295 mov(oldv, tmp); 2296 } 2297 if (fail) 2298 b(*fail); 2299 } 2300 2301 void MacroAssembler::cmpxchg_obj_header(Register oldv, Register newv, Register obj, Register tmp, 2302 Label &succeed, Label *fail) { 2303 assert(oopDesc::mark_offset_in_bytes() == 0, "assumption"); 2304 cmpxchgptr(oldv, newv, obj, tmp, succeed, fail); 2305 } 2306 2307 void MacroAssembler::cmpxchgw(Register oldv, Register newv, Register addr, Register tmp, 2308 Label &succeed, Label *fail) { 2309 // oldv holds comparison value 2310 // newv holds value to write in exchange 2311 // addr identifies memory word to compare against/update 2312 // tmp returns 0/1 for success/failure 2313 if (UseLSE) { 2314 mov(tmp, oldv); 2315 casal(Assembler::word, oldv, newv, addr); 2316 cmp(tmp, oldv); 2317 br(Assembler::EQ, succeed); 2318 membar(AnyAny); 2319 } else { 2320 Label retry_load, nope; 2321 if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH)) 2322 prfm(Address(addr), PSTL1STRM); 2323 bind(retry_load); 2324 // flush and load exclusive from the memory location 2325 // and fail if it is not what we expect 2326 ldaxrw(tmp, addr); 2327 cmp(tmp, oldv); 2328 br(Assembler::NE, nope); 2329 // if we store+flush with no intervening write tmp wil be zero 2330 stlxrw(tmp, newv, addr); 2331 cbzw(tmp, succeed); 2332 // retry so we only ever return after a load fails to compare 2333 // ensures we don't return a stale value after a failed write. 2334 b(retry_load); 2335 // if the memory word differs we return it in oldv and signal a fail 2336 bind(nope); 2337 membar(AnyAny); 2338 mov(oldv, tmp); 2339 } 2340 if (fail) 2341 b(*fail); 2342 } 2343 2344 // A generic CAS; success or failure is in the EQ flag. A weak CAS 2345 // doesn't retry and may fail spuriously. If the oldval is wanted, 2346 // Pass a register for the result, otherwise pass noreg. 2347 2348 // Clobbers rscratch1 2349 void MacroAssembler::cmpxchg(Register addr, Register expected, 2350 Register new_val, 2351 enum operand_size size, 2352 bool acquire, bool release, 2353 bool weak, 2354 Register result) { 2355 if (result == noreg) result = rscratch1; 2356 if (UseLSE) { 2357 mov(result, expected); 2358 lse_cas(result, new_val, addr, size, acquire, release, /*not_pair*/ true); 2359 cmp(result, expected); 2360 } else { 2361 BLOCK_COMMENT("cmpxchg {"); 2362 Label retry_load, done; 2363 if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH)) 2364 prfm(Address(addr), PSTL1STRM); 2365 bind(retry_load); 2366 load_exclusive(result, addr, size, acquire); 2367 if (size == xword) 2368 cmp(result, expected); 2369 else 2370 cmpw(result, expected); 2371 br(Assembler::NE, done); 2372 store_exclusive(rscratch1, new_val, addr, size, release); 2373 if (weak) { 2374 cmpw(rscratch1, 0u); // If the store fails, return NE to our caller. 2375 } else { 2376 cbnzw(rscratch1, retry_load); 2377 } 2378 bind(done); 2379 BLOCK_COMMENT("} cmpxchg"); 2380 } 2381 } 2382 2383 void MacroAssembler::cmpxchg_oop(Register addr, Register expected, Register new_val, 2384 bool acquire, bool release, bool weak, bool encode, 2385 Register tmp1, Register tmp2, 2386 Register tmp3, Register result) { 2387 BarrierSetAssembler* bsa = BarrierSet::barrier_set()->barrier_set_assembler(); 2388 bsa->cmpxchg_oop(this, addr, expected, new_val, acquire, release, weak, encode, tmp1, tmp2, tmp3, result); 2389 } 2390 2391 static bool different(Register a, RegisterOrConstant b, Register c) { 2392 if (b.is_constant()) 2393 return a != c; 2394 else 2395 return a != b.as_register() && a != c && b.as_register() != c; 2396 } 2397 2398 #define ATOMIC_OP(NAME, LDXR, OP, IOP, AOP, STXR, sz) \ 2399 void MacroAssembler::atomic_##NAME(Register prev, RegisterOrConstant incr, Register addr) { \ 2400 if (UseLSE) { \ 2401 prev = prev->is_valid() ? prev : zr; \ 2402 if (incr.is_register()) { \ 2403 AOP(sz, incr.as_register(), prev, addr); \ 2404 } else { \ 2405 mov(rscratch2, incr.as_constant()); \ 2406 AOP(sz, rscratch2, prev, addr); \ 2407 } \ 2408 return; \ 2409 } \ 2410 Register result = rscratch2; \ 2411 if (prev->is_valid()) \ 2412 result = different(prev, incr, addr) ? prev : rscratch2; \ 2413 \ 2414 Label retry_load; \ 2415 if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH)) \ 2416 prfm(Address(addr), PSTL1STRM); \ 2417 bind(retry_load); \ 2418 LDXR(result, addr); \ 2419 OP(rscratch1, result, incr); \ 2420 STXR(rscratch2, rscratch1, addr); \ 2421 cbnzw(rscratch2, retry_load); \ 2422 if (prev->is_valid() && prev != result) { \ 2423 IOP(prev, rscratch1, incr); \ 2424 } \ 2425 } 2426 2427 ATOMIC_OP(add, ldxr, add, sub, ldadd, stxr, Assembler::xword) 2428 ATOMIC_OP(addw, ldxrw, addw, subw, ldadd, stxrw, Assembler::word) 2429 ATOMIC_OP(addal, ldaxr, add, sub, ldaddal, stlxr, Assembler::xword) 2430 ATOMIC_OP(addalw, ldaxrw, addw, subw, ldaddal, stlxrw, Assembler::word) 2431 2432 #undef ATOMIC_OP 2433 2434 #define ATOMIC_XCHG(OP, AOP, LDXR, STXR, sz) \ 2435 void MacroAssembler::atomic_##OP(Register prev, Register newv, Register addr) { \ 2436 if (UseLSE) { \ 2437 prev = prev->is_valid() ? prev : zr; \ 2438 AOP(sz, newv, prev, addr); \ 2439 return; \ 2440 } \ 2441 Register result = rscratch2; \ 2442 if (prev->is_valid()) \ 2443 result = different(prev, newv, addr) ? prev : rscratch2; \ 2444 \ 2445 Label retry_load; \ 2446 if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH)) \ 2447 prfm(Address(addr), PSTL1STRM); \ 2448 bind(retry_load); \ 2449 LDXR(result, addr); \ 2450 STXR(rscratch1, newv, addr); \ 2451 cbnzw(rscratch1, retry_load); \ 2452 if (prev->is_valid() && prev != result) \ 2453 mov(prev, result); \ 2454 } 2455 2456 ATOMIC_XCHG(xchg, swp, ldxr, stxr, Assembler::xword) 2457 ATOMIC_XCHG(xchgw, swp, ldxrw, stxrw, Assembler::word) 2458 ATOMIC_XCHG(xchgal, swpal, ldaxr, stlxr, Assembler::xword) 2459 ATOMIC_XCHG(xchgalw, swpal, ldaxrw, stlxrw, Assembler::word) 2460 2461 #undef ATOMIC_XCHG 2462 2463 #ifndef PRODUCT 2464 extern "C" void findpc(intptr_t x); 2465 #endif 2466 2467 void MacroAssembler::debug64(char* msg, int64_t pc, int64_t regs[]) 2468 { 2469 // In order to get locks to work, we need to fake a in_VM state 2470 if (ShowMessageBoxOnError ) { 2471 JavaThread* thread = JavaThread::current(); 2472 JavaThreadState saved_state = thread->thread_state(); 2473 thread->set_thread_state(_thread_in_vm); 2474 #ifndef PRODUCT 2475 if (CountBytecodes || TraceBytecodes || StopInterpreterAt) { 2476 ttyLocker ttyl; 2477 BytecodeCounter::print(); 2478 } 2479 #endif 2480 if (os::message_box(msg, "Execution stopped, print registers?")) { 2481 ttyLocker ttyl; 2482 tty->print_cr(" pc = 0x%016lx", pc); 2483 #ifndef PRODUCT 2484 tty->cr(); 2485 findpc(pc); 2486 tty->cr(); 2487 #endif 2488 tty->print_cr(" r0 = 0x%016lx", regs[0]); 2489 tty->print_cr(" r1 = 0x%016lx", regs[1]); 2490 tty->print_cr(" r2 = 0x%016lx", regs[2]); 2491 tty->print_cr(" r3 = 0x%016lx", regs[3]); 2492 tty->print_cr(" r4 = 0x%016lx", regs[4]); 2493 tty->print_cr(" r5 = 0x%016lx", regs[5]); 2494 tty->print_cr(" r6 = 0x%016lx", regs[6]); 2495 tty->print_cr(" r7 = 0x%016lx", regs[7]); 2496 tty->print_cr(" r8 = 0x%016lx", regs[8]); 2497 tty->print_cr(" r9 = 0x%016lx", regs[9]); 2498 tty->print_cr("r10 = 0x%016lx", regs[10]); 2499 tty->print_cr("r11 = 0x%016lx", regs[11]); 2500 tty->print_cr("r12 = 0x%016lx", regs[12]); 2501 tty->print_cr("r13 = 0x%016lx", regs[13]); 2502 tty->print_cr("r14 = 0x%016lx", regs[14]); 2503 tty->print_cr("r15 = 0x%016lx", regs[15]); 2504 tty->print_cr("r16 = 0x%016lx", regs[16]); 2505 tty->print_cr("r17 = 0x%016lx", regs[17]); 2506 tty->print_cr("r18 = 0x%016lx", regs[18]); 2507 tty->print_cr("r19 = 0x%016lx", regs[19]); 2508 tty->print_cr("r20 = 0x%016lx", regs[20]); 2509 tty->print_cr("r21 = 0x%016lx", regs[21]); 2510 tty->print_cr("r22 = 0x%016lx", regs[22]); 2511 tty->print_cr("r23 = 0x%016lx", regs[23]); 2512 tty->print_cr("r24 = 0x%016lx", regs[24]); 2513 tty->print_cr("r25 = 0x%016lx", regs[25]); 2514 tty->print_cr("r26 = 0x%016lx", regs[26]); 2515 tty->print_cr("r27 = 0x%016lx", regs[27]); 2516 tty->print_cr("r28 = 0x%016lx", regs[28]); 2517 tty->print_cr("r30 = 0x%016lx", regs[30]); 2518 tty->print_cr("r31 = 0x%016lx", regs[31]); 2519 BREAKPOINT; 2520 } 2521 ThreadStateTransition::transition(thread, _thread_in_vm, saved_state); 2522 } else { 2523 ttyLocker ttyl; 2524 ::tty->print_cr("=============== DEBUG MESSAGE: %s ================\n", 2525 msg); 2526 assert(false, "DEBUG MESSAGE: %s", msg); 2527 } 2528 } 2529 2530 #ifdef BUILTIN_SIM 2531 // routine to generate an x86 prolog for a stub function which 2532 // bootstraps into the generated ARM code which directly follows the 2533 // stub 2534 // 2535 // the argument encodes the number of general and fp registers 2536 // passed by the caller and the callng convention (currently just 2537 // the number of general registers and assumes C argument passing) 2538 2539 extern "C" { 2540 int aarch64_stub_prolog_size(); 2541 void aarch64_stub_prolog(); 2542 void aarch64_prolog(); 2543 } 2544 2545 void MacroAssembler::c_stub_prolog(int gp_arg_count, int fp_arg_count, int ret_type, 2546 address *prolog_ptr) 2547 { 2548 int calltype = (((ret_type & 0x3) << 8) | 2549 ((fp_arg_count & 0xf) << 4) | 2550 (gp_arg_count & 0xf)); 2551 2552 // the addresses for the x86 to ARM entry code we need to use 2553 address start = pc(); 2554 // printf("start = %lx\n", start); 2555 int byteCount = aarch64_stub_prolog_size(); 2556 // printf("byteCount = %x\n", byteCount); 2557 int instructionCount = (byteCount + 3)/ 4; 2558 // printf("instructionCount = %x\n", instructionCount); 2559 for (int i = 0; i < instructionCount; i++) { 2560 nop(); 2561 } 2562 2563 memcpy(start, (void*)aarch64_stub_prolog, byteCount); 2564 2565 // write the address of the setup routine and the call format at the 2566 // end of into the copied code 2567 u_int64_t *patch_end = (u_int64_t *)(start + byteCount); 2568 if (prolog_ptr) 2569 patch_end[-2] = (u_int64_t)prolog_ptr; 2570 patch_end[-1] = calltype; 2571 } 2572 #endif 2573 2574 void MacroAssembler::push_call_clobbered_fp_registers() { 2575 int step = 4 * wordSize; 2576 sub(sp, sp, step); 2577 mov(rscratch1, -step); 2578 // Push v0-v7, v16-v31. 2579 for (int i = 31; i>= 4; i -= 4) { 2580 if (i <= v7->encoding() || i >= v16->encoding()) 2581 st1(as_FloatRegister(i-3), as_FloatRegister(i-2), as_FloatRegister(i-1), 2582 as_FloatRegister(i), T1D, Address(post(sp, rscratch1))); 2583 } 2584 st1(as_FloatRegister(0), as_FloatRegister(1), as_FloatRegister(2), 2585 as_FloatRegister(3), T1D, Address(sp)); 2586 } 2587 2588 void MacroAssembler::pop_call_clobbered_fp_registers() { 2589 for (int i = 0; i < 32; i += 4) { 2590 if (i <= v7->encoding() || i >= v16->encoding()) 2591 ld1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2), 2592 as_FloatRegister(i+3), T1D, Address(post(sp, 4 * wordSize))); 2593 } 2594 } 2595 2596 void MacroAssembler::push_call_clobbered_registers() { 2597 push(RegSet::range(r0, r18) - RegSet::of(rscratch1, rscratch2), sp); 2598 push_call_clobbered_fp_registers(); 2599 } 2600 2601 void MacroAssembler::pop_call_clobbered_registers() { 2602 pop_call_clobbered_fp_registers(); 2603 pop(RegSet::range(r0, r18) - RegSet::of(rscratch1, rscratch2), sp); 2604 } 2605 2606 void MacroAssembler::push_CPU_state(bool save_vectors) { 2607 int step = (save_vectors ? 8 : 4) * wordSize; 2608 push(0x3fffffff, sp); // integer registers except lr & sp 2609 mov(rscratch1, -step); 2610 sub(sp, sp, step); 2611 for (int i = 28; i >= 4; i -= 4) { 2612 st1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2), 2613 as_FloatRegister(i+3), save_vectors ? T2D : T1D, Address(post(sp, rscratch1))); 2614 } 2615 st1(v0, v1, v2, v3, save_vectors ? T2D : T1D, sp); 2616 } 2617 2618 void MacroAssembler::pop_CPU_state(bool restore_vectors) { 2619 int step = (restore_vectors ? 8 : 4) * wordSize; 2620 for (int i = 0; i <= 28; i += 4) 2621 ld1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2), 2622 as_FloatRegister(i+3), restore_vectors ? T2D : T1D, Address(post(sp, step))); 2623 pop(0x3fffffff, sp); // integer registers except lr & sp 2624 } 2625 2626 /** 2627 * Helpers for multiply_to_len(). 2628 */ 2629 void MacroAssembler::add2_with_carry(Register final_dest_hi, Register dest_hi, Register dest_lo, 2630 Register src1, Register src2) { 2631 adds(dest_lo, dest_lo, src1); 2632 adc(dest_hi, dest_hi, zr); 2633 adds(dest_lo, dest_lo, src2); 2634 adc(final_dest_hi, dest_hi, zr); 2635 } 2636 2637 // Generate an address from (r + r1 extend offset). "size" is the 2638 // size of the operand. The result may be in rscratch2. 2639 Address MacroAssembler::offsetted_address(Register r, Register r1, 2640 Address::extend ext, int offset, int size) { 2641 if (offset || (ext.shift() % size != 0)) { 2642 lea(rscratch2, Address(r, r1, ext)); 2643 return Address(rscratch2, offset); 2644 } else { 2645 return Address(r, r1, ext); 2646 } 2647 } 2648 2649 Address MacroAssembler::spill_address(int size, int offset, Register tmp) 2650 { 2651 assert(offset >= 0, "spill to negative address?"); 2652 // Offset reachable ? 2653 // Not aligned - 9 bits signed offset 2654 // Aligned - 12 bits unsigned offset shifted 2655 Register base = sp; 2656 if ((offset & (size-1)) && offset >= (1<<8)) { 2657 add(tmp, base, offset & ((1<<12)-1)); 2658 base = tmp; 2659 offset &= -1<<12; 2660 } 2661 2662 if (offset >= (1<<12) * size) { 2663 add(tmp, base, offset & (((1<<12)-1)<<12)); 2664 base = tmp; 2665 offset &= ~(((1<<12)-1)<<12); 2666 } 2667 2668 return Address(base, offset); 2669 } 2670 2671 // Checks whether offset is aligned. 2672 // Returns true if it is, else false. 2673 bool MacroAssembler::merge_alignment_check(Register base, 2674 size_t size, 2675 long cur_offset, 2676 long prev_offset) const { 2677 if (AvoidUnalignedAccesses) { 2678 if (base == sp) { 2679 // Checks whether low offset if aligned to pair of registers. 2680 long pair_mask = size * 2 - 1; 2681 long offset = prev_offset > cur_offset ? cur_offset : prev_offset; 2682 return (offset & pair_mask) == 0; 2683 } else { // If base is not sp, we can't guarantee the access is aligned. 2684 return false; 2685 } 2686 } else { 2687 long mask = size - 1; 2688 // Load/store pair instruction only supports element size aligned offset. 2689 return (cur_offset & mask) == 0 && (prev_offset & mask) == 0; 2690 } 2691 } 2692 2693 // Checks whether current and previous loads/stores can be merged. 2694 // Returns true if it can be merged, else false. 2695 bool MacroAssembler::ldst_can_merge(Register rt, 2696 const Address &adr, 2697 size_t cur_size_in_bytes, 2698 bool is_store) const { 2699 address prev = pc() - NativeInstruction::instruction_size; 2700 address last = code()->last_insn(); 2701 2702 if (last == NULL || !nativeInstruction_at(last)->is_Imm_LdSt()) { 2703 return false; 2704 } 2705 2706 if (adr.getMode() != Address::base_plus_offset || prev != last) { 2707 return false; 2708 } 2709 2710 NativeLdSt* prev_ldst = NativeLdSt_at(prev); 2711 size_t prev_size_in_bytes = prev_ldst->size_in_bytes(); 2712 2713 assert(prev_size_in_bytes == 4 || prev_size_in_bytes == 8, "only supports 64/32bit merging."); 2714 assert(cur_size_in_bytes == 4 || cur_size_in_bytes == 8, "only supports 64/32bit merging."); 2715 2716 if (cur_size_in_bytes != prev_size_in_bytes || is_store != prev_ldst->is_store()) { 2717 return false; 2718 } 2719 2720 long max_offset = 63 * prev_size_in_bytes; 2721 long min_offset = -64 * prev_size_in_bytes; 2722 2723 assert(prev_ldst->is_not_pre_post_index(), "pre-index or post-index is not supported to be merged."); 2724 2725 // Only same base can be merged. 2726 if (adr.base() != prev_ldst->base()) { 2727 return false; 2728 } 2729 2730 long cur_offset = adr.offset(); 2731 long prev_offset = prev_ldst->offset(); 2732 size_t diff = abs(cur_offset - prev_offset); 2733 if (diff != prev_size_in_bytes) { 2734 return false; 2735 } 2736 2737 // Following cases can not be merged: 2738 // ldr x2, [x2, #8] 2739 // ldr x3, [x2, #16] 2740 // or: 2741 // ldr x2, [x3, #8] 2742 // ldr x2, [x3, #16] 2743 // If t1 and t2 is the same in "ldp t1, t2, [xn, #imm]", we'll get SIGILL. 2744 if (!is_store && (adr.base() == prev_ldst->target() || rt == prev_ldst->target())) { 2745 return false; 2746 } 2747 2748 long low_offset = prev_offset > cur_offset ? cur_offset : prev_offset; 2749 // Offset range must be in ldp/stp instruction's range. 2750 if (low_offset > max_offset || low_offset < min_offset) { 2751 return false; 2752 } 2753 2754 if (merge_alignment_check(adr.base(), prev_size_in_bytes, cur_offset, prev_offset)) { 2755 return true; 2756 } 2757 2758 return false; 2759 } 2760 2761 // Merge current load/store with previous load/store into ldp/stp. 2762 void MacroAssembler::merge_ldst(Register rt, 2763 const Address &adr, 2764 size_t cur_size_in_bytes, 2765 bool is_store) { 2766 2767 assert(ldst_can_merge(rt, adr, cur_size_in_bytes, is_store) == true, "cur and prev must be able to be merged."); 2768 2769 Register rt_low, rt_high; 2770 address prev = pc() - NativeInstruction::instruction_size; 2771 NativeLdSt* prev_ldst = NativeLdSt_at(prev); 2772 2773 long offset; 2774 2775 if (adr.offset() < prev_ldst->offset()) { 2776 offset = adr.offset(); 2777 rt_low = rt; 2778 rt_high = prev_ldst->target(); 2779 } else { 2780 offset = prev_ldst->offset(); 2781 rt_low = prev_ldst->target(); 2782 rt_high = rt; 2783 } 2784 2785 Address adr_p = Address(prev_ldst->base(), offset); 2786 // Overwrite previous generated binary. 2787 code_section()->set_end(prev); 2788 2789 const int sz = prev_ldst->size_in_bytes(); 2790 assert(sz == 8 || sz == 4, "only supports 64/32bit merging."); 2791 if (!is_store) { 2792 BLOCK_COMMENT("merged ldr pair"); 2793 if (sz == 8) { 2794 ldp(rt_low, rt_high, adr_p); 2795 } else { 2796 ldpw(rt_low, rt_high, adr_p); 2797 } 2798 } else { 2799 BLOCK_COMMENT("merged str pair"); 2800 if (sz == 8) { 2801 stp(rt_low, rt_high, adr_p); 2802 } else { 2803 stpw(rt_low, rt_high, adr_p); 2804 } 2805 } 2806 } 2807 2808 /** 2809 * Multiply 64 bit by 64 bit first loop. 2810 */ 2811 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart, 2812 Register y, Register y_idx, Register z, 2813 Register carry, Register product, 2814 Register idx, Register kdx) { 2815 // 2816 // jlong carry, x[], y[], z[]; 2817 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) { 2818 // huge_128 product = y[idx] * x[xstart] + carry; 2819 // z[kdx] = (jlong)product; 2820 // carry = (jlong)(product >>> 64); 2821 // } 2822 // z[xstart] = carry; 2823 // 2824 2825 Label L_first_loop, L_first_loop_exit; 2826 Label L_one_x, L_one_y, L_multiply; 2827 2828 subsw(xstart, xstart, 1); 2829 br(Assembler::MI, L_one_x); 2830 2831 lea(rscratch1, Address(x, xstart, Address::lsl(LogBytesPerInt))); 2832 ldr(x_xstart, Address(rscratch1)); 2833 ror(x_xstart, x_xstart, 32); // convert big-endian to little-endian 2834 2835 bind(L_first_loop); 2836 subsw(idx, idx, 1); 2837 br(Assembler::MI, L_first_loop_exit); 2838 subsw(idx, idx, 1); 2839 br(Assembler::MI, L_one_y); 2840 lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt))); 2841 ldr(y_idx, Address(rscratch1)); 2842 ror(y_idx, y_idx, 32); // convert big-endian to little-endian 2843 bind(L_multiply); 2844 2845 // AArch64 has a multiply-accumulate instruction that we can't use 2846 // here because it has no way to process carries, so we have to use 2847 // separate add and adc instructions. Bah. 2848 umulh(rscratch1, x_xstart, y_idx); // x_xstart * y_idx -> rscratch1:product 2849 mul(product, x_xstart, y_idx); 2850 adds(product, product, carry); 2851 adc(carry, rscratch1, zr); // x_xstart * y_idx + carry -> carry:product 2852 2853 subw(kdx, kdx, 2); 2854 ror(product, product, 32); // back to big-endian 2855 str(product, offsetted_address(z, kdx, Address::uxtw(LogBytesPerInt), 0, BytesPerLong)); 2856 2857 b(L_first_loop); 2858 2859 bind(L_one_y); 2860 ldrw(y_idx, Address(y, 0)); 2861 b(L_multiply); 2862 2863 bind(L_one_x); 2864 ldrw(x_xstart, Address(x, 0)); 2865 b(L_first_loop); 2866 2867 bind(L_first_loop_exit); 2868 } 2869 2870 /** 2871 * Multiply 128 bit by 128. Unrolled inner loop. 2872 * 2873 */ 2874 void MacroAssembler::multiply_128_x_128_loop(Register y, Register z, 2875 Register carry, Register carry2, 2876 Register idx, Register jdx, 2877 Register yz_idx1, Register yz_idx2, 2878 Register tmp, Register tmp3, Register tmp4, 2879 Register tmp6, Register product_hi) { 2880 2881 // jlong carry, x[], y[], z[]; 2882 // int kdx = ystart+1; 2883 // for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop 2884 // huge_128 tmp3 = (y[idx+1] * product_hi) + z[kdx+idx+1] + carry; 2885 // jlong carry2 = (jlong)(tmp3 >>> 64); 2886 // huge_128 tmp4 = (y[idx] * product_hi) + z[kdx+idx] + carry2; 2887 // carry = (jlong)(tmp4 >>> 64); 2888 // z[kdx+idx+1] = (jlong)tmp3; 2889 // z[kdx+idx] = (jlong)tmp4; 2890 // } 2891 // idx += 2; 2892 // if (idx > 0) { 2893 // yz_idx1 = (y[idx] * product_hi) + z[kdx+idx] + carry; 2894 // z[kdx+idx] = (jlong)yz_idx1; 2895 // carry = (jlong)(yz_idx1 >>> 64); 2896 // } 2897 // 2898 2899 Label L_third_loop, L_third_loop_exit, L_post_third_loop_done; 2900 2901 lsrw(jdx, idx, 2); 2902 2903 bind(L_third_loop); 2904 2905 subsw(jdx, jdx, 1); 2906 br(Assembler::MI, L_third_loop_exit); 2907 subw(idx, idx, 4); 2908 2909 lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt))); 2910 2911 ldp(yz_idx2, yz_idx1, Address(rscratch1, 0)); 2912 2913 lea(tmp6, Address(z, idx, Address::uxtw(LogBytesPerInt))); 2914 2915 ror(yz_idx1, yz_idx1, 32); // convert big-endian to little-endian 2916 ror(yz_idx2, yz_idx2, 32); 2917 2918 ldp(rscratch2, rscratch1, Address(tmp6, 0)); 2919 2920 mul(tmp3, product_hi, yz_idx1); // yz_idx1 * product_hi -> tmp4:tmp3 2921 umulh(tmp4, product_hi, yz_idx1); 2922 2923 ror(rscratch1, rscratch1, 32); // convert big-endian to little-endian 2924 ror(rscratch2, rscratch2, 32); 2925 2926 mul(tmp, product_hi, yz_idx2); // yz_idx2 * product_hi -> carry2:tmp 2927 umulh(carry2, product_hi, yz_idx2); 2928 2929 // propagate sum of both multiplications into carry:tmp4:tmp3 2930 adds(tmp3, tmp3, carry); 2931 adc(tmp4, tmp4, zr); 2932 adds(tmp3, tmp3, rscratch1); 2933 adcs(tmp4, tmp4, tmp); 2934 adc(carry, carry2, zr); 2935 adds(tmp4, tmp4, rscratch2); 2936 adc(carry, carry, zr); 2937 2938 ror(tmp3, tmp3, 32); // convert little-endian to big-endian 2939 ror(tmp4, tmp4, 32); 2940 stp(tmp4, tmp3, Address(tmp6, 0)); 2941 2942 b(L_third_loop); 2943 bind (L_third_loop_exit); 2944 2945 andw (idx, idx, 0x3); 2946 cbz(idx, L_post_third_loop_done); 2947 2948 Label L_check_1; 2949 subsw(idx, idx, 2); 2950 br(Assembler::MI, L_check_1); 2951 2952 lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt))); 2953 ldr(yz_idx1, Address(rscratch1, 0)); 2954 ror(yz_idx1, yz_idx1, 32); 2955 mul(tmp3, product_hi, yz_idx1); // yz_idx1 * product_hi -> tmp4:tmp3 2956 umulh(tmp4, product_hi, yz_idx1); 2957 lea(rscratch1, Address(z, idx, Address::uxtw(LogBytesPerInt))); 2958 ldr(yz_idx2, Address(rscratch1, 0)); 2959 ror(yz_idx2, yz_idx2, 32); 2960 2961 add2_with_carry(carry, tmp4, tmp3, carry, yz_idx2); 2962 2963 ror(tmp3, tmp3, 32); 2964 str(tmp3, Address(rscratch1, 0)); 2965 2966 bind (L_check_1); 2967 2968 andw (idx, idx, 0x1); 2969 subsw(idx, idx, 1); 2970 br(Assembler::MI, L_post_third_loop_done); 2971 ldrw(tmp4, Address(y, idx, Address::uxtw(LogBytesPerInt))); 2972 mul(tmp3, tmp4, product_hi); // tmp4 * product_hi -> carry2:tmp3 2973 umulh(carry2, tmp4, product_hi); 2974 ldrw(tmp4, Address(z, idx, Address::uxtw(LogBytesPerInt))); 2975 2976 add2_with_carry(carry2, tmp3, tmp4, carry); 2977 2978 strw(tmp3, Address(z, idx, Address::uxtw(LogBytesPerInt))); 2979 extr(carry, carry2, tmp3, 32); 2980 2981 bind(L_post_third_loop_done); 2982 } 2983 2984 /** 2985 * Code for BigInteger::multiplyToLen() instrinsic. 2986 * 2987 * r0: x 2988 * r1: xlen 2989 * r2: y 2990 * r3: ylen 2991 * r4: z 2992 * r5: zlen 2993 * r10: tmp1 2994 * r11: tmp2 2995 * r12: tmp3 2996 * r13: tmp4 2997 * r14: tmp5 2998 * r15: tmp6 2999 * r16: tmp7 3000 * 3001 */ 3002 void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Register ylen, 3003 Register z, Register zlen, 3004 Register tmp1, Register tmp2, Register tmp3, Register tmp4, 3005 Register tmp5, Register tmp6, Register product_hi) { 3006 3007 assert_different_registers(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6); 3008 3009 const Register idx = tmp1; 3010 const Register kdx = tmp2; 3011 const Register xstart = tmp3; 3012 3013 const Register y_idx = tmp4; 3014 const Register carry = tmp5; 3015 const Register product = xlen; 3016 const Register x_xstart = zlen; // reuse register 3017 3018 // First Loop. 3019 // 3020 // final static long LONG_MASK = 0xffffffffL; 3021 // int xstart = xlen - 1; 3022 // int ystart = ylen - 1; 3023 // long carry = 0; 3024 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) { 3025 // long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry; 3026 // z[kdx] = (int)product; 3027 // carry = product >>> 32; 3028 // } 3029 // z[xstart] = (int)carry; 3030 // 3031 3032 movw(idx, ylen); // idx = ylen; 3033 movw(kdx, zlen); // kdx = xlen+ylen; 3034 mov(carry, zr); // carry = 0; 3035 3036 Label L_done; 3037 3038 movw(xstart, xlen); 3039 subsw(xstart, xstart, 1); 3040 br(Assembler::MI, L_done); 3041 3042 multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, carry, product, idx, kdx); 3043 3044 Label L_second_loop; 3045 cbzw(kdx, L_second_loop); 3046 3047 Label L_carry; 3048 subw(kdx, kdx, 1); 3049 cbzw(kdx, L_carry); 3050 3051 strw(carry, Address(z, kdx, Address::uxtw(LogBytesPerInt))); 3052 lsr(carry, carry, 32); 3053 subw(kdx, kdx, 1); 3054 3055 bind(L_carry); 3056 strw(carry, Address(z, kdx, Address::uxtw(LogBytesPerInt))); 3057 3058 // Second and third (nested) loops. 3059 // 3060 // for (int i = xstart-1; i >= 0; i--) { // Second loop 3061 // carry = 0; 3062 // for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop 3063 // long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) + 3064 // (z[k] & LONG_MASK) + carry; 3065 // z[k] = (int)product; 3066 // carry = product >>> 32; 3067 // } 3068 // z[i] = (int)carry; 3069 // } 3070 // 3071 // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = product_hi 3072 3073 const Register jdx = tmp1; 3074 3075 bind(L_second_loop); 3076 mov(carry, zr); // carry = 0; 3077 movw(jdx, ylen); // j = ystart+1 3078 3079 subsw(xstart, xstart, 1); // i = xstart-1; 3080 br(Assembler::MI, L_done); 3081 3082 str(z, Address(pre(sp, -4 * wordSize))); 3083 3084 Label L_last_x; 3085 lea(z, offsetted_address(z, xstart, Address::uxtw(LogBytesPerInt), 4, BytesPerInt)); // z = z + k - j 3086 subsw(xstart, xstart, 1); // i = xstart-1; 3087 br(Assembler::MI, L_last_x); 3088 3089 lea(rscratch1, Address(x, xstart, Address::uxtw(LogBytesPerInt))); 3090 ldr(product_hi, Address(rscratch1)); 3091 ror(product_hi, product_hi, 32); // convert big-endian to little-endian 3092 3093 Label L_third_loop_prologue; 3094 bind(L_third_loop_prologue); 3095 3096 str(ylen, Address(sp, wordSize)); 3097 stp(x, xstart, Address(sp, 2 * wordSize)); 3098 multiply_128_x_128_loop(y, z, carry, x, jdx, ylen, product, 3099 tmp2, x_xstart, tmp3, tmp4, tmp6, product_hi); 3100 ldp(z, ylen, Address(post(sp, 2 * wordSize))); 3101 ldp(x, xlen, Address(post(sp, 2 * wordSize))); // copy old xstart -> xlen 3102 3103 addw(tmp3, xlen, 1); 3104 strw(carry, Address(z, tmp3, Address::uxtw(LogBytesPerInt))); 3105 subsw(tmp3, tmp3, 1); 3106 br(Assembler::MI, L_done); 3107 3108 lsr(carry, carry, 32); 3109 strw(carry, Address(z, tmp3, Address::uxtw(LogBytesPerInt))); 3110 b(L_second_loop); 3111 3112 // Next infrequent code is moved outside loops. 3113 bind(L_last_x); 3114 ldrw(product_hi, Address(x, 0)); 3115 b(L_third_loop_prologue); 3116 3117 bind(L_done); 3118 } 3119 3120 // Code for BigInteger::mulAdd instrinsic 3121 // out = r0 3122 // in = r1 3123 // offset = r2 (already out.length-offset) 3124 // len = r3 3125 // k = r4 3126 // 3127 // pseudo code from java implementation: 3128 // carry = 0; 3129 // offset = out.length-offset - 1; 3130 // for (int j=len-1; j >= 0; j--) { 3131 // product = (in[j] & LONG_MASK) * kLong + (out[offset] & LONG_MASK) + carry; 3132 // out[offset--] = (int)product; 3133 // carry = product >>> 32; 3134 // } 3135 // return (int)carry; 3136 void MacroAssembler::mul_add(Register out, Register in, Register offset, 3137 Register len, Register k) { 3138 Label LOOP, END; 3139 // pre-loop 3140 cmp(len, zr); // cmp, not cbz/cbnz: to use condition twice => less branches 3141 csel(out, zr, out, Assembler::EQ); 3142 br(Assembler::EQ, END); 3143 add(in, in, len, LSL, 2); // in[j+1] address 3144 add(offset, out, offset, LSL, 2); // out[offset + 1] address 3145 mov(out, zr); // used to keep carry now 3146 BIND(LOOP); 3147 ldrw(rscratch1, Address(pre(in, -4))); 3148 madd(rscratch1, rscratch1, k, out); 3149 ldrw(rscratch2, Address(pre(offset, -4))); 3150 add(rscratch1, rscratch1, rscratch2); 3151 strw(rscratch1, Address(offset)); 3152 lsr(out, rscratch1, 32); 3153 subs(len, len, 1); 3154 br(Assembler::NE, LOOP); 3155 BIND(END); 3156 } 3157 3158 /** 3159 * Emits code to update CRC-32 with a byte value according to constants in table 3160 * 3161 * @param [in,out]crc Register containing the crc. 3162 * @param [in]val Register containing the byte to fold into the CRC. 3163 * @param [in]table Register containing the table of crc constants. 3164 * 3165 * uint32_t crc; 3166 * val = crc_table[(val ^ crc) & 0xFF]; 3167 * crc = val ^ (crc >> 8); 3168 * 3169 */ 3170 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) { 3171 eor(val, val, crc); 3172 andr(val, val, 0xff); 3173 ldrw(val, Address(table, val, Address::lsl(2))); 3174 eor(crc, val, crc, Assembler::LSR, 8); 3175 } 3176 3177 /** 3178 * Emits code to update CRC-32 with a 32-bit value according to tables 0 to 3 3179 * 3180 * @param [in,out]crc Register containing the crc. 3181 * @param [in]v Register containing the 32-bit to fold into the CRC. 3182 * @param [in]table0 Register containing table 0 of crc constants. 3183 * @param [in]table1 Register containing table 1 of crc constants. 3184 * @param [in]table2 Register containing table 2 of crc constants. 3185 * @param [in]table3 Register containing table 3 of crc constants. 3186 * 3187 * uint32_t crc; 3188 * v = crc ^ v 3189 * crc = table3[v&0xff]^table2[(v>>8)&0xff]^table1[(v>>16)&0xff]^table0[v>>24] 3190 * 3191 */ 3192 void MacroAssembler::update_word_crc32(Register crc, Register v, Register tmp, 3193 Register table0, Register table1, Register table2, Register table3, 3194 bool upper) { 3195 eor(v, crc, v, upper ? LSR:LSL, upper ? 32:0); 3196 uxtb(tmp, v); 3197 ldrw(crc, Address(table3, tmp, Address::lsl(2))); 3198 ubfx(tmp, v, 8, 8); 3199 ldrw(tmp, Address(table2, tmp, Address::lsl(2))); 3200 eor(crc, crc, tmp); 3201 ubfx(tmp, v, 16, 8); 3202 ldrw(tmp, Address(table1, tmp, Address::lsl(2))); 3203 eor(crc, crc, tmp); 3204 ubfx(tmp, v, 24, 8); 3205 ldrw(tmp, Address(table0, tmp, Address::lsl(2))); 3206 eor(crc, crc, tmp); 3207 } 3208 3209 void MacroAssembler::kernel_crc32_using_crc32(Register crc, Register buf, 3210 Register len, Register tmp0, Register tmp1, Register tmp2, 3211 Register tmp3) { 3212 Label CRC_by64_loop, CRC_by4_loop, CRC_by1_loop, CRC_less64, CRC_by64_pre, CRC_by32_loop, CRC_less32, L_exit; 3213 assert_different_registers(crc, buf, len, tmp0, tmp1, tmp2, tmp3); 3214 3215 mvnw(crc, crc); 3216 3217 subs(len, len, 128); 3218 br(Assembler::GE, CRC_by64_pre); 3219 BIND(CRC_less64); 3220 adds(len, len, 128-32); 3221 br(Assembler::GE, CRC_by32_loop); 3222 BIND(CRC_less32); 3223 adds(len, len, 32-4); 3224 br(Assembler::GE, CRC_by4_loop); 3225 adds(len, len, 4); 3226 br(Assembler::GT, CRC_by1_loop); 3227 b(L_exit); 3228 3229 BIND(CRC_by32_loop); 3230 ldp(tmp0, tmp1, Address(post(buf, 16))); 3231 subs(len, len, 32); 3232 crc32x(crc, crc, tmp0); 3233 ldr(tmp2, Address(post(buf, 8))); 3234 crc32x(crc, crc, tmp1); 3235 ldr(tmp3, Address(post(buf, 8))); 3236 crc32x(crc, crc, tmp2); 3237 crc32x(crc, crc, tmp3); 3238 br(Assembler::GE, CRC_by32_loop); 3239 cmn(len, 32); 3240 br(Assembler::NE, CRC_less32); 3241 b(L_exit); 3242 3243 BIND(CRC_by4_loop); 3244 ldrw(tmp0, Address(post(buf, 4))); 3245 subs(len, len, 4); 3246 crc32w(crc, crc, tmp0); 3247 br(Assembler::GE, CRC_by4_loop); 3248 adds(len, len, 4); 3249 br(Assembler::LE, L_exit); 3250 BIND(CRC_by1_loop); 3251 ldrb(tmp0, Address(post(buf, 1))); 3252 subs(len, len, 1); 3253 crc32b(crc, crc, tmp0); 3254 br(Assembler::GT, CRC_by1_loop); 3255 b(L_exit); 3256 3257 BIND(CRC_by64_pre); 3258 sub(buf, buf, 8); 3259 ldp(tmp0, tmp1, Address(buf, 8)); 3260 crc32x(crc, crc, tmp0); 3261 ldr(tmp2, Address(buf, 24)); 3262 crc32x(crc, crc, tmp1); 3263 ldr(tmp3, Address(buf, 32)); 3264 crc32x(crc, crc, tmp2); 3265 ldr(tmp0, Address(buf, 40)); 3266 crc32x(crc, crc, tmp3); 3267 ldr(tmp1, Address(buf, 48)); 3268 crc32x(crc, crc, tmp0); 3269 ldr(tmp2, Address(buf, 56)); 3270 crc32x(crc, crc, tmp1); 3271 ldr(tmp3, Address(pre(buf, 64))); 3272 3273 b(CRC_by64_loop); 3274 3275 align(CodeEntryAlignment); 3276 BIND(CRC_by64_loop); 3277 subs(len, len, 64); 3278 crc32x(crc, crc, tmp2); 3279 ldr(tmp0, Address(buf, 8)); 3280 crc32x(crc, crc, tmp3); 3281 ldr(tmp1, Address(buf, 16)); 3282 crc32x(crc, crc, tmp0); 3283 ldr(tmp2, Address(buf, 24)); 3284 crc32x(crc, crc, tmp1); 3285 ldr(tmp3, Address(buf, 32)); 3286 crc32x(crc, crc, tmp2); 3287 ldr(tmp0, Address(buf, 40)); 3288 crc32x(crc, crc, tmp3); 3289 ldr(tmp1, Address(buf, 48)); 3290 crc32x(crc, crc, tmp0); 3291 ldr(tmp2, Address(buf, 56)); 3292 crc32x(crc, crc, tmp1); 3293 ldr(tmp3, Address(pre(buf, 64))); 3294 br(Assembler::GE, CRC_by64_loop); 3295 3296 // post-loop 3297 crc32x(crc, crc, tmp2); 3298 crc32x(crc, crc, tmp3); 3299 3300 sub(len, len, 64); 3301 add(buf, buf, 8); 3302 cmn(len, 128); 3303 br(Assembler::NE, CRC_less64); 3304 BIND(L_exit); 3305 mvnw(crc, crc); 3306 } 3307 3308 /** 3309 * @param crc register containing existing CRC (32-bit) 3310 * @param buf register pointing to input byte buffer (byte*) 3311 * @param len register containing number of bytes 3312 * @param table register that will contain address of CRC table 3313 * @param tmp scratch register 3314 */ 3315 void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len, 3316 Register table0, Register table1, Register table2, Register table3, 3317 Register tmp, Register tmp2, Register tmp3) { 3318 Label L_by16, L_by16_loop, L_by4, L_by4_loop, L_by1, L_by1_loop, L_exit; 3319 unsigned long offset; 3320 3321 if (UseCRC32) { 3322 kernel_crc32_using_crc32(crc, buf, len, table0, table1, table2, table3); 3323 return; 3324 } 3325 3326 mvnw(crc, crc); 3327 3328 adrp(table0, ExternalAddress(StubRoutines::crc_table_addr()), offset); 3329 if (offset) add(table0, table0, offset); 3330 add(table1, table0, 1*256*sizeof(juint)); 3331 add(table2, table0, 2*256*sizeof(juint)); 3332 add(table3, table0, 3*256*sizeof(juint)); 3333 3334 if (UseNeon) { 3335 cmp(len, (u1)64); 3336 br(Assembler::LT, L_by16); 3337 eor(v16, T16B, v16, v16); 3338 3339 Label L_fold; 3340 3341 add(tmp, table0, 4*256*sizeof(juint)); // Point at the Neon constants 3342 3343 ld1(v0, v1, T2D, post(buf, 32)); 3344 ld1r(v4, T2D, post(tmp, 8)); 3345 ld1r(v5, T2D, post(tmp, 8)); 3346 ld1r(v6, T2D, post(tmp, 8)); 3347 ld1r(v7, T2D, post(tmp, 8)); 3348 mov(v16, T4S, 0, crc); 3349 3350 eor(v0, T16B, v0, v16); 3351 sub(len, len, 64); 3352 3353 BIND(L_fold); 3354 pmull(v22, T8H, v0, v5, T8B); 3355 pmull(v20, T8H, v0, v7, T8B); 3356 pmull(v23, T8H, v0, v4, T8B); 3357 pmull(v21, T8H, v0, v6, T8B); 3358 3359 pmull2(v18, T8H, v0, v5, T16B); 3360 pmull2(v16, T8H, v0, v7, T16B); 3361 pmull2(v19, T8H, v0, v4, T16B); 3362 pmull2(v17, T8H, v0, v6, T16B); 3363 3364 uzp1(v24, T8H, v20, v22); 3365 uzp2(v25, T8H, v20, v22); 3366 eor(v20, T16B, v24, v25); 3367 3368 uzp1(v26, T8H, v16, v18); 3369 uzp2(v27, T8H, v16, v18); 3370 eor(v16, T16B, v26, v27); 3371 3372 ushll2(v22, T4S, v20, T8H, 8); 3373 ushll(v20, T4S, v20, T4H, 8); 3374 3375 ushll2(v18, T4S, v16, T8H, 8); 3376 ushll(v16, T4S, v16, T4H, 8); 3377 3378 eor(v22, T16B, v23, v22); 3379 eor(v18, T16B, v19, v18); 3380 eor(v20, T16B, v21, v20); 3381 eor(v16, T16B, v17, v16); 3382 3383 uzp1(v17, T2D, v16, v20); 3384 uzp2(v21, T2D, v16, v20); 3385 eor(v17, T16B, v17, v21); 3386 3387 ushll2(v20, T2D, v17, T4S, 16); 3388 ushll(v16, T2D, v17, T2S, 16); 3389 3390 eor(v20, T16B, v20, v22); 3391 eor(v16, T16B, v16, v18); 3392 3393 uzp1(v17, T2D, v20, v16); 3394 uzp2(v21, T2D, v20, v16); 3395 eor(v28, T16B, v17, v21); 3396 3397 pmull(v22, T8H, v1, v5, T8B); 3398 pmull(v20, T8H, v1, v7, T8B); 3399 pmull(v23, T8H, v1, v4, T8B); 3400 pmull(v21, T8H, v1, v6, T8B); 3401 3402 pmull2(v18, T8H, v1, v5, T16B); 3403 pmull2(v16, T8H, v1, v7, T16B); 3404 pmull2(v19, T8H, v1, v4, T16B); 3405 pmull2(v17, T8H, v1, v6, T16B); 3406 3407 ld1(v0, v1, T2D, post(buf, 32)); 3408 3409 uzp1(v24, T8H, v20, v22); 3410 uzp2(v25, T8H, v20, v22); 3411 eor(v20, T16B, v24, v25); 3412 3413 uzp1(v26, T8H, v16, v18); 3414 uzp2(v27, T8H, v16, v18); 3415 eor(v16, T16B, v26, v27); 3416 3417 ushll2(v22, T4S, v20, T8H, 8); 3418 ushll(v20, T4S, v20, T4H, 8); 3419 3420 ushll2(v18, T4S, v16, T8H, 8); 3421 ushll(v16, T4S, v16, T4H, 8); 3422 3423 eor(v22, T16B, v23, v22); 3424 eor(v18, T16B, v19, v18); 3425 eor(v20, T16B, v21, v20); 3426 eor(v16, T16B, v17, v16); 3427 3428 uzp1(v17, T2D, v16, v20); 3429 uzp2(v21, T2D, v16, v20); 3430 eor(v16, T16B, v17, v21); 3431 3432 ushll2(v20, T2D, v16, T4S, 16); 3433 ushll(v16, T2D, v16, T2S, 16); 3434 3435 eor(v20, T16B, v22, v20); 3436 eor(v16, T16B, v16, v18); 3437 3438 uzp1(v17, T2D, v20, v16); 3439 uzp2(v21, T2D, v20, v16); 3440 eor(v20, T16B, v17, v21); 3441 3442 shl(v16, T2D, v28, 1); 3443 shl(v17, T2D, v20, 1); 3444 3445 eor(v0, T16B, v0, v16); 3446 eor(v1, T16B, v1, v17); 3447 3448 subs(len, len, 32); 3449 br(Assembler::GE, L_fold); 3450 3451 mov(crc, 0); 3452 mov(tmp, v0, T1D, 0); 3453 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false); 3454 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true); 3455 mov(tmp, v0, T1D, 1); 3456 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false); 3457 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true); 3458 mov(tmp, v1, T1D, 0); 3459 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false); 3460 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true); 3461 mov(tmp, v1, T1D, 1); 3462 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false); 3463 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true); 3464 3465 add(len, len, 32); 3466 } 3467 3468 BIND(L_by16); 3469 subs(len, len, 16); 3470 br(Assembler::GE, L_by16_loop); 3471 adds(len, len, 16-4); 3472 br(Assembler::GE, L_by4_loop); 3473 adds(len, len, 4); 3474 br(Assembler::GT, L_by1_loop); 3475 b(L_exit); 3476 3477 BIND(L_by4_loop); 3478 ldrw(tmp, Address(post(buf, 4))); 3479 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3); 3480 subs(len, len, 4); 3481 br(Assembler::GE, L_by4_loop); 3482 adds(len, len, 4); 3483 br(Assembler::LE, L_exit); 3484 BIND(L_by1_loop); 3485 subs(len, len, 1); 3486 ldrb(tmp, Address(post(buf, 1))); 3487 update_byte_crc32(crc, tmp, table0); 3488 br(Assembler::GT, L_by1_loop); 3489 b(L_exit); 3490 3491 align(CodeEntryAlignment); 3492 BIND(L_by16_loop); 3493 subs(len, len, 16); 3494 ldp(tmp, tmp3, Address(post(buf, 16))); 3495 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false); 3496 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true); 3497 update_word_crc32(crc, tmp3, tmp2, table0, table1, table2, table3, false); 3498 update_word_crc32(crc, tmp3, tmp2, table0, table1, table2, table3, true); 3499 br(Assembler::GE, L_by16_loop); 3500 adds(len, len, 16-4); 3501 br(Assembler::GE, L_by4_loop); 3502 adds(len, len, 4); 3503 br(Assembler::GT, L_by1_loop); 3504 BIND(L_exit); 3505 mvnw(crc, crc); 3506 } 3507 3508 void MacroAssembler::kernel_crc32c_using_crc32c(Register crc, Register buf, 3509 Register len, Register tmp0, Register tmp1, Register tmp2, 3510 Register tmp3) { 3511 Label CRC_by64_loop, CRC_by4_loop, CRC_by1_loop, CRC_less64, CRC_by64_pre, CRC_by32_loop, CRC_less32, L_exit; 3512 assert_different_registers(crc, buf, len, tmp0, tmp1, tmp2, tmp3); 3513 3514 subs(len, len, 128); 3515 br(Assembler::GE, CRC_by64_pre); 3516 BIND(CRC_less64); 3517 adds(len, len, 128-32); 3518 br(Assembler::GE, CRC_by32_loop); 3519 BIND(CRC_less32); 3520 adds(len, len, 32-4); 3521 br(Assembler::GE, CRC_by4_loop); 3522 adds(len, len, 4); 3523 br(Assembler::GT, CRC_by1_loop); 3524 b(L_exit); 3525 3526 BIND(CRC_by32_loop); 3527 ldp(tmp0, tmp1, Address(post(buf, 16))); 3528 subs(len, len, 32); 3529 crc32cx(crc, crc, tmp0); 3530 ldr(tmp2, Address(post(buf, 8))); 3531 crc32cx(crc, crc, tmp1); 3532 ldr(tmp3, Address(post(buf, 8))); 3533 crc32cx(crc, crc, tmp2); 3534 crc32cx(crc, crc, tmp3); 3535 br(Assembler::GE, CRC_by32_loop); 3536 cmn(len, 32); 3537 br(Assembler::NE, CRC_less32); 3538 b(L_exit); 3539 3540 BIND(CRC_by4_loop); 3541 ldrw(tmp0, Address(post(buf, 4))); 3542 subs(len, len, 4); 3543 crc32cw(crc, crc, tmp0); 3544 br(Assembler::GE, CRC_by4_loop); 3545 adds(len, len, 4); 3546 br(Assembler::LE, L_exit); 3547 BIND(CRC_by1_loop); 3548 ldrb(tmp0, Address(post(buf, 1))); 3549 subs(len, len, 1); 3550 crc32cb(crc, crc, tmp0); 3551 br(Assembler::GT, CRC_by1_loop); 3552 b(L_exit); 3553 3554 BIND(CRC_by64_pre); 3555 sub(buf, buf, 8); 3556 ldp(tmp0, tmp1, Address(buf, 8)); 3557 crc32cx(crc, crc, tmp0); 3558 ldr(tmp2, Address(buf, 24)); 3559 crc32cx(crc, crc, tmp1); 3560 ldr(tmp3, Address(buf, 32)); 3561 crc32cx(crc, crc, tmp2); 3562 ldr(tmp0, Address(buf, 40)); 3563 crc32cx(crc, crc, tmp3); 3564 ldr(tmp1, Address(buf, 48)); 3565 crc32cx(crc, crc, tmp0); 3566 ldr(tmp2, Address(buf, 56)); 3567 crc32cx(crc, crc, tmp1); 3568 ldr(tmp3, Address(pre(buf, 64))); 3569 3570 b(CRC_by64_loop); 3571 3572 align(CodeEntryAlignment); 3573 BIND(CRC_by64_loop); 3574 subs(len, len, 64); 3575 crc32cx(crc, crc, tmp2); 3576 ldr(tmp0, Address(buf, 8)); 3577 crc32cx(crc, crc, tmp3); 3578 ldr(tmp1, Address(buf, 16)); 3579 crc32cx(crc, crc, tmp0); 3580 ldr(tmp2, Address(buf, 24)); 3581 crc32cx(crc, crc, tmp1); 3582 ldr(tmp3, Address(buf, 32)); 3583 crc32cx(crc, crc, tmp2); 3584 ldr(tmp0, Address(buf, 40)); 3585 crc32cx(crc, crc, tmp3); 3586 ldr(tmp1, Address(buf, 48)); 3587 crc32cx(crc, crc, tmp0); 3588 ldr(tmp2, Address(buf, 56)); 3589 crc32cx(crc, crc, tmp1); 3590 ldr(tmp3, Address(pre(buf, 64))); 3591 br(Assembler::GE, CRC_by64_loop); 3592 3593 // post-loop 3594 crc32cx(crc, crc, tmp2); 3595 crc32cx(crc, crc, tmp3); 3596 3597 sub(len, len, 64); 3598 add(buf, buf, 8); 3599 cmn(len, 128); 3600 br(Assembler::NE, CRC_less64); 3601 BIND(L_exit); 3602 } 3603 3604 /** 3605 * @param crc register containing existing CRC (32-bit) 3606 * @param buf register pointing to input byte buffer (byte*) 3607 * @param len register containing number of bytes 3608 * @param table register that will contain address of CRC table 3609 * @param tmp scratch register 3610 */ 3611 void MacroAssembler::kernel_crc32c(Register crc, Register buf, Register len, 3612 Register table0, Register table1, Register table2, Register table3, 3613 Register tmp, Register tmp2, Register tmp3) { 3614 kernel_crc32c_using_crc32c(crc, buf, len, table0, table1, table2, table3); 3615 } 3616 3617 3618 SkipIfEqual::SkipIfEqual( 3619 MacroAssembler* masm, const bool* flag_addr, bool value) { 3620 _masm = masm; 3621 unsigned long offset; 3622 _masm->adrp(rscratch1, ExternalAddress((address)flag_addr), offset); 3623 _masm->ldrb(rscratch1, Address(rscratch1, offset)); 3624 _masm->cbzw(rscratch1, _label); 3625 } 3626 3627 SkipIfEqual::~SkipIfEqual() { 3628 _masm->bind(_label); 3629 } 3630 3631 void MacroAssembler::addptr(const Address &dst, int32_t src) { 3632 Address adr; 3633 switch(dst.getMode()) { 3634 case Address::base_plus_offset: 3635 // This is the expected mode, although we allow all the other 3636 // forms below. 3637 adr = form_address(rscratch2, dst.base(), dst.offset(), LogBytesPerWord); 3638 break; 3639 default: 3640 lea(rscratch2, dst); 3641 adr = Address(rscratch2); 3642 break; 3643 } 3644 ldr(rscratch1, adr); 3645 add(rscratch1, rscratch1, src); 3646 str(rscratch1, adr); 3647 } 3648 3649 void MacroAssembler::cmpptr(Register src1, Address src2) { 3650 unsigned long offset; 3651 adrp(rscratch1, src2, offset); 3652 ldr(rscratch1, Address(rscratch1, offset)); 3653 cmp(src1, rscratch1); 3654 } 3655 3656 void MacroAssembler::cmpoop(Register obj1, Register obj2) { 3657 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 3658 bs->obj_equals(this, obj1, obj2); 3659 } 3660 3661 void MacroAssembler::load_klass(Register dst, Register src) { 3662 if (UseCompressedClassPointers) { 3663 ldrw(dst, Address(src, oopDesc::klass_offset_in_bytes())); 3664 decode_klass_not_null(dst); 3665 } else { 3666 ldr(dst, Address(src, oopDesc::klass_offset_in_bytes())); 3667 } 3668 } 3669 3670 // ((OopHandle)result).resolve(); 3671 void MacroAssembler::resolve_oop_handle(Register result, Register tmp) { 3672 // OopHandle::resolve is an indirection. 3673 access_load_at(T_OBJECT, IN_NATIVE, result, Address(result, 0), tmp, noreg); 3674 } 3675 3676 void MacroAssembler::load_mirror(Register dst, Register method, Register tmp) { 3677 const int mirror_offset = in_bytes(Klass::java_mirror_offset()); 3678 ldr(dst, Address(rmethod, Method::const_offset())); 3679 ldr(dst, Address(dst, ConstMethod::constants_offset())); 3680 ldr(dst, Address(dst, ConstantPool::pool_holder_offset_in_bytes())); 3681 ldr(dst, Address(dst, mirror_offset)); 3682 resolve_oop_handle(dst, tmp); 3683 } 3684 3685 void MacroAssembler::cmp_klass(Register oop, Register trial_klass, Register tmp) { 3686 if (UseCompressedClassPointers) { 3687 ldrw(tmp, Address(oop, oopDesc::klass_offset_in_bytes())); 3688 if (Universe::narrow_klass_base() == NULL) { 3689 cmp(trial_klass, tmp, LSL, Universe::narrow_klass_shift()); 3690 return; 3691 } else if (((uint64_t)Universe::narrow_klass_base() & 0xffffffff) == 0 3692 && Universe::narrow_klass_shift() == 0) { 3693 // Only the bottom 32 bits matter 3694 cmpw(trial_klass, tmp); 3695 return; 3696 } 3697 decode_klass_not_null(tmp); 3698 } else { 3699 ldr(tmp, Address(oop, oopDesc::klass_offset_in_bytes())); 3700 } 3701 cmp(trial_klass, tmp); 3702 } 3703 3704 void MacroAssembler::load_prototype_header(Register dst, Register src) { 3705 load_klass(dst, src); 3706 ldr(dst, Address(dst, Klass::prototype_header_offset())); 3707 } 3708 3709 void MacroAssembler::store_klass(Register dst, Register src) { 3710 // FIXME: Should this be a store release? concurrent gcs assumes 3711 // klass length is valid if klass field is not null. 3712 if (UseCompressedClassPointers) { 3713 encode_klass_not_null(src); 3714 strw(src, Address(dst, oopDesc::klass_offset_in_bytes())); 3715 } else { 3716 str(src, Address(dst, oopDesc::klass_offset_in_bytes())); 3717 } 3718 } 3719 3720 void MacroAssembler::store_klass_gap(Register dst, Register src) { 3721 if (UseCompressedClassPointers) { 3722 // Store to klass gap in destination 3723 strw(src, Address(dst, oopDesc::klass_gap_offset_in_bytes())); 3724 } 3725 } 3726 3727 // Algorithm must match CompressedOops::encode. 3728 void MacroAssembler::encode_heap_oop(Register d, Register s) { 3729 #ifdef ASSERT 3730 verify_heapbase("MacroAssembler::encode_heap_oop: heap base corrupted?"); 3731 #endif 3732 verify_oop(s, "broken oop in encode_heap_oop"); 3733 if (Universe::narrow_oop_base() == NULL) { 3734 if (Universe::narrow_oop_shift() != 0) { 3735 assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong"); 3736 lsr(d, s, LogMinObjAlignmentInBytes); 3737 } else { 3738 mov(d, s); 3739 } 3740 } else { 3741 subs(d, s, rheapbase); 3742 csel(d, d, zr, Assembler::HS); 3743 lsr(d, d, LogMinObjAlignmentInBytes); 3744 3745 /* Old algorithm: is this any worse? 3746 Label nonnull; 3747 cbnz(r, nonnull); 3748 sub(r, r, rheapbase); 3749 bind(nonnull); 3750 lsr(r, r, LogMinObjAlignmentInBytes); 3751 */ 3752 } 3753 } 3754 3755 void MacroAssembler::encode_heap_oop_not_null(Register r) { 3756 #ifdef ASSERT 3757 verify_heapbase("MacroAssembler::encode_heap_oop_not_null: heap base corrupted?"); 3758 if (CheckCompressedOops) { 3759 Label ok; 3760 cbnz(r, ok); 3761 stop("null oop passed to encode_heap_oop_not_null"); 3762 bind(ok); 3763 } 3764 #endif 3765 verify_oop(r, "broken oop in encode_heap_oop_not_null"); 3766 if (Universe::narrow_oop_base() != NULL) { 3767 sub(r, r, rheapbase); 3768 } 3769 if (Universe::narrow_oop_shift() != 0) { 3770 assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong"); 3771 lsr(r, r, LogMinObjAlignmentInBytes); 3772 } 3773 } 3774 3775 void MacroAssembler::encode_heap_oop_not_null(Register dst, Register src) { 3776 #ifdef ASSERT 3777 verify_heapbase("MacroAssembler::encode_heap_oop_not_null2: heap base corrupted?"); 3778 if (CheckCompressedOops) { 3779 Label ok; 3780 cbnz(src, ok); 3781 stop("null oop passed to encode_heap_oop_not_null2"); 3782 bind(ok); 3783 } 3784 #endif 3785 verify_oop(src, "broken oop in encode_heap_oop_not_null2"); 3786 3787 Register data = src; 3788 if (Universe::narrow_oop_base() != NULL) { 3789 sub(dst, src, rheapbase); 3790 data = dst; 3791 } 3792 if (Universe::narrow_oop_shift() != 0) { 3793 assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong"); 3794 lsr(dst, data, LogMinObjAlignmentInBytes); 3795 data = dst; 3796 } 3797 if (data == src) 3798 mov(dst, src); 3799 } 3800 3801 void MacroAssembler::decode_heap_oop(Register d, Register s) { 3802 #ifdef ASSERT 3803 verify_heapbase("MacroAssembler::decode_heap_oop: heap base corrupted?"); 3804 #endif 3805 if (Universe::narrow_oop_base() == NULL) { 3806 if (Universe::narrow_oop_shift() != 0 || d != s) { 3807 lsl(d, s, Universe::narrow_oop_shift()); 3808 } 3809 } else { 3810 Label done; 3811 if (d != s) 3812 mov(d, s); 3813 cbz(s, done); 3814 add(d, rheapbase, s, Assembler::LSL, LogMinObjAlignmentInBytes); 3815 bind(done); 3816 } 3817 verify_oop(d, "broken oop in decode_heap_oop"); 3818 } 3819 3820 void MacroAssembler::decode_heap_oop_not_null(Register r) { 3821 assert (UseCompressedOops, "should only be used for compressed headers"); 3822 assert (Universe::heap() != NULL, "java heap should be initialized"); 3823 // Cannot assert, unverified entry point counts instructions (see .ad file) 3824 // vtableStubs also counts instructions in pd_code_size_limit. 3825 // Also do not verify_oop as this is called by verify_oop. 3826 if (Universe::narrow_oop_shift() != 0) { 3827 assert(LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong"); 3828 if (Universe::narrow_oop_base() != NULL) { 3829 add(r, rheapbase, r, Assembler::LSL, LogMinObjAlignmentInBytes); 3830 } else { 3831 add(r, zr, r, Assembler::LSL, LogMinObjAlignmentInBytes); 3832 } 3833 } else { 3834 assert (Universe::narrow_oop_base() == NULL, "sanity"); 3835 } 3836 } 3837 3838 void MacroAssembler::decode_heap_oop_not_null(Register dst, Register src) { 3839 assert (UseCompressedOops, "should only be used for compressed headers"); 3840 assert (Universe::heap() != NULL, "java heap should be initialized"); 3841 // Cannot assert, unverified entry point counts instructions (see .ad file) 3842 // vtableStubs also counts instructions in pd_code_size_limit. 3843 // Also do not verify_oop as this is called by verify_oop. 3844 if (Universe::narrow_oop_shift() != 0) { 3845 assert(LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong"); 3846 if (Universe::narrow_oop_base() != NULL) { 3847 add(dst, rheapbase, src, Assembler::LSL, LogMinObjAlignmentInBytes); 3848 } else { 3849 add(dst, zr, src, Assembler::LSL, LogMinObjAlignmentInBytes); 3850 } 3851 } else { 3852 assert (Universe::narrow_oop_base() == NULL, "sanity"); 3853 if (dst != src) { 3854 mov(dst, src); 3855 } 3856 } 3857 } 3858 3859 void MacroAssembler::encode_klass_not_null(Register dst, Register src) { 3860 if (Universe::narrow_klass_base() == NULL) { 3861 if (Universe::narrow_klass_shift() != 0) { 3862 assert (LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong"); 3863 lsr(dst, src, LogKlassAlignmentInBytes); 3864 } else { 3865 if (dst != src) mov(dst, src); 3866 } 3867 return; 3868 } 3869 3870 if (use_XOR_for_compressed_class_base) { 3871 if (Universe::narrow_klass_shift() != 0) { 3872 eor(dst, src, (uint64_t)Universe::narrow_klass_base()); 3873 lsr(dst, dst, LogKlassAlignmentInBytes); 3874 } else { 3875 eor(dst, src, (uint64_t)Universe::narrow_klass_base()); 3876 } 3877 return; 3878 } 3879 3880 if (((uint64_t)Universe::narrow_klass_base() & 0xffffffff) == 0 3881 && Universe::narrow_klass_shift() == 0) { 3882 movw(dst, src); 3883 return; 3884 } 3885 3886 #ifdef ASSERT 3887 verify_heapbase("MacroAssembler::encode_klass_not_null2: heap base corrupted?"); 3888 #endif 3889 3890 Register rbase = dst; 3891 if (dst == src) rbase = rheapbase; 3892 mov(rbase, (uint64_t)Universe::narrow_klass_base()); 3893 sub(dst, src, rbase); 3894 if (Universe::narrow_klass_shift() != 0) { 3895 assert (LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong"); 3896 lsr(dst, dst, LogKlassAlignmentInBytes); 3897 } 3898 if (dst == src) reinit_heapbase(); 3899 } 3900 3901 void MacroAssembler::encode_klass_not_null(Register r) { 3902 encode_klass_not_null(r, r); 3903 } 3904 3905 void MacroAssembler::decode_klass_not_null(Register dst, Register src) { 3906 Register rbase = dst; 3907 assert (UseCompressedClassPointers, "should only be used for compressed headers"); 3908 3909 if (Universe::narrow_klass_base() == NULL) { 3910 if (Universe::narrow_klass_shift() != 0) { 3911 assert(LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong"); 3912 lsl(dst, src, LogKlassAlignmentInBytes); 3913 } else { 3914 if (dst != src) mov(dst, src); 3915 } 3916 return; 3917 } 3918 3919 if (use_XOR_for_compressed_class_base) { 3920 if (Universe::narrow_klass_shift() != 0) { 3921 lsl(dst, src, LogKlassAlignmentInBytes); 3922 eor(dst, dst, (uint64_t)Universe::narrow_klass_base()); 3923 } else { 3924 eor(dst, src, (uint64_t)Universe::narrow_klass_base()); 3925 } 3926 return; 3927 } 3928 3929 if (((uint64_t)Universe::narrow_klass_base() & 0xffffffff) == 0 3930 && Universe::narrow_klass_shift() == 0) { 3931 if (dst != src) 3932 movw(dst, src); 3933 movk(dst, (uint64_t)Universe::narrow_klass_base() >> 32, 32); 3934 return; 3935 } 3936 3937 // Cannot assert, unverified entry point counts instructions (see .ad file) 3938 // vtableStubs also counts instructions in pd_code_size_limit. 3939 // Also do not verify_oop as this is called by verify_oop. 3940 if (dst == src) rbase = rheapbase; 3941 mov(rbase, (uint64_t)Universe::narrow_klass_base()); 3942 if (Universe::narrow_klass_shift() != 0) { 3943 assert(LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong"); 3944 add(dst, rbase, src, Assembler::LSL, LogKlassAlignmentInBytes); 3945 } else { 3946 add(dst, rbase, src); 3947 } 3948 if (dst == src) reinit_heapbase(); 3949 } 3950 3951 void MacroAssembler::decode_klass_not_null(Register r) { 3952 decode_klass_not_null(r, r); 3953 } 3954 3955 void MacroAssembler::set_narrow_oop(Register dst, jobject obj) { 3956 #ifdef ASSERT 3957 { 3958 ThreadInVMfromUnknown tiv; 3959 assert (UseCompressedOops, "should only be used for compressed oops"); 3960 assert (Universe::heap() != NULL, "java heap should be initialized"); 3961 assert (oop_recorder() != NULL, "this assembler needs an OopRecorder"); 3962 assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "should be real oop"); 3963 } 3964 #endif 3965 int oop_index = oop_recorder()->find_index(obj); 3966 InstructionMark im(this); 3967 RelocationHolder rspec = oop_Relocation::spec(oop_index); 3968 code_section()->relocate(inst_mark(), rspec); 3969 movz(dst, 0xDEAD, 16); 3970 movk(dst, 0xBEEF); 3971 } 3972 3973 void MacroAssembler::set_narrow_klass(Register dst, Klass* k) { 3974 assert (UseCompressedClassPointers, "should only be used for compressed headers"); 3975 assert (oop_recorder() != NULL, "this assembler needs an OopRecorder"); 3976 int index = oop_recorder()->find_index(k); 3977 assert(! Universe::heap()->is_in_reserved(k), "should not be an oop"); 3978 3979 InstructionMark im(this); 3980 RelocationHolder rspec = metadata_Relocation::spec(index); 3981 code_section()->relocate(inst_mark(), rspec); 3982 narrowKlass nk = Klass::encode_klass(k); 3983 movz(dst, (nk >> 16), 16); 3984 movk(dst, nk & 0xffff); 3985 } 3986 3987 void MacroAssembler::access_load_at(BasicType type, DecoratorSet decorators, 3988 Register dst, Address src, 3989 Register tmp1, Register thread_tmp) { 3990 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 3991 decorators = AccessInternal::decorator_fixup(decorators); 3992 bool as_raw = (decorators & AS_RAW) != 0; 3993 if (as_raw) { 3994 bs->BarrierSetAssembler::load_at(this, decorators, type, dst, src, tmp1, thread_tmp); 3995 } else { 3996 bs->load_at(this, decorators, type, dst, src, tmp1, thread_tmp); 3997 } 3998 } 3999 4000 void MacroAssembler::access_store_at(BasicType type, DecoratorSet decorators, 4001 Address dst, Register src, 4002 Register tmp1, Register thread_tmp) { 4003 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 4004 decorators = AccessInternal::decorator_fixup(decorators); 4005 bool as_raw = (decorators & AS_RAW) != 0; 4006 if (as_raw) { 4007 bs->BarrierSetAssembler::store_at(this, decorators, type, dst, src, tmp1, thread_tmp); 4008 } else { 4009 bs->store_at(this, decorators, type, dst, src, tmp1, thread_tmp); 4010 } 4011 } 4012 4013 void MacroAssembler::resolve(DecoratorSet decorators, Register obj) { 4014 // Use stronger ACCESS_WRITE|ACCESS_READ by default. 4015 if ((decorators & (ACCESS_READ | ACCESS_WRITE)) == 0) { 4016 decorators |= ACCESS_READ | ACCESS_WRITE; 4017 } 4018 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 4019 return bs->resolve(this, decorators, obj); 4020 } 4021 4022 void MacroAssembler::load_heap_oop(Register dst, Address src, Register tmp1, 4023 Register thread_tmp, DecoratorSet decorators) { 4024 access_load_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, thread_tmp); 4025 } 4026 4027 void MacroAssembler::load_heap_oop_not_null(Register dst, Address src, Register tmp1, 4028 Register thread_tmp, DecoratorSet decorators) { 4029 access_load_at(T_OBJECT, IN_HEAP | IS_NOT_NULL | decorators, dst, src, tmp1, thread_tmp); 4030 } 4031 4032 void MacroAssembler::store_heap_oop(Address dst, Register src, Register tmp1, 4033 Register thread_tmp, DecoratorSet decorators) { 4034 access_store_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, thread_tmp); 4035 } 4036 4037 // Used for storing NULLs. 4038 void MacroAssembler::store_heap_oop_null(Address dst) { 4039 access_store_at(T_OBJECT, IN_HEAP, dst, noreg, noreg, noreg); 4040 } 4041 4042 Address MacroAssembler::allocate_metadata_address(Metadata* obj) { 4043 assert(oop_recorder() != NULL, "this assembler needs a Recorder"); 4044 int index = oop_recorder()->allocate_metadata_index(obj); 4045 RelocationHolder rspec = metadata_Relocation::spec(index); 4046 return Address((address)obj, rspec); 4047 } 4048 4049 // Move an oop into a register. immediate is true if we want 4050 // immediate instrcutions, i.e. we are not going to patch this 4051 // instruction while the code is being executed by another thread. In 4052 // that case we can use move immediates rather than the constant pool. 4053 void MacroAssembler::movoop(Register dst, jobject obj, bool immediate) { 4054 int oop_index; 4055 if (obj == NULL) { 4056 oop_index = oop_recorder()->allocate_oop_index(obj); 4057 } else { 4058 #ifdef ASSERT 4059 { 4060 ThreadInVMfromUnknown tiv; 4061 assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "should be real oop"); 4062 } 4063 #endif 4064 oop_index = oop_recorder()->find_index(obj); 4065 } 4066 RelocationHolder rspec = oop_Relocation::spec(oop_index); 4067 if (! immediate) { 4068 address dummy = address(uintptr_t(pc()) & -wordSize); // A nearby aligned address 4069 ldr_constant(dst, Address(dummy, rspec)); 4070 } else 4071 mov(dst, Address((address)obj, rspec)); 4072 } 4073 4074 // Move a metadata address into a register. 4075 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) { 4076 int oop_index; 4077 if (obj == NULL) { 4078 oop_index = oop_recorder()->allocate_metadata_index(obj); 4079 } else { 4080 oop_index = oop_recorder()->find_index(obj); 4081 } 4082 RelocationHolder rspec = metadata_Relocation::spec(oop_index); 4083 mov(dst, Address((address)obj, rspec)); 4084 } 4085 4086 Address MacroAssembler::constant_oop_address(jobject obj) { 4087 #ifdef ASSERT 4088 { 4089 ThreadInVMfromUnknown tiv; 4090 assert(oop_recorder() != NULL, "this assembler needs an OopRecorder"); 4091 assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "not an oop"); 4092 } 4093 #endif 4094 int oop_index = oop_recorder()->find_index(obj); 4095 return Address((address)obj, oop_Relocation::spec(oop_index)); 4096 } 4097 4098 // Defines obj, preserves var_size_in_bytes, okay for t2 == var_size_in_bytes. 4099 void MacroAssembler::tlab_allocate(Register obj, 4100 Register var_size_in_bytes, 4101 int con_size_in_bytes, 4102 Register t1, 4103 Register t2, 4104 Label& slow_case) { 4105 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 4106 bs->tlab_allocate(this, obj, var_size_in_bytes, con_size_in_bytes, t1, t2, slow_case); 4107 } 4108 4109 // Defines obj, preserves var_size_in_bytes 4110 void MacroAssembler::eden_allocate(Register obj, 4111 Register var_size_in_bytes, 4112 int con_size_in_bytes, 4113 Register t1, 4114 Label& slow_case) { 4115 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 4116 bs->eden_allocate(this, obj, var_size_in_bytes, con_size_in_bytes, t1, slow_case); 4117 } 4118 4119 // Zero words; len is in bytes 4120 // Destroys all registers except addr 4121 // len must be a nonzero multiple of wordSize 4122 void MacroAssembler::zero_memory(Register addr, Register len, Register t1) { 4123 assert_different_registers(addr, len, t1, rscratch1, rscratch2); 4124 4125 #ifdef ASSERT 4126 { Label L; 4127 tst(len, BytesPerWord - 1); 4128 br(Assembler::EQ, L); 4129 stop("len is not a multiple of BytesPerWord"); 4130 bind(L); 4131 } 4132 #endif 4133 4134 #ifndef PRODUCT 4135 block_comment("zero memory"); 4136 #endif 4137 4138 Label loop; 4139 Label entry; 4140 4141 // Algorithm: 4142 // 4143 // scratch1 = cnt & 7; 4144 // cnt -= scratch1; 4145 // p += scratch1; 4146 // switch (scratch1) { 4147 // do { 4148 // cnt -= 8; 4149 // p[-8] = 0; 4150 // case 7: 4151 // p[-7] = 0; 4152 // case 6: 4153 // p[-6] = 0; 4154 // // ... 4155 // case 1: 4156 // p[-1] = 0; 4157 // case 0: 4158 // p += 8; 4159 // } while (cnt); 4160 // } 4161 4162 const int unroll = 8; // Number of str(zr) instructions we'll unroll 4163 4164 lsr(len, len, LogBytesPerWord); 4165 andr(rscratch1, len, unroll - 1); // tmp1 = cnt % unroll 4166 sub(len, len, rscratch1); // cnt -= unroll 4167 // t1 always points to the end of the region we're about to zero 4168 add(t1, addr, rscratch1, Assembler::LSL, LogBytesPerWord); 4169 adr(rscratch2, entry); 4170 sub(rscratch2, rscratch2, rscratch1, Assembler::LSL, 2); 4171 br(rscratch2); 4172 bind(loop); 4173 sub(len, len, unroll); 4174 for (int i = -unroll; i < 0; i++) 4175 Assembler::str(zr, Address(t1, i * wordSize)); 4176 bind(entry); 4177 add(t1, t1, unroll * wordSize); 4178 cbnz(len, loop); 4179 } 4180 4181 void MacroAssembler::verify_tlab() { 4182 #ifdef ASSERT 4183 if (UseTLAB && VerifyOops) { 4184 Label next, ok; 4185 4186 stp(rscratch2, rscratch1, Address(pre(sp, -16))); 4187 4188 ldr(rscratch2, Address(rthread, in_bytes(JavaThread::tlab_top_offset()))); 4189 ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_start_offset()))); 4190 cmp(rscratch2, rscratch1); 4191 br(Assembler::HS, next); 4192 STOP("assert(top >= start)"); 4193 should_not_reach_here(); 4194 4195 bind(next); 4196 ldr(rscratch2, Address(rthread, in_bytes(JavaThread::tlab_end_offset()))); 4197 ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_top_offset()))); 4198 cmp(rscratch2, rscratch1); 4199 br(Assembler::HS, ok); 4200 STOP("assert(top <= end)"); 4201 should_not_reach_here(); 4202 4203 bind(ok); 4204 ldp(rscratch2, rscratch1, Address(post(sp, 16))); 4205 } 4206 #endif 4207 } 4208 4209 // Writes to stack successive pages until offset reached to check for 4210 // stack overflow + shadow pages. This clobbers tmp. 4211 void MacroAssembler::bang_stack_size(Register size, Register tmp) { 4212 assert_different_registers(tmp, size, rscratch1); 4213 mov(tmp, sp); 4214 // Bang stack for total size given plus shadow page size. 4215 // Bang one page at a time because large size can bang beyond yellow and 4216 // red zones. 4217 Label loop; 4218 mov(rscratch1, os::vm_page_size()); 4219 bind(loop); 4220 lea(tmp, Address(tmp, -os::vm_page_size())); 4221 subsw(size, size, rscratch1); 4222 str(size, Address(tmp)); 4223 br(Assembler::GT, loop); 4224 4225 // Bang down shadow pages too. 4226 // At this point, (tmp-0) is the last address touched, so don't 4227 // touch it again. (It was touched as (tmp-pagesize) but then tmp 4228 // was post-decremented.) Skip this address by starting at i=1, and 4229 // touch a few more pages below. N.B. It is important to touch all 4230 // the way down to and including i=StackShadowPages. 4231 for (int i = 0; i < (int)(JavaThread::stack_shadow_zone_size() / os::vm_page_size()) - 1; i++) { 4232 // this could be any sized move but this is can be a debugging crumb 4233 // so the bigger the better. 4234 lea(tmp, Address(tmp, -os::vm_page_size())); 4235 str(size, Address(tmp)); 4236 } 4237 } 4238 4239 4240 // Move the address of the polling page into dest. 4241 void MacroAssembler::get_polling_page(Register dest, address page, relocInfo::relocType rtype) { 4242 if (SafepointMechanism::uses_thread_local_poll()) { 4243 ldr(dest, Address(rthread, Thread::polling_page_offset())); 4244 } else { 4245 unsigned long off; 4246 adrp(dest, Address(page, rtype), off); 4247 assert(off == 0, "polling page must be page aligned"); 4248 } 4249 } 4250 4251 // Move the address of the polling page into r, then read the polling 4252 // page. 4253 address MacroAssembler::read_polling_page(Register r, address page, relocInfo::relocType rtype) { 4254 get_polling_page(r, page, rtype); 4255 return read_polling_page(r, rtype); 4256 } 4257 4258 // Read the polling page. The address of the polling page must 4259 // already be in r. 4260 address MacroAssembler::read_polling_page(Register r, relocInfo::relocType rtype) { 4261 InstructionMark im(this); 4262 code_section()->relocate(inst_mark(), rtype); 4263 ldrw(zr, Address(r, 0)); 4264 return inst_mark(); 4265 } 4266 4267 void MacroAssembler::adrp(Register reg1, const Address &dest, unsigned long &byte_offset) { 4268 relocInfo::relocType rtype = dest.rspec().reloc()->type(); 4269 unsigned long low_page = (unsigned long)CodeCache::low_bound() >> 12; 4270 unsigned long high_page = (unsigned long)(CodeCache::high_bound()-1) >> 12; 4271 unsigned long dest_page = (unsigned long)dest.target() >> 12; 4272 long offset_low = dest_page - low_page; 4273 long offset_high = dest_page - high_page; 4274 4275 assert(is_valid_AArch64_address(dest.target()), "bad address"); 4276 assert(dest.getMode() == Address::literal, "ADRP must be applied to a literal address"); 4277 4278 InstructionMark im(this); 4279 code_section()->relocate(inst_mark(), dest.rspec()); 4280 // 8143067: Ensure that the adrp can reach the dest from anywhere within 4281 // the code cache so that if it is relocated we know it will still reach 4282 if (offset_high >= -(1<<20) && offset_low < (1<<20)) { 4283 _adrp(reg1, dest.target()); 4284 } else { 4285 unsigned long target = (unsigned long)dest.target(); 4286 unsigned long adrp_target 4287 = (target & 0xffffffffUL) | ((unsigned long)pc() & 0xffff00000000UL); 4288 4289 _adrp(reg1, (address)adrp_target); 4290 movk(reg1, target >> 32, 32); 4291 } 4292 byte_offset = (unsigned long)dest.target() & 0xfff; 4293 } 4294 4295 void MacroAssembler::load_byte_map_base(Register reg) { 4296 jbyte *byte_map_base = 4297 ((CardTableBarrierSet*)(BarrierSet::barrier_set()))->card_table()->byte_map_base(); 4298 4299 if (is_valid_AArch64_address((address)byte_map_base)) { 4300 // Strictly speaking the byte_map_base isn't an address at all, 4301 // and it might even be negative. 4302 unsigned long offset; 4303 adrp(reg, ExternalAddress((address)byte_map_base), offset); 4304 // We expect offset to be zero with most collectors. 4305 if (offset != 0) { 4306 add(reg, reg, offset); 4307 } 4308 } else { 4309 mov(reg, (uint64_t)byte_map_base); 4310 } 4311 } 4312 4313 void MacroAssembler::build_frame(int framesize) { 4314 assert(framesize > 0, "framesize must be > 0"); 4315 if (framesize < ((1 << 9) + 2 * wordSize)) { 4316 sub(sp, sp, framesize); 4317 stp(rfp, lr, Address(sp, framesize - 2 * wordSize)); 4318 if (PreserveFramePointer) add(rfp, sp, framesize - 2 * wordSize); 4319 } else { 4320 stp(rfp, lr, Address(pre(sp, -2 * wordSize))); 4321 if (PreserveFramePointer) mov(rfp, sp); 4322 if (framesize < ((1 << 12) + 2 * wordSize)) 4323 sub(sp, sp, framesize - 2 * wordSize); 4324 else { 4325 mov(rscratch1, framesize - 2 * wordSize); 4326 sub(sp, sp, rscratch1); 4327 } 4328 } 4329 } 4330 4331 void MacroAssembler::remove_frame(int framesize) { 4332 assert(framesize > 0, "framesize must be > 0"); 4333 if (framesize < ((1 << 9) + 2 * wordSize)) { 4334 ldp(rfp, lr, Address(sp, framesize - 2 * wordSize)); 4335 add(sp, sp, framesize); 4336 } else { 4337 if (framesize < ((1 << 12) + 2 * wordSize)) 4338 add(sp, sp, framesize - 2 * wordSize); 4339 else { 4340 mov(rscratch1, framesize - 2 * wordSize); 4341 add(sp, sp, rscratch1); 4342 } 4343 ldp(rfp, lr, Address(post(sp, 2 * wordSize))); 4344 } 4345 } 4346 4347 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr); 4348 4349 // Search for str1 in str2 and return index or -1 4350 void MacroAssembler::string_indexof(Register str2, Register str1, 4351 Register cnt2, Register cnt1, 4352 Register tmp1, Register tmp2, 4353 Register tmp3, Register tmp4, 4354 Register tmp5, Register tmp6, 4355 int icnt1, Register result, int ae) { 4356 // NOTE: tmp5, tmp6 can be zr depending on specific method version 4357 Label LINEARSEARCH, LINEARSTUB, LINEAR_MEDIUM, DONE, NOMATCH, MATCH; 4358 4359 Register ch1 = rscratch1; 4360 Register ch2 = rscratch2; 4361 Register cnt1tmp = tmp1; 4362 Register cnt2tmp = tmp2; 4363 Register cnt1_neg = cnt1; 4364 Register cnt2_neg = cnt2; 4365 Register result_tmp = tmp4; 4366 4367 bool isL = ae == StrIntrinsicNode::LL; 4368 4369 bool str1_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL; 4370 bool str2_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU; 4371 int str1_chr_shift = str1_isL ? 0:1; 4372 int str2_chr_shift = str2_isL ? 0:1; 4373 int str1_chr_size = str1_isL ? 1:2; 4374 int str2_chr_size = str2_isL ? 1:2; 4375 chr_insn str1_load_1chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb : 4376 (chr_insn)&MacroAssembler::ldrh; 4377 chr_insn str2_load_1chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb : 4378 (chr_insn)&MacroAssembler::ldrh; 4379 chr_insn load_2chr = isL ? (chr_insn)&MacroAssembler::ldrh : (chr_insn)&MacroAssembler::ldrw; 4380 chr_insn load_4chr = isL ? (chr_insn)&MacroAssembler::ldrw : (chr_insn)&MacroAssembler::ldr; 4381 4382 // Note, inline_string_indexOf() generates checks: 4383 // if (substr.count > string.count) return -1; 4384 // if (substr.count == 0) return 0; 4385 4386 // We have two strings, a source string in str2, cnt2 and a pattern string 4387 // in str1, cnt1. Find the 1st occurence of pattern in source or return -1. 4388 4389 // For larger pattern and source we use a simplified Boyer Moore algorithm. 4390 // With a small pattern and source we use linear scan. 4391 4392 if (icnt1 == -1) { 4393 sub(result_tmp, cnt2, cnt1); 4394 cmp(cnt1, (u1)8); // Use Linear Scan if cnt1 < 8 || cnt1 >= 256 4395 br(LT, LINEARSEARCH); 4396 dup(v0, T16B, cnt1); // done in separate FPU pipeline. Almost no penalty 4397 subs(zr, cnt1, 256); 4398 lsr(tmp1, cnt2, 2); 4399 ccmp(cnt1, tmp1, 0b0000, LT); // Source must be 4 * pattern for BM 4400 br(GE, LINEARSTUB); 4401 } 4402 4403 // The Boyer Moore alogorithm is based on the description here:- 4404 // 4405 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm 4406 // 4407 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule 4408 // and the 'Good Suffix' rule. 4409 // 4410 // These rules are essentially heuristics for how far we can shift the 4411 // pattern along the search string. 4412 // 4413 // The implementation here uses the 'Bad Character' rule only because of the 4414 // complexity of initialisation for the 'Good Suffix' rule. 4415 // 4416 // This is also known as the Boyer-Moore-Horspool algorithm:- 4417 // 4418 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm 4419 // 4420 // This particular implementation has few java-specific optimizations. 4421 // 4422 // #define ASIZE 256 4423 // 4424 // int bm(unsigned char *x, int m, unsigned char *y, int n) { 4425 // int i, j; 4426 // unsigned c; 4427 // unsigned char bc[ASIZE]; 4428 // 4429 // /* Preprocessing */ 4430 // for (i = 0; i < ASIZE; ++i) 4431 // bc[i] = m; 4432 // for (i = 0; i < m - 1; ) { 4433 // c = x[i]; 4434 // ++i; 4435 // // c < 256 for Latin1 string, so, no need for branch 4436 // #ifdef PATTERN_STRING_IS_LATIN1 4437 // bc[c] = m - i; 4438 // #else 4439 // if (c < ASIZE) bc[c] = m - i; 4440 // #endif 4441 // } 4442 // 4443 // /* Searching */ 4444 // j = 0; 4445 // while (j <= n - m) { 4446 // c = y[i+j]; 4447 // if (x[m-1] == c) 4448 // for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i); 4449 // if (i < 0) return j; 4450 // // c < 256 for Latin1 string, so, no need for branch 4451 // #ifdef SOURCE_STRING_IS_LATIN1 4452 // // LL case: (c< 256) always true. Remove branch 4453 // j += bc[y[j+m-1]]; 4454 // #endif 4455 // #ifndef PATTERN_STRING_IS_UTF 4456 // // UU case: need if (c<ASIZE) check. Skip 1 character if not. 4457 // if (c < ASIZE) 4458 // j += bc[y[j+m-1]]; 4459 // else 4460 // j += 1 4461 // #endif 4462 // #ifdef PATTERN_IS_LATIN1_AND_SOURCE_IS_UTF 4463 // // UL case: need if (c<ASIZE) check. Skip <pattern length> if not. 4464 // if (c < ASIZE) 4465 // j += bc[y[j+m-1]]; 4466 // else 4467 // j += m 4468 // #endif 4469 // } 4470 // } 4471 4472 if (icnt1 == -1) { 4473 Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH, 4474 BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP; 4475 Register cnt1end = tmp2; 4476 Register str2end = cnt2; 4477 Register skipch = tmp2; 4478 4479 // str1 length is >=8, so, we can read at least 1 register for cases when 4480 // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for 4481 // UL case. We'll re-read last character in inner pre-loop code to have 4482 // single outer pre-loop load 4483 const int firstStep = isL ? 7 : 3; 4484 4485 const int ASIZE = 256; 4486 const int STORED_BYTES = 32; // amount of bytes stored per instruction 4487 sub(sp, sp, ASIZE); 4488 mov(tmp5, ASIZE/STORED_BYTES); // loop iterations 4489 mov(ch1, sp); 4490 BIND(BM_INIT_LOOP); 4491 stpq(v0, v0, Address(post(ch1, STORED_BYTES))); 4492 subs(tmp5, tmp5, 1); 4493 br(GT, BM_INIT_LOOP); 4494 4495 sub(cnt1tmp, cnt1, 1); 4496 mov(tmp5, str2); 4497 add(str2end, str2, result_tmp, LSL, str2_chr_shift); 4498 sub(ch2, cnt1, 1); 4499 mov(tmp3, str1); 4500 BIND(BCLOOP); 4501 (this->*str1_load_1chr)(ch1, Address(post(tmp3, str1_chr_size))); 4502 if (!str1_isL) { 4503 subs(zr, ch1, ASIZE); 4504 br(HS, BCSKIP); 4505 } 4506 strb(ch2, Address(sp, ch1)); 4507 BIND(BCSKIP); 4508 subs(ch2, ch2, 1); 4509 br(GT, BCLOOP); 4510 4511 add(tmp6, str1, cnt1, LSL, str1_chr_shift); // address after str1 4512 if (str1_isL == str2_isL) { 4513 // load last 8 bytes (8LL/4UU symbols) 4514 ldr(tmp6, Address(tmp6, -wordSize)); 4515 } else { 4516 ldrw(tmp6, Address(tmp6, -wordSize/2)); // load last 4 bytes(4 symbols) 4517 // convert Latin1 to UTF. We'll have to wait until load completed, but 4518 // it's still faster than per-character loads+checks 4519 lsr(tmp3, tmp6, BitsPerByte * (wordSize/2 - str1_chr_size)); // str1[N-1] 4520 ubfx(ch1, tmp6, 8, 8); // str1[N-2] 4521 ubfx(ch2, tmp6, 16, 8); // str1[N-3] 4522 andr(tmp6, tmp6, 0xFF); // str1[N-4] 4523 orr(ch2, ch1, ch2, LSL, 16); 4524 orr(tmp6, tmp6, tmp3, LSL, 48); 4525 orr(tmp6, tmp6, ch2, LSL, 16); 4526 } 4527 BIND(BMLOOPSTR2); 4528 (this->*str2_load_1chr)(skipch, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift))); 4529 sub(cnt1tmp, cnt1tmp, firstStep); // cnt1tmp is positive here, because cnt1 >= 8 4530 if (str1_isL == str2_isL) { 4531 // re-init tmp3. It's for free because it's executed in parallel with 4532 // load above. Alternative is to initialize it before loop, but it'll 4533 // affect performance on in-order systems with 2 or more ld/st pipelines 4534 lsr(tmp3, tmp6, BitsPerByte * (wordSize - str1_chr_size)); 4535 } 4536 if (!isL) { // UU/UL case 4537 lsl(ch2, cnt1tmp, 1); // offset in bytes 4538 } 4539 cmp(tmp3, skipch); 4540 br(NE, BMSKIP); 4541 ldr(ch2, Address(str2, isL ? cnt1tmp : ch2)); 4542 mov(ch1, tmp6); 4543 if (isL) { 4544 b(BMLOOPSTR1_AFTER_LOAD); 4545 } else { 4546 sub(cnt1tmp, cnt1tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8 4547 b(BMLOOPSTR1_CMP); 4548 } 4549 BIND(BMLOOPSTR1); 4550 (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift))); 4551 (this->*str2_load_1chr)(ch2, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift))); 4552 BIND(BMLOOPSTR1_AFTER_LOAD); 4553 subs(cnt1tmp, cnt1tmp, 1); 4554 br(LT, BMLOOPSTR1_LASTCMP); 4555 BIND(BMLOOPSTR1_CMP); 4556 cmp(ch1, ch2); 4557 br(EQ, BMLOOPSTR1); 4558 BIND(BMSKIP); 4559 if (!isL) { 4560 // if we've met UTF symbol while searching Latin1 pattern, then we can 4561 // skip cnt1 symbols 4562 if (str1_isL != str2_isL) { 4563 mov(result_tmp, cnt1); 4564 } else { 4565 mov(result_tmp, 1); 4566 } 4567 subs(zr, skipch, ASIZE); 4568 br(HS, BMADV); 4569 } 4570 ldrb(result_tmp, Address(sp, skipch)); // load skip distance 4571 BIND(BMADV); 4572 sub(cnt1tmp, cnt1, 1); 4573 add(str2, str2, result_tmp, LSL, str2_chr_shift); 4574 cmp(str2, str2end); 4575 br(LE, BMLOOPSTR2); 4576 add(sp, sp, ASIZE); 4577 b(NOMATCH); 4578 BIND(BMLOOPSTR1_LASTCMP); 4579 cmp(ch1, ch2); 4580 br(NE, BMSKIP); 4581 BIND(BMMATCH); 4582 sub(result, str2, tmp5); 4583 if (!str2_isL) lsr(result, result, 1); 4584 add(sp, sp, ASIZE); 4585 b(DONE); 4586 4587 BIND(LINEARSTUB); 4588 cmp(cnt1, (u1)16); // small patterns still should be handled by simple algorithm 4589 br(LT, LINEAR_MEDIUM); 4590 mov(result, zr); 4591 RuntimeAddress stub = NULL; 4592 if (isL) { 4593 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ll()); 4594 assert(stub.target() != NULL, "string_indexof_linear_ll stub has not been generated"); 4595 } else if (str1_isL) { 4596 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ul()); 4597 assert(stub.target() != NULL, "string_indexof_linear_ul stub has not been generated"); 4598 } else { 4599 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_uu()); 4600 assert(stub.target() != NULL, "string_indexof_linear_uu stub has not been generated"); 4601 } 4602 trampoline_call(stub); 4603 b(DONE); 4604 } 4605 4606 BIND(LINEARSEARCH); 4607 { 4608 Label DO1, DO2, DO3; 4609 4610 Register str2tmp = tmp2; 4611 Register first = tmp3; 4612 4613 if (icnt1 == -1) 4614 { 4615 Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT; 4616 4617 cmp(cnt1, u1(str1_isL == str2_isL ? 4 : 2)); 4618 br(LT, DOSHORT); 4619 BIND(LINEAR_MEDIUM); 4620 (this->*str1_load_1chr)(first, Address(str1)); 4621 lea(str1, Address(str1, cnt1, Address::lsl(str1_chr_shift))); 4622 sub(cnt1_neg, zr, cnt1, LSL, str1_chr_shift); 4623 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 4624 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 4625 4626 BIND(FIRST_LOOP); 4627 (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg)); 4628 cmp(first, ch2); 4629 br(EQ, STR1_LOOP); 4630 BIND(STR2_NEXT); 4631 adds(cnt2_neg, cnt2_neg, str2_chr_size); 4632 br(LE, FIRST_LOOP); 4633 b(NOMATCH); 4634 4635 BIND(STR1_LOOP); 4636 adds(cnt1tmp, cnt1_neg, str1_chr_size); 4637 add(cnt2tmp, cnt2_neg, str2_chr_size); 4638 br(GE, MATCH); 4639 4640 BIND(STR1_NEXT); 4641 (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp)); 4642 (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp)); 4643 cmp(ch1, ch2); 4644 br(NE, STR2_NEXT); 4645 adds(cnt1tmp, cnt1tmp, str1_chr_size); 4646 add(cnt2tmp, cnt2tmp, str2_chr_size); 4647 br(LT, STR1_NEXT); 4648 b(MATCH); 4649 4650 BIND(DOSHORT); 4651 if (str1_isL == str2_isL) { 4652 cmp(cnt1, (u1)2); 4653 br(LT, DO1); 4654 br(GT, DO3); 4655 } 4656 } 4657 4658 if (icnt1 == 4) { 4659 Label CH1_LOOP; 4660 4661 (this->*load_4chr)(ch1, str1); 4662 sub(result_tmp, cnt2, 4); 4663 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 4664 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 4665 4666 BIND(CH1_LOOP); 4667 (this->*load_4chr)(ch2, Address(str2, cnt2_neg)); 4668 cmp(ch1, ch2); 4669 br(EQ, MATCH); 4670 adds(cnt2_neg, cnt2_neg, str2_chr_size); 4671 br(LE, CH1_LOOP); 4672 b(NOMATCH); 4673 } 4674 4675 if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 2) { 4676 Label CH1_LOOP; 4677 4678 BIND(DO2); 4679 (this->*load_2chr)(ch1, str1); 4680 if (icnt1 == 2) { 4681 sub(result_tmp, cnt2, 2); 4682 } 4683 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 4684 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 4685 BIND(CH1_LOOP); 4686 (this->*load_2chr)(ch2, Address(str2, cnt2_neg)); 4687 cmp(ch1, ch2); 4688 br(EQ, MATCH); 4689 adds(cnt2_neg, cnt2_neg, str2_chr_size); 4690 br(LE, CH1_LOOP); 4691 b(NOMATCH); 4692 } 4693 4694 if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 3) { 4695 Label FIRST_LOOP, STR2_NEXT, STR1_LOOP; 4696 4697 BIND(DO3); 4698 (this->*load_2chr)(first, str1); 4699 (this->*str1_load_1chr)(ch1, Address(str1, 2*str1_chr_size)); 4700 if (icnt1 == 3) { 4701 sub(result_tmp, cnt2, 3); 4702 } 4703 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 4704 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 4705 BIND(FIRST_LOOP); 4706 (this->*load_2chr)(ch2, Address(str2, cnt2_neg)); 4707 cmpw(first, ch2); 4708 br(EQ, STR1_LOOP); 4709 BIND(STR2_NEXT); 4710 adds(cnt2_neg, cnt2_neg, str2_chr_size); 4711 br(LE, FIRST_LOOP); 4712 b(NOMATCH); 4713 4714 BIND(STR1_LOOP); 4715 add(cnt2tmp, cnt2_neg, 2*str2_chr_size); 4716 (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp)); 4717 cmp(ch1, ch2); 4718 br(NE, STR2_NEXT); 4719 b(MATCH); 4720 } 4721 4722 if (icnt1 == -1 || icnt1 == 1) { 4723 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP; 4724 4725 BIND(DO1); 4726 (this->*str1_load_1chr)(ch1, str1); 4727 cmp(cnt2, (u1)8); 4728 br(LT, DO1_SHORT); 4729 4730 sub(result_tmp, cnt2, 8/str2_chr_size); 4731 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 4732 mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001); 4733 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 4734 4735 if (str2_isL) { 4736 orr(ch1, ch1, ch1, LSL, 8); 4737 } 4738 orr(ch1, ch1, ch1, LSL, 16); 4739 orr(ch1, ch1, ch1, LSL, 32); 4740 BIND(CH1_LOOP); 4741 ldr(ch2, Address(str2, cnt2_neg)); 4742 eor(ch2, ch1, ch2); 4743 sub(tmp1, ch2, tmp3); 4744 orr(tmp2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 4745 bics(tmp1, tmp1, tmp2); 4746 br(NE, HAS_ZERO); 4747 adds(cnt2_neg, cnt2_neg, 8); 4748 br(LT, CH1_LOOP); 4749 4750 cmp(cnt2_neg, (u1)8); 4751 mov(cnt2_neg, 0); 4752 br(LT, CH1_LOOP); 4753 b(NOMATCH); 4754 4755 BIND(HAS_ZERO); 4756 rev(tmp1, tmp1); 4757 clz(tmp1, tmp1); 4758 add(cnt2_neg, cnt2_neg, tmp1, LSR, 3); 4759 b(MATCH); 4760 4761 BIND(DO1_SHORT); 4762 mov(result_tmp, cnt2); 4763 lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift))); 4764 sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift); 4765 BIND(DO1_LOOP); 4766 (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg)); 4767 cmpw(ch1, ch2); 4768 br(EQ, MATCH); 4769 adds(cnt2_neg, cnt2_neg, str2_chr_size); 4770 br(LT, DO1_LOOP); 4771 } 4772 } 4773 BIND(NOMATCH); 4774 mov(result, -1); 4775 b(DONE); 4776 BIND(MATCH); 4777 add(result, result_tmp, cnt2_neg, ASR, str2_chr_shift); 4778 BIND(DONE); 4779 } 4780 4781 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr); 4782 typedef void (MacroAssembler::* uxt_insn)(Register Rd, Register Rn); 4783 4784 void MacroAssembler::string_indexof_char(Register str1, Register cnt1, 4785 Register ch, Register result, 4786 Register tmp1, Register tmp2, Register tmp3) 4787 { 4788 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE; 4789 Register cnt1_neg = cnt1; 4790 Register ch1 = rscratch1; 4791 Register result_tmp = rscratch2; 4792 4793 cmp(cnt1, (u1)4); 4794 br(LT, DO1_SHORT); 4795 4796 orr(ch, ch, ch, LSL, 16); 4797 orr(ch, ch, ch, LSL, 32); 4798 4799 sub(cnt1, cnt1, 4); 4800 mov(result_tmp, cnt1); 4801 lea(str1, Address(str1, cnt1, Address::uxtw(1))); 4802 sub(cnt1_neg, zr, cnt1, LSL, 1); 4803 4804 mov(tmp3, 0x0001000100010001); 4805 4806 BIND(CH1_LOOP); 4807 ldr(ch1, Address(str1, cnt1_neg)); 4808 eor(ch1, ch, ch1); 4809 sub(tmp1, ch1, tmp3); 4810 orr(tmp2, ch1, 0x7fff7fff7fff7fff); 4811 bics(tmp1, tmp1, tmp2); 4812 br(NE, HAS_ZERO); 4813 adds(cnt1_neg, cnt1_neg, 8); 4814 br(LT, CH1_LOOP); 4815 4816 cmp(cnt1_neg, (u1)8); 4817 mov(cnt1_neg, 0); 4818 br(LT, CH1_LOOP); 4819 b(NOMATCH); 4820 4821 BIND(HAS_ZERO); 4822 rev(tmp1, tmp1); 4823 clz(tmp1, tmp1); 4824 add(cnt1_neg, cnt1_neg, tmp1, LSR, 3); 4825 b(MATCH); 4826 4827 BIND(DO1_SHORT); 4828 mov(result_tmp, cnt1); 4829 lea(str1, Address(str1, cnt1, Address::uxtw(1))); 4830 sub(cnt1_neg, zr, cnt1, LSL, 1); 4831 BIND(DO1_LOOP); 4832 ldrh(ch1, Address(str1, cnt1_neg)); 4833 cmpw(ch, ch1); 4834 br(EQ, MATCH); 4835 adds(cnt1_neg, cnt1_neg, 2); 4836 br(LT, DO1_LOOP); 4837 BIND(NOMATCH); 4838 mov(result, -1); 4839 b(DONE); 4840 BIND(MATCH); 4841 add(result, result_tmp, cnt1_neg, ASR, 1); 4842 BIND(DONE); 4843 } 4844 4845 // Compare strings. 4846 void MacroAssembler::string_compare(Register str1, Register str2, 4847 Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2, 4848 FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3, int ae) { 4849 Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB, 4850 DIFFERENCE, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, 4851 SHORT_LOOP_START, TAIL_CHECK; 4852 4853 const u1 STUB_THRESHOLD = 64 + 8; 4854 bool isLL = ae == StrIntrinsicNode::LL; 4855 bool isLU = ae == StrIntrinsicNode::LU; 4856 bool isUL = ae == StrIntrinsicNode::UL; 4857 4858 bool str1_isL = isLL || isLU; 4859 bool str2_isL = isLL || isUL; 4860 4861 int str1_chr_shift = str1_isL ? 0 : 1; 4862 int str2_chr_shift = str2_isL ? 0 : 1; 4863 int str1_chr_size = str1_isL ? 1 : 2; 4864 int str2_chr_size = str2_isL ? 1 : 2; 4865 int minCharsInWord = isLL ? wordSize : wordSize/2; 4866 4867 FloatRegister vtmpZ = vtmp1, vtmp = vtmp2; 4868 chr_insn str1_load_chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb : 4869 (chr_insn)&MacroAssembler::ldrh; 4870 chr_insn str2_load_chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb : 4871 (chr_insn)&MacroAssembler::ldrh; 4872 uxt_insn ext_chr = isLL ? (uxt_insn)&MacroAssembler::uxtbw : 4873 (uxt_insn)&MacroAssembler::uxthw; 4874 4875 BLOCK_COMMENT("string_compare {"); 4876 4877 // Bizzarely, the counts are passed in bytes, regardless of whether they 4878 // are L or U strings, however the result is always in characters. 4879 if (!str1_isL) asrw(cnt1, cnt1, 1); 4880 if (!str2_isL) asrw(cnt2, cnt2, 1); 4881 4882 // Compute the minimum of the string lengths and save the difference. 4883 subsw(result, cnt1, cnt2); 4884 cselw(cnt2, cnt1, cnt2, Assembler::LE); // min 4885 4886 // A very short string 4887 cmpw(cnt2, minCharsInWord); 4888 br(Assembler::LT, SHORT_STRING); 4889 4890 // Compare longwords 4891 // load first parts of strings and finish initialization while loading 4892 { 4893 if (str1_isL == str2_isL) { // LL or UU 4894 ldr(tmp1, Address(str1)); 4895 cmp(str1, str2); 4896 br(Assembler::EQ, DONE); 4897 ldr(tmp2, Address(str2)); 4898 cmp(cnt2, STUB_THRESHOLD); 4899 br(GE, STUB); 4900 subsw(cnt2, cnt2, minCharsInWord); 4901 br(EQ, TAIL_CHECK); 4902 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift))); 4903 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift))); 4904 sub(cnt2, zr, cnt2, LSL, str2_chr_shift); 4905 } else if (isLU) { 4906 ldrs(vtmp, Address(str1)); 4907 cmp(str1, str2); 4908 br(Assembler::EQ, DONE); 4909 ldr(tmp2, Address(str2)); 4910 cmp(cnt2, STUB_THRESHOLD); 4911 br(GE, STUB); 4912 subsw(cnt2, cnt2, 4); 4913 br(EQ, TAIL_CHECK); 4914 eor(vtmpZ, T16B, vtmpZ, vtmpZ); 4915 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift))); 4916 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift))); 4917 zip1(vtmp, T8B, vtmp, vtmpZ); 4918 sub(cnt1, zr, cnt2, LSL, str1_chr_shift); 4919 sub(cnt2, zr, cnt2, LSL, str2_chr_shift); 4920 add(cnt1, cnt1, 4); 4921 fmovd(tmp1, vtmp); 4922 } else { // UL case 4923 ldr(tmp1, Address(str1)); 4924 cmp(str1, str2); 4925 br(Assembler::EQ, DONE); 4926 ldrs(vtmp, Address(str2)); 4927 cmp(cnt2, STUB_THRESHOLD); 4928 br(GE, STUB); 4929 subsw(cnt2, cnt2, 4); 4930 br(EQ, TAIL_CHECK); 4931 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift))); 4932 eor(vtmpZ, T16B, vtmpZ, vtmpZ); 4933 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift))); 4934 sub(cnt1, zr, cnt2, LSL, str1_chr_shift); 4935 zip1(vtmp, T8B, vtmp, vtmpZ); 4936 sub(cnt2, zr, cnt2, LSL, str2_chr_shift); 4937 add(cnt1, cnt1, 8); 4938 fmovd(tmp2, vtmp); 4939 } 4940 adds(cnt2, cnt2, isUL ? 4 : 8); 4941 br(GE, TAIL); 4942 eor(rscratch2, tmp1, tmp2); 4943 cbnz(rscratch2, DIFFERENCE); 4944 // main loop 4945 bind(NEXT_WORD); 4946 if (str1_isL == str2_isL) { 4947 ldr(tmp1, Address(str1, cnt2)); 4948 ldr(tmp2, Address(str2, cnt2)); 4949 adds(cnt2, cnt2, 8); 4950 } else if (isLU) { 4951 ldrs(vtmp, Address(str1, cnt1)); 4952 ldr(tmp2, Address(str2, cnt2)); 4953 add(cnt1, cnt1, 4); 4954 zip1(vtmp, T8B, vtmp, vtmpZ); 4955 fmovd(tmp1, vtmp); 4956 adds(cnt2, cnt2, 8); 4957 } else { // UL 4958 ldrs(vtmp, Address(str2, cnt2)); 4959 ldr(tmp1, Address(str1, cnt1)); 4960 zip1(vtmp, T8B, vtmp, vtmpZ); 4961 add(cnt1, cnt1, 8); 4962 fmovd(tmp2, vtmp); 4963 adds(cnt2, cnt2, 4); 4964 } 4965 br(GE, TAIL); 4966 4967 eor(rscratch2, tmp1, tmp2); 4968 cbz(rscratch2, NEXT_WORD); 4969 b(DIFFERENCE); 4970 bind(TAIL); 4971 eor(rscratch2, tmp1, tmp2); 4972 cbnz(rscratch2, DIFFERENCE); 4973 // Last longword. In the case where length == 4 we compare the 4974 // same longword twice, but that's still faster than another 4975 // conditional branch. 4976 if (str1_isL == str2_isL) { 4977 ldr(tmp1, Address(str1)); 4978 ldr(tmp2, Address(str2)); 4979 } else if (isLU) { 4980 ldrs(vtmp, Address(str1)); 4981 ldr(tmp2, Address(str2)); 4982 zip1(vtmp, T8B, vtmp, vtmpZ); 4983 fmovd(tmp1, vtmp); 4984 } else { // UL 4985 ldrs(vtmp, Address(str2)); 4986 ldr(tmp1, Address(str1)); 4987 zip1(vtmp, T8B, vtmp, vtmpZ); 4988 fmovd(tmp2, vtmp); 4989 } 4990 bind(TAIL_CHECK); 4991 eor(rscratch2, tmp1, tmp2); 4992 cbz(rscratch2, DONE); 4993 4994 // Find the first different characters in the longwords and 4995 // compute their difference. 4996 bind(DIFFERENCE); 4997 rev(rscratch2, rscratch2); 4998 clz(rscratch2, rscratch2); 4999 andr(rscratch2, rscratch2, isLL ? -8 : -16); 5000 lsrv(tmp1, tmp1, rscratch2); 5001 (this->*ext_chr)(tmp1, tmp1); 5002 lsrv(tmp2, tmp2, rscratch2); 5003 (this->*ext_chr)(tmp2, tmp2); 5004 subw(result, tmp1, tmp2); 5005 b(DONE); 5006 } 5007 5008 bind(STUB); 5009 RuntimeAddress stub = NULL; 5010 switch(ae) { 5011 case StrIntrinsicNode::LL: 5012 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LL()); 5013 break; 5014 case StrIntrinsicNode::UU: 5015 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UU()); 5016 break; 5017 case StrIntrinsicNode::LU: 5018 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LU()); 5019 break; 5020 case StrIntrinsicNode::UL: 5021 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UL()); 5022 break; 5023 default: 5024 ShouldNotReachHere(); 5025 } 5026 assert(stub.target() != NULL, "compare_long_string stub has not been generated"); 5027 trampoline_call(stub); 5028 b(DONE); 5029 5030 bind(SHORT_STRING); 5031 // Is the minimum length zero? 5032 cbz(cnt2, DONE); 5033 // arrange code to do most branches while loading and loading next characters 5034 // while comparing previous 5035 (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size))); 5036 subs(cnt2, cnt2, 1); 5037 br(EQ, SHORT_LAST_INIT); 5038 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size))); 5039 b(SHORT_LOOP_START); 5040 bind(SHORT_LOOP); 5041 subs(cnt2, cnt2, 1); 5042 br(EQ, SHORT_LAST); 5043 bind(SHORT_LOOP_START); 5044 (this->*str1_load_chr)(tmp2, Address(post(str1, str1_chr_size))); 5045 (this->*str2_load_chr)(rscratch1, Address(post(str2, str2_chr_size))); 5046 cmp(tmp1, cnt1); 5047 br(NE, SHORT_LOOP_TAIL); 5048 subs(cnt2, cnt2, 1); 5049 br(EQ, SHORT_LAST2); 5050 (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size))); 5051 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size))); 5052 cmp(tmp2, rscratch1); 5053 br(EQ, SHORT_LOOP); 5054 sub(result, tmp2, rscratch1); 5055 b(DONE); 5056 bind(SHORT_LOOP_TAIL); 5057 sub(result, tmp1, cnt1); 5058 b(DONE); 5059 bind(SHORT_LAST2); 5060 cmp(tmp2, rscratch1); 5061 br(EQ, DONE); 5062 sub(result, tmp2, rscratch1); 5063 5064 b(DONE); 5065 bind(SHORT_LAST_INIT); 5066 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size))); 5067 bind(SHORT_LAST); 5068 cmp(tmp1, cnt1); 5069 br(EQ, DONE); 5070 sub(result, tmp1, cnt1); 5071 5072 bind(DONE); 5073 5074 BLOCK_COMMENT("} string_compare"); 5075 } 5076 5077 // This method checks if provided byte array contains byte with highest bit set. 5078 void MacroAssembler::has_negatives(Register ary1, Register len, Register result) { 5079 // Simple and most common case of aligned small array which is not at the 5080 // end of memory page is placed here. All other cases are in stub. 5081 Label LOOP, END, STUB, STUB_LONG, SET_RESULT, DONE; 5082 const uint64_t UPPER_BIT_MASK=0x8080808080808080; 5083 assert_different_registers(ary1, len, result); 5084 5085 cmpw(len, 0); 5086 br(LE, SET_RESULT); 5087 cmpw(len, 4 * wordSize); 5088 br(GE, STUB_LONG); // size > 32 then go to stub 5089 5090 int shift = 64 - exact_log2(os::vm_page_size()); 5091 lsl(rscratch1, ary1, shift); 5092 mov(rscratch2, (size_t)(4 * wordSize) << shift); 5093 adds(rscratch2, rscratch1, rscratch2); // At end of page? 5094 br(CS, STUB); // at the end of page then go to stub 5095 subs(len, len, wordSize); 5096 br(LT, END); 5097 5098 BIND(LOOP); 5099 ldr(rscratch1, Address(post(ary1, wordSize))); 5100 tst(rscratch1, UPPER_BIT_MASK); 5101 br(NE, SET_RESULT); 5102 subs(len, len, wordSize); 5103 br(GE, LOOP); 5104 cmpw(len, -wordSize); 5105 br(EQ, SET_RESULT); 5106 5107 BIND(END); 5108 ldr(result, Address(ary1)); 5109 sub(len, zr, len, LSL, 3); // LSL 3 is to get bits from bytes 5110 lslv(result, result, len); 5111 tst(result, UPPER_BIT_MASK); 5112 b(SET_RESULT); 5113 5114 BIND(STUB); 5115 RuntimeAddress has_neg = RuntimeAddress(StubRoutines::aarch64::has_negatives()); 5116 assert(has_neg.target() != NULL, "has_negatives stub has not been generated"); 5117 trampoline_call(has_neg); 5118 b(DONE); 5119 5120 BIND(STUB_LONG); 5121 RuntimeAddress has_neg_long = RuntimeAddress( 5122 StubRoutines::aarch64::has_negatives_long()); 5123 assert(has_neg_long.target() != NULL, "has_negatives stub has not been generated"); 5124 trampoline_call(has_neg_long); 5125 b(DONE); 5126 5127 BIND(SET_RESULT); 5128 cset(result, NE); // set true or false 5129 5130 BIND(DONE); 5131 } 5132 5133 void MacroAssembler::arrays_equals(Register a1, Register a2, Register tmp3, 5134 Register tmp4, Register tmp5, Register result, 5135 Register cnt1, int elem_size) { 5136 Label DONE, SAME; 5137 Register tmp1 = rscratch1; 5138 Register tmp2 = rscratch2; 5139 Register cnt2 = tmp2; // cnt2 only used in array length compare 5140 int elem_per_word = wordSize/elem_size; 5141 int log_elem_size = exact_log2(elem_size); 5142 int length_offset = arrayOopDesc::length_offset_in_bytes(); 5143 int base_offset 5144 = arrayOopDesc::base_offset_in_bytes(elem_size == 2 ? T_CHAR : T_BYTE); 5145 int stubBytesThreshold = 3 * 64 + (UseSIMDForArrayEquals ? 0 : 16); 5146 5147 assert(elem_size == 1 || elem_size == 2, "must be char or byte"); 5148 assert_different_registers(a1, a2, result, cnt1, rscratch1, rscratch2); 5149 5150 #ifndef PRODUCT 5151 { 5152 const char kind = (elem_size == 2) ? 'U' : 'L'; 5153 char comment[64]; 5154 snprintf(comment, sizeof comment, "array_equals%c{", kind); 5155 BLOCK_COMMENT(comment); 5156 } 5157 #endif 5158 5159 // if (a1 == a2) 5160 // return true; 5161 cmpoop(a1, a2); // May have read barriers for a1 and a2. 5162 br(EQ, SAME); 5163 5164 if (UseSimpleArrayEquals) { 5165 Label NEXT_WORD, SHORT, TAIL03, TAIL01, A_MIGHT_BE_NULL, A_IS_NOT_NULL; 5166 // if (a1 == null || a2 == null) 5167 // return false; 5168 // a1 & a2 == 0 means (some-pointer is null) or 5169 // (very-rare-or-even-probably-impossible-pointer-values) 5170 // so, we can save one branch in most cases 5171 tst(a1, a2); 5172 mov(result, false); 5173 br(EQ, A_MIGHT_BE_NULL); 5174 // if (a1.length != a2.length) 5175 // return false; 5176 bind(A_IS_NOT_NULL); 5177 ldrw(cnt1, Address(a1, length_offset)); 5178 ldrw(cnt2, Address(a2, length_offset)); 5179 eorw(tmp5, cnt1, cnt2); 5180 cbnzw(tmp5, DONE); 5181 lea(a1, Address(a1, base_offset)); 5182 lea(a2, Address(a2, base_offset)); 5183 // Check for short strings, i.e. smaller than wordSize. 5184 subs(cnt1, cnt1, elem_per_word); 5185 br(Assembler::LT, SHORT); 5186 // Main 8 byte comparison loop. 5187 bind(NEXT_WORD); { 5188 ldr(tmp1, Address(post(a1, wordSize))); 5189 ldr(tmp2, Address(post(a2, wordSize))); 5190 subs(cnt1, cnt1, elem_per_word); 5191 eor(tmp5, tmp1, tmp2); 5192 cbnz(tmp5, DONE); 5193 } br(GT, NEXT_WORD); 5194 // Last longword. In the case where length == 4 we compare the 5195 // same longword twice, but that's still faster than another 5196 // conditional branch. 5197 // cnt1 could be 0, -1, -2, -3, -4 for chars; -4 only happens when 5198 // length == 4. 5199 if (log_elem_size > 0) 5200 lsl(cnt1, cnt1, log_elem_size); 5201 ldr(tmp3, Address(a1, cnt1)); 5202 ldr(tmp4, Address(a2, cnt1)); 5203 eor(tmp5, tmp3, tmp4); 5204 cbnz(tmp5, DONE); 5205 b(SAME); 5206 bind(A_MIGHT_BE_NULL); 5207 // in case both a1 and a2 are not-null, proceed with loads 5208 cbz(a1, DONE); 5209 cbz(a2, DONE); 5210 b(A_IS_NOT_NULL); 5211 bind(SHORT); 5212 5213 tbz(cnt1, 2 - log_elem_size, TAIL03); // 0-7 bytes left. 5214 { 5215 ldrw(tmp1, Address(post(a1, 4))); 5216 ldrw(tmp2, Address(post(a2, 4))); 5217 eorw(tmp5, tmp1, tmp2); 5218 cbnzw(tmp5, DONE); 5219 } 5220 bind(TAIL03); 5221 tbz(cnt1, 1 - log_elem_size, TAIL01); // 0-3 bytes left. 5222 { 5223 ldrh(tmp3, Address(post(a1, 2))); 5224 ldrh(tmp4, Address(post(a2, 2))); 5225 eorw(tmp5, tmp3, tmp4); 5226 cbnzw(tmp5, DONE); 5227 } 5228 bind(TAIL01); 5229 if (elem_size == 1) { // Only needed when comparing byte arrays. 5230 tbz(cnt1, 0, SAME); // 0-1 bytes left. 5231 { 5232 ldrb(tmp1, a1); 5233 ldrb(tmp2, a2); 5234 eorw(tmp5, tmp1, tmp2); 5235 cbnzw(tmp5, DONE); 5236 } 5237 } 5238 } else { 5239 Label NEXT_DWORD, SHORT, TAIL, TAIL2, STUB, EARLY_OUT, 5240 CSET_EQ, LAST_CHECK; 5241 mov(result, false); 5242 cbz(a1, DONE); 5243 ldrw(cnt1, Address(a1, length_offset)); 5244 cbz(a2, DONE); 5245 ldrw(cnt2, Address(a2, length_offset)); 5246 // on most CPUs a2 is still "locked"(surprisingly) in ldrw and it's 5247 // faster to perform another branch before comparing a1 and a2 5248 cmp(cnt1, (u1)elem_per_word); 5249 br(LE, SHORT); // short or same 5250 ldr(tmp3, Address(pre(a1, base_offset))); 5251 subs(zr, cnt1, stubBytesThreshold); 5252 br(GE, STUB); 5253 ldr(tmp4, Address(pre(a2, base_offset))); 5254 sub(tmp5, zr, cnt1, LSL, 3 + log_elem_size); 5255 cmp(cnt2, cnt1); 5256 br(NE, DONE); 5257 5258 // Main 16 byte comparison loop with 2 exits 5259 bind(NEXT_DWORD); { 5260 ldr(tmp1, Address(pre(a1, wordSize))); 5261 ldr(tmp2, Address(pre(a2, wordSize))); 5262 subs(cnt1, cnt1, 2 * elem_per_word); 5263 br(LE, TAIL); 5264 eor(tmp4, tmp3, tmp4); 5265 cbnz(tmp4, DONE); 5266 ldr(tmp3, Address(pre(a1, wordSize))); 5267 ldr(tmp4, Address(pre(a2, wordSize))); 5268 cmp(cnt1, (u1)elem_per_word); 5269 br(LE, TAIL2); 5270 cmp(tmp1, tmp2); 5271 } br(EQ, NEXT_DWORD); 5272 b(DONE); 5273 5274 bind(TAIL); 5275 eor(tmp4, tmp3, tmp4); 5276 eor(tmp2, tmp1, tmp2); 5277 lslv(tmp2, tmp2, tmp5); 5278 orr(tmp5, tmp4, tmp2); 5279 cmp(tmp5, zr); 5280 b(CSET_EQ); 5281 5282 bind(TAIL2); 5283 eor(tmp2, tmp1, tmp2); 5284 cbnz(tmp2, DONE); 5285 b(LAST_CHECK); 5286 5287 bind(STUB); 5288 ldr(tmp4, Address(pre(a2, base_offset))); 5289 cmp(cnt2, cnt1); 5290 br(NE, DONE); 5291 if (elem_size == 2) { // convert to byte counter 5292 lsl(cnt1, cnt1, 1); 5293 } 5294 eor(tmp5, tmp3, tmp4); 5295 cbnz(tmp5, DONE); 5296 RuntimeAddress stub = RuntimeAddress(StubRoutines::aarch64::large_array_equals()); 5297 assert(stub.target() != NULL, "array_equals_long stub has not been generated"); 5298 trampoline_call(stub); 5299 b(DONE); 5300 5301 bind(EARLY_OUT); 5302 // (a1 != null && a2 == null) || (a1 != null && a2 != null && a1 == a2) 5303 // so, if a2 == null => return false(0), else return true, so we can return a2 5304 mov(result, a2); 5305 b(DONE); 5306 bind(SHORT); 5307 cmp(cnt2, cnt1); 5308 br(NE, DONE); 5309 cbz(cnt1, SAME); 5310 sub(tmp5, zr, cnt1, LSL, 3 + log_elem_size); 5311 ldr(tmp3, Address(a1, base_offset)); 5312 ldr(tmp4, Address(a2, base_offset)); 5313 bind(LAST_CHECK); 5314 eor(tmp4, tmp3, tmp4); 5315 lslv(tmp5, tmp4, tmp5); 5316 cmp(tmp5, zr); 5317 bind(CSET_EQ); 5318 cset(result, EQ); 5319 b(DONE); 5320 } 5321 5322 bind(SAME); 5323 mov(result, true); 5324 // That's it. 5325 bind(DONE); 5326 5327 BLOCK_COMMENT("} array_equals"); 5328 } 5329 5330 // Compare Strings 5331 5332 // For Strings we're passed the address of the first characters in a1 5333 // and a2 and the length in cnt1. 5334 // elem_size is the element size in bytes: either 1 or 2. 5335 // There are two implementations. For arrays >= 8 bytes, all 5336 // comparisons (including the final one, which may overlap) are 5337 // performed 8 bytes at a time. For strings < 8 bytes, we compare a 5338 // halfword, then a short, and then a byte. 5339 5340 void MacroAssembler::string_equals(Register a1, Register a2, 5341 Register result, Register cnt1, int elem_size) 5342 { 5343 Label SAME, DONE, SHORT, NEXT_WORD; 5344 Register tmp1 = rscratch1; 5345 Register tmp2 = rscratch2; 5346 Register cnt2 = tmp2; // cnt2 only used in array length compare 5347 5348 assert(elem_size == 1 || elem_size == 2, "must be 2 or 1 byte"); 5349 assert_different_registers(a1, a2, result, cnt1, rscratch1, rscratch2); 5350 5351 #ifndef PRODUCT 5352 { 5353 const char kind = (elem_size == 2) ? 'U' : 'L'; 5354 char comment[64]; 5355 snprintf(comment, sizeof comment, "{string_equals%c", kind); 5356 BLOCK_COMMENT(comment); 5357 } 5358 #endif 5359 5360 mov(result, false); 5361 5362 // Check for short strings, i.e. smaller than wordSize. 5363 subs(cnt1, cnt1, wordSize); 5364 br(Assembler::LT, SHORT); 5365 // Main 8 byte comparison loop. 5366 bind(NEXT_WORD); { 5367 ldr(tmp1, Address(post(a1, wordSize))); 5368 ldr(tmp2, Address(post(a2, wordSize))); 5369 subs(cnt1, cnt1, wordSize); 5370 eor(tmp1, tmp1, tmp2); 5371 cbnz(tmp1, DONE); 5372 } br(GT, NEXT_WORD); 5373 // Last longword. In the case where length == 4 we compare the 5374 // same longword twice, but that's still faster than another 5375 // conditional branch. 5376 // cnt1 could be 0, -1, -2, -3, -4 for chars; -4 only happens when 5377 // length == 4. 5378 ldr(tmp1, Address(a1, cnt1)); 5379 ldr(tmp2, Address(a2, cnt1)); 5380 eor(tmp2, tmp1, tmp2); 5381 cbnz(tmp2, DONE); 5382 b(SAME); 5383 5384 bind(SHORT); 5385 Label TAIL03, TAIL01; 5386 5387 tbz(cnt1, 2, TAIL03); // 0-7 bytes left. 5388 { 5389 ldrw(tmp1, Address(post(a1, 4))); 5390 ldrw(tmp2, Address(post(a2, 4))); 5391 eorw(tmp1, tmp1, tmp2); 5392 cbnzw(tmp1, DONE); 5393 } 5394 bind(TAIL03); 5395 tbz(cnt1, 1, TAIL01); // 0-3 bytes left. 5396 { 5397 ldrh(tmp1, Address(post(a1, 2))); 5398 ldrh(tmp2, Address(post(a2, 2))); 5399 eorw(tmp1, tmp1, tmp2); 5400 cbnzw(tmp1, DONE); 5401 } 5402 bind(TAIL01); 5403 if (elem_size == 1) { // Only needed when comparing 1-byte elements 5404 tbz(cnt1, 0, SAME); // 0-1 bytes left. 5405 { 5406 ldrb(tmp1, a1); 5407 ldrb(tmp2, a2); 5408 eorw(tmp1, tmp1, tmp2); 5409 cbnzw(tmp1, DONE); 5410 } 5411 } 5412 // Arrays are equal. 5413 bind(SAME); 5414 mov(result, true); 5415 5416 // That's it. 5417 bind(DONE); 5418 BLOCK_COMMENT("} string_equals"); 5419 } 5420 5421 5422 // The size of the blocks erased by the zero_blocks stub. We must 5423 // handle anything smaller than this ourselves in zero_words(). 5424 const int MacroAssembler::zero_words_block_size = 8; 5425 5426 // zero_words() is used by C2 ClearArray patterns. It is as small as 5427 // possible, handling small word counts locally and delegating 5428 // anything larger to the zero_blocks stub. It is expanded many times 5429 // in compiled code, so it is important to keep it short. 5430 5431 // ptr: Address of a buffer to be zeroed. 5432 // cnt: Count in HeapWords. 5433 // 5434 // ptr, cnt, rscratch1, and rscratch2 are clobbered. 5435 void MacroAssembler::zero_words(Register ptr, Register cnt) 5436 { 5437 assert(is_power_of_2(zero_words_block_size), "adjust this"); 5438 assert(ptr == r10 && cnt == r11, "mismatch in register usage"); 5439 5440 BLOCK_COMMENT("zero_words {"); 5441 cmp(cnt, (u1)zero_words_block_size); 5442 Label around, done, done16; 5443 br(LO, around); 5444 { 5445 RuntimeAddress zero_blocks = RuntimeAddress(StubRoutines::aarch64::zero_blocks()); 5446 assert(zero_blocks.target() != NULL, "zero_blocks stub has not been generated"); 5447 if (StubRoutines::aarch64::complete()) { 5448 trampoline_call(zero_blocks); 5449 } else { 5450 bl(zero_blocks); 5451 } 5452 } 5453 bind(around); 5454 for (int i = zero_words_block_size >> 1; i > 1; i >>= 1) { 5455 Label l; 5456 tbz(cnt, exact_log2(i), l); 5457 for (int j = 0; j < i; j += 2) { 5458 stp(zr, zr, post(ptr, 16)); 5459 } 5460 bind(l); 5461 } 5462 { 5463 Label l; 5464 tbz(cnt, 0, l); 5465 str(zr, Address(ptr)); 5466 bind(l); 5467 } 5468 BLOCK_COMMENT("} zero_words"); 5469 } 5470 5471 // base: Address of a buffer to be zeroed, 8 bytes aligned. 5472 // cnt: Immediate count in HeapWords. 5473 #define SmallArraySize (18 * BytesPerLong) 5474 void MacroAssembler::zero_words(Register base, u_int64_t cnt) 5475 { 5476 BLOCK_COMMENT("zero_words {"); 5477 int i = cnt & 1; // store any odd word to start 5478 if (i) str(zr, Address(base)); 5479 5480 if (cnt <= SmallArraySize / BytesPerLong) { 5481 for (; i < (int)cnt; i += 2) 5482 stp(zr, zr, Address(base, i * wordSize)); 5483 } else { 5484 const int unroll = 4; // Number of stp(zr, zr) instructions we'll unroll 5485 int remainder = cnt % (2 * unroll); 5486 for (; i < remainder; i += 2) 5487 stp(zr, zr, Address(base, i * wordSize)); 5488 5489 Label loop; 5490 Register cnt_reg = rscratch1; 5491 Register loop_base = rscratch2; 5492 cnt = cnt - remainder; 5493 mov(cnt_reg, cnt); 5494 // adjust base and prebias by -2 * wordSize so we can pre-increment 5495 add(loop_base, base, (remainder - 2) * wordSize); 5496 bind(loop); 5497 sub(cnt_reg, cnt_reg, 2 * unroll); 5498 for (i = 1; i < unroll; i++) 5499 stp(zr, zr, Address(loop_base, 2 * i * wordSize)); 5500 stp(zr, zr, Address(pre(loop_base, 2 * unroll * wordSize))); 5501 cbnz(cnt_reg, loop); 5502 } 5503 BLOCK_COMMENT("} zero_words"); 5504 } 5505 5506 // Zero blocks of memory by using DC ZVA. 5507 // 5508 // Aligns the base address first sufficently for DC ZVA, then uses 5509 // DC ZVA repeatedly for every full block. cnt is the size to be 5510 // zeroed in HeapWords. Returns the count of words left to be zeroed 5511 // in cnt. 5512 // 5513 // NOTE: This is intended to be used in the zero_blocks() stub. If 5514 // you want to use it elsewhere, note that cnt must be >= 2*zva_length. 5515 void MacroAssembler::zero_dcache_blocks(Register base, Register cnt) { 5516 Register tmp = rscratch1; 5517 Register tmp2 = rscratch2; 5518 int zva_length = VM_Version::zva_length(); 5519 Label initial_table_end, loop_zva; 5520 Label fini; 5521 5522 // Base must be 16 byte aligned. If not just return and let caller handle it 5523 tst(base, 0x0f); 5524 br(Assembler::NE, fini); 5525 // Align base with ZVA length. 5526 neg(tmp, base); 5527 andr(tmp, tmp, zva_length - 1); 5528 5529 // tmp: the number of bytes to be filled to align the base with ZVA length. 5530 add(base, base, tmp); 5531 sub(cnt, cnt, tmp, Assembler::ASR, 3); 5532 adr(tmp2, initial_table_end); 5533 sub(tmp2, tmp2, tmp, Assembler::LSR, 2); 5534 br(tmp2); 5535 5536 for (int i = -zva_length + 16; i < 0; i += 16) 5537 stp(zr, zr, Address(base, i)); 5538 bind(initial_table_end); 5539 5540 sub(cnt, cnt, zva_length >> 3); 5541 bind(loop_zva); 5542 dc(Assembler::ZVA, base); 5543 subs(cnt, cnt, zva_length >> 3); 5544 add(base, base, zva_length); 5545 br(Assembler::GE, loop_zva); 5546 add(cnt, cnt, zva_length >> 3); // count not zeroed by DC ZVA 5547 bind(fini); 5548 } 5549 5550 // base: Address of a buffer to be filled, 8 bytes aligned. 5551 // cnt: Count in 8-byte unit. 5552 // value: Value to be filled with. 5553 // base will point to the end of the buffer after filling. 5554 void MacroAssembler::fill_words(Register base, Register cnt, Register value) 5555 { 5556 // Algorithm: 5557 // 5558 // scratch1 = cnt & 7; 5559 // cnt -= scratch1; 5560 // p += scratch1; 5561 // switch (scratch1) { 5562 // do { 5563 // cnt -= 8; 5564 // p[-8] = v; 5565 // case 7: 5566 // p[-7] = v; 5567 // case 6: 5568 // p[-6] = v; 5569 // // ... 5570 // case 1: 5571 // p[-1] = v; 5572 // case 0: 5573 // p += 8; 5574 // } while (cnt); 5575 // } 5576 5577 assert_different_registers(base, cnt, value, rscratch1, rscratch2); 5578 5579 Label fini, skip, entry, loop; 5580 const int unroll = 8; // Number of stp instructions we'll unroll 5581 5582 cbz(cnt, fini); 5583 tbz(base, 3, skip); 5584 str(value, Address(post(base, 8))); 5585 sub(cnt, cnt, 1); 5586 bind(skip); 5587 5588 andr(rscratch1, cnt, (unroll-1) * 2); 5589 sub(cnt, cnt, rscratch1); 5590 add(base, base, rscratch1, Assembler::LSL, 3); 5591 adr(rscratch2, entry); 5592 sub(rscratch2, rscratch2, rscratch1, Assembler::LSL, 1); 5593 br(rscratch2); 5594 5595 bind(loop); 5596 add(base, base, unroll * 16); 5597 for (int i = -unroll; i < 0; i++) 5598 stp(value, value, Address(base, i * 16)); 5599 bind(entry); 5600 subs(cnt, cnt, unroll * 2); 5601 br(Assembler::GE, loop); 5602 5603 tbz(cnt, 0, fini); 5604 str(value, Address(post(base, 8))); 5605 bind(fini); 5606 } 5607 5608 // Intrinsic for sun/nio/cs/ISO_8859_1$Encoder.implEncodeISOArray and 5609 // java/lang/StringUTF16.compress. 5610 void MacroAssembler::encode_iso_array(Register src, Register dst, 5611 Register len, Register result, 5612 FloatRegister Vtmp1, FloatRegister Vtmp2, 5613 FloatRegister Vtmp3, FloatRegister Vtmp4) 5614 { 5615 Label DONE, SET_RESULT, NEXT_32, NEXT_32_PRFM, LOOP_8, NEXT_8, LOOP_1, NEXT_1, 5616 NEXT_32_START, NEXT_32_PRFM_START; 5617 Register tmp1 = rscratch1, tmp2 = rscratch2; 5618 5619 mov(result, len); // Save initial len 5620 5621 #ifndef BUILTIN_SIM 5622 cmp(len, (u1)8); // handle shortest strings first 5623 br(LT, LOOP_1); 5624 cmp(len, (u1)32); 5625 br(LT, NEXT_8); 5626 // The following code uses the SIMD 'uzp1' and 'uzp2' instructions 5627 // to convert chars to bytes 5628 if (SoftwarePrefetchHintDistance >= 0) { 5629 ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src); 5630 subs(tmp2, len, SoftwarePrefetchHintDistance/2 + 16); 5631 br(LE, NEXT_32_START); 5632 b(NEXT_32_PRFM_START); 5633 BIND(NEXT_32_PRFM); 5634 ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src); 5635 BIND(NEXT_32_PRFM_START); 5636 prfm(Address(src, SoftwarePrefetchHintDistance)); 5637 orr(v4, T16B, Vtmp1, Vtmp2); 5638 orr(v5, T16B, Vtmp3, Vtmp4); 5639 uzp1(Vtmp1, T16B, Vtmp1, Vtmp2); 5640 uzp1(Vtmp3, T16B, Vtmp3, Vtmp4); 5641 stpq(Vtmp1, Vtmp3, dst); 5642 uzp2(v5, T16B, v4, v5); // high bytes 5643 umov(tmp2, v5, D, 1); 5644 fmovd(tmp1, v5); 5645 orr(tmp1, tmp1, tmp2); 5646 cbnz(tmp1, LOOP_8); 5647 sub(len, len, 32); 5648 add(dst, dst, 32); 5649 add(src, src, 64); 5650 subs(tmp2, len, SoftwarePrefetchHintDistance/2 + 16); 5651 br(GE, NEXT_32_PRFM); 5652 cmp(len, (u1)32); 5653 br(LT, LOOP_8); 5654 BIND(NEXT_32); 5655 ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src); 5656 BIND(NEXT_32_START); 5657 } else { 5658 BIND(NEXT_32); 5659 ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src); 5660 } 5661 prfm(Address(src, SoftwarePrefetchHintDistance)); 5662 uzp1(v4, T16B, Vtmp1, Vtmp2); 5663 uzp1(v5, T16B, Vtmp3, Vtmp4); 5664 stpq(v4, v5, dst); 5665 orr(Vtmp1, T16B, Vtmp1, Vtmp2); 5666 orr(Vtmp3, T16B, Vtmp3, Vtmp4); 5667 uzp2(Vtmp1, T16B, Vtmp1, Vtmp3); // high bytes 5668 umov(tmp2, Vtmp1, D, 1); 5669 fmovd(tmp1, Vtmp1); 5670 orr(tmp1, tmp1, tmp2); 5671 cbnz(tmp1, LOOP_8); 5672 sub(len, len, 32); 5673 add(dst, dst, 32); 5674 add(src, src, 64); 5675 cmp(len, (u1)32); 5676 br(GE, NEXT_32); 5677 cbz(len, DONE); 5678 5679 BIND(LOOP_8); 5680 cmp(len, (u1)8); 5681 br(LT, LOOP_1); 5682 BIND(NEXT_8); 5683 ld1(Vtmp1, T8H, src); 5684 uzp1(Vtmp2, T16B, Vtmp1, Vtmp1); // low bytes 5685 uzp2(Vtmp3, T16B, Vtmp1, Vtmp1); // high bytes 5686 strd(Vtmp2, dst); 5687 fmovd(tmp1, Vtmp3); 5688 cbnz(tmp1, NEXT_1); 5689 5690 sub(len, len, 8); 5691 add(dst, dst, 8); 5692 add(src, src, 16); 5693 cmp(len, (u1)8); 5694 br(GE, NEXT_8); 5695 5696 BIND(LOOP_1); 5697 #endif 5698 cbz(len, DONE); 5699 BIND(NEXT_1); 5700 ldrh(tmp1, Address(post(src, 2))); 5701 strb(tmp1, Address(post(dst, 1))); 5702 tst(tmp1, 0xff00); 5703 br(NE, SET_RESULT); 5704 subs(len, len, 1); 5705 br(GT, NEXT_1); 5706 5707 BIND(SET_RESULT); 5708 sub(result, result, len); // Return index where we stopped 5709 // Return len == 0 if we processed all 5710 // characters 5711 BIND(DONE); 5712 } 5713 5714 5715 // Inflate byte[] array to char[]. 5716 void MacroAssembler::byte_array_inflate(Register src, Register dst, Register len, 5717 FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3, 5718 Register tmp4) { 5719 Label big, done, after_init, to_stub; 5720 5721 assert_different_registers(src, dst, len, tmp4, rscratch1); 5722 5723 fmovd(vtmp1, zr); 5724 lsrw(tmp4, len, 3); 5725 bind(after_init); 5726 cbnzw(tmp4, big); 5727 // Short string: less than 8 bytes. 5728 { 5729 Label loop, tiny; 5730 5731 cmpw(len, 4); 5732 br(LT, tiny); 5733 // Use SIMD to do 4 bytes. 5734 ldrs(vtmp2, post(src, 4)); 5735 zip1(vtmp3, T8B, vtmp2, vtmp1); 5736 subw(len, len, 4); 5737 strd(vtmp3, post(dst, 8)); 5738 5739 cbzw(len, done); 5740 5741 // Do the remaining bytes by steam. 5742 bind(loop); 5743 ldrb(tmp4, post(src, 1)); 5744 strh(tmp4, post(dst, 2)); 5745 subw(len, len, 1); 5746 5747 bind(tiny); 5748 cbnz(len, loop); 5749 5750 b(done); 5751 } 5752 5753 if (SoftwarePrefetchHintDistance >= 0) { 5754 bind(to_stub); 5755 RuntimeAddress stub = RuntimeAddress(StubRoutines::aarch64::large_byte_array_inflate()); 5756 assert(stub.target() != NULL, "large_byte_array_inflate stub has not been generated"); 5757 trampoline_call(stub); 5758 b(after_init); 5759 } 5760 5761 // Unpack the bytes 8 at a time. 5762 bind(big); 5763 { 5764 Label loop, around, loop_last, loop_start; 5765 5766 if (SoftwarePrefetchHintDistance >= 0) { 5767 const int large_loop_threshold = (64 + 16)/8; 5768 ldrd(vtmp2, post(src, 8)); 5769 andw(len, len, 7); 5770 cmp(tmp4, (u1)large_loop_threshold); 5771 br(GE, to_stub); 5772 b(loop_start); 5773 5774 bind(loop); 5775 ldrd(vtmp2, post(src, 8)); 5776 bind(loop_start); 5777 subs(tmp4, tmp4, 1); 5778 br(EQ, loop_last); 5779 zip1(vtmp2, T16B, vtmp2, vtmp1); 5780 ldrd(vtmp3, post(src, 8)); 5781 st1(vtmp2, T8H, post(dst, 16)); 5782 subs(tmp4, tmp4, 1); 5783 zip1(vtmp3, T16B, vtmp3, vtmp1); 5784 st1(vtmp3, T8H, post(dst, 16)); 5785 br(NE, loop); 5786 b(around); 5787 bind(loop_last); 5788 zip1(vtmp2, T16B, vtmp2, vtmp1); 5789 st1(vtmp2, T8H, post(dst, 16)); 5790 bind(around); 5791 cbz(len, done); 5792 } else { 5793 andw(len, len, 7); 5794 bind(loop); 5795 ldrd(vtmp2, post(src, 8)); 5796 sub(tmp4, tmp4, 1); 5797 zip1(vtmp3, T16B, vtmp2, vtmp1); 5798 st1(vtmp3, T8H, post(dst, 16)); 5799 cbnz(tmp4, loop); 5800 } 5801 } 5802 5803 // Do the tail of up to 8 bytes. 5804 add(src, src, len); 5805 ldrd(vtmp3, Address(src, -8)); 5806 add(dst, dst, len, ext::uxtw, 1); 5807 zip1(vtmp3, T16B, vtmp3, vtmp1); 5808 strq(vtmp3, Address(dst, -16)); 5809 5810 bind(done); 5811 } 5812 5813 // Compress char[] array to byte[]. 5814 void MacroAssembler::char_array_compress(Register src, Register dst, Register len, 5815 FloatRegister tmp1Reg, FloatRegister tmp2Reg, 5816 FloatRegister tmp3Reg, FloatRegister tmp4Reg, 5817 Register result) { 5818 encode_iso_array(src, dst, len, result, 5819 tmp1Reg, tmp2Reg, tmp3Reg, tmp4Reg); 5820 cmp(len, zr); 5821 csel(result, result, zr, EQ); 5822 } 5823 5824 // get_thread() can be called anywhere inside generated code so we 5825 // need to save whatever non-callee save context might get clobbered 5826 // by the call to JavaThread::aarch64_get_thread_helper() or, indeed, 5827 // the call setup code. 5828 // 5829 // aarch64_get_thread_helper() clobbers only r0, r1, and flags. 5830 // 5831 void MacroAssembler::get_thread(Register dst) { 5832 RegSet saved_regs = RegSet::range(r0, r1) + lr - dst; 5833 push(saved_regs, sp); 5834 5835 mov(lr, CAST_FROM_FN_PTR(address, JavaThread::aarch64_get_thread_helper)); 5836 blrt(lr, 1, 0, 1); 5837 if (dst != c_rarg0) { 5838 mov(dst, c_rarg0); 5839 } 5840 5841 pop(saved_regs, sp); 5842 }