1 /* 2 * Copyright (c) 1997, 2019, Oracle and/or its affiliates. All rights reserved. 3 * Copyright (c) 2014, 2019, Red Hat Inc. All rights reserved. 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. 9 * 10 * This code is distributed in the hope that it will be useful, but WITHOUT 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 * version 2 for more details (a copy is included in the LICENSE file that 14 * accompanied this code). 15 * 16 * You should have received a copy of the GNU General Public License version 17 * 2 along with this work; if not, write to the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 19 * 20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 21 * or visit www.oracle.com if you need additional information or have any 22 * questions. 23 * 24 */ 25 26 #include <sys/types.h> 27 28 #include "precompiled.hpp" 29 #include "jvm.h" 30 #include "asm/assembler.hpp" 31 #include "asm/assembler.inline.hpp" 32 #include "gc/shared/barrierSet.hpp" 33 #include "gc/shared/cardTable.hpp" 34 #include "gc/shared/barrierSetAssembler.hpp" 35 #include "gc/shared/cardTableBarrierSet.hpp" 36 #include "interpreter/interpreter.hpp" 37 #include "compiler/disassembler.hpp" 38 #include "memory/resourceArea.hpp" 39 #include "nativeInst_aarch64.hpp" 40 #include "oops/accessDecorators.hpp" 41 #include "oops/compressedOops.inline.hpp" 42 #include "oops/klass.inline.hpp" 43 #include "runtime/biasedLocking.hpp" 44 #include "runtime/icache.hpp" 45 #include "runtime/interfaceSupport.inline.hpp" 46 #include "runtime/jniHandles.inline.hpp" 47 #include "runtime/sharedRuntime.hpp" 48 #include "runtime/thread.hpp" 49 #ifdef COMPILER1 50 #include "c1/c1_LIRAssembler.hpp" 51 #endif 52 #ifdef COMPILER2 53 #include "oops/oop.hpp" 54 #include "opto/compile.hpp" 55 #include "opto/intrinsicnode.hpp" 56 #include "opto/node.hpp" 57 #endif 58 59 #ifdef PRODUCT 60 #define BLOCK_COMMENT(str) /* nothing */ 61 #define STOP(error) stop(error) 62 #else 63 #define BLOCK_COMMENT(str) block_comment(str) 64 #define STOP(error) block_comment(error); stop(error) 65 #endif 66 67 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 68 69 // Patch any kind of instruction; there may be several instructions. 70 // Return the total length (in bytes) of the instructions. 71 int MacroAssembler::pd_patch_instruction_size(address branch, address target) { 72 int instructions = 1; 73 assert((uint64_t)target < (1ul << 48), "48-bit overflow in address constant"); 74 long offset = (target - branch) >> 2; 75 unsigned insn = *(unsigned*)branch; 76 if ((Instruction_aarch64::extract(insn, 29, 24) & 0b111011) == 0b011000) { 77 // Load register (literal) 78 Instruction_aarch64::spatch(branch, 23, 5, offset); 79 } else if (Instruction_aarch64::extract(insn, 30, 26) == 0b00101) { 80 // Unconditional branch (immediate) 81 Instruction_aarch64::spatch(branch, 25, 0, offset); 82 } else if (Instruction_aarch64::extract(insn, 31, 25) == 0b0101010) { 83 // Conditional branch (immediate) 84 Instruction_aarch64::spatch(branch, 23, 5, offset); 85 } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011010) { 86 // Compare & branch (immediate) 87 Instruction_aarch64::spatch(branch, 23, 5, offset); 88 } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011011) { 89 // Test & branch (immediate) 90 Instruction_aarch64::spatch(branch, 18, 5, offset); 91 } else if (Instruction_aarch64::extract(insn, 28, 24) == 0b10000) { 92 // PC-rel. addressing 93 offset = target-branch; 94 int shift = Instruction_aarch64::extract(insn, 31, 31); 95 if (shift) { 96 u_int64_t dest = (u_int64_t)target; 97 uint64_t pc_page = (uint64_t)branch >> 12; 98 uint64_t adr_page = (uint64_t)target >> 12; 99 unsigned offset_lo = dest & 0xfff; 100 offset = adr_page - pc_page; 101 102 // We handle 4 types of PC relative addressing 103 // 1 - adrp Rx, target_page 104 // ldr/str Ry, [Rx, #offset_in_page] 105 // 2 - adrp Rx, target_page 106 // add Ry, Rx, #offset_in_page 107 // 3 - adrp Rx, target_page (page aligned reloc, offset == 0) 108 // movk Rx, #imm16<<32 109 // 4 - adrp Rx, target_page (page aligned reloc, offset == 0) 110 // In the first 3 cases we must check that Rx is the same in the adrp and the 111 // subsequent ldr/str, add or movk instruction. Otherwise we could accidentally end 112 // up treating a type 4 relocation as a type 1, 2 or 3 just because it happened 113 // to be followed by a random unrelated ldr/str, add or movk instruction. 114 // 115 unsigned insn2 = ((unsigned*)branch)[1]; 116 if (Instruction_aarch64::extract(insn2, 29, 24) == 0b111001 && 117 Instruction_aarch64::extract(insn, 4, 0) == 118 Instruction_aarch64::extract(insn2, 9, 5)) { 119 // Load/store register (unsigned immediate) 120 unsigned size = Instruction_aarch64::extract(insn2, 31, 30); 121 Instruction_aarch64::patch(branch + sizeof (unsigned), 122 21, 10, offset_lo >> size); 123 guarantee(((dest >> size) << size) == dest, "misaligned target"); 124 instructions = 2; 125 } else if (Instruction_aarch64::extract(insn2, 31, 22) == 0b1001000100 && 126 Instruction_aarch64::extract(insn, 4, 0) == 127 Instruction_aarch64::extract(insn2, 4, 0)) { 128 // add (immediate) 129 Instruction_aarch64::patch(branch + sizeof (unsigned), 130 21, 10, offset_lo); 131 instructions = 2; 132 } else if (Instruction_aarch64::extract(insn2, 31, 21) == 0b11110010110 && 133 Instruction_aarch64::extract(insn, 4, 0) == 134 Instruction_aarch64::extract(insn2, 4, 0)) { 135 // movk #imm16<<32 136 Instruction_aarch64::patch(branch + 4, 20, 5, (uint64_t)target >> 32); 137 long dest = ((long)target & 0xffffffffL) | ((long)branch & 0xffff00000000L); 138 long pc_page = (long)branch >> 12; 139 long adr_page = (long)dest >> 12; 140 offset = adr_page - pc_page; 141 instructions = 2; 142 } 143 } 144 int offset_lo = offset & 3; 145 offset >>= 2; 146 Instruction_aarch64::spatch(branch, 23, 5, offset); 147 Instruction_aarch64::patch(branch, 30, 29, offset_lo); 148 } else if (Instruction_aarch64::extract(insn, 31, 21) == 0b11010010100) { 149 u_int64_t dest = (u_int64_t)target; 150 // Move wide constant 151 assert(nativeInstruction_at(branch+4)->is_movk(), "wrong insns in patch"); 152 assert(nativeInstruction_at(branch+8)->is_movk(), "wrong insns in patch"); 153 Instruction_aarch64::patch(branch, 20, 5, dest & 0xffff); 154 Instruction_aarch64::patch(branch+4, 20, 5, (dest >>= 16) & 0xffff); 155 Instruction_aarch64::patch(branch+8, 20, 5, (dest >>= 16) & 0xffff); 156 assert(target_addr_for_insn(branch) == target, "should be"); 157 instructions = 3; 158 } else if (Instruction_aarch64::extract(insn, 31, 22) == 0b1011100101 && 159 Instruction_aarch64::extract(insn, 4, 0) == 0b11111) { 160 // nothing to do 161 assert(target == 0, "did not expect to relocate target for polling page load"); 162 } else { 163 ShouldNotReachHere(); 164 } 165 return instructions * NativeInstruction::instruction_size; 166 } 167 168 int MacroAssembler::patch_oop(address insn_addr, address o) { 169 int instructions; 170 unsigned insn = *(unsigned*)insn_addr; 171 assert(nativeInstruction_at(insn_addr+4)->is_movk(), "wrong insns in patch"); 172 173 // OOPs are either narrow (32 bits) or wide (48 or 64 bits). We encode 174 // narrow OOPs by setting the upper 16 bits in the first 175 // instruction. 176 // 64 bit addresses are only enabled with Use64BitLiteralAddresses set. 177 if (Instruction_aarch64::extract(insn, 31, 21) == 0b11010010101) { 178 // Move narrow OOP 179 narrowOop n = CompressedOops::encode((oop)o); 180 Instruction_aarch64::patch(insn_addr, 20, 5, n >> 16); 181 Instruction_aarch64::patch(insn_addr+4, 20, 5, n & 0xffff); 182 instructions = 2; 183 } else { 184 // Move wide OOP 185 assert(nativeInstruction_at(insn_addr+8)->is_movk(), "wrong insns in patch"); 186 uintptr_t dest = (uintptr_t)o; 187 Instruction_aarch64::patch(insn_addr, 20, 5, dest & 0xffff); 188 Instruction_aarch64::patch(insn_addr+4, 20, 5, (dest >>= 16) & 0xffff); 189 Instruction_aarch64::patch(insn_addr+8, 20, 5, (dest >>= 16) & 0xffff); 190 instructions = 3; 191 192 if (Use64BitLiteralAddresses) { 193 assert(nativeInstruction_at(insn_addr+12)->is_movk(), "wrong insns in patch"); 194 Instruction_aarch64::patch(insn_addr+12, 20, 5, (dest >>= 16) & 0xffff); 195 instructions = 4; 196 } 197 } 198 return instructions * NativeInstruction::instruction_size; 199 } 200 201 int MacroAssembler::patch_narrow_klass(address insn_addr, narrowKlass n) { 202 // Metatdata pointers are either narrow (32 bits) or wide (48 bits). 203 // We encode narrow ones by setting the upper 16 bits in the first 204 // instruction. 205 NativeInstruction *insn = nativeInstruction_at(insn_addr); 206 assert(Instruction_aarch64::extract(insn->encoding(), 31, 21) == 0b11010010101 && 207 nativeInstruction_at(insn_addr+4)->is_movk(), "wrong insns in patch"); 208 209 Instruction_aarch64::patch(insn_addr, 20, 5, n >> 16); 210 Instruction_aarch64::patch(insn_addr+4, 20, 5, n & 0xffff); 211 return 2 * NativeInstruction::instruction_size; 212 } 213 214 address MacroAssembler::target_addr_for_insn(address insn_addr, unsigned insn) { 215 long offset = 0; 216 if ((Instruction_aarch64::extract(insn, 29, 24) & 0b011011) == 0b00011000) { 217 // Load register (literal) 218 offset = Instruction_aarch64::sextract(insn, 23, 5); 219 return address(((uint64_t)insn_addr + (offset << 2))); 220 } else if (Instruction_aarch64::extract(insn, 30, 26) == 0b00101) { 221 // Unconditional branch (immediate) 222 offset = Instruction_aarch64::sextract(insn, 25, 0); 223 } else if (Instruction_aarch64::extract(insn, 31, 25) == 0b0101010) { 224 // Conditional branch (immediate) 225 offset = Instruction_aarch64::sextract(insn, 23, 5); 226 } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011010) { 227 // Compare & branch (immediate) 228 offset = Instruction_aarch64::sextract(insn, 23, 5); 229 } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011011) { 230 // Test & branch (immediate) 231 offset = Instruction_aarch64::sextract(insn, 18, 5); 232 } else if (Instruction_aarch64::extract(insn, 28, 24) == 0b10000) { 233 // PC-rel. addressing 234 offset = Instruction_aarch64::extract(insn, 30, 29); 235 offset |= Instruction_aarch64::sextract(insn, 23, 5) << 2; 236 int shift = Instruction_aarch64::extract(insn, 31, 31) ? 12 : 0; 237 if (shift) { 238 offset <<= shift; 239 uint64_t target_page = ((uint64_t)insn_addr) + offset; 240 target_page &= ((uint64_t)-1) << shift; 241 // Return the target address for the following sequences 242 // 1 - adrp Rx, target_page 243 // ldr/str Ry, [Rx, #offset_in_page] 244 // 2 - adrp Rx, target_page 245 // add Ry, Rx, #offset_in_page 246 // 3 - adrp Rx, target_page (page aligned reloc, offset == 0) 247 // movk Rx, #imm12<<32 248 // 4 - adrp Rx, target_page (page aligned reloc, offset == 0) 249 // 250 // In the first two cases we check that the register is the same and 251 // return the target_page + the offset within the page. 252 // Otherwise we assume it is a page aligned relocation and return 253 // the target page only. 254 // 255 unsigned insn2 = ((unsigned*)insn_addr)[1]; 256 if (Instruction_aarch64::extract(insn2, 29, 24) == 0b111001 && 257 Instruction_aarch64::extract(insn, 4, 0) == 258 Instruction_aarch64::extract(insn2, 9, 5)) { 259 // Load/store register (unsigned immediate) 260 unsigned int byte_offset = Instruction_aarch64::extract(insn2, 21, 10); 261 unsigned int size = Instruction_aarch64::extract(insn2, 31, 30); 262 return address(target_page + (byte_offset << size)); 263 } else if (Instruction_aarch64::extract(insn2, 31, 22) == 0b1001000100 && 264 Instruction_aarch64::extract(insn, 4, 0) == 265 Instruction_aarch64::extract(insn2, 4, 0)) { 266 // add (immediate) 267 unsigned int byte_offset = Instruction_aarch64::extract(insn2, 21, 10); 268 return address(target_page + byte_offset); 269 } else { 270 if (Instruction_aarch64::extract(insn2, 31, 21) == 0b11110010110 && 271 Instruction_aarch64::extract(insn, 4, 0) == 272 Instruction_aarch64::extract(insn2, 4, 0)) { 273 target_page = (target_page & 0xffffffff) | 274 ((uint64_t)Instruction_aarch64::extract(insn2, 20, 5) << 32); 275 } 276 return (address)target_page; 277 } 278 } else { 279 ShouldNotReachHere(); 280 } 281 } else if (Instruction_aarch64::extract(insn, 31, 23) == 0b110100101) { 282 u_int32_t *insns = (u_int32_t *)insn_addr; 283 // Move wide constant: movz, movk, movk [, movk]. See movptr(). 284 assert(nativeInstruction_at(insns+1)->is_movk(), "wrong insns in patch - 2nd movk missing"); 285 assert(nativeInstruction_at(insns+2)->is_movk(), "wrong insns in patch - 3rd movk missing"); 286 u_int64_t addr = u_int64_t(Instruction_aarch64::extract(insns[0], 20, 5)) 287 + (u_int64_t(Instruction_aarch64::extract(insns[1], 20, 5)) << 16) 288 + (u_int64_t(Instruction_aarch64::extract(insns[2], 20, 5)) << 32); 289 290 // Allow for getting the target address of a possible adddress. 291 if (Use64BitLiteralAddresses) { 292 assert(nativeInstruction_at(insns+3)->is_movk(), "wrong insns in patch - 4th movk missing."); 293 addr += u_int64_t(Instruction_aarch64::extract(insns[3], 20, 5)) << 48; 294 } 295 return (address) addr; 296 } else if (Instruction_aarch64::extract(insn, 31, 22) == 0b1011100101 && 297 Instruction_aarch64::extract(insn, 4, 0) == 0b11111) { 298 return 0; 299 } else { 300 ShouldNotReachHere(); 301 } 302 return address(((uint64_t)insn_addr + (offset << 2))); 303 } 304 305 void MacroAssembler::safepoint_poll(Label& slow_path) { 306 if (SafepointMechanism::uses_thread_local_poll()) { 307 ldr(rscratch1, Address(rthread, Thread::polling_page_offset())); 308 tbnz(rscratch1, exact_log2(SafepointMechanism::poll_bit()), slow_path); 309 } else { 310 unsigned long offset; 311 adrp(rscratch1, ExternalAddress(SafepointSynchronize::address_of_state()), offset); 312 ldrw(rscratch1, Address(rscratch1, offset)); 313 assert(SafepointSynchronize::_not_synchronized == 0, "rewrite this code"); 314 cbnz(rscratch1, slow_path); 315 } 316 } 317 318 // Just like safepoint_poll, but use an acquiring load for thread- 319 // local polling. 320 // 321 // We need an acquire here to ensure that any subsequent load of the 322 // global SafepointSynchronize::_state flag is ordered after this load 323 // of the local Thread::_polling page. We don't want this poll to 324 // return false (i.e. not safepointing) and a later poll of the global 325 // SafepointSynchronize::_state spuriously to return true. 326 // 327 // This is to avoid a race when we're in a native->Java transition 328 // racing the code which wakes up from a safepoint. 329 // 330 void MacroAssembler::safepoint_poll_acquire(Label& slow_path) { 331 if (SafepointMechanism::uses_thread_local_poll()) { 332 lea(rscratch1, Address(rthread, Thread::polling_page_offset())); 333 ldar(rscratch1, rscratch1); 334 tbnz(rscratch1, exact_log2(SafepointMechanism::poll_bit()), slow_path); 335 } else { 336 safepoint_poll(slow_path); 337 } 338 } 339 340 void MacroAssembler::reset_last_Java_frame(bool clear_fp) { 341 // we must set sp to zero to clear frame 342 str(zr, Address(rthread, JavaThread::last_Java_sp_offset())); 343 344 // must clear fp, so that compiled frames are not confused; it is 345 // possible that we need it only for debugging 346 if (clear_fp) { 347 str(zr, Address(rthread, JavaThread::last_Java_fp_offset())); 348 } 349 350 // Always clear the pc because it could have been set by make_walkable() 351 str(zr, Address(rthread, JavaThread::last_Java_pc_offset())); 352 } 353 354 // Calls to C land 355 // 356 // When entering C land, the rfp, & resp of the last Java frame have to be recorded 357 // in the (thread-local) JavaThread object. When leaving C land, the last Java fp 358 // has to be reset to 0. This is required to allow proper stack traversal. 359 void MacroAssembler::set_last_Java_frame(Register last_java_sp, 360 Register last_java_fp, 361 Register last_java_pc, 362 Register scratch) { 363 364 if (last_java_pc->is_valid()) { 365 str(last_java_pc, Address(rthread, 366 JavaThread::frame_anchor_offset() 367 + JavaFrameAnchor::last_Java_pc_offset())); 368 } 369 370 // determine last_java_sp register 371 if (last_java_sp == sp) { 372 mov(scratch, sp); 373 last_java_sp = scratch; 374 } else if (!last_java_sp->is_valid()) { 375 last_java_sp = esp; 376 } 377 378 str(last_java_sp, Address(rthread, JavaThread::last_Java_sp_offset())); 379 380 // last_java_fp is optional 381 if (last_java_fp->is_valid()) { 382 str(last_java_fp, Address(rthread, JavaThread::last_Java_fp_offset())); 383 } 384 } 385 386 void MacroAssembler::set_last_Java_frame(Register last_java_sp, 387 Register last_java_fp, 388 address last_java_pc, 389 Register scratch) { 390 assert(last_java_pc != NULL, "must provide a valid PC"); 391 392 adr(scratch, last_java_pc); 393 str(scratch, Address(rthread, 394 JavaThread::frame_anchor_offset() 395 + JavaFrameAnchor::last_Java_pc_offset())); 396 397 set_last_Java_frame(last_java_sp, last_java_fp, noreg, scratch); 398 } 399 400 void MacroAssembler::set_last_Java_frame(Register last_java_sp, 401 Register last_java_fp, 402 Label &L, 403 Register scratch) { 404 if (L.is_bound()) { 405 set_last_Java_frame(last_java_sp, last_java_fp, target(L), scratch); 406 } else { 407 InstructionMark im(this); 408 L.add_patch_at(code(), locator()); 409 set_last_Java_frame(last_java_sp, last_java_fp, pc() /* Patched later */, scratch); 410 } 411 } 412 413 void MacroAssembler::far_call(Address entry, CodeBuffer *cbuf, Register tmp) { 414 assert(ReservedCodeCacheSize < 4*G, "branch out of range"); 415 assert(CodeCache::find_blob(entry.target()) != NULL, 416 "destination of far call not found in code cache"); 417 if (far_branches()) { 418 unsigned long offset; 419 // We can use ADRP here because we know that the total size of 420 // the code cache cannot exceed 2Gb. 421 adrp(tmp, entry, offset); 422 add(tmp, tmp, offset); 423 if (cbuf) cbuf->set_insts_mark(); 424 blr(tmp); 425 } else { 426 if (cbuf) cbuf->set_insts_mark(); 427 bl(entry); 428 } 429 } 430 431 void MacroAssembler::far_jump(Address entry, CodeBuffer *cbuf, Register tmp) { 432 assert(ReservedCodeCacheSize < 4*G, "branch out of range"); 433 assert(CodeCache::find_blob(entry.target()) != NULL, 434 "destination of far call not found in code cache"); 435 if (far_branches()) { 436 unsigned long offset; 437 // We can use ADRP here because we know that the total size of 438 // the code cache cannot exceed 2Gb. 439 adrp(tmp, entry, offset); 440 add(tmp, tmp, offset); 441 if (cbuf) cbuf->set_insts_mark(); 442 br(tmp); 443 } else { 444 if (cbuf) cbuf->set_insts_mark(); 445 b(entry); 446 } 447 } 448 449 void MacroAssembler::reserved_stack_check() { 450 // testing if reserved zone needs to be enabled 451 Label no_reserved_zone_enabling; 452 453 ldr(rscratch1, Address(rthread, JavaThread::reserved_stack_activation_offset())); 454 cmp(sp, rscratch1); 455 br(Assembler::LO, no_reserved_zone_enabling); 456 457 enter(); // LR and FP are live. 458 lea(rscratch1, CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone)); 459 mov(c_rarg0, rthread); 460 blr(rscratch1); 461 leave(); 462 463 // We have already removed our own frame. 464 // throw_delayed_StackOverflowError will think that it's been 465 // called by our caller. 466 lea(rscratch1, RuntimeAddress(StubRoutines::throw_delayed_StackOverflowError_entry())); 467 br(rscratch1); 468 should_not_reach_here(); 469 470 bind(no_reserved_zone_enabling); 471 } 472 473 int MacroAssembler::biased_locking_enter(Register lock_reg, 474 Register obj_reg, 475 Register swap_reg, 476 Register tmp_reg, 477 bool swap_reg_contains_mark, 478 Label& done, 479 Label* slow_case, 480 BiasedLockingCounters* counters) { 481 assert(UseBiasedLocking, "why call this otherwise?"); 482 assert_different_registers(lock_reg, obj_reg, swap_reg); 483 484 if (PrintBiasedLockingStatistics && counters == NULL) 485 counters = BiasedLocking::counters(); 486 487 assert_different_registers(lock_reg, obj_reg, swap_reg, tmp_reg, rscratch1, rscratch2, noreg); 488 assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits, "biased locking makes assumptions about bit layout"); 489 Address mark_addr (obj_reg, oopDesc::mark_offset_in_bytes()); 490 Address klass_addr (obj_reg, oopDesc::klass_offset_in_bytes()); 491 Address saved_mark_addr(lock_reg, 0); 492 493 // Biased locking 494 // See whether the lock is currently biased toward our thread and 495 // whether the epoch is still valid 496 // Note that the runtime guarantees sufficient alignment of JavaThread 497 // pointers to allow age to be placed into low bits 498 // First check to see whether biasing is even enabled for this object 499 Label cas_label; 500 int null_check_offset = -1; 501 if (!swap_reg_contains_mark) { 502 null_check_offset = offset(); 503 ldr(swap_reg, mark_addr); 504 } 505 andr(tmp_reg, swap_reg, markOopDesc::biased_lock_mask_in_place); 506 cmp(tmp_reg, (u1)markOopDesc::biased_lock_pattern); 507 br(Assembler::NE, cas_label); 508 // The bias pattern is present in the object's header. Need to check 509 // whether the bias owner and the epoch are both still current. 510 load_prototype_header(tmp_reg, obj_reg); 511 orr(tmp_reg, tmp_reg, rthread); 512 eor(tmp_reg, swap_reg, tmp_reg); 513 andr(tmp_reg, tmp_reg, ~((int) markOopDesc::age_mask_in_place)); 514 if (counters != NULL) { 515 Label around; 516 cbnz(tmp_reg, around); 517 atomic_incw(Address((address)counters->biased_lock_entry_count_addr()), tmp_reg, rscratch1, rscratch2); 518 b(done); 519 bind(around); 520 } else { 521 cbz(tmp_reg, done); 522 } 523 524 Label try_revoke_bias; 525 Label try_rebias; 526 527 // At this point we know that the header has the bias pattern and 528 // that we are not the bias owner in the current epoch. We need to 529 // figure out more details about the state of the header in order to 530 // know what operations can be legally performed on the object's 531 // header. 532 533 // If the low three bits in the xor result aren't clear, that means 534 // the prototype header is no longer biased and we have to revoke 535 // the bias on this object. 536 andr(rscratch1, tmp_reg, markOopDesc::biased_lock_mask_in_place); 537 cbnz(rscratch1, try_revoke_bias); 538 539 // Biasing is still enabled for this data type. See whether the 540 // epoch of the current bias is still valid, meaning that the epoch 541 // bits of the mark word are equal to the epoch bits of the 542 // prototype header. (Note that the prototype header's epoch bits 543 // only change at a safepoint.) If not, attempt to rebias the object 544 // toward the current thread. Note that we must be absolutely sure 545 // that the current epoch is invalid in order to do this because 546 // otherwise the manipulations it performs on the mark word are 547 // illegal. 548 andr(rscratch1, tmp_reg, markOopDesc::epoch_mask_in_place); 549 cbnz(rscratch1, try_rebias); 550 551 // The epoch of the current bias is still valid but we know nothing 552 // about the owner; it might be set or it might be clear. Try to 553 // acquire the bias of the object using an atomic operation. If this 554 // fails we will go in to the runtime to revoke the object's bias. 555 // Note that we first construct the presumed unbiased header so we 556 // don't accidentally blow away another thread's valid bias. 557 { 558 Label here; 559 mov(rscratch1, markOopDesc::biased_lock_mask_in_place | markOopDesc::age_mask_in_place | markOopDesc::epoch_mask_in_place); 560 andr(swap_reg, swap_reg, rscratch1); 561 orr(tmp_reg, swap_reg, rthread); 562 cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, rscratch1, here, slow_case); 563 // If the biasing toward our thread failed, this means that 564 // another thread succeeded in biasing it toward itself and we 565 // need to revoke that bias. The revocation will occur in the 566 // interpreter runtime in the slow case. 567 bind(here); 568 if (counters != NULL) { 569 atomic_incw(Address((address)counters->anonymously_biased_lock_entry_count_addr()), 570 tmp_reg, rscratch1, rscratch2); 571 } 572 } 573 b(done); 574 575 bind(try_rebias); 576 // At this point we know the epoch has expired, meaning that the 577 // current "bias owner", if any, is actually invalid. Under these 578 // circumstances _only_, we are allowed to use the current header's 579 // value as the comparison value when doing the cas to acquire the 580 // bias in the current epoch. In other words, we allow transfer of 581 // the bias from one thread to another directly in this situation. 582 // 583 // FIXME: due to a lack of registers we currently blow away the age 584 // bits in this situation. Should attempt to preserve them. 585 { 586 Label here; 587 load_prototype_header(tmp_reg, obj_reg); 588 orr(tmp_reg, rthread, tmp_reg); 589 cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, rscratch1, here, slow_case); 590 // If the biasing toward our thread failed, then another thread 591 // succeeded in biasing it toward itself and we need to revoke that 592 // bias. The revocation will occur in the runtime in the slow case. 593 bind(here); 594 if (counters != NULL) { 595 atomic_incw(Address((address)counters->rebiased_lock_entry_count_addr()), 596 tmp_reg, rscratch1, rscratch2); 597 } 598 } 599 b(done); 600 601 bind(try_revoke_bias); 602 // The prototype mark in the klass doesn't have the bias bit set any 603 // more, indicating that objects of this data type are not supposed 604 // to be biased any more. We are going to try to reset the mark of 605 // this object to the prototype value and fall through to the 606 // CAS-based locking scheme. Note that if our CAS fails, it means 607 // that another thread raced us for the privilege of revoking the 608 // bias of this particular object, so it's okay to continue in the 609 // normal locking code. 610 // 611 // FIXME: due to a lack of registers we currently blow away the age 612 // bits in this situation. Should attempt to preserve them. 613 { 614 Label here, nope; 615 load_prototype_header(tmp_reg, obj_reg); 616 cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, rscratch1, here, &nope); 617 bind(here); 618 619 // Fall through to the normal CAS-based lock, because no matter what 620 // the result of the above CAS, some thread must have succeeded in 621 // removing the bias bit from the object's header. 622 if (counters != NULL) { 623 atomic_incw(Address((address)counters->revoked_lock_entry_count_addr()), tmp_reg, 624 rscratch1, rscratch2); 625 } 626 bind(nope); 627 } 628 629 bind(cas_label); 630 631 return null_check_offset; 632 } 633 634 void MacroAssembler::biased_locking_exit(Register obj_reg, Register temp_reg, Label& done) { 635 assert(UseBiasedLocking, "why call this otherwise?"); 636 637 // Check for biased locking unlock case, which is a no-op 638 // Note: we do not have to check the thread ID for two reasons. 639 // First, the interpreter checks for IllegalMonitorStateException at 640 // a higher level. Second, if the bias was revoked while we held the 641 // lock, the object could not be rebiased toward another thread, so 642 // the bias bit would be clear. 643 ldr(temp_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes())); 644 andr(temp_reg, temp_reg, markOopDesc::biased_lock_mask_in_place); 645 cmp(temp_reg, (u1)markOopDesc::biased_lock_pattern); 646 br(Assembler::EQ, done); 647 } 648 649 static void pass_arg0(MacroAssembler* masm, Register arg) { 650 if (c_rarg0 != arg ) { 651 masm->mov(c_rarg0, arg); 652 } 653 } 654 655 static void pass_arg1(MacroAssembler* masm, Register arg) { 656 if (c_rarg1 != arg ) { 657 masm->mov(c_rarg1, arg); 658 } 659 } 660 661 static void pass_arg2(MacroAssembler* masm, Register arg) { 662 if (c_rarg2 != arg ) { 663 masm->mov(c_rarg2, arg); 664 } 665 } 666 667 static void pass_arg3(MacroAssembler* masm, Register arg) { 668 if (c_rarg3 != arg ) { 669 masm->mov(c_rarg3, arg); 670 } 671 } 672 673 void MacroAssembler::call_VM_base(Register oop_result, 674 Register java_thread, 675 Register last_java_sp, 676 address entry_point, 677 int number_of_arguments, 678 bool check_exceptions) { 679 // determine java_thread register 680 if (!java_thread->is_valid()) { 681 java_thread = rthread; 682 } 683 684 // determine last_java_sp register 685 if (!last_java_sp->is_valid()) { 686 last_java_sp = esp; 687 } 688 689 // debugging support 690 assert(number_of_arguments >= 0 , "cannot have negative number of arguments"); 691 assert(java_thread == rthread, "unexpected register"); 692 #ifdef ASSERT 693 // TraceBytecodes does not use r12 but saves it over the call, so don't verify 694 // if ((UseCompressedOops || UseCompressedClassPointers) && !TraceBytecodes) verify_heapbase("call_VM_base: heap base corrupted?"); 695 #endif // ASSERT 696 697 assert(java_thread != oop_result , "cannot use the same register for java_thread & oop_result"); 698 assert(java_thread != last_java_sp, "cannot use the same register for java_thread & last_java_sp"); 699 700 // push java thread (becomes first argument of C function) 701 702 mov(c_rarg0, java_thread); 703 704 // set last Java frame before call 705 assert(last_java_sp != rfp, "can't use rfp"); 706 707 Label l; 708 set_last_Java_frame(last_java_sp, rfp, l, rscratch1); 709 710 // do the call, remove parameters 711 MacroAssembler::call_VM_leaf_base(entry_point, number_of_arguments, &l); 712 713 // reset last Java frame 714 // Only interpreter should have to clear fp 715 reset_last_Java_frame(true); 716 717 // C++ interp handles this in the interpreter 718 check_and_handle_popframe(java_thread); 719 check_and_handle_earlyret(java_thread); 720 721 if (check_exceptions) { 722 // check for pending exceptions (java_thread is set upon return) 723 ldr(rscratch1, Address(java_thread, in_bytes(Thread::pending_exception_offset()))); 724 Label ok; 725 cbz(rscratch1, ok); 726 lea(rscratch1, RuntimeAddress(StubRoutines::forward_exception_entry())); 727 br(rscratch1); 728 bind(ok); 729 } 730 731 // get oop result if there is one and reset the value in the thread 732 if (oop_result->is_valid()) { 733 get_vm_result(oop_result, java_thread); 734 } 735 } 736 737 void MacroAssembler::call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions) { 738 call_VM_base(oop_result, noreg, noreg, entry_point, number_of_arguments, check_exceptions); 739 } 740 741 // Maybe emit a call via a trampoline. If the code cache is small 742 // trampolines won't be emitted. 743 744 address MacroAssembler::trampoline_call(Address entry, CodeBuffer *cbuf) { 745 assert(JavaThread::current()->is_Compiler_thread(), "just checking"); 746 assert(entry.rspec().type() == relocInfo::runtime_call_type 747 || entry.rspec().type() == relocInfo::opt_virtual_call_type 748 || entry.rspec().type() == relocInfo::static_call_type 749 || entry.rspec().type() == relocInfo::virtual_call_type, "wrong reloc type"); 750 751 // We need a trampoline if branches are far. 752 if (far_branches()) { 753 bool in_scratch_emit_size = false; 754 #ifdef COMPILER2 755 // We don't want to emit a trampoline if C2 is generating dummy 756 // code during its branch shortening phase. 757 CompileTask* task = ciEnv::current()->task(); 758 in_scratch_emit_size = 759 (task != NULL && is_c2_compile(task->comp_level()) && 760 Compile::current()->in_scratch_emit_size()); 761 #endif 762 if (!in_scratch_emit_size) { 763 address stub = emit_trampoline_stub(offset(), entry.target()); 764 if (stub == NULL) { 765 return NULL; // CodeCache is full 766 } 767 } 768 } 769 770 if (cbuf) cbuf->set_insts_mark(); 771 relocate(entry.rspec()); 772 if (!far_branches()) { 773 bl(entry.target()); 774 } else { 775 bl(pc()); 776 } 777 // just need to return a non-null address 778 return pc(); 779 } 780 781 782 // Emit a trampoline stub for a call to a target which is too far away. 783 // 784 // code sequences: 785 // 786 // call-site: 787 // branch-and-link to <destination> or <trampoline stub> 788 // 789 // Related trampoline stub for this call site in the stub section: 790 // load the call target from the constant pool 791 // branch (LR still points to the call site above) 792 793 address MacroAssembler::emit_trampoline_stub(int insts_call_instruction_offset, 794 address dest) { 795 // Max stub size: alignment nop, TrampolineStub. 796 address stub = start_a_stub(NativeInstruction::instruction_size 797 + NativeCallTrampolineStub::instruction_size); 798 if (stub == NULL) { 799 return NULL; // CodeBuffer::expand failed 800 } 801 802 // Create a trampoline stub relocation which relates this trampoline stub 803 // with the call instruction at insts_call_instruction_offset in the 804 // instructions code-section. 805 align(wordSize); 806 relocate(trampoline_stub_Relocation::spec(code()->insts()->start() 807 + insts_call_instruction_offset)); 808 const int stub_start_offset = offset(); 809 810 // Now, create the trampoline stub's code: 811 // - load the call 812 // - call 813 Label target; 814 ldr(rscratch1, target); 815 br(rscratch1); 816 bind(target); 817 assert(offset() - stub_start_offset == NativeCallTrampolineStub::data_offset, 818 "should be"); 819 emit_int64((int64_t)dest); 820 821 const address stub_start_addr = addr_at(stub_start_offset); 822 823 assert(is_NativeCallTrampolineStub_at(stub_start_addr), "doesn't look like a trampoline"); 824 825 end_a_stub(); 826 return stub_start_addr; 827 } 828 829 void MacroAssembler::c2bool(Register x) { 830 // implements x == 0 ? 0 : 1 831 // note: must only look at least-significant byte of x 832 // since C-style booleans are stored in one byte 833 // only! (was bug) 834 tst(x, 0xff); 835 cset(x, Assembler::NE); 836 } 837 838 address MacroAssembler::ic_call(address entry, jint method_index) { 839 RelocationHolder rh = virtual_call_Relocation::spec(pc(), method_index); 840 // address const_ptr = long_constant((jlong)Universe::non_oop_word()); 841 // unsigned long offset; 842 // ldr_constant(rscratch2, const_ptr); 843 movptr(rscratch2, (uintptr_t)Universe::non_oop_word()); 844 return trampoline_call(Address(entry, rh)); 845 } 846 847 // Implementation of call_VM versions 848 849 void MacroAssembler::call_VM(Register oop_result, 850 address entry_point, 851 bool check_exceptions) { 852 call_VM_helper(oop_result, entry_point, 0, check_exceptions); 853 } 854 855 void MacroAssembler::call_VM(Register oop_result, 856 address entry_point, 857 Register arg_1, 858 bool check_exceptions) { 859 pass_arg1(this, arg_1); 860 call_VM_helper(oop_result, entry_point, 1, check_exceptions); 861 } 862 863 void MacroAssembler::call_VM(Register oop_result, 864 address entry_point, 865 Register arg_1, 866 Register arg_2, 867 bool check_exceptions) { 868 assert(arg_1 != c_rarg2, "smashed arg"); 869 pass_arg2(this, arg_2); 870 pass_arg1(this, arg_1); 871 call_VM_helper(oop_result, entry_point, 2, check_exceptions); 872 } 873 874 void MacroAssembler::call_VM(Register oop_result, 875 address entry_point, 876 Register arg_1, 877 Register arg_2, 878 Register arg_3, 879 bool check_exceptions) { 880 assert(arg_1 != c_rarg3, "smashed arg"); 881 assert(arg_2 != c_rarg3, "smashed arg"); 882 pass_arg3(this, arg_3); 883 884 assert(arg_1 != c_rarg2, "smashed arg"); 885 pass_arg2(this, arg_2); 886 887 pass_arg1(this, arg_1); 888 call_VM_helper(oop_result, entry_point, 3, check_exceptions); 889 } 890 891 void MacroAssembler::call_VM(Register oop_result, 892 Register last_java_sp, 893 address entry_point, 894 int number_of_arguments, 895 bool check_exceptions) { 896 call_VM_base(oop_result, rthread, last_java_sp, entry_point, number_of_arguments, check_exceptions); 897 } 898 899 void MacroAssembler::call_VM(Register oop_result, 900 Register last_java_sp, 901 address entry_point, 902 Register arg_1, 903 bool check_exceptions) { 904 pass_arg1(this, arg_1); 905 call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions); 906 } 907 908 void MacroAssembler::call_VM(Register oop_result, 909 Register last_java_sp, 910 address entry_point, 911 Register arg_1, 912 Register arg_2, 913 bool check_exceptions) { 914 915 assert(arg_1 != c_rarg2, "smashed arg"); 916 pass_arg2(this, arg_2); 917 pass_arg1(this, arg_1); 918 call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions); 919 } 920 921 void MacroAssembler::call_VM(Register oop_result, 922 Register last_java_sp, 923 address entry_point, 924 Register arg_1, 925 Register arg_2, 926 Register arg_3, 927 bool check_exceptions) { 928 assert(arg_1 != c_rarg3, "smashed arg"); 929 assert(arg_2 != c_rarg3, "smashed arg"); 930 pass_arg3(this, arg_3); 931 assert(arg_1 != c_rarg2, "smashed arg"); 932 pass_arg2(this, arg_2); 933 pass_arg1(this, arg_1); 934 call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions); 935 } 936 937 938 void MacroAssembler::get_vm_result(Register oop_result, Register java_thread) { 939 ldr(oop_result, Address(java_thread, JavaThread::vm_result_offset())); 940 str(zr, Address(java_thread, JavaThread::vm_result_offset())); 941 verify_oop(oop_result, "broken oop in call_VM_base"); 942 } 943 944 void MacroAssembler::get_vm_result_2(Register metadata_result, Register java_thread) { 945 ldr(metadata_result, Address(java_thread, JavaThread::vm_result_2_offset())); 946 str(zr, Address(java_thread, JavaThread::vm_result_2_offset())); 947 } 948 949 void MacroAssembler::align(int modulus) { 950 while (offset() % modulus != 0) nop(); 951 } 952 953 // these are no-ops overridden by InterpreterMacroAssembler 954 955 void MacroAssembler::check_and_handle_earlyret(Register java_thread) { } 956 957 void MacroAssembler::check_and_handle_popframe(Register java_thread) { } 958 959 960 RegisterOrConstant MacroAssembler::delayed_value_impl(intptr_t* delayed_value_addr, 961 Register tmp, 962 int offset) { 963 intptr_t value = *delayed_value_addr; 964 if (value != 0) 965 return RegisterOrConstant(value + offset); 966 967 // load indirectly to solve generation ordering problem 968 ldr(tmp, ExternalAddress((address) delayed_value_addr)); 969 970 if (offset != 0) 971 add(tmp, tmp, offset); 972 973 return RegisterOrConstant(tmp); 974 } 975 976 977 void MacroAssembler:: notify(int type) { 978 if (type == bytecode_start) { 979 // set_last_Java_frame(esp, rfp, (address)NULL); 980 Assembler:: notify(type); 981 // reset_last_Java_frame(true); 982 } 983 else 984 Assembler:: notify(type); 985 } 986 987 // Look up the method for a megamorphic invokeinterface call. 988 // The target method is determined by <intf_klass, itable_index>. 989 // The receiver klass is in recv_klass. 990 // On success, the result will be in method_result, and execution falls through. 991 // On failure, execution transfers to the given label. 992 void MacroAssembler::lookup_interface_method(Register recv_klass, 993 Register intf_klass, 994 RegisterOrConstant itable_index, 995 Register method_result, 996 Register scan_temp, 997 Label& L_no_such_interface, 998 bool return_method) { 999 assert_different_registers(recv_klass, intf_klass, scan_temp); 1000 assert_different_registers(method_result, intf_klass, scan_temp); 1001 assert(recv_klass != method_result || !return_method, 1002 "recv_klass can be destroyed when method isn't needed"); 1003 assert(itable_index.is_constant() || itable_index.as_register() == method_result, 1004 "caller must use same register for non-constant itable index as for method"); 1005 1006 // Compute start of first itableOffsetEntry (which is at the end of the vtable) 1007 int vtable_base = in_bytes(Klass::vtable_start_offset()); 1008 int itentry_off = itableMethodEntry::method_offset_in_bytes(); 1009 int scan_step = itableOffsetEntry::size() * wordSize; 1010 int vte_size = vtableEntry::size_in_bytes(); 1011 assert(vte_size == wordSize, "else adjust times_vte_scale"); 1012 1013 ldrw(scan_temp, Address(recv_klass, Klass::vtable_length_offset())); 1014 1015 // %%% Could store the aligned, prescaled offset in the klassoop. 1016 // lea(scan_temp, Address(recv_klass, scan_temp, times_vte_scale, vtable_base)); 1017 lea(scan_temp, Address(recv_klass, scan_temp, Address::lsl(3))); 1018 add(scan_temp, scan_temp, vtable_base); 1019 1020 if (return_method) { 1021 // Adjust recv_klass by scaled itable_index, so we can free itable_index. 1022 assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below"); 1023 // lea(recv_klass, Address(recv_klass, itable_index, Address::times_ptr, itentry_off)); 1024 lea(recv_klass, Address(recv_klass, itable_index, Address::lsl(3))); 1025 if (itentry_off) 1026 add(recv_klass, recv_klass, itentry_off); 1027 } 1028 1029 // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) { 1030 // if (scan->interface() == intf) { 1031 // result = (klass + scan->offset() + itable_index); 1032 // } 1033 // } 1034 Label search, found_method; 1035 1036 for (int peel = 1; peel >= 0; peel--) { 1037 ldr(method_result, Address(scan_temp, itableOffsetEntry::interface_offset_in_bytes())); 1038 cmp(intf_klass, method_result); 1039 1040 if (peel) { 1041 br(Assembler::EQ, found_method); 1042 } else { 1043 br(Assembler::NE, search); 1044 // (invert the test to fall through to found_method...) 1045 } 1046 1047 if (!peel) break; 1048 1049 bind(search); 1050 1051 // Check that the previous entry is non-null. A null entry means that 1052 // the receiver class doesn't implement the interface, and wasn't the 1053 // same as when the caller was compiled. 1054 cbz(method_result, L_no_such_interface); 1055 add(scan_temp, scan_temp, scan_step); 1056 } 1057 1058 bind(found_method); 1059 1060 // Got a hit. 1061 if (return_method) { 1062 ldrw(scan_temp, Address(scan_temp, itableOffsetEntry::offset_offset_in_bytes())); 1063 ldr(method_result, Address(recv_klass, scan_temp, Address::uxtw(0))); 1064 } 1065 } 1066 1067 // virtual method calling 1068 void MacroAssembler::lookup_virtual_method(Register recv_klass, 1069 RegisterOrConstant vtable_index, 1070 Register method_result) { 1071 const int base = in_bytes(Klass::vtable_start_offset()); 1072 assert(vtableEntry::size() * wordSize == 8, 1073 "adjust the scaling in the code below"); 1074 int vtable_offset_in_bytes = base + vtableEntry::method_offset_in_bytes(); 1075 1076 if (vtable_index.is_register()) { 1077 lea(method_result, Address(recv_klass, 1078 vtable_index.as_register(), 1079 Address::lsl(LogBytesPerWord))); 1080 ldr(method_result, Address(method_result, vtable_offset_in_bytes)); 1081 } else { 1082 vtable_offset_in_bytes += vtable_index.as_constant() * wordSize; 1083 ldr(method_result, 1084 form_address(rscratch1, recv_klass, vtable_offset_in_bytes, 0)); 1085 } 1086 } 1087 1088 void MacroAssembler::check_klass_subtype(Register sub_klass, 1089 Register super_klass, 1090 Register temp_reg, 1091 Label& L_success) { 1092 Label L_failure; 1093 check_klass_subtype_fast_path(sub_klass, super_klass, temp_reg, &L_success, &L_failure, NULL); 1094 check_klass_subtype_slow_path(sub_klass, super_klass, temp_reg, noreg, &L_success, NULL); 1095 bind(L_failure); 1096 } 1097 1098 1099 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass, 1100 Register super_klass, 1101 Register temp_reg, 1102 Label* L_success, 1103 Label* L_failure, 1104 Label* L_slow_path, 1105 RegisterOrConstant super_check_offset) { 1106 assert_different_registers(sub_klass, super_klass, temp_reg); 1107 bool must_load_sco = (super_check_offset.constant_or_zero() == -1); 1108 if (super_check_offset.is_register()) { 1109 assert_different_registers(sub_klass, super_klass, 1110 super_check_offset.as_register()); 1111 } else if (must_load_sco) { 1112 assert(temp_reg != noreg, "supply either a temp or a register offset"); 1113 } 1114 1115 Label L_fallthrough; 1116 int label_nulls = 0; 1117 if (L_success == NULL) { L_success = &L_fallthrough; label_nulls++; } 1118 if (L_failure == NULL) { L_failure = &L_fallthrough; label_nulls++; } 1119 if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; } 1120 assert(label_nulls <= 1, "at most one NULL in the batch"); 1121 1122 int sc_offset = in_bytes(Klass::secondary_super_cache_offset()); 1123 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 1124 Address super_check_offset_addr(super_klass, sco_offset); 1125 1126 // Hacked jmp, which may only be used just before L_fallthrough. 1127 #define final_jmp(label) \ 1128 if (&(label) == &L_fallthrough) { /*do nothing*/ } \ 1129 else b(label) /*omit semi*/ 1130 1131 // If the pointers are equal, we are done (e.g., String[] elements). 1132 // This self-check enables sharing of secondary supertype arrays among 1133 // non-primary types such as array-of-interface. Otherwise, each such 1134 // type would need its own customized SSA. 1135 // We move this check to the front of the fast path because many 1136 // type checks are in fact trivially successful in this manner, 1137 // so we get a nicely predicted branch right at the start of the check. 1138 cmp(sub_klass, super_klass); 1139 br(Assembler::EQ, *L_success); 1140 1141 // Check the supertype display: 1142 if (must_load_sco) { 1143 ldrw(temp_reg, super_check_offset_addr); 1144 super_check_offset = RegisterOrConstant(temp_reg); 1145 } 1146 Address super_check_addr(sub_klass, super_check_offset); 1147 ldr(rscratch1, super_check_addr); 1148 cmp(super_klass, rscratch1); // load displayed supertype 1149 1150 // This check has worked decisively for primary supers. 1151 // Secondary supers are sought in the super_cache ('super_cache_addr'). 1152 // (Secondary supers are interfaces and very deeply nested subtypes.) 1153 // This works in the same check above because of a tricky aliasing 1154 // between the super_cache and the primary super display elements. 1155 // (The 'super_check_addr' can address either, as the case requires.) 1156 // Note that the cache is updated below if it does not help us find 1157 // what we need immediately. 1158 // So if it was a primary super, we can just fail immediately. 1159 // Otherwise, it's the slow path for us (no success at this point). 1160 1161 if (super_check_offset.is_register()) { 1162 br(Assembler::EQ, *L_success); 1163 subs(zr, super_check_offset.as_register(), sc_offset); 1164 if (L_failure == &L_fallthrough) { 1165 br(Assembler::EQ, *L_slow_path); 1166 } else { 1167 br(Assembler::NE, *L_failure); 1168 final_jmp(*L_slow_path); 1169 } 1170 } else if (super_check_offset.as_constant() == sc_offset) { 1171 // Need a slow path; fast failure is impossible. 1172 if (L_slow_path == &L_fallthrough) { 1173 br(Assembler::EQ, *L_success); 1174 } else { 1175 br(Assembler::NE, *L_slow_path); 1176 final_jmp(*L_success); 1177 } 1178 } else { 1179 // No slow path; it's a fast decision. 1180 if (L_failure == &L_fallthrough) { 1181 br(Assembler::EQ, *L_success); 1182 } else { 1183 br(Assembler::NE, *L_failure); 1184 final_jmp(*L_success); 1185 } 1186 } 1187 1188 bind(L_fallthrough); 1189 1190 #undef final_jmp 1191 } 1192 1193 // These two are taken from x86, but they look generally useful 1194 1195 // scans count pointer sized words at [addr] for occurence of value, 1196 // generic 1197 void MacroAssembler::repne_scan(Register addr, Register value, Register count, 1198 Register scratch) { 1199 Label Lloop, Lexit; 1200 cbz(count, Lexit); 1201 bind(Lloop); 1202 ldr(scratch, post(addr, wordSize)); 1203 cmp(value, scratch); 1204 br(EQ, Lexit); 1205 sub(count, count, 1); 1206 cbnz(count, Lloop); 1207 bind(Lexit); 1208 } 1209 1210 // scans count 4 byte words at [addr] for occurence of value, 1211 // generic 1212 void MacroAssembler::repne_scanw(Register addr, Register value, Register count, 1213 Register scratch) { 1214 Label Lloop, Lexit; 1215 cbz(count, Lexit); 1216 bind(Lloop); 1217 ldrw(scratch, post(addr, wordSize)); 1218 cmpw(value, scratch); 1219 br(EQ, Lexit); 1220 sub(count, count, 1); 1221 cbnz(count, Lloop); 1222 bind(Lexit); 1223 } 1224 1225 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass, 1226 Register super_klass, 1227 Register temp_reg, 1228 Register temp2_reg, 1229 Label* L_success, 1230 Label* L_failure, 1231 bool set_cond_codes) { 1232 assert_different_registers(sub_klass, super_klass, temp_reg); 1233 if (temp2_reg != noreg) 1234 assert_different_registers(sub_klass, super_klass, temp_reg, temp2_reg, rscratch1); 1235 #define IS_A_TEMP(reg) ((reg) == temp_reg || (reg) == temp2_reg) 1236 1237 Label L_fallthrough; 1238 int label_nulls = 0; 1239 if (L_success == NULL) { L_success = &L_fallthrough; label_nulls++; } 1240 if (L_failure == NULL) { L_failure = &L_fallthrough; label_nulls++; } 1241 assert(label_nulls <= 1, "at most one NULL in the batch"); 1242 1243 // a couple of useful fields in sub_klass: 1244 int ss_offset = in_bytes(Klass::secondary_supers_offset()); 1245 int sc_offset = in_bytes(Klass::secondary_super_cache_offset()); 1246 Address secondary_supers_addr(sub_klass, ss_offset); 1247 Address super_cache_addr( sub_klass, sc_offset); 1248 1249 BLOCK_COMMENT("check_klass_subtype_slow_path"); 1250 1251 // Do a linear scan of the secondary super-klass chain. 1252 // This code is rarely used, so simplicity is a virtue here. 1253 // The repne_scan instruction uses fixed registers, which we must spill. 1254 // Don't worry too much about pre-existing connections with the input regs. 1255 1256 assert(sub_klass != r0, "killed reg"); // killed by mov(r0, super) 1257 assert(sub_klass != r2, "killed reg"); // killed by lea(r2, &pst_counter) 1258 1259 RegSet pushed_registers; 1260 if (!IS_A_TEMP(r2)) pushed_registers += r2; 1261 if (!IS_A_TEMP(r5)) pushed_registers += r5; 1262 1263 if (super_klass != r0 || UseCompressedOops) { 1264 if (!IS_A_TEMP(r0)) pushed_registers += r0; 1265 } 1266 1267 push(pushed_registers, sp); 1268 1269 // Get super_klass value into r0 (even if it was in r5 or r2). 1270 if (super_klass != r0) { 1271 mov(r0, super_klass); 1272 } 1273 1274 #ifndef PRODUCT 1275 mov(rscratch2, (address)&SharedRuntime::_partial_subtype_ctr); 1276 Address pst_counter_addr(rscratch2); 1277 ldr(rscratch1, pst_counter_addr); 1278 add(rscratch1, rscratch1, 1); 1279 str(rscratch1, pst_counter_addr); 1280 #endif //PRODUCT 1281 1282 // We will consult the secondary-super array. 1283 ldr(r5, secondary_supers_addr); 1284 // Load the array length. 1285 ldrw(r2, Address(r5, Array<Klass*>::length_offset_in_bytes())); 1286 // Skip to start of data. 1287 add(r5, r5, Array<Klass*>::base_offset_in_bytes()); 1288 1289 cmp(sp, zr); // Clear Z flag; SP is never zero 1290 // Scan R2 words at [R5] for an occurrence of R0. 1291 // Set NZ/Z based on last compare. 1292 repne_scan(r5, r0, r2, rscratch1); 1293 1294 // Unspill the temp. registers: 1295 pop(pushed_registers, sp); 1296 1297 br(Assembler::NE, *L_failure); 1298 1299 // Success. Cache the super we found and proceed in triumph. 1300 str(super_klass, super_cache_addr); 1301 1302 if (L_success != &L_fallthrough) { 1303 b(*L_success); 1304 } 1305 1306 #undef IS_A_TEMP 1307 1308 bind(L_fallthrough); 1309 } 1310 1311 1312 void MacroAssembler::verify_oop(Register reg, const char* s) { 1313 if (!VerifyOops) return; 1314 1315 // Pass register number to verify_oop_subroutine 1316 const char* b = NULL; 1317 { 1318 ResourceMark rm; 1319 stringStream ss; 1320 ss.print("verify_oop: %s: %s", reg->name(), s); 1321 b = code_string(ss.as_string()); 1322 } 1323 BLOCK_COMMENT("verify_oop {"); 1324 1325 stp(r0, rscratch1, Address(pre(sp, -2 * wordSize))); 1326 stp(rscratch2, lr, Address(pre(sp, -2 * wordSize))); 1327 1328 mov(r0, reg); 1329 mov(rscratch1, (address)b); 1330 1331 // call indirectly to solve generation ordering problem 1332 lea(rscratch2, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address())); 1333 ldr(rscratch2, Address(rscratch2)); 1334 blr(rscratch2); 1335 1336 ldp(rscratch2, lr, Address(post(sp, 2 * wordSize))); 1337 ldp(r0, rscratch1, Address(post(sp, 2 * wordSize))); 1338 1339 BLOCK_COMMENT("} verify_oop"); 1340 } 1341 1342 void MacroAssembler::verify_oop_addr(Address addr, const char* s) { 1343 if (!VerifyOops) return; 1344 1345 const char* b = NULL; 1346 { 1347 ResourceMark rm; 1348 stringStream ss; 1349 ss.print("verify_oop_addr: %s", s); 1350 b = code_string(ss.as_string()); 1351 } 1352 BLOCK_COMMENT("verify_oop_addr {"); 1353 1354 stp(r0, rscratch1, Address(pre(sp, -2 * wordSize))); 1355 stp(rscratch2, lr, Address(pre(sp, -2 * wordSize))); 1356 1357 // addr may contain sp so we will have to adjust it based on the 1358 // pushes that we just did. 1359 if (addr.uses(sp)) { 1360 lea(r0, addr); 1361 ldr(r0, Address(r0, 4 * wordSize)); 1362 } else { 1363 ldr(r0, addr); 1364 } 1365 mov(rscratch1, (address)b); 1366 1367 // call indirectly to solve generation ordering problem 1368 lea(rscratch2, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address())); 1369 ldr(rscratch2, Address(rscratch2)); 1370 blr(rscratch2); 1371 1372 ldp(rscratch2, lr, Address(post(sp, 2 * wordSize))); 1373 ldp(r0, rscratch1, Address(post(sp, 2 * wordSize))); 1374 1375 BLOCK_COMMENT("} verify_oop_addr"); 1376 } 1377 1378 Address MacroAssembler::argument_address(RegisterOrConstant arg_slot, 1379 int extra_slot_offset) { 1380 // cf. TemplateTable::prepare_invoke(), if (load_receiver). 1381 int stackElementSize = Interpreter::stackElementSize; 1382 int offset = Interpreter::expr_offset_in_bytes(extra_slot_offset+0); 1383 #ifdef ASSERT 1384 int offset1 = Interpreter::expr_offset_in_bytes(extra_slot_offset+1); 1385 assert(offset1 - offset == stackElementSize, "correct arithmetic"); 1386 #endif 1387 if (arg_slot.is_constant()) { 1388 return Address(esp, arg_slot.as_constant() * stackElementSize 1389 + offset); 1390 } else { 1391 add(rscratch1, esp, arg_slot.as_register(), 1392 ext::uxtx, exact_log2(stackElementSize)); 1393 return Address(rscratch1, offset); 1394 } 1395 } 1396 1397 void MacroAssembler::call_VM_leaf_base(address entry_point, 1398 int number_of_arguments, 1399 Label *retaddr) { 1400 call_VM_leaf_base1(entry_point, number_of_arguments, 0, ret_type_integral, retaddr); 1401 } 1402 1403 void MacroAssembler::call_VM_leaf_base1(address entry_point, 1404 int number_of_gp_arguments, 1405 int number_of_fp_arguments, 1406 ret_type type, 1407 Label *retaddr) { 1408 Label E, L; 1409 1410 stp(rscratch1, rmethod, Address(pre(sp, -2 * wordSize))); 1411 1412 // We add 1 to number_of_arguments because the thread in arg0 is 1413 // not counted 1414 mov(rscratch1, entry_point); 1415 blrt(rscratch1, number_of_gp_arguments + 1, number_of_fp_arguments, type); 1416 if (retaddr) 1417 bind(*retaddr); 1418 1419 ldp(rscratch1, rmethod, Address(post(sp, 2 * wordSize))); 1420 maybe_isb(); 1421 } 1422 1423 void MacroAssembler::call_VM_leaf(address entry_point, int number_of_arguments) { 1424 call_VM_leaf_base(entry_point, number_of_arguments); 1425 } 1426 1427 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0) { 1428 pass_arg0(this, arg_0); 1429 call_VM_leaf_base(entry_point, 1); 1430 } 1431 1432 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1) { 1433 pass_arg0(this, arg_0); 1434 pass_arg1(this, arg_1); 1435 call_VM_leaf_base(entry_point, 2); 1436 } 1437 1438 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, 1439 Register arg_1, Register arg_2) { 1440 pass_arg0(this, arg_0); 1441 pass_arg1(this, arg_1); 1442 pass_arg2(this, arg_2); 1443 call_VM_leaf_base(entry_point, 3); 1444 } 1445 1446 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0) { 1447 pass_arg0(this, arg_0); 1448 MacroAssembler::call_VM_leaf_base(entry_point, 1); 1449 } 1450 1451 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1) { 1452 1453 assert(arg_0 != c_rarg1, "smashed arg"); 1454 pass_arg1(this, arg_1); 1455 pass_arg0(this, arg_0); 1456 MacroAssembler::call_VM_leaf_base(entry_point, 2); 1457 } 1458 1459 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) { 1460 assert(arg_0 != c_rarg2, "smashed arg"); 1461 assert(arg_1 != c_rarg2, "smashed arg"); 1462 pass_arg2(this, arg_2); 1463 assert(arg_0 != c_rarg1, "smashed arg"); 1464 pass_arg1(this, arg_1); 1465 pass_arg0(this, arg_0); 1466 MacroAssembler::call_VM_leaf_base(entry_point, 3); 1467 } 1468 1469 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) { 1470 assert(arg_0 != c_rarg3, "smashed arg"); 1471 assert(arg_1 != c_rarg3, "smashed arg"); 1472 assert(arg_2 != c_rarg3, "smashed arg"); 1473 pass_arg3(this, arg_3); 1474 assert(arg_0 != c_rarg2, "smashed arg"); 1475 assert(arg_1 != c_rarg2, "smashed arg"); 1476 pass_arg2(this, arg_2); 1477 assert(arg_0 != c_rarg1, "smashed arg"); 1478 pass_arg1(this, arg_1); 1479 pass_arg0(this, arg_0); 1480 MacroAssembler::call_VM_leaf_base(entry_point, 4); 1481 } 1482 1483 void MacroAssembler::null_check(Register reg, int offset) { 1484 if (needs_explicit_null_check(offset)) { 1485 // provoke OS NULL exception if reg = NULL by 1486 // accessing M[reg] w/o changing any registers 1487 // NOTE: this is plenty to provoke a segv 1488 ldr(zr, Address(reg)); 1489 } else { 1490 // nothing to do, (later) access of M[reg + offset] 1491 // will provoke OS NULL exception if reg = NULL 1492 } 1493 } 1494 1495 // MacroAssembler protected routines needed to implement 1496 // public methods 1497 1498 void MacroAssembler::mov(Register r, Address dest) { 1499 code_section()->relocate(pc(), dest.rspec()); 1500 u_int64_t imm64 = (u_int64_t)dest.target(); 1501 movptr(r, imm64); 1502 } 1503 1504 // Move a constant pointer into r. In AArch64 mode the virtual address space 1505 // is 48 bits in size or 52 bits. We need three or four instructions to create 1506 // a patchable instruction sequence that can reach anywhere. 1507 void MacroAssembler::movptr(Register r, uintptr_t imm64) { 1508 #ifndef PRODUCT 1509 { 1510 char buffer[64]; 1511 snprintf(buffer, sizeof(buffer), "0x%" PRIX64, imm64); 1512 block_comment(buffer); 1513 } 1514 #endif 1515 if (!Use64BitLiteralAddresses) { 1516 assert(imm64 < (1ul << 48), "48-bit overflow in address constant"); 1517 } 1518 movz(r, imm64 & 0xffff); 1519 imm64 >>= 16; 1520 movk(r, imm64 & 0xffff, 16); 1521 imm64 >>= 16; 1522 movk(r, imm64 & 0xffff, 32); 1523 1524 if (Use64BitLiteralAddresses) { 1525 imm64 >>= 16; 1526 movk(r, imm64 & 0xffff, 48); 1527 } 1528 } 1529 1530 // Macro to mov replicated immediate to vector register. 1531 // Vd will get the following values for different arrangements in T 1532 // imm32 == hex 000000gh T8B: Vd = ghghghghghghghgh 1533 // imm32 == hex 000000gh T16B: Vd = ghghghghghghghghghghghghghghghgh 1534 // imm32 == hex 0000efgh T4H: Vd = efghefghefghefgh 1535 // imm32 == hex 0000efgh T8H: Vd = efghefghefghefghefghefghefghefgh 1536 // imm32 == hex abcdefgh T2S: Vd = abcdefghabcdefgh 1537 // imm32 == hex abcdefgh T4S: Vd = abcdefghabcdefghabcdefghabcdefgh 1538 // T1D/T2D: invalid 1539 void MacroAssembler::mov(FloatRegister Vd, SIMD_Arrangement T, u_int32_t imm32) { 1540 assert(T != T1D && T != T2D, "invalid arrangement"); 1541 if (T == T8B || T == T16B) { 1542 assert((imm32 & ~0xff) == 0, "extraneous bits in unsigned imm32 (T8B/T16B)"); 1543 movi(Vd, T, imm32 & 0xff, 0); 1544 return; 1545 } 1546 u_int32_t nimm32 = ~imm32; 1547 if (T == T4H || T == T8H) { 1548 assert((imm32 & ~0xffff) == 0, "extraneous bits in unsigned imm32 (T4H/T8H)"); 1549 imm32 &= 0xffff; 1550 nimm32 &= 0xffff; 1551 } 1552 u_int32_t x = imm32; 1553 int movi_cnt = 0; 1554 int movn_cnt = 0; 1555 while (x) { if (x & 0xff) movi_cnt++; x >>= 8; } 1556 x = nimm32; 1557 while (x) { if (x & 0xff) movn_cnt++; x >>= 8; } 1558 if (movn_cnt < movi_cnt) imm32 = nimm32; 1559 unsigned lsl = 0; 1560 while (imm32 && (imm32 & 0xff) == 0) { lsl += 8; imm32 >>= 8; } 1561 if (movn_cnt < movi_cnt) 1562 mvni(Vd, T, imm32 & 0xff, lsl); 1563 else 1564 movi(Vd, T, imm32 & 0xff, lsl); 1565 imm32 >>= 8; lsl += 8; 1566 while (imm32) { 1567 while ((imm32 & 0xff) == 0) { lsl += 8; imm32 >>= 8; } 1568 if (movn_cnt < movi_cnt) 1569 bici(Vd, T, imm32 & 0xff, lsl); 1570 else 1571 orri(Vd, T, imm32 & 0xff, lsl); 1572 lsl += 8; imm32 >>= 8; 1573 } 1574 } 1575 1576 void MacroAssembler::mov_immediate64(Register dst, u_int64_t imm64) 1577 { 1578 #ifndef PRODUCT 1579 { 1580 char buffer[64]; 1581 snprintf(buffer, sizeof(buffer), "0x%" PRIX64, imm64); 1582 block_comment(buffer); 1583 } 1584 #endif 1585 if (operand_valid_for_logical_immediate(false, imm64)) { 1586 orr(dst, zr, imm64); 1587 } else { 1588 // we can use a combination of MOVZ or MOVN with 1589 // MOVK to build up the constant 1590 u_int64_t imm_h[4]; 1591 int zero_count = 0; 1592 int neg_count = 0; 1593 int i; 1594 for (i = 0; i < 4; i++) { 1595 imm_h[i] = ((imm64 >> (i * 16)) & 0xffffL); 1596 if (imm_h[i] == 0) { 1597 zero_count++; 1598 } else if (imm_h[i] == 0xffffL) { 1599 neg_count++; 1600 } 1601 } 1602 if (zero_count == 4) { 1603 // one MOVZ will do 1604 movz(dst, 0); 1605 } else if (neg_count == 4) { 1606 // one MOVN will do 1607 movn(dst, 0); 1608 } else if (zero_count == 3) { 1609 for (i = 0; i < 4; i++) { 1610 if (imm_h[i] != 0L) { 1611 movz(dst, (u_int32_t)imm_h[i], (i << 4)); 1612 break; 1613 } 1614 } 1615 } else if (neg_count == 3) { 1616 // one MOVN will do 1617 for (int i = 0; i < 4; i++) { 1618 if (imm_h[i] != 0xffffL) { 1619 movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4)); 1620 break; 1621 } 1622 } 1623 } else if (zero_count == 2) { 1624 // one MOVZ and one MOVK will do 1625 for (i = 0; i < 3; i++) { 1626 if (imm_h[i] != 0L) { 1627 movz(dst, (u_int32_t)imm_h[i], (i << 4)); 1628 i++; 1629 break; 1630 } 1631 } 1632 for (;i < 4; i++) { 1633 if (imm_h[i] != 0L) { 1634 movk(dst, (u_int32_t)imm_h[i], (i << 4)); 1635 } 1636 } 1637 } else if (neg_count == 2) { 1638 // one MOVN and one MOVK will do 1639 for (i = 0; i < 4; i++) { 1640 if (imm_h[i] != 0xffffL) { 1641 movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4)); 1642 i++; 1643 break; 1644 } 1645 } 1646 for (;i < 4; i++) { 1647 if (imm_h[i] != 0xffffL) { 1648 movk(dst, (u_int32_t)imm_h[i], (i << 4)); 1649 } 1650 } 1651 } else if (zero_count == 1) { 1652 // one MOVZ and two MOVKs will do 1653 for (i = 0; i < 4; i++) { 1654 if (imm_h[i] != 0L) { 1655 movz(dst, (u_int32_t)imm_h[i], (i << 4)); 1656 i++; 1657 break; 1658 } 1659 } 1660 for (;i < 4; i++) { 1661 if (imm_h[i] != 0x0L) { 1662 movk(dst, (u_int32_t)imm_h[i], (i << 4)); 1663 } 1664 } 1665 } else if (neg_count == 1) { 1666 // one MOVN and two MOVKs will do 1667 for (i = 0; i < 4; i++) { 1668 if (imm_h[i] != 0xffffL) { 1669 movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4)); 1670 i++; 1671 break; 1672 } 1673 } 1674 for (;i < 4; i++) { 1675 if (imm_h[i] != 0xffffL) { 1676 movk(dst, (u_int32_t)imm_h[i], (i << 4)); 1677 } 1678 } 1679 } else { 1680 // use a MOVZ and 3 MOVKs (makes it easier to debug) 1681 movz(dst, (u_int32_t)imm_h[0], 0); 1682 for (i = 1; i < 4; i++) { 1683 movk(dst, (u_int32_t)imm_h[i], (i << 4)); 1684 } 1685 } 1686 } 1687 } 1688 1689 void MacroAssembler::mov_immediate32(Register dst, u_int32_t imm32) 1690 { 1691 #ifndef PRODUCT 1692 { 1693 char buffer[64]; 1694 snprintf(buffer, sizeof(buffer), "0x%" PRIX32, imm32); 1695 block_comment(buffer); 1696 } 1697 #endif 1698 if (operand_valid_for_logical_immediate(true, imm32)) { 1699 orrw(dst, zr, imm32); 1700 } else { 1701 // we can use MOVZ, MOVN or two calls to MOVK to build up the 1702 // constant 1703 u_int32_t imm_h[2]; 1704 imm_h[0] = imm32 & 0xffff; 1705 imm_h[1] = ((imm32 >> 16) & 0xffff); 1706 if (imm_h[0] == 0) { 1707 movzw(dst, imm_h[1], 16); 1708 } else if (imm_h[0] == 0xffff) { 1709 movnw(dst, imm_h[1] ^ 0xffff, 16); 1710 } else if (imm_h[1] == 0) { 1711 movzw(dst, imm_h[0], 0); 1712 } else if (imm_h[1] == 0xffff) { 1713 movnw(dst, imm_h[0] ^ 0xffff, 0); 1714 } else { 1715 // use a MOVZ and MOVK (makes it easier to debug) 1716 movzw(dst, imm_h[0], 0); 1717 movkw(dst, imm_h[1], 16); 1718 } 1719 } 1720 } 1721 1722 // Form an address from base + offset in Rd. Rd may or may 1723 // not actually be used: you must use the Address that is returned. 1724 // It is up to you to ensure that the shift provided matches the size 1725 // of your data. 1726 Address MacroAssembler::form_address(Register Rd, Register base, long byte_offset, int shift) { 1727 if (Address::offset_ok_for_immed(byte_offset, shift)) 1728 // It fits; no need for any heroics 1729 return Address(base, byte_offset); 1730 1731 // Don't do anything clever with negative or misaligned offsets 1732 unsigned mask = (1 << shift) - 1; 1733 if (byte_offset < 0 || byte_offset & mask) { 1734 mov(Rd, byte_offset); 1735 add(Rd, base, Rd); 1736 return Address(Rd); 1737 } 1738 1739 // See if we can do this with two 12-bit offsets 1740 { 1741 unsigned long word_offset = byte_offset >> shift; 1742 unsigned long masked_offset = word_offset & 0xfff000; 1743 if (Address::offset_ok_for_immed(word_offset - masked_offset) 1744 && Assembler::operand_valid_for_add_sub_immediate(masked_offset << shift)) { 1745 add(Rd, base, masked_offset << shift); 1746 word_offset -= masked_offset; 1747 return Address(Rd, word_offset << shift); 1748 } 1749 } 1750 1751 // Do it the hard way 1752 mov(Rd, byte_offset); 1753 add(Rd, base, Rd); 1754 return Address(Rd); 1755 } 1756 1757 void MacroAssembler::atomic_incw(Register counter_addr, Register tmp, Register tmp2) { 1758 if (UseLSE) { 1759 mov(tmp, 1); 1760 ldadd(Assembler::word, tmp, zr, counter_addr); 1761 return; 1762 } 1763 Label retry_load; 1764 if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH)) 1765 prfm(Address(counter_addr), PSTL1STRM); 1766 bind(retry_load); 1767 // flush and load exclusive from the memory location 1768 ldxrw(tmp, counter_addr); 1769 addw(tmp, tmp, 1); 1770 // if we store+flush with no intervening write tmp wil be zero 1771 stxrw(tmp2, tmp, counter_addr); 1772 cbnzw(tmp2, retry_load); 1773 } 1774 1775 1776 int MacroAssembler::corrected_idivl(Register result, Register ra, Register rb, 1777 bool want_remainder, Register scratch) 1778 { 1779 // Full implementation of Java idiv and irem. The function 1780 // returns the (pc) offset of the div instruction - may be needed 1781 // for implicit exceptions. 1782 // 1783 // constraint : ra/rb =/= scratch 1784 // normal case 1785 // 1786 // input : ra: dividend 1787 // rb: divisor 1788 // 1789 // result: either 1790 // quotient (= ra idiv rb) 1791 // remainder (= ra irem rb) 1792 1793 assert(ra != scratch && rb != scratch, "reg cannot be scratch"); 1794 1795 int idivl_offset = offset(); 1796 if (! want_remainder) { 1797 sdivw(result, ra, rb); 1798 } else { 1799 sdivw(scratch, ra, rb); 1800 Assembler::msubw(result, scratch, rb, ra); 1801 } 1802 1803 return idivl_offset; 1804 } 1805 1806 int MacroAssembler::corrected_idivq(Register result, Register ra, Register rb, 1807 bool want_remainder, Register scratch) 1808 { 1809 // Full implementation of Java ldiv and lrem. The function 1810 // returns the (pc) offset of the div instruction - may be needed 1811 // for implicit exceptions. 1812 // 1813 // constraint : ra/rb =/= scratch 1814 // normal case 1815 // 1816 // input : ra: dividend 1817 // rb: divisor 1818 // 1819 // result: either 1820 // quotient (= ra idiv rb) 1821 // remainder (= ra irem rb) 1822 1823 assert(ra != scratch && rb != scratch, "reg cannot be scratch"); 1824 1825 int idivq_offset = offset(); 1826 if (! want_remainder) { 1827 sdiv(result, ra, rb); 1828 } else { 1829 sdiv(scratch, ra, rb); 1830 Assembler::msub(result, scratch, rb, ra); 1831 } 1832 1833 return idivq_offset; 1834 } 1835 1836 void MacroAssembler::membar(Membar_mask_bits order_constraint) { 1837 address prev = pc() - NativeMembar::instruction_size; 1838 address last = code()->last_insn(); 1839 if (last != NULL && nativeInstruction_at(last)->is_Membar() && prev == last) { 1840 NativeMembar *bar = NativeMembar_at(prev); 1841 // We are merging two memory barrier instructions. On AArch64 we 1842 // can do this simply by ORing them together. 1843 bar->set_kind(bar->get_kind() | order_constraint); 1844 BLOCK_COMMENT("merged membar"); 1845 } else { 1846 code()->set_last_insn(pc()); 1847 dmb(Assembler::barrier(order_constraint)); 1848 } 1849 } 1850 1851 bool MacroAssembler::try_merge_ldst(Register rt, const Address &adr, size_t size_in_bytes, bool is_store) { 1852 if (ldst_can_merge(rt, adr, size_in_bytes, is_store)) { 1853 merge_ldst(rt, adr, size_in_bytes, is_store); 1854 code()->clear_last_insn(); 1855 return true; 1856 } else { 1857 assert(size_in_bytes == 8 || size_in_bytes == 4, "only 8 bytes or 4 bytes load/store is supported."); 1858 const unsigned mask = size_in_bytes - 1; 1859 if (adr.getMode() == Address::base_plus_offset && 1860 (adr.offset() & mask) == 0) { // only supports base_plus_offset. 1861 code()->set_last_insn(pc()); 1862 } 1863 return false; 1864 } 1865 } 1866 1867 void MacroAssembler::ldr(Register Rx, const Address &adr) { 1868 // We always try to merge two adjacent loads into one ldp. 1869 if (!try_merge_ldst(Rx, adr, 8, false)) { 1870 Assembler::ldr(Rx, adr); 1871 } 1872 } 1873 1874 void MacroAssembler::ldrw(Register Rw, const Address &adr) { 1875 // We always try to merge two adjacent loads into one ldp. 1876 if (!try_merge_ldst(Rw, adr, 4, false)) { 1877 Assembler::ldrw(Rw, adr); 1878 } 1879 } 1880 1881 void MacroAssembler::str(Register Rx, const Address &adr) { 1882 // We always try to merge two adjacent stores into one stp. 1883 if (!try_merge_ldst(Rx, adr, 8, true)) { 1884 Assembler::str(Rx, adr); 1885 } 1886 } 1887 1888 void MacroAssembler::strw(Register Rw, const Address &adr) { 1889 // We always try to merge two adjacent stores into one stp. 1890 if (!try_merge_ldst(Rw, adr, 4, true)) { 1891 Assembler::strw(Rw, adr); 1892 } 1893 } 1894 1895 // MacroAssembler routines found actually to be needed 1896 1897 void MacroAssembler::push(Register src) 1898 { 1899 str(src, Address(pre(esp, -1 * wordSize))); 1900 } 1901 1902 void MacroAssembler::pop(Register dst) 1903 { 1904 ldr(dst, Address(post(esp, 1 * wordSize))); 1905 } 1906 1907 // Note: load_unsigned_short used to be called load_unsigned_word. 1908 int MacroAssembler::load_unsigned_short(Register dst, Address src) { 1909 int off = offset(); 1910 ldrh(dst, src); 1911 return off; 1912 } 1913 1914 int MacroAssembler::load_unsigned_byte(Register dst, Address src) { 1915 int off = offset(); 1916 ldrb(dst, src); 1917 return off; 1918 } 1919 1920 int MacroAssembler::load_signed_short(Register dst, Address src) { 1921 int off = offset(); 1922 ldrsh(dst, src); 1923 return off; 1924 } 1925 1926 int MacroAssembler::load_signed_byte(Register dst, Address src) { 1927 int off = offset(); 1928 ldrsb(dst, src); 1929 return off; 1930 } 1931 1932 int MacroAssembler::load_signed_short32(Register dst, Address src) { 1933 int off = offset(); 1934 ldrshw(dst, src); 1935 return off; 1936 } 1937 1938 int MacroAssembler::load_signed_byte32(Register dst, Address src) { 1939 int off = offset(); 1940 ldrsbw(dst, src); 1941 return off; 1942 } 1943 1944 void MacroAssembler::load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed, Register dst2) { 1945 switch (size_in_bytes) { 1946 case 8: ldr(dst, src); break; 1947 case 4: ldrw(dst, src); break; 1948 case 2: is_signed ? load_signed_short(dst, src) : load_unsigned_short(dst, src); break; 1949 case 1: is_signed ? load_signed_byte( dst, src) : load_unsigned_byte( dst, src); break; 1950 default: ShouldNotReachHere(); 1951 } 1952 } 1953 1954 void MacroAssembler::store_sized_value(Address dst, Register src, size_t size_in_bytes, Register src2) { 1955 switch (size_in_bytes) { 1956 case 8: str(src, dst); break; 1957 case 4: strw(src, dst); break; 1958 case 2: strh(src, dst); break; 1959 case 1: strb(src, dst); break; 1960 default: ShouldNotReachHere(); 1961 } 1962 } 1963 1964 void MacroAssembler::decrementw(Register reg, int value) 1965 { 1966 if (value < 0) { incrementw(reg, -value); return; } 1967 if (value == 0) { return; } 1968 if (value < (1 << 12)) { subw(reg, reg, value); return; } 1969 /* else */ { 1970 guarantee(reg != rscratch2, "invalid dst for register decrement"); 1971 movw(rscratch2, (unsigned)value); 1972 subw(reg, reg, rscratch2); 1973 } 1974 } 1975 1976 void MacroAssembler::decrement(Register reg, int value) 1977 { 1978 if (value < 0) { increment(reg, -value); return; } 1979 if (value == 0) { return; } 1980 if (value < (1 << 12)) { sub(reg, reg, value); return; } 1981 /* else */ { 1982 assert(reg != rscratch2, "invalid dst for register decrement"); 1983 mov(rscratch2, (unsigned long)value); 1984 sub(reg, reg, rscratch2); 1985 } 1986 } 1987 1988 void MacroAssembler::decrementw(Address dst, int value) 1989 { 1990 assert(!dst.uses(rscratch1), "invalid dst for address decrement"); 1991 if (dst.getMode() == Address::literal) { 1992 assert(abs(value) < (1 << 12), "invalid value and address mode combination"); 1993 lea(rscratch2, dst); 1994 dst = Address(rscratch2); 1995 } 1996 ldrw(rscratch1, dst); 1997 decrementw(rscratch1, value); 1998 strw(rscratch1, dst); 1999 } 2000 2001 void MacroAssembler::decrement(Address dst, int value) 2002 { 2003 assert(!dst.uses(rscratch1), "invalid address for decrement"); 2004 if (dst.getMode() == Address::literal) { 2005 assert(abs(value) < (1 << 12), "invalid value and address mode combination"); 2006 lea(rscratch2, dst); 2007 dst = Address(rscratch2); 2008 } 2009 ldr(rscratch1, dst); 2010 decrement(rscratch1, value); 2011 str(rscratch1, dst); 2012 } 2013 2014 void MacroAssembler::incrementw(Register reg, int value) 2015 { 2016 if (value < 0) { decrementw(reg, -value); return; } 2017 if (value == 0) { return; } 2018 if (value < (1 << 12)) { addw(reg, reg, value); return; } 2019 /* else */ { 2020 assert(reg != rscratch2, "invalid dst for register increment"); 2021 movw(rscratch2, (unsigned)value); 2022 addw(reg, reg, rscratch2); 2023 } 2024 } 2025 2026 void MacroAssembler::increment(Register reg, int value) 2027 { 2028 if (value < 0) { decrement(reg, -value); return; } 2029 if (value == 0) { return; } 2030 if (value < (1 << 12)) { add(reg, reg, value); return; } 2031 /* else */ { 2032 assert(reg != rscratch2, "invalid dst for register increment"); 2033 movw(rscratch2, (unsigned)value); 2034 add(reg, reg, rscratch2); 2035 } 2036 } 2037 2038 void MacroAssembler::incrementw(Address dst, int value) 2039 { 2040 assert(!dst.uses(rscratch1), "invalid dst for address increment"); 2041 if (dst.getMode() == Address::literal) { 2042 assert(abs(value) < (1 << 12), "invalid value and address mode combination"); 2043 lea(rscratch2, dst); 2044 dst = Address(rscratch2); 2045 } 2046 ldrw(rscratch1, dst); 2047 incrementw(rscratch1, value); 2048 strw(rscratch1, dst); 2049 } 2050 2051 void MacroAssembler::increment(Address dst, int value) 2052 { 2053 assert(!dst.uses(rscratch1), "invalid dst for address increment"); 2054 if (dst.getMode() == Address::literal) { 2055 assert(abs(value) < (1 << 12), "invalid value and address mode combination"); 2056 lea(rscratch2, dst); 2057 dst = Address(rscratch2); 2058 } 2059 ldr(rscratch1, dst); 2060 increment(rscratch1, value); 2061 str(rscratch1, dst); 2062 } 2063 2064 2065 void MacroAssembler::pusha() { 2066 push(0x7fffffff, sp); 2067 } 2068 2069 void MacroAssembler::popa() { 2070 pop(0x7fffffff, sp); 2071 } 2072 2073 // Push lots of registers in the bit set supplied. Don't push sp. 2074 // Return the number of words pushed 2075 int MacroAssembler::push(unsigned int bitset, Register stack) { 2076 int words_pushed = 0; 2077 2078 // Scan bitset to accumulate register pairs 2079 unsigned char regs[32]; 2080 int count = 0; 2081 for (int reg = 0; reg <= 30; reg++) { 2082 if (1 & bitset) 2083 regs[count++] = reg; 2084 bitset >>= 1; 2085 } 2086 regs[count++] = zr->encoding_nocheck(); 2087 count &= ~1; // Only push an even nuber of regs 2088 2089 if (count) { 2090 stp(as_Register(regs[0]), as_Register(regs[1]), 2091 Address(pre(stack, -count * wordSize))); 2092 words_pushed += 2; 2093 } 2094 for (int i = 2; i < count; i += 2) { 2095 stp(as_Register(regs[i]), as_Register(regs[i+1]), 2096 Address(stack, i * wordSize)); 2097 words_pushed += 2; 2098 } 2099 2100 assert(words_pushed == count, "oops, pushed != count"); 2101 2102 return count; 2103 } 2104 2105 int MacroAssembler::pop(unsigned int bitset, Register stack) { 2106 int words_pushed = 0; 2107 2108 // Scan bitset to accumulate register pairs 2109 unsigned char regs[32]; 2110 int count = 0; 2111 for (int reg = 0; reg <= 30; reg++) { 2112 if (1 & bitset) 2113 regs[count++] = reg; 2114 bitset >>= 1; 2115 } 2116 regs[count++] = zr->encoding_nocheck(); 2117 count &= ~1; 2118 2119 for (int i = 2; i < count; i += 2) { 2120 ldp(as_Register(regs[i]), as_Register(regs[i+1]), 2121 Address(stack, i * wordSize)); 2122 words_pushed += 2; 2123 } 2124 if (count) { 2125 ldp(as_Register(regs[0]), as_Register(regs[1]), 2126 Address(post(stack, count * wordSize))); 2127 words_pushed += 2; 2128 } 2129 2130 assert(words_pushed == count, "oops, pushed != count"); 2131 2132 return count; 2133 } 2134 #ifdef ASSERT 2135 void MacroAssembler::verify_heapbase(const char* msg) { 2136 #if 0 2137 assert (UseCompressedOops || UseCompressedClassPointers, "should be compressed"); 2138 assert (Universe::heap() != NULL, "java heap should be initialized"); 2139 if (CheckCompressedOops) { 2140 Label ok; 2141 push(1 << rscratch1->encoding(), sp); // cmpptr trashes rscratch1 2142 cmpptr(rheapbase, ExternalAddress((address)Universe::narrow_ptrs_base_addr())); 2143 br(Assembler::EQ, ok); 2144 stop(msg); 2145 bind(ok); 2146 pop(1 << rscratch1->encoding(), sp); 2147 } 2148 #endif 2149 } 2150 #endif 2151 2152 void MacroAssembler::resolve_jobject(Register value, Register thread, Register tmp) { 2153 Label done, not_weak; 2154 cbz(value, done); // Use NULL as-is. 2155 2156 STATIC_ASSERT(JNIHandles::weak_tag_mask == 1u); 2157 tbz(r0, 0, not_weak); // Test for jweak tag. 2158 2159 // Resolve jweak. 2160 access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF, value, 2161 Address(value, -JNIHandles::weak_tag_value), tmp, thread); 2162 verify_oop(value); 2163 b(done); 2164 2165 bind(not_weak); 2166 // Resolve (untagged) jobject. 2167 access_load_at(T_OBJECT, IN_NATIVE, value, Address(value, 0), tmp, thread); 2168 verify_oop(value); 2169 bind(done); 2170 } 2171 2172 void MacroAssembler::stop(const char* msg) { 2173 address ip = pc(); 2174 pusha(); 2175 mov(c_rarg0, (address)msg); 2176 mov(c_rarg1, (address)ip); 2177 mov(c_rarg2, sp); 2178 mov(c_rarg3, CAST_FROM_FN_PTR(address, MacroAssembler::debug64)); 2179 // call(c_rarg3); 2180 blrt(c_rarg3, 3, 0, 1); 2181 hlt(0); 2182 } 2183 2184 void MacroAssembler::warn(const char* msg) { 2185 pusha(); 2186 mov(c_rarg0, (address)msg); 2187 mov(lr, CAST_FROM_FN_PTR(address, warning)); 2188 blrt(lr, 1, 0, MacroAssembler::ret_type_void); 2189 popa(); 2190 } 2191 2192 void MacroAssembler::unimplemented(const char* what) { 2193 const char* buf = NULL; 2194 { 2195 ResourceMark rm; 2196 stringStream ss; 2197 ss.print("unimplemented: %s", what); 2198 buf = code_string(ss.as_string()); 2199 } 2200 stop(buf); 2201 } 2202 2203 // If a constant does not fit in an immediate field, generate some 2204 // number of MOV instructions and then perform the operation. 2205 void MacroAssembler::wrap_add_sub_imm_insn(Register Rd, Register Rn, unsigned imm, 2206 add_sub_imm_insn insn1, 2207 add_sub_reg_insn insn2) { 2208 assert(Rd != zr, "Rd = zr and not setting flags?"); 2209 if (operand_valid_for_add_sub_immediate((int)imm)) { 2210 (this->*insn1)(Rd, Rn, imm); 2211 } else { 2212 if (uabs(imm) < (1 << 24)) { 2213 (this->*insn1)(Rd, Rn, imm & -(1 << 12)); 2214 (this->*insn1)(Rd, Rd, imm & ((1 << 12)-1)); 2215 } else { 2216 assert_different_registers(Rd, Rn); 2217 mov(Rd, (uint64_t)imm); 2218 (this->*insn2)(Rd, Rn, Rd, LSL, 0); 2219 } 2220 } 2221 } 2222 2223 // Seperate vsn which sets the flags. Optimisations are more restricted 2224 // because we must set the flags correctly. 2225 void MacroAssembler::wrap_adds_subs_imm_insn(Register Rd, Register Rn, unsigned imm, 2226 add_sub_imm_insn insn1, 2227 add_sub_reg_insn insn2) { 2228 if (operand_valid_for_add_sub_immediate((int)imm)) { 2229 (this->*insn1)(Rd, Rn, imm); 2230 } else { 2231 assert_different_registers(Rd, Rn); 2232 assert(Rd != zr, "overflow in immediate operand"); 2233 mov(Rd, (uint64_t)imm); 2234 (this->*insn2)(Rd, Rn, Rd, LSL, 0); 2235 } 2236 } 2237 2238 2239 void MacroAssembler::add(Register Rd, Register Rn, RegisterOrConstant increment) { 2240 if (increment.is_register()) { 2241 add(Rd, Rn, increment.as_register()); 2242 } else { 2243 add(Rd, Rn, increment.as_constant()); 2244 } 2245 } 2246 2247 void MacroAssembler::addw(Register Rd, Register Rn, RegisterOrConstant increment) { 2248 if (increment.is_register()) { 2249 addw(Rd, Rn, increment.as_register()); 2250 } else { 2251 addw(Rd, Rn, increment.as_constant()); 2252 } 2253 } 2254 2255 void MacroAssembler::sub(Register Rd, Register Rn, RegisterOrConstant decrement) { 2256 if (decrement.is_register()) { 2257 sub(Rd, Rn, decrement.as_register()); 2258 } else { 2259 sub(Rd, Rn, decrement.as_constant()); 2260 } 2261 } 2262 2263 void MacroAssembler::subw(Register Rd, Register Rn, RegisterOrConstant decrement) { 2264 if (decrement.is_register()) { 2265 subw(Rd, Rn, decrement.as_register()); 2266 } else { 2267 subw(Rd, Rn, decrement.as_constant()); 2268 } 2269 } 2270 2271 void MacroAssembler::reinit_heapbase() 2272 { 2273 if (UseCompressedOops) { 2274 if (Universe::is_fully_initialized()) { 2275 mov(rheapbase, Universe::narrow_ptrs_base()); 2276 } else { 2277 lea(rheapbase, ExternalAddress((address)Universe::narrow_ptrs_base_addr())); 2278 ldr(rheapbase, Address(rheapbase)); 2279 } 2280 } 2281 } 2282 2283 // this simulates the behaviour of the x86 cmpxchg instruction using a 2284 // load linked/store conditional pair. we use the acquire/release 2285 // versions of these instructions so that we flush pending writes as 2286 // per Java semantics. 2287 2288 // n.b the x86 version assumes the old value to be compared against is 2289 // in rax and updates rax with the value located in memory if the 2290 // cmpxchg fails. we supply a register for the old value explicitly 2291 2292 // the aarch64 load linked/store conditional instructions do not 2293 // accept an offset. so, unlike x86, we must provide a plain register 2294 // to identify the memory word to be compared/exchanged rather than a 2295 // register+offset Address. 2296 2297 void MacroAssembler::cmpxchgptr(Register oldv, Register newv, Register addr, Register tmp, 2298 Label &succeed, Label *fail) { 2299 // oldv holds comparison value 2300 // newv holds value to write in exchange 2301 // addr identifies memory word to compare against/update 2302 if (UseLSE) { 2303 mov(tmp, oldv); 2304 casal(Assembler::xword, oldv, newv, addr); 2305 cmp(tmp, oldv); 2306 br(Assembler::EQ, succeed); 2307 membar(AnyAny); 2308 } else { 2309 Label retry_load, nope; 2310 if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH)) 2311 prfm(Address(addr), PSTL1STRM); 2312 bind(retry_load); 2313 // flush and load exclusive from the memory location 2314 // and fail if it is not what we expect 2315 ldaxr(tmp, addr); 2316 cmp(tmp, oldv); 2317 br(Assembler::NE, nope); 2318 // if we store+flush with no intervening write tmp wil be zero 2319 stlxr(tmp, newv, addr); 2320 cbzw(tmp, succeed); 2321 // retry so we only ever return after a load fails to compare 2322 // ensures we don't return a stale value after a failed write. 2323 b(retry_load); 2324 // if the memory word differs we return it in oldv and signal a fail 2325 bind(nope); 2326 membar(AnyAny); 2327 mov(oldv, tmp); 2328 } 2329 if (fail) 2330 b(*fail); 2331 } 2332 2333 void MacroAssembler::cmpxchg_obj_header(Register oldv, Register newv, Register obj, Register tmp, 2334 Label &succeed, Label *fail) { 2335 assert(oopDesc::mark_offset_in_bytes() == 0, "assumption"); 2336 cmpxchgptr(oldv, newv, obj, tmp, succeed, fail); 2337 } 2338 2339 void MacroAssembler::cmpxchgw(Register oldv, Register newv, Register addr, Register tmp, 2340 Label &succeed, Label *fail) { 2341 // oldv holds comparison value 2342 // newv holds value to write in exchange 2343 // addr identifies memory word to compare against/update 2344 // tmp returns 0/1 for success/failure 2345 if (UseLSE) { 2346 mov(tmp, oldv); 2347 casal(Assembler::word, oldv, newv, addr); 2348 cmp(tmp, oldv); 2349 br(Assembler::EQ, succeed); 2350 membar(AnyAny); 2351 } else { 2352 Label retry_load, nope; 2353 if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH)) 2354 prfm(Address(addr), PSTL1STRM); 2355 bind(retry_load); 2356 // flush and load exclusive from the memory location 2357 // and fail if it is not what we expect 2358 ldaxrw(tmp, addr); 2359 cmp(tmp, oldv); 2360 br(Assembler::NE, nope); 2361 // if we store+flush with no intervening write tmp wil be zero 2362 stlxrw(tmp, newv, addr); 2363 cbzw(tmp, succeed); 2364 // retry so we only ever return after a load fails to compare 2365 // ensures we don't return a stale value after a failed write. 2366 b(retry_load); 2367 // if the memory word differs we return it in oldv and signal a fail 2368 bind(nope); 2369 membar(AnyAny); 2370 mov(oldv, tmp); 2371 } 2372 if (fail) 2373 b(*fail); 2374 } 2375 2376 // A generic CAS; success or failure is in the EQ flag. A weak CAS 2377 // doesn't retry and may fail spuriously. If the oldval is wanted, 2378 // Pass a register for the result, otherwise pass noreg. 2379 2380 // Clobbers rscratch1 2381 void MacroAssembler::cmpxchg(Register addr, Register expected, 2382 Register new_val, 2383 enum operand_size size, 2384 bool acquire, bool release, 2385 bool weak, 2386 Register result) { 2387 if (result == noreg) result = rscratch1; 2388 BLOCK_COMMENT("cmpxchg {"); 2389 if (UseLSE) { 2390 mov(result, expected); 2391 lse_cas(result, new_val, addr, size, acquire, release, /*not_pair*/ true); 2392 compare_eq(result, expected, size); 2393 } else { 2394 Label retry_load, done; 2395 if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH)) 2396 prfm(Address(addr), PSTL1STRM); 2397 bind(retry_load); 2398 load_exclusive(result, addr, size, acquire); 2399 compare_eq(result, expected, size); 2400 br(Assembler::NE, done); 2401 store_exclusive(rscratch1, new_val, addr, size, release); 2402 if (weak) { 2403 cmpw(rscratch1, 0u); // If the store fails, return NE to our caller. 2404 } else { 2405 cbnzw(rscratch1, retry_load); 2406 } 2407 bind(done); 2408 } 2409 BLOCK_COMMENT("} cmpxchg"); 2410 } 2411 2412 // A generic comparison. Only compares for equality, clobbers rscratch1. 2413 void MacroAssembler::compare_eq(Register rm, Register rn, enum operand_size size) { 2414 if (size == xword) { 2415 cmp(rm, rn); 2416 } else if (size == word) { 2417 cmpw(rm, rn); 2418 } else if (size == halfword) { 2419 eorw(rscratch1, rm, rn); 2420 ands(zr, rscratch1, 0xffff); 2421 } else if (size == byte) { 2422 eorw(rscratch1, rm, rn); 2423 ands(zr, rscratch1, 0xff); 2424 } else { 2425 ShouldNotReachHere(); 2426 } 2427 } 2428 2429 2430 static bool different(Register a, RegisterOrConstant b, Register c) { 2431 if (b.is_constant()) 2432 return a != c; 2433 else 2434 return a != b.as_register() && a != c && b.as_register() != c; 2435 } 2436 2437 #define ATOMIC_OP(NAME, LDXR, OP, IOP, AOP, STXR, sz) \ 2438 void MacroAssembler::atomic_##NAME(Register prev, RegisterOrConstant incr, Register addr) { \ 2439 if (UseLSE) { \ 2440 prev = prev->is_valid() ? prev : zr; \ 2441 if (incr.is_register()) { \ 2442 AOP(sz, incr.as_register(), prev, addr); \ 2443 } else { \ 2444 mov(rscratch2, incr.as_constant()); \ 2445 AOP(sz, rscratch2, prev, addr); \ 2446 } \ 2447 return; \ 2448 } \ 2449 Register result = rscratch2; \ 2450 if (prev->is_valid()) \ 2451 result = different(prev, incr, addr) ? prev : rscratch2; \ 2452 \ 2453 Label retry_load; \ 2454 if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH)) \ 2455 prfm(Address(addr), PSTL1STRM); \ 2456 bind(retry_load); \ 2457 LDXR(result, addr); \ 2458 OP(rscratch1, result, incr); \ 2459 STXR(rscratch2, rscratch1, addr); \ 2460 cbnzw(rscratch2, retry_load); \ 2461 if (prev->is_valid() && prev != result) { \ 2462 IOP(prev, rscratch1, incr); \ 2463 } \ 2464 } 2465 2466 ATOMIC_OP(add, ldxr, add, sub, ldadd, stxr, Assembler::xword) 2467 ATOMIC_OP(addw, ldxrw, addw, subw, ldadd, stxrw, Assembler::word) 2468 ATOMIC_OP(addal, ldaxr, add, sub, ldaddal, stlxr, Assembler::xword) 2469 ATOMIC_OP(addalw, ldaxrw, addw, subw, ldaddal, stlxrw, Assembler::word) 2470 2471 #undef ATOMIC_OP 2472 2473 #define ATOMIC_XCHG(OP, AOP, LDXR, STXR, sz) \ 2474 void MacroAssembler::atomic_##OP(Register prev, Register newv, Register addr) { \ 2475 if (UseLSE) { \ 2476 prev = prev->is_valid() ? prev : zr; \ 2477 AOP(sz, newv, prev, addr); \ 2478 return; \ 2479 } \ 2480 Register result = rscratch2; \ 2481 if (prev->is_valid()) \ 2482 result = different(prev, newv, addr) ? prev : rscratch2; \ 2483 \ 2484 Label retry_load; \ 2485 if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH)) \ 2486 prfm(Address(addr), PSTL1STRM); \ 2487 bind(retry_load); \ 2488 LDXR(result, addr); \ 2489 STXR(rscratch1, newv, addr); \ 2490 cbnzw(rscratch1, retry_load); \ 2491 if (prev->is_valid() && prev != result) \ 2492 mov(prev, result); \ 2493 } 2494 2495 ATOMIC_XCHG(xchg, swp, ldxr, stxr, Assembler::xword) 2496 ATOMIC_XCHG(xchgw, swp, ldxrw, stxrw, Assembler::word) 2497 ATOMIC_XCHG(xchgal, swpal, ldaxr, stlxr, Assembler::xword) 2498 ATOMIC_XCHG(xchgalw, swpal, ldaxrw, stlxrw, Assembler::word) 2499 2500 #undef ATOMIC_XCHG 2501 2502 #ifndef PRODUCT 2503 extern "C" void findpc(intptr_t x); 2504 #endif 2505 2506 void MacroAssembler::debug64(char* msg, int64_t pc, int64_t regs[]) 2507 { 2508 // In order to get locks to work, we need to fake a in_VM state 2509 if (ShowMessageBoxOnError ) { 2510 JavaThread* thread = JavaThread::current(); 2511 JavaThreadState saved_state = thread->thread_state(); 2512 thread->set_thread_state(_thread_in_vm); 2513 #ifndef PRODUCT 2514 if (CountBytecodes || TraceBytecodes || StopInterpreterAt) { 2515 ttyLocker ttyl; 2516 BytecodeCounter::print(); 2517 } 2518 #endif 2519 if (os::message_box(msg, "Execution stopped, print registers?")) { 2520 ttyLocker ttyl; 2521 tty->print_cr(" pc = 0x%016lx", pc); 2522 #ifndef PRODUCT 2523 tty->cr(); 2524 findpc(pc); 2525 tty->cr(); 2526 #endif 2527 tty->print_cr(" r0 = 0x%016lx", regs[0]); 2528 tty->print_cr(" r1 = 0x%016lx", regs[1]); 2529 tty->print_cr(" r2 = 0x%016lx", regs[2]); 2530 tty->print_cr(" r3 = 0x%016lx", regs[3]); 2531 tty->print_cr(" r4 = 0x%016lx", regs[4]); 2532 tty->print_cr(" r5 = 0x%016lx", regs[5]); 2533 tty->print_cr(" r6 = 0x%016lx", regs[6]); 2534 tty->print_cr(" r7 = 0x%016lx", regs[7]); 2535 tty->print_cr(" r8 = 0x%016lx", regs[8]); 2536 tty->print_cr(" r9 = 0x%016lx", regs[9]); 2537 tty->print_cr("r10 = 0x%016lx", regs[10]); 2538 tty->print_cr("r11 = 0x%016lx", regs[11]); 2539 tty->print_cr("r12 = 0x%016lx", regs[12]); 2540 tty->print_cr("r13 = 0x%016lx", regs[13]); 2541 tty->print_cr("r14 = 0x%016lx", regs[14]); 2542 tty->print_cr("r15 = 0x%016lx", regs[15]); 2543 tty->print_cr("r16 = 0x%016lx", regs[16]); 2544 tty->print_cr("r17 = 0x%016lx", regs[17]); 2545 tty->print_cr("r18 = 0x%016lx", regs[18]); 2546 tty->print_cr("r19 = 0x%016lx", regs[19]); 2547 tty->print_cr("r20 = 0x%016lx", regs[20]); 2548 tty->print_cr("r21 = 0x%016lx", regs[21]); 2549 tty->print_cr("r22 = 0x%016lx", regs[22]); 2550 tty->print_cr("r23 = 0x%016lx", regs[23]); 2551 tty->print_cr("r24 = 0x%016lx", regs[24]); 2552 tty->print_cr("r25 = 0x%016lx", regs[25]); 2553 tty->print_cr("r26 = 0x%016lx", regs[26]); 2554 tty->print_cr("r27 = 0x%016lx", regs[27]); 2555 tty->print_cr("r28 = 0x%016lx", regs[28]); 2556 tty->print_cr("r30 = 0x%016lx", regs[30]); 2557 tty->print_cr("r31 = 0x%016lx", regs[31]); 2558 BREAKPOINT; 2559 } 2560 ThreadStateTransition::transition(thread, _thread_in_vm, saved_state); 2561 } else { 2562 ttyLocker ttyl; 2563 ::tty->print_cr("=============== DEBUG MESSAGE: %s ================\n", 2564 msg); 2565 assert(false, "DEBUG MESSAGE: %s", msg); 2566 } 2567 } 2568 2569 #ifdef BUILTIN_SIM 2570 // routine to generate an x86 prolog for a stub function which 2571 // bootstraps into the generated ARM code which directly follows the 2572 // stub 2573 // 2574 // the argument encodes the number of general and fp registers 2575 // passed by the caller and the callng convention (currently just 2576 // the number of general registers and assumes C argument passing) 2577 2578 extern "C" { 2579 int aarch64_stub_prolog_size(); 2580 void aarch64_stub_prolog(); 2581 void aarch64_prolog(); 2582 } 2583 2584 void MacroAssembler::c_stub_prolog(int gp_arg_count, int fp_arg_count, int ret_type, 2585 address *prolog_ptr) 2586 { 2587 int calltype = (((ret_type & 0x3) << 8) | 2588 ((fp_arg_count & 0xf) << 4) | 2589 (gp_arg_count & 0xf)); 2590 2591 // the addresses for the x86 to ARM entry code we need to use 2592 address start = pc(); 2593 // printf("start = %lx\n", start); 2594 int byteCount = aarch64_stub_prolog_size(); 2595 // printf("byteCount = %x\n", byteCount); 2596 int instructionCount = (byteCount + 3)/ 4; 2597 // printf("instructionCount = %x\n", instructionCount); 2598 for (int i = 0; i < instructionCount; i++) { 2599 nop(); 2600 } 2601 2602 memcpy(start, (void*)aarch64_stub_prolog, byteCount); 2603 2604 // write the address of the setup routine and the call format at the 2605 // end of into the copied code 2606 u_int64_t *patch_end = (u_int64_t *)(start + byteCount); 2607 if (prolog_ptr) 2608 patch_end[-2] = (u_int64_t)prolog_ptr; 2609 patch_end[-1] = calltype; 2610 } 2611 #endif 2612 2613 void MacroAssembler::push_call_clobbered_registers() { 2614 int step = 4 * wordSize; 2615 push(RegSet::range(r0, r18) - RegSet::of(rscratch1, rscratch2), sp); 2616 sub(sp, sp, step); 2617 mov(rscratch1, -step); 2618 // Push v0-v7, v16-v31. 2619 for (int i = 31; i>= 4; i -= 4) { 2620 if (i <= v7->encoding() || i >= v16->encoding()) 2621 st1(as_FloatRegister(i-3), as_FloatRegister(i-2), as_FloatRegister(i-1), 2622 as_FloatRegister(i), T1D, Address(post(sp, rscratch1))); 2623 } 2624 st1(as_FloatRegister(0), as_FloatRegister(1), as_FloatRegister(2), 2625 as_FloatRegister(3), T1D, Address(sp)); 2626 } 2627 2628 void MacroAssembler::pop_call_clobbered_registers() { 2629 for (int i = 0; i < 32; i += 4) { 2630 if (i <= v7->encoding() || i >= v16->encoding()) 2631 ld1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2), 2632 as_FloatRegister(i+3), T1D, Address(post(sp, 4 * wordSize))); 2633 } 2634 2635 pop(RegSet::range(r0, r18) - RegSet::of(rscratch1, rscratch2), sp); 2636 } 2637 2638 void MacroAssembler::push_CPU_state(bool save_vectors) { 2639 int step = (save_vectors ? 8 : 4) * wordSize; 2640 push(0x3fffffff, sp); // integer registers except lr & sp 2641 mov(rscratch1, -step); 2642 sub(sp, sp, step); 2643 for (int i = 28; i >= 4; i -= 4) { 2644 st1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2), 2645 as_FloatRegister(i+3), save_vectors ? T2D : T1D, Address(post(sp, rscratch1))); 2646 } 2647 st1(v0, v1, v2, v3, save_vectors ? T2D : T1D, sp); 2648 } 2649 2650 void MacroAssembler::pop_CPU_state(bool restore_vectors) { 2651 int step = (restore_vectors ? 8 : 4) * wordSize; 2652 for (int i = 0; i <= 28; i += 4) 2653 ld1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2), 2654 as_FloatRegister(i+3), restore_vectors ? T2D : T1D, Address(post(sp, step))); 2655 pop(0x3fffffff, sp); // integer registers except lr & sp 2656 } 2657 2658 /** 2659 * Helpers for multiply_to_len(). 2660 */ 2661 void MacroAssembler::add2_with_carry(Register final_dest_hi, Register dest_hi, Register dest_lo, 2662 Register src1, Register src2) { 2663 adds(dest_lo, dest_lo, src1); 2664 adc(dest_hi, dest_hi, zr); 2665 adds(dest_lo, dest_lo, src2); 2666 adc(final_dest_hi, dest_hi, zr); 2667 } 2668 2669 // Generate an address from (r + r1 extend offset). "size" is the 2670 // size of the operand. The result may be in rscratch2. 2671 Address MacroAssembler::offsetted_address(Register r, Register r1, 2672 Address::extend ext, int offset, int size) { 2673 if (offset || (ext.shift() % size != 0)) { 2674 lea(rscratch2, Address(r, r1, ext)); 2675 return Address(rscratch2, offset); 2676 } else { 2677 return Address(r, r1, ext); 2678 } 2679 } 2680 2681 Address MacroAssembler::spill_address(int size, int offset, Register tmp) 2682 { 2683 assert(offset >= 0, "spill to negative address?"); 2684 // Offset reachable ? 2685 // Not aligned - 9 bits signed offset 2686 // Aligned - 12 bits unsigned offset shifted 2687 Register base = sp; 2688 if ((offset & (size-1)) && offset >= (1<<8)) { 2689 add(tmp, base, offset & ((1<<12)-1)); 2690 base = tmp; 2691 offset &= -1<<12; 2692 } 2693 2694 if (offset >= (1<<12) * size) { 2695 add(tmp, base, offset & (((1<<12)-1)<<12)); 2696 base = tmp; 2697 offset &= ~(((1<<12)-1)<<12); 2698 } 2699 2700 return Address(base, offset); 2701 } 2702 2703 // Checks whether offset is aligned. 2704 // Returns true if it is, else false. 2705 bool MacroAssembler::merge_alignment_check(Register base, 2706 size_t size, 2707 long cur_offset, 2708 long prev_offset) const { 2709 if (AvoidUnalignedAccesses) { 2710 if (base == sp) { 2711 // Checks whether low offset if aligned to pair of registers. 2712 long pair_mask = size * 2 - 1; 2713 long offset = prev_offset > cur_offset ? cur_offset : prev_offset; 2714 return (offset & pair_mask) == 0; 2715 } else { // If base is not sp, we can't guarantee the access is aligned. 2716 return false; 2717 } 2718 } else { 2719 long mask = size - 1; 2720 // Load/store pair instruction only supports element size aligned offset. 2721 return (cur_offset & mask) == 0 && (prev_offset & mask) == 0; 2722 } 2723 } 2724 2725 // Checks whether current and previous loads/stores can be merged. 2726 // Returns true if it can be merged, else false. 2727 bool MacroAssembler::ldst_can_merge(Register rt, 2728 const Address &adr, 2729 size_t cur_size_in_bytes, 2730 bool is_store) const { 2731 address prev = pc() - NativeInstruction::instruction_size; 2732 address last = code()->last_insn(); 2733 2734 if (last == NULL || !nativeInstruction_at(last)->is_Imm_LdSt()) { 2735 return false; 2736 } 2737 2738 if (adr.getMode() != Address::base_plus_offset || prev != last) { 2739 return false; 2740 } 2741 2742 NativeLdSt* prev_ldst = NativeLdSt_at(prev); 2743 size_t prev_size_in_bytes = prev_ldst->size_in_bytes(); 2744 2745 assert(prev_size_in_bytes == 4 || prev_size_in_bytes == 8, "only supports 64/32bit merging."); 2746 assert(cur_size_in_bytes == 4 || cur_size_in_bytes == 8, "only supports 64/32bit merging."); 2747 2748 if (cur_size_in_bytes != prev_size_in_bytes || is_store != prev_ldst->is_store()) { 2749 return false; 2750 } 2751 2752 long max_offset = 63 * prev_size_in_bytes; 2753 long min_offset = -64 * prev_size_in_bytes; 2754 2755 assert(prev_ldst->is_not_pre_post_index(), "pre-index or post-index is not supported to be merged."); 2756 2757 // Only same base can be merged. 2758 if (adr.base() != prev_ldst->base()) { 2759 return false; 2760 } 2761 2762 long cur_offset = adr.offset(); 2763 long prev_offset = prev_ldst->offset(); 2764 size_t diff = abs(cur_offset - prev_offset); 2765 if (diff != prev_size_in_bytes) { 2766 return false; 2767 } 2768 2769 // Following cases can not be merged: 2770 // ldr x2, [x2, #8] 2771 // ldr x3, [x2, #16] 2772 // or: 2773 // ldr x2, [x3, #8] 2774 // ldr x2, [x3, #16] 2775 // If t1 and t2 is the same in "ldp t1, t2, [xn, #imm]", we'll get SIGILL. 2776 if (!is_store && (adr.base() == prev_ldst->target() || rt == prev_ldst->target())) { 2777 return false; 2778 } 2779 2780 long low_offset = prev_offset > cur_offset ? cur_offset : prev_offset; 2781 // Offset range must be in ldp/stp instruction's range. 2782 if (low_offset > max_offset || low_offset < min_offset) { 2783 return false; 2784 } 2785 2786 if (merge_alignment_check(adr.base(), prev_size_in_bytes, cur_offset, prev_offset)) { 2787 return true; 2788 } 2789 2790 return false; 2791 } 2792 2793 // Merge current load/store with previous load/store into ldp/stp. 2794 void MacroAssembler::merge_ldst(Register rt, 2795 const Address &adr, 2796 size_t cur_size_in_bytes, 2797 bool is_store) { 2798 2799 assert(ldst_can_merge(rt, adr, cur_size_in_bytes, is_store) == true, "cur and prev must be able to be merged."); 2800 2801 Register rt_low, rt_high; 2802 address prev = pc() - NativeInstruction::instruction_size; 2803 NativeLdSt* prev_ldst = NativeLdSt_at(prev); 2804 2805 long offset; 2806 2807 if (adr.offset() < prev_ldst->offset()) { 2808 offset = adr.offset(); 2809 rt_low = rt; 2810 rt_high = prev_ldst->target(); 2811 } else { 2812 offset = prev_ldst->offset(); 2813 rt_low = prev_ldst->target(); 2814 rt_high = rt; 2815 } 2816 2817 Address adr_p = Address(prev_ldst->base(), offset); 2818 // Overwrite previous generated binary. 2819 code_section()->set_end(prev); 2820 2821 const int sz = prev_ldst->size_in_bytes(); 2822 assert(sz == 8 || sz == 4, "only supports 64/32bit merging."); 2823 if (!is_store) { 2824 BLOCK_COMMENT("merged ldr pair"); 2825 if (sz == 8) { 2826 ldp(rt_low, rt_high, adr_p); 2827 } else { 2828 ldpw(rt_low, rt_high, adr_p); 2829 } 2830 } else { 2831 BLOCK_COMMENT("merged str pair"); 2832 if (sz == 8) { 2833 stp(rt_low, rt_high, adr_p); 2834 } else { 2835 stpw(rt_low, rt_high, adr_p); 2836 } 2837 } 2838 } 2839 2840 /** 2841 * Multiply 64 bit by 64 bit first loop. 2842 */ 2843 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart, 2844 Register y, Register y_idx, Register z, 2845 Register carry, Register product, 2846 Register idx, Register kdx) { 2847 // 2848 // jlong carry, x[], y[], z[]; 2849 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) { 2850 // huge_128 product = y[idx] * x[xstart] + carry; 2851 // z[kdx] = (jlong)product; 2852 // carry = (jlong)(product >>> 64); 2853 // } 2854 // z[xstart] = carry; 2855 // 2856 2857 Label L_first_loop, L_first_loop_exit; 2858 Label L_one_x, L_one_y, L_multiply; 2859 2860 subsw(xstart, xstart, 1); 2861 br(Assembler::MI, L_one_x); 2862 2863 lea(rscratch1, Address(x, xstart, Address::lsl(LogBytesPerInt))); 2864 ldr(x_xstart, Address(rscratch1)); 2865 ror(x_xstart, x_xstart, 32); // convert big-endian to little-endian 2866 2867 bind(L_first_loop); 2868 subsw(idx, idx, 1); 2869 br(Assembler::MI, L_first_loop_exit); 2870 subsw(idx, idx, 1); 2871 br(Assembler::MI, L_one_y); 2872 lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt))); 2873 ldr(y_idx, Address(rscratch1)); 2874 ror(y_idx, y_idx, 32); // convert big-endian to little-endian 2875 bind(L_multiply); 2876 2877 // AArch64 has a multiply-accumulate instruction that we can't use 2878 // here because it has no way to process carries, so we have to use 2879 // separate add and adc instructions. Bah. 2880 umulh(rscratch1, x_xstart, y_idx); // x_xstart * y_idx -> rscratch1:product 2881 mul(product, x_xstart, y_idx); 2882 adds(product, product, carry); 2883 adc(carry, rscratch1, zr); // x_xstart * y_idx + carry -> carry:product 2884 2885 subw(kdx, kdx, 2); 2886 ror(product, product, 32); // back to big-endian 2887 str(product, offsetted_address(z, kdx, Address::uxtw(LogBytesPerInt), 0, BytesPerLong)); 2888 2889 b(L_first_loop); 2890 2891 bind(L_one_y); 2892 ldrw(y_idx, Address(y, 0)); 2893 b(L_multiply); 2894 2895 bind(L_one_x); 2896 ldrw(x_xstart, Address(x, 0)); 2897 b(L_first_loop); 2898 2899 bind(L_first_loop_exit); 2900 } 2901 2902 /** 2903 * Multiply 128 bit by 128. Unrolled inner loop. 2904 * 2905 */ 2906 void MacroAssembler::multiply_128_x_128_loop(Register y, Register z, 2907 Register carry, Register carry2, 2908 Register idx, Register jdx, 2909 Register yz_idx1, Register yz_idx2, 2910 Register tmp, Register tmp3, Register tmp4, 2911 Register tmp6, Register product_hi) { 2912 2913 // jlong carry, x[], y[], z[]; 2914 // int kdx = ystart+1; 2915 // for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop 2916 // huge_128 tmp3 = (y[idx+1] * product_hi) + z[kdx+idx+1] + carry; 2917 // jlong carry2 = (jlong)(tmp3 >>> 64); 2918 // huge_128 tmp4 = (y[idx] * product_hi) + z[kdx+idx] + carry2; 2919 // carry = (jlong)(tmp4 >>> 64); 2920 // z[kdx+idx+1] = (jlong)tmp3; 2921 // z[kdx+idx] = (jlong)tmp4; 2922 // } 2923 // idx += 2; 2924 // if (idx > 0) { 2925 // yz_idx1 = (y[idx] * product_hi) + z[kdx+idx] + carry; 2926 // z[kdx+idx] = (jlong)yz_idx1; 2927 // carry = (jlong)(yz_idx1 >>> 64); 2928 // } 2929 // 2930 2931 Label L_third_loop, L_third_loop_exit, L_post_third_loop_done; 2932 2933 lsrw(jdx, idx, 2); 2934 2935 bind(L_third_loop); 2936 2937 subsw(jdx, jdx, 1); 2938 br(Assembler::MI, L_third_loop_exit); 2939 subw(idx, idx, 4); 2940 2941 lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt))); 2942 2943 ldp(yz_idx2, yz_idx1, Address(rscratch1, 0)); 2944 2945 lea(tmp6, Address(z, idx, Address::uxtw(LogBytesPerInt))); 2946 2947 ror(yz_idx1, yz_idx1, 32); // convert big-endian to little-endian 2948 ror(yz_idx2, yz_idx2, 32); 2949 2950 ldp(rscratch2, rscratch1, Address(tmp6, 0)); 2951 2952 mul(tmp3, product_hi, yz_idx1); // yz_idx1 * product_hi -> tmp4:tmp3 2953 umulh(tmp4, product_hi, yz_idx1); 2954 2955 ror(rscratch1, rscratch1, 32); // convert big-endian to little-endian 2956 ror(rscratch2, rscratch2, 32); 2957 2958 mul(tmp, product_hi, yz_idx2); // yz_idx2 * product_hi -> carry2:tmp 2959 umulh(carry2, product_hi, yz_idx2); 2960 2961 // propagate sum of both multiplications into carry:tmp4:tmp3 2962 adds(tmp3, tmp3, carry); 2963 adc(tmp4, tmp4, zr); 2964 adds(tmp3, tmp3, rscratch1); 2965 adcs(tmp4, tmp4, tmp); 2966 adc(carry, carry2, zr); 2967 adds(tmp4, tmp4, rscratch2); 2968 adc(carry, carry, zr); 2969 2970 ror(tmp3, tmp3, 32); // convert little-endian to big-endian 2971 ror(tmp4, tmp4, 32); 2972 stp(tmp4, tmp3, Address(tmp6, 0)); 2973 2974 b(L_third_loop); 2975 bind (L_third_loop_exit); 2976 2977 andw (idx, idx, 0x3); 2978 cbz(idx, L_post_third_loop_done); 2979 2980 Label L_check_1; 2981 subsw(idx, idx, 2); 2982 br(Assembler::MI, L_check_1); 2983 2984 lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt))); 2985 ldr(yz_idx1, Address(rscratch1, 0)); 2986 ror(yz_idx1, yz_idx1, 32); 2987 mul(tmp3, product_hi, yz_idx1); // yz_idx1 * product_hi -> tmp4:tmp3 2988 umulh(tmp4, product_hi, yz_idx1); 2989 lea(rscratch1, Address(z, idx, Address::uxtw(LogBytesPerInt))); 2990 ldr(yz_idx2, Address(rscratch1, 0)); 2991 ror(yz_idx2, yz_idx2, 32); 2992 2993 add2_with_carry(carry, tmp4, tmp3, carry, yz_idx2); 2994 2995 ror(tmp3, tmp3, 32); 2996 str(tmp3, Address(rscratch1, 0)); 2997 2998 bind (L_check_1); 2999 3000 andw (idx, idx, 0x1); 3001 subsw(idx, idx, 1); 3002 br(Assembler::MI, L_post_third_loop_done); 3003 ldrw(tmp4, Address(y, idx, Address::uxtw(LogBytesPerInt))); 3004 mul(tmp3, tmp4, product_hi); // tmp4 * product_hi -> carry2:tmp3 3005 umulh(carry2, tmp4, product_hi); 3006 ldrw(tmp4, Address(z, idx, Address::uxtw(LogBytesPerInt))); 3007 3008 add2_with_carry(carry2, tmp3, tmp4, carry); 3009 3010 strw(tmp3, Address(z, idx, Address::uxtw(LogBytesPerInt))); 3011 extr(carry, carry2, tmp3, 32); 3012 3013 bind(L_post_third_loop_done); 3014 } 3015 3016 /** 3017 * Code for BigInteger::multiplyToLen() instrinsic. 3018 * 3019 * r0: x 3020 * r1: xlen 3021 * r2: y 3022 * r3: ylen 3023 * r4: z 3024 * r5: zlen 3025 * r10: tmp1 3026 * r11: tmp2 3027 * r12: tmp3 3028 * r13: tmp4 3029 * r14: tmp5 3030 * r15: tmp6 3031 * r16: tmp7 3032 * 3033 */ 3034 void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Register ylen, 3035 Register z, Register zlen, 3036 Register tmp1, Register tmp2, Register tmp3, Register tmp4, 3037 Register tmp5, Register tmp6, Register product_hi) { 3038 3039 assert_different_registers(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6); 3040 3041 const Register idx = tmp1; 3042 const Register kdx = tmp2; 3043 const Register xstart = tmp3; 3044 3045 const Register y_idx = tmp4; 3046 const Register carry = tmp5; 3047 const Register product = xlen; 3048 const Register x_xstart = zlen; // reuse register 3049 3050 // First Loop. 3051 // 3052 // final static long LONG_MASK = 0xffffffffL; 3053 // int xstart = xlen - 1; 3054 // int ystart = ylen - 1; 3055 // long carry = 0; 3056 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) { 3057 // long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry; 3058 // z[kdx] = (int)product; 3059 // carry = product >>> 32; 3060 // } 3061 // z[xstart] = (int)carry; 3062 // 3063 3064 movw(idx, ylen); // idx = ylen; 3065 movw(kdx, zlen); // kdx = xlen+ylen; 3066 mov(carry, zr); // carry = 0; 3067 3068 Label L_done; 3069 3070 movw(xstart, xlen); 3071 subsw(xstart, xstart, 1); 3072 br(Assembler::MI, L_done); 3073 3074 multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, carry, product, idx, kdx); 3075 3076 Label L_second_loop; 3077 cbzw(kdx, L_second_loop); 3078 3079 Label L_carry; 3080 subw(kdx, kdx, 1); 3081 cbzw(kdx, L_carry); 3082 3083 strw(carry, Address(z, kdx, Address::uxtw(LogBytesPerInt))); 3084 lsr(carry, carry, 32); 3085 subw(kdx, kdx, 1); 3086 3087 bind(L_carry); 3088 strw(carry, Address(z, kdx, Address::uxtw(LogBytesPerInt))); 3089 3090 // Second and third (nested) loops. 3091 // 3092 // for (int i = xstart-1; i >= 0; i--) { // Second loop 3093 // carry = 0; 3094 // for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop 3095 // long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) + 3096 // (z[k] & LONG_MASK) + carry; 3097 // z[k] = (int)product; 3098 // carry = product >>> 32; 3099 // } 3100 // z[i] = (int)carry; 3101 // } 3102 // 3103 // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = product_hi 3104 3105 const Register jdx = tmp1; 3106 3107 bind(L_second_loop); 3108 mov(carry, zr); // carry = 0; 3109 movw(jdx, ylen); // j = ystart+1 3110 3111 subsw(xstart, xstart, 1); // i = xstart-1; 3112 br(Assembler::MI, L_done); 3113 3114 str(z, Address(pre(sp, -4 * wordSize))); 3115 3116 Label L_last_x; 3117 lea(z, offsetted_address(z, xstart, Address::uxtw(LogBytesPerInt), 4, BytesPerInt)); // z = z + k - j 3118 subsw(xstart, xstart, 1); // i = xstart-1; 3119 br(Assembler::MI, L_last_x); 3120 3121 lea(rscratch1, Address(x, xstart, Address::uxtw(LogBytesPerInt))); 3122 ldr(product_hi, Address(rscratch1)); 3123 ror(product_hi, product_hi, 32); // convert big-endian to little-endian 3124 3125 Label L_third_loop_prologue; 3126 bind(L_third_loop_prologue); 3127 3128 str(ylen, Address(sp, wordSize)); 3129 stp(x, xstart, Address(sp, 2 * wordSize)); 3130 multiply_128_x_128_loop(y, z, carry, x, jdx, ylen, product, 3131 tmp2, x_xstart, tmp3, tmp4, tmp6, product_hi); 3132 ldp(z, ylen, Address(post(sp, 2 * wordSize))); 3133 ldp(x, xlen, Address(post(sp, 2 * wordSize))); // copy old xstart -> xlen 3134 3135 addw(tmp3, xlen, 1); 3136 strw(carry, Address(z, tmp3, Address::uxtw(LogBytesPerInt))); 3137 subsw(tmp3, tmp3, 1); 3138 br(Assembler::MI, L_done); 3139 3140 lsr(carry, carry, 32); 3141 strw(carry, Address(z, tmp3, Address::uxtw(LogBytesPerInt))); 3142 b(L_second_loop); 3143 3144 // Next infrequent code is moved outside loops. 3145 bind(L_last_x); 3146 ldrw(product_hi, Address(x, 0)); 3147 b(L_third_loop_prologue); 3148 3149 bind(L_done); 3150 } 3151 3152 // Code for BigInteger::mulAdd instrinsic 3153 // out = r0 3154 // in = r1 3155 // offset = r2 (already out.length-offset) 3156 // len = r3 3157 // k = r4 3158 // 3159 // pseudo code from java implementation: 3160 // carry = 0; 3161 // offset = out.length-offset - 1; 3162 // for (int j=len-1; j >= 0; j--) { 3163 // product = (in[j] & LONG_MASK) * kLong + (out[offset] & LONG_MASK) + carry; 3164 // out[offset--] = (int)product; 3165 // carry = product >>> 32; 3166 // } 3167 // return (int)carry; 3168 void MacroAssembler::mul_add(Register out, Register in, Register offset, 3169 Register len, Register k) { 3170 Label LOOP, END; 3171 // pre-loop 3172 cmp(len, zr); // cmp, not cbz/cbnz: to use condition twice => less branches 3173 csel(out, zr, out, Assembler::EQ); 3174 br(Assembler::EQ, END); 3175 add(in, in, len, LSL, 2); // in[j+1] address 3176 add(offset, out, offset, LSL, 2); // out[offset + 1] address 3177 mov(out, zr); // used to keep carry now 3178 BIND(LOOP); 3179 ldrw(rscratch1, Address(pre(in, -4))); 3180 madd(rscratch1, rscratch1, k, out); 3181 ldrw(rscratch2, Address(pre(offset, -4))); 3182 add(rscratch1, rscratch1, rscratch2); 3183 strw(rscratch1, Address(offset)); 3184 lsr(out, rscratch1, 32); 3185 subs(len, len, 1); 3186 br(Assembler::NE, LOOP); 3187 BIND(END); 3188 } 3189 3190 /** 3191 * Emits code to update CRC-32 with a byte value according to constants in table 3192 * 3193 * @param [in,out]crc Register containing the crc. 3194 * @param [in]val Register containing the byte to fold into the CRC. 3195 * @param [in]table Register containing the table of crc constants. 3196 * 3197 * uint32_t crc; 3198 * val = crc_table[(val ^ crc) & 0xFF]; 3199 * crc = val ^ (crc >> 8); 3200 * 3201 */ 3202 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) { 3203 eor(val, val, crc); 3204 andr(val, val, 0xff); 3205 ldrw(val, Address(table, val, Address::lsl(2))); 3206 eor(crc, val, crc, Assembler::LSR, 8); 3207 } 3208 3209 /** 3210 * Emits code to update CRC-32 with a 32-bit value according to tables 0 to 3 3211 * 3212 * @param [in,out]crc Register containing the crc. 3213 * @param [in]v Register containing the 32-bit to fold into the CRC. 3214 * @param [in]table0 Register containing table 0 of crc constants. 3215 * @param [in]table1 Register containing table 1 of crc constants. 3216 * @param [in]table2 Register containing table 2 of crc constants. 3217 * @param [in]table3 Register containing table 3 of crc constants. 3218 * 3219 * uint32_t crc; 3220 * v = crc ^ v 3221 * crc = table3[v&0xff]^table2[(v>>8)&0xff]^table1[(v>>16)&0xff]^table0[v>>24] 3222 * 3223 */ 3224 void MacroAssembler::update_word_crc32(Register crc, Register v, Register tmp, 3225 Register table0, Register table1, Register table2, Register table3, 3226 bool upper) { 3227 eor(v, crc, v, upper ? LSR:LSL, upper ? 32:0); 3228 uxtb(tmp, v); 3229 ldrw(crc, Address(table3, tmp, Address::lsl(2))); 3230 ubfx(tmp, v, 8, 8); 3231 ldrw(tmp, Address(table2, tmp, Address::lsl(2))); 3232 eor(crc, crc, tmp); 3233 ubfx(tmp, v, 16, 8); 3234 ldrw(tmp, Address(table1, tmp, Address::lsl(2))); 3235 eor(crc, crc, tmp); 3236 ubfx(tmp, v, 24, 8); 3237 ldrw(tmp, Address(table0, tmp, Address::lsl(2))); 3238 eor(crc, crc, tmp); 3239 } 3240 3241 void MacroAssembler::kernel_crc32_using_crc32(Register crc, Register buf, 3242 Register len, Register tmp0, Register tmp1, Register tmp2, 3243 Register tmp3) { 3244 Label CRC_by64_loop, CRC_by4_loop, CRC_by1_loop, CRC_less64, CRC_by64_pre, CRC_by32_loop, CRC_less32, L_exit; 3245 assert_different_registers(crc, buf, len, tmp0, tmp1, tmp2, tmp3); 3246 3247 mvnw(crc, crc); 3248 3249 subs(len, len, 128); 3250 br(Assembler::GE, CRC_by64_pre); 3251 BIND(CRC_less64); 3252 adds(len, len, 128-32); 3253 br(Assembler::GE, CRC_by32_loop); 3254 BIND(CRC_less32); 3255 adds(len, len, 32-4); 3256 br(Assembler::GE, CRC_by4_loop); 3257 adds(len, len, 4); 3258 br(Assembler::GT, CRC_by1_loop); 3259 b(L_exit); 3260 3261 BIND(CRC_by32_loop); 3262 ldp(tmp0, tmp1, Address(post(buf, 16))); 3263 subs(len, len, 32); 3264 crc32x(crc, crc, tmp0); 3265 ldr(tmp2, Address(post(buf, 8))); 3266 crc32x(crc, crc, tmp1); 3267 ldr(tmp3, Address(post(buf, 8))); 3268 crc32x(crc, crc, tmp2); 3269 crc32x(crc, crc, tmp3); 3270 br(Assembler::GE, CRC_by32_loop); 3271 cmn(len, 32); 3272 br(Assembler::NE, CRC_less32); 3273 b(L_exit); 3274 3275 BIND(CRC_by4_loop); 3276 ldrw(tmp0, Address(post(buf, 4))); 3277 subs(len, len, 4); 3278 crc32w(crc, crc, tmp0); 3279 br(Assembler::GE, CRC_by4_loop); 3280 adds(len, len, 4); 3281 br(Assembler::LE, L_exit); 3282 BIND(CRC_by1_loop); 3283 ldrb(tmp0, Address(post(buf, 1))); 3284 subs(len, len, 1); 3285 crc32b(crc, crc, tmp0); 3286 br(Assembler::GT, CRC_by1_loop); 3287 b(L_exit); 3288 3289 BIND(CRC_by64_pre); 3290 sub(buf, buf, 8); 3291 ldp(tmp0, tmp1, Address(buf, 8)); 3292 crc32x(crc, crc, tmp0); 3293 ldr(tmp2, Address(buf, 24)); 3294 crc32x(crc, crc, tmp1); 3295 ldr(tmp3, Address(buf, 32)); 3296 crc32x(crc, crc, tmp2); 3297 ldr(tmp0, Address(buf, 40)); 3298 crc32x(crc, crc, tmp3); 3299 ldr(tmp1, Address(buf, 48)); 3300 crc32x(crc, crc, tmp0); 3301 ldr(tmp2, Address(buf, 56)); 3302 crc32x(crc, crc, tmp1); 3303 ldr(tmp3, Address(pre(buf, 64))); 3304 3305 b(CRC_by64_loop); 3306 3307 align(CodeEntryAlignment); 3308 BIND(CRC_by64_loop); 3309 subs(len, len, 64); 3310 crc32x(crc, crc, tmp2); 3311 ldr(tmp0, Address(buf, 8)); 3312 crc32x(crc, crc, tmp3); 3313 ldr(tmp1, Address(buf, 16)); 3314 crc32x(crc, crc, tmp0); 3315 ldr(tmp2, Address(buf, 24)); 3316 crc32x(crc, crc, tmp1); 3317 ldr(tmp3, Address(buf, 32)); 3318 crc32x(crc, crc, tmp2); 3319 ldr(tmp0, Address(buf, 40)); 3320 crc32x(crc, crc, tmp3); 3321 ldr(tmp1, Address(buf, 48)); 3322 crc32x(crc, crc, tmp0); 3323 ldr(tmp2, Address(buf, 56)); 3324 crc32x(crc, crc, tmp1); 3325 ldr(tmp3, Address(pre(buf, 64))); 3326 br(Assembler::GE, CRC_by64_loop); 3327 3328 // post-loop 3329 crc32x(crc, crc, tmp2); 3330 crc32x(crc, crc, tmp3); 3331 3332 sub(len, len, 64); 3333 add(buf, buf, 8); 3334 cmn(len, 128); 3335 br(Assembler::NE, CRC_less64); 3336 BIND(L_exit); 3337 mvnw(crc, crc); 3338 } 3339 3340 /** 3341 * @param crc register containing existing CRC (32-bit) 3342 * @param buf register pointing to input byte buffer (byte*) 3343 * @param len register containing number of bytes 3344 * @param table register that will contain address of CRC table 3345 * @param tmp scratch register 3346 */ 3347 void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len, 3348 Register table0, Register table1, Register table2, Register table3, 3349 Register tmp, Register tmp2, Register tmp3) { 3350 Label L_by16, L_by16_loop, L_by4, L_by4_loop, L_by1, L_by1_loop, L_exit; 3351 unsigned long offset; 3352 3353 if (UseCRC32) { 3354 kernel_crc32_using_crc32(crc, buf, len, table0, table1, table2, table3); 3355 return; 3356 } 3357 3358 mvnw(crc, crc); 3359 3360 adrp(table0, ExternalAddress(StubRoutines::crc_table_addr()), offset); 3361 if (offset) add(table0, table0, offset); 3362 add(table1, table0, 1*256*sizeof(juint)); 3363 add(table2, table0, 2*256*sizeof(juint)); 3364 add(table3, table0, 3*256*sizeof(juint)); 3365 3366 if (UseNeon) { 3367 cmp(len, (u1)64); 3368 br(Assembler::LT, L_by16); 3369 eor(v16, T16B, v16, v16); 3370 3371 Label L_fold; 3372 3373 add(tmp, table0, 4*256*sizeof(juint)); // Point at the Neon constants 3374 3375 ld1(v0, v1, T2D, post(buf, 32)); 3376 ld1r(v4, T2D, post(tmp, 8)); 3377 ld1r(v5, T2D, post(tmp, 8)); 3378 ld1r(v6, T2D, post(tmp, 8)); 3379 ld1r(v7, T2D, post(tmp, 8)); 3380 mov(v16, T4S, 0, crc); 3381 3382 eor(v0, T16B, v0, v16); 3383 sub(len, len, 64); 3384 3385 BIND(L_fold); 3386 pmull(v22, T8H, v0, v5, T8B); 3387 pmull(v20, T8H, v0, v7, T8B); 3388 pmull(v23, T8H, v0, v4, T8B); 3389 pmull(v21, T8H, v0, v6, T8B); 3390 3391 pmull2(v18, T8H, v0, v5, T16B); 3392 pmull2(v16, T8H, v0, v7, T16B); 3393 pmull2(v19, T8H, v0, v4, T16B); 3394 pmull2(v17, T8H, v0, v6, T16B); 3395 3396 uzp1(v24, T8H, v20, v22); 3397 uzp2(v25, T8H, v20, v22); 3398 eor(v20, T16B, v24, v25); 3399 3400 uzp1(v26, T8H, v16, v18); 3401 uzp2(v27, T8H, v16, v18); 3402 eor(v16, T16B, v26, v27); 3403 3404 ushll2(v22, T4S, v20, T8H, 8); 3405 ushll(v20, T4S, v20, T4H, 8); 3406 3407 ushll2(v18, T4S, v16, T8H, 8); 3408 ushll(v16, T4S, v16, T4H, 8); 3409 3410 eor(v22, T16B, v23, v22); 3411 eor(v18, T16B, v19, v18); 3412 eor(v20, T16B, v21, v20); 3413 eor(v16, T16B, v17, v16); 3414 3415 uzp1(v17, T2D, v16, v20); 3416 uzp2(v21, T2D, v16, v20); 3417 eor(v17, T16B, v17, v21); 3418 3419 ushll2(v20, T2D, v17, T4S, 16); 3420 ushll(v16, T2D, v17, T2S, 16); 3421 3422 eor(v20, T16B, v20, v22); 3423 eor(v16, T16B, v16, v18); 3424 3425 uzp1(v17, T2D, v20, v16); 3426 uzp2(v21, T2D, v20, v16); 3427 eor(v28, T16B, v17, v21); 3428 3429 pmull(v22, T8H, v1, v5, T8B); 3430 pmull(v20, T8H, v1, v7, T8B); 3431 pmull(v23, T8H, v1, v4, T8B); 3432 pmull(v21, T8H, v1, v6, T8B); 3433 3434 pmull2(v18, T8H, v1, v5, T16B); 3435 pmull2(v16, T8H, v1, v7, T16B); 3436 pmull2(v19, T8H, v1, v4, T16B); 3437 pmull2(v17, T8H, v1, v6, T16B); 3438 3439 ld1(v0, v1, T2D, post(buf, 32)); 3440 3441 uzp1(v24, T8H, v20, v22); 3442 uzp2(v25, T8H, v20, v22); 3443 eor(v20, T16B, v24, v25); 3444 3445 uzp1(v26, T8H, v16, v18); 3446 uzp2(v27, T8H, v16, v18); 3447 eor(v16, T16B, v26, v27); 3448 3449 ushll2(v22, T4S, v20, T8H, 8); 3450 ushll(v20, T4S, v20, T4H, 8); 3451 3452 ushll2(v18, T4S, v16, T8H, 8); 3453 ushll(v16, T4S, v16, T4H, 8); 3454 3455 eor(v22, T16B, v23, v22); 3456 eor(v18, T16B, v19, v18); 3457 eor(v20, T16B, v21, v20); 3458 eor(v16, T16B, v17, v16); 3459 3460 uzp1(v17, T2D, v16, v20); 3461 uzp2(v21, T2D, v16, v20); 3462 eor(v16, T16B, v17, v21); 3463 3464 ushll2(v20, T2D, v16, T4S, 16); 3465 ushll(v16, T2D, v16, T2S, 16); 3466 3467 eor(v20, T16B, v22, v20); 3468 eor(v16, T16B, v16, v18); 3469 3470 uzp1(v17, T2D, v20, v16); 3471 uzp2(v21, T2D, v20, v16); 3472 eor(v20, T16B, v17, v21); 3473 3474 shl(v16, T2D, v28, 1); 3475 shl(v17, T2D, v20, 1); 3476 3477 eor(v0, T16B, v0, v16); 3478 eor(v1, T16B, v1, v17); 3479 3480 subs(len, len, 32); 3481 br(Assembler::GE, L_fold); 3482 3483 mov(crc, 0); 3484 mov(tmp, v0, T1D, 0); 3485 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false); 3486 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true); 3487 mov(tmp, v0, T1D, 1); 3488 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false); 3489 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true); 3490 mov(tmp, v1, T1D, 0); 3491 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false); 3492 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true); 3493 mov(tmp, v1, T1D, 1); 3494 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false); 3495 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true); 3496 3497 add(len, len, 32); 3498 } 3499 3500 BIND(L_by16); 3501 subs(len, len, 16); 3502 br(Assembler::GE, L_by16_loop); 3503 adds(len, len, 16-4); 3504 br(Assembler::GE, L_by4_loop); 3505 adds(len, len, 4); 3506 br(Assembler::GT, L_by1_loop); 3507 b(L_exit); 3508 3509 BIND(L_by4_loop); 3510 ldrw(tmp, Address(post(buf, 4))); 3511 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3); 3512 subs(len, len, 4); 3513 br(Assembler::GE, L_by4_loop); 3514 adds(len, len, 4); 3515 br(Assembler::LE, L_exit); 3516 BIND(L_by1_loop); 3517 subs(len, len, 1); 3518 ldrb(tmp, Address(post(buf, 1))); 3519 update_byte_crc32(crc, tmp, table0); 3520 br(Assembler::GT, L_by1_loop); 3521 b(L_exit); 3522 3523 align(CodeEntryAlignment); 3524 BIND(L_by16_loop); 3525 subs(len, len, 16); 3526 ldp(tmp, tmp3, Address(post(buf, 16))); 3527 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false); 3528 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true); 3529 update_word_crc32(crc, tmp3, tmp2, table0, table1, table2, table3, false); 3530 update_word_crc32(crc, tmp3, tmp2, table0, table1, table2, table3, true); 3531 br(Assembler::GE, L_by16_loop); 3532 adds(len, len, 16-4); 3533 br(Assembler::GE, L_by4_loop); 3534 adds(len, len, 4); 3535 br(Assembler::GT, L_by1_loop); 3536 BIND(L_exit); 3537 mvnw(crc, crc); 3538 } 3539 3540 void MacroAssembler::kernel_crc32c_using_crc32c(Register crc, Register buf, 3541 Register len, Register tmp0, Register tmp1, Register tmp2, 3542 Register tmp3) { 3543 Label CRC_by64_loop, CRC_by4_loop, CRC_by1_loop, CRC_less64, CRC_by64_pre, CRC_by32_loop, CRC_less32, L_exit; 3544 assert_different_registers(crc, buf, len, tmp0, tmp1, tmp2, tmp3); 3545 3546 subs(len, len, 128); 3547 br(Assembler::GE, CRC_by64_pre); 3548 BIND(CRC_less64); 3549 adds(len, len, 128-32); 3550 br(Assembler::GE, CRC_by32_loop); 3551 BIND(CRC_less32); 3552 adds(len, len, 32-4); 3553 br(Assembler::GE, CRC_by4_loop); 3554 adds(len, len, 4); 3555 br(Assembler::GT, CRC_by1_loop); 3556 b(L_exit); 3557 3558 BIND(CRC_by32_loop); 3559 ldp(tmp0, tmp1, Address(post(buf, 16))); 3560 subs(len, len, 32); 3561 crc32cx(crc, crc, tmp0); 3562 ldr(tmp2, Address(post(buf, 8))); 3563 crc32cx(crc, crc, tmp1); 3564 ldr(tmp3, Address(post(buf, 8))); 3565 crc32cx(crc, crc, tmp2); 3566 crc32cx(crc, crc, tmp3); 3567 br(Assembler::GE, CRC_by32_loop); 3568 cmn(len, 32); 3569 br(Assembler::NE, CRC_less32); 3570 b(L_exit); 3571 3572 BIND(CRC_by4_loop); 3573 ldrw(tmp0, Address(post(buf, 4))); 3574 subs(len, len, 4); 3575 crc32cw(crc, crc, tmp0); 3576 br(Assembler::GE, CRC_by4_loop); 3577 adds(len, len, 4); 3578 br(Assembler::LE, L_exit); 3579 BIND(CRC_by1_loop); 3580 ldrb(tmp0, Address(post(buf, 1))); 3581 subs(len, len, 1); 3582 crc32cb(crc, crc, tmp0); 3583 br(Assembler::GT, CRC_by1_loop); 3584 b(L_exit); 3585 3586 BIND(CRC_by64_pre); 3587 sub(buf, buf, 8); 3588 ldp(tmp0, tmp1, Address(buf, 8)); 3589 crc32cx(crc, crc, tmp0); 3590 ldr(tmp2, Address(buf, 24)); 3591 crc32cx(crc, crc, tmp1); 3592 ldr(tmp3, Address(buf, 32)); 3593 crc32cx(crc, crc, tmp2); 3594 ldr(tmp0, Address(buf, 40)); 3595 crc32cx(crc, crc, tmp3); 3596 ldr(tmp1, Address(buf, 48)); 3597 crc32cx(crc, crc, tmp0); 3598 ldr(tmp2, Address(buf, 56)); 3599 crc32cx(crc, crc, tmp1); 3600 ldr(tmp3, Address(pre(buf, 64))); 3601 3602 b(CRC_by64_loop); 3603 3604 align(CodeEntryAlignment); 3605 BIND(CRC_by64_loop); 3606 subs(len, len, 64); 3607 crc32cx(crc, crc, tmp2); 3608 ldr(tmp0, Address(buf, 8)); 3609 crc32cx(crc, crc, tmp3); 3610 ldr(tmp1, Address(buf, 16)); 3611 crc32cx(crc, crc, tmp0); 3612 ldr(tmp2, Address(buf, 24)); 3613 crc32cx(crc, crc, tmp1); 3614 ldr(tmp3, Address(buf, 32)); 3615 crc32cx(crc, crc, tmp2); 3616 ldr(tmp0, Address(buf, 40)); 3617 crc32cx(crc, crc, tmp3); 3618 ldr(tmp1, Address(buf, 48)); 3619 crc32cx(crc, crc, tmp0); 3620 ldr(tmp2, Address(buf, 56)); 3621 crc32cx(crc, crc, tmp1); 3622 ldr(tmp3, Address(pre(buf, 64))); 3623 br(Assembler::GE, CRC_by64_loop); 3624 3625 // post-loop 3626 crc32cx(crc, crc, tmp2); 3627 crc32cx(crc, crc, tmp3); 3628 3629 sub(len, len, 64); 3630 add(buf, buf, 8); 3631 cmn(len, 128); 3632 br(Assembler::NE, CRC_less64); 3633 BIND(L_exit); 3634 } 3635 3636 /** 3637 * @param crc register containing existing CRC (32-bit) 3638 * @param buf register pointing to input byte buffer (byte*) 3639 * @param len register containing number of bytes 3640 * @param table register that will contain address of CRC table 3641 * @param tmp scratch register 3642 */ 3643 void MacroAssembler::kernel_crc32c(Register crc, Register buf, Register len, 3644 Register table0, Register table1, Register table2, Register table3, 3645 Register tmp, Register tmp2, Register tmp3) { 3646 kernel_crc32c_using_crc32c(crc, buf, len, table0, table1, table2, table3); 3647 } 3648 3649 3650 SkipIfEqual::SkipIfEqual( 3651 MacroAssembler* masm, const bool* flag_addr, bool value) { 3652 _masm = masm; 3653 unsigned long offset; 3654 _masm->adrp(rscratch1, ExternalAddress((address)flag_addr), offset); 3655 _masm->ldrb(rscratch1, Address(rscratch1, offset)); 3656 _masm->cbzw(rscratch1, _label); 3657 } 3658 3659 SkipIfEqual::~SkipIfEqual() { 3660 _masm->bind(_label); 3661 } 3662 3663 void MacroAssembler::addptr(const Address &dst, int32_t src) { 3664 Address adr; 3665 switch(dst.getMode()) { 3666 case Address::base_plus_offset: 3667 // This is the expected mode, although we allow all the other 3668 // forms below. 3669 adr = form_address(rscratch2, dst.base(), dst.offset(), LogBytesPerWord); 3670 break; 3671 default: 3672 lea(rscratch2, dst); 3673 adr = Address(rscratch2); 3674 break; 3675 } 3676 ldr(rscratch1, adr); 3677 add(rscratch1, rscratch1, src); 3678 str(rscratch1, adr); 3679 } 3680 3681 void MacroAssembler::cmpptr(Register src1, Address src2) { 3682 unsigned long offset; 3683 adrp(rscratch1, src2, offset); 3684 ldr(rscratch1, Address(rscratch1, offset)); 3685 cmp(src1, rscratch1); 3686 } 3687 3688 void MacroAssembler::cmpoop(Register obj1, Register obj2) { 3689 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 3690 bs->obj_equals(this, obj1, obj2); 3691 } 3692 3693 void MacroAssembler::load_klass(Register dst, Register src) { 3694 if (UseCompressedClassPointers) { 3695 ldrw(dst, Address(src, oopDesc::klass_offset_in_bytes())); 3696 decode_klass_not_null(dst); 3697 } else { 3698 ldr(dst, Address(src, oopDesc::klass_offset_in_bytes())); 3699 } 3700 } 3701 3702 // ((OopHandle)result).resolve(); 3703 void MacroAssembler::resolve_oop_handle(Register result, Register tmp) { 3704 // OopHandle::resolve is an indirection. 3705 access_load_at(T_OBJECT, IN_NATIVE, result, Address(result, 0), tmp, noreg); 3706 } 3707 3708 void MacroAssembler::load_mirror(Register dst, Register method, Register tmp) { 3709 const int mirror_offset = in_bytes(Klass::java_mirror_offset()); 3710 ldr(dst, Address(rmethod, Method::const_offset())); 3711 ldr(dst, Address(dst, ConstMethod::constants_offset())); 3712 ldr(dst, Address(dst, ConstantPool::pool_holder_offset_in_bytes())); 3713 ldr(dst, Address(dst, mirror_offset)); 3714 resolve_oop_handle(dst, tmp); 3715 } 3716 3717 void MacroAssembler::cmp_klass(Register oop, Register trial_klass, Register tmp) { 3718 if (UseCompressedClassPointers) { 3719 ldrw(tmp, Address(oop, oopDesc::klass_offset_in_bytes())); 3720 if (Universe::narrow_klass_base() == NULL) { 3721 cmp(trial_klass, tmp, LSL, Universe::narrow_klass_shift()); 3722 return; 3723 } else if (((uint64_t)Universe::narrow_klass_base() & 0xffffffff) == 0 3724 && Universe::narrow_klass_shift() == 0) { 3725 // Only the bottom 32 bits matter 3726 cmpw(trial_klass, tmp); 3727 return; 3728 } 3729 decode_klass_not_null(tmp); 3730 } else { 3731 ldr(tmp, Address(oop, oopDesc::klass_offset_in_bytes())); 3732 } 3733 cmp(trial_klass, tmp); 3734 } 3735 3736 void MacroAssembler::load_prototype_header(Register dst, Register src) { 3737 load_klass(dst, src); 3738 ldr(dst, Address(dst, Klass::prototype_header_offset())); 3739 } 3740 3741 void MacroAssembler::store_klass(Register dst, Register src) { 3742 // FIXME: Should this be a store release? concurrent gcs assumes 3743 // klass length is valid if klass field is not null. 3744 if (UseCompressedClassPointers) { 3745 encode_klass_not_null(src); 3746 strw(src, Address(dst, oopDesc::klass_offset_in_bytes())); 3747 } else { 3748 str(src, Address(dst, oopDesc::klass_offset_in_bytes())); 3749 } 3750 } 3751 3752 void MacroAssembler::store_klass_gap(Register dst, Register src) { 3753 if (UseCompressedClassPointers) { 3754 // Store to klass gap in destination 3755 strw(src, Address(dst, oopDesc::klass_gap_offset_in_bytes())); 3756 } 3757 } 3758 3759 // Algorithm must match CompressedOops::encode. 3760 void MacroAssembler::encode_heap_oop(Register d, Register s) { 3761 #ifdef ASSERT 3762 verify_heapbase("MacroAssembler::encode_heap_oop: heap base corrupted?"); 3763 #endif 3764 verify_oop(s, "broken oop in encode_heap_oop"); 3765 if (Universe::narrow_oop_base() == NULL) { 3766 if (Universe::narrow_oop_shift() != 0) { 3767 assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong"); 3768 lsr(d, s, LogMinObjAlignmentInBytes); 3769 } else { 3770 mov(d, s); 3771 } 3772 } else { 3773 subs(d, s, rheapbase); 3774 csel(d, d, zr, Assembler::HS); 3775 lsr(d, d, LogMinObjAlignmentInBytes); 3776 3777 /* Old algorithm: is this any worse? 3778 Label nonnull; 3779 cbnz(r, nonnull); 3780 sub(r, r, rheapbase); 3781 bind(nonnull); 3782 lsr(r, r, LogMinObjAlignmentInBytes); 3783 */ 3784 } 3785 } 3786 3787 void MacroAssembler::encode_heap_oop_not_null(Register r) { 3788 #ifdef ASSERT 3789 verify_heapbase("MacroAssembler::encode_heap_oop_not_null: heap base corrupted?"); 3790 if (CheckCompressedOops) { 3791 Label ok; 3792 cbnz(r, ok); 3793 stop("null oop passed to encode_heap_oop_not_null"); 3794 bind(ok); 3795 } 3796 #endif 3797 verify_oop(r, "broken oop in encode_heap_oop_not_null"); 3798 if (Universe::narrow_oop_base() != NULL) { 3799 sub(r, r, rheapbase); 3800 } 3801 if (Universe::narrow_oop_shift() != 0) { 3802 assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong"); 3803 lsr(r, r, LogMinObjAlignmentInBytes); 3804 } 3805 } 3806 3807 void MacroAssembler::encode_heap_oop_not_null(Register dst, Register src) { 3808 #ifdef ASSERT 3809 verify_heapbase("MacroAssembler::encode_heap_oop_not_null2: heap base corrupted?"); 3810 if (CheckCompressedOops) { 3811 Label ok; 3812 cbnz(src, ok); 3813 stop("null oop passed to encode_heap_oop_not_null2"); 3814 bind(ok); 3815 } 3816 #endif 3817 verify_oop(src, "broken oop in encode_heap_oop_not_null2"); 3818 3819 Register data = src; 3820 if (Universe::narrow_oop_base() != NULL) { 3821 sub(dst, src, rheapbase); 3822 data = dst; 3823 } 3824 if (Universe::narrow_oop_shift() != 0) { 3825 assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong"); 3826 lsr(dst, data, LogMinObjAlignmentInBytes); 3827 data = dst; 3828 } 3829 if (data == src) 3830 mov(dst, src); 3831 } 3832 3833 void MacroAssembler::decode_heap_oop(Register d, Register s) { 3834 #ifdef ASSERT 3835 verify_heapbase("MacroAssembler::decode_heap_oop: heap base corrupted?"); 3836 #endif 3837 if (Universe::narrow_oop_base() == NULL) { 3838 if (Universe::narrow_oop_shift() != 0 || d != s) { 3839 lsl(d, s, Universe::narrow_oop_shift()); 3840 } 3841 } else { 3842 Label done; 3843 if (d != s) 3844 mov(d, s); 3845 cbz(s, done); 3846 add(d, rheapbase, s, Assembler::LSL, LogMinObjAlignmentInBytes); 3847 bind(done); 3848 } 3849 verify_oop(d, "broken oop in decode_heap_oop"); 3850 } 3851 3852 void MacroAssembler::decode_heap_oop_not_null(Register r) { 3853 assert (UseCompressedOops, "should only be used for compressed headers"); 3854 assert (Universe::heap() != NULL, "java heap should be initialized"); 3855 // Cannot assert, unverified entry point counts instructions (see .ad file) 3856 // vtableStubs also counts instructions in pd_code_size_limit. 3857 // Also do not verify_oop as this is called by verify_oop. 3858 if (Universe::narrow_oop_shift() != 0) { 3859 assert(LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong"); 3860 if (Universe::narrow_oop_base() != NULL) { 3861 add(r, rheapbase, r, Assembler::LSL, LogMinObjAlignmentInBytes); 3862 } else { 3863 add(r, zr, r, Assembler::LSL, LogMinObjAlignmentInBytes); 3864 } 3865 } else { 3866 assert (Universe::narrow_oop_base() == NULL, "sanity"); 3867 } 3868 } 3869 3870 void MacroAssembler::decode_heap_oop_not_null(Register dst, Register src) { 3871 assert (UseCompressedOops, "should only be used for compressed headers"); 3872 assert (Universe::heap() != NULL, "java heap should be initialized"); 3873 // Cannot assert, unverified entry point counts instructions (see .ad file) 3874 // vtableStubs also counts instructions in pd_code_size_limit. 3875 // Also do not verify_oop as this is called by verify_oop. 3876 if (Universe::narrow_oop_shift() != 0) { 3877 assert(LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong"); 3878 if (Universe::narrow_oop_base() != NULL) { 3879 add(dst, rheapbase, src, Assembler::LSL, LogMinObjAlignmentInBytes); 3880 } else { 3881 add(dst, zr, src, Assembler::LSL, LogMinObjAlignmentInBytes); 3882 } 3883 } else { 3884 assert (Universe::narrow_oop_base() == NULL, "sanity"); 3885 if (dst != src) { 3886 mov(dst, src); 3887 } 3888 } 3889 } 3890 3891 void MacroAssembler::encode_klass_not_null(Register dst, Register src) { 3892 if (Universe::narrow_klass_base() == NULL) { 3893 if (Universe::narrow_klass_shift() != 0) { 3894 assert (LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong"); 3895 lsr(dst, src, LogKlassAlignmentInBytes); 3896 } else { 3897 if (dst != src) mov(dst, src); 3898 } 3899 return; 3900 } 3901 3902 if (use_XOR_for_compressed_class_base) { 3903 if (Universe::narrow_klass_shift() != 0) { 3904 eor(dst, src, (uint64_t)Universe::narrow_klass_base()); 3905 lsr(dst, dst, LogKlassAlignmentInBytes); 3906 } else { 3907 eor(dst, src, (uint64_t)Universe::narrow_klass_base()); 3908 } 3909 return; 3910 } 3911 3912 if (((uint64_t)Universe::narrow_klass_base() & 0xffffffff) == 0 3913 && Universe::narrow_klass_shift() == 0) { 3914 movw(dst, src); 3915 return; 3916 } 3917 3918 #ifdef ASSERT 3919 verify_heapbase("MacroAssembler::encode_klass_not_null2: heap base corrupted?"); 3920 #endif 3921 3922 Register rbase = dst; 3923 if (dst == src) rbase = rheapbase; 3924 mov(rbase, (uint64_t)Universe::narrow_klass_base()); 3925 sub(dst, src, rbase); 3926 if (Universe::narrow_klass_shift() != 0) { 3927 assert (LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong"); 3928 lsr(dst, dst, LogKlassAlignmentInBytes); 3929 } 3930 if (dst == src) reinit_heapbase(); 3931 } 3932 3933 void MacroAssembler::encode_klass_not_null(Register r) { 3934 encode_klass_not_null(r, r); 3935 } 3936 3937 void MacroAssembler::decode_klass_not_null(Register dst, Register src) { 3938 Register rbase = dst; 3939 assert (UseCompressedClassPointers, "should only be used for compressed headers"); 3940 3941 if (Universe::narrow_klass_base() == NULL) { 3942 if (Universe::narrow_klass_shift() != 0) { 3943 assert(LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong"); 3944 lsl(dst, src, LogKlassAlignmentInBytes); 3945 } else { 3946 if (dst != src) mov(dst, src); 3947 } 3948 return; 3949 } 3950 3951 if (use_XOR_for_compressed_class_base) { 3952 if (Universe::narrow_klass_shift() != 0) { 3953 lsl(dst, src, LogKlassAlignmentInBytes); 3954 eor(dst, dst, (uint64_t)Universe::narrow_klass_base()); 3955 } else { 3956 eor(dst, src, (uint64_t)Universe::narrow_klass_base()); 3957 } 3958 return; 3959 } 3960 3961 if (((uint64_t)Universe::narrow_klass_base() & 0xffffffff) == 0 3962 && Universe::narrow_klass_shift() == 0) { 3963 if (dst != src) 3964 movw(dst, src); 3965 movk(dst, (uint64_t)Universe::narrow_klass_base() >> 32, 32); 3966 return; 3967 } 3968 3969 // Cannot assert, unverified entry point counts instructions (see .ad file) 3970 // vtableStubs also counts instructions in pd_code_size_limit. 3971 // Also do not verify_oop as this is called by verify_oop. 3972 if (dst == src) rbase = rheapbase; 3973 mov(rbase, (uint64_t)Universe::narrow_klass_base()); 3974 if (Universe::narrow_klass_shift() != 0) { 3975 assert(LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong"); 3976 add(dst, rbase, src, Assembler::LSL, LogKlassAlignmentInBytes); 3977 } else { 3978 add(dst, rbase, src); 3979 } 3980 if (dst == src) reinit_heapbase(); 3981 } 3982 3983 void MacroAssembler::decode_klass_not_null(Register r) { 3984 decode_klass_not_null(r, r); 3985 } 3986 3987 void MacroAssembler::set_narrow_oop(Register dst, jobject obj) { 3988 #ifdef ASSERT 3989 { 3990 ThreadInVMfromUnknown tiv; 3991 assert (UseCompressedOops, "should only be used for compressed oops"); 3992 assert (Universe::heap() != NULL, "java heap should be initialized"); 3993 assert (oop_recorder() != NULL, "this assembler needs an OopRecorder"); 3994 assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "should be real oop"); 3995 } 3996 #endif 3997 int oop_index = oop_recorder()->find_index(obj); 3998 InstructionMark im(this); 3999 RelocationHolder rspec = oop_Relocation::spec(oop_index); 4000 code_section()->relocate(inst_mark(), rspec); 4001 movz(dst, 0xDEAD, 16); 4002 movk(dst, 0xBEEF); 4003 } 4004 4005 void MacroAssembler::set_narrow_klass(Register dst, Klass* k) { 4006 assert (UseCompressedClassPointers, "should only be used for compressed headers"); 4007 assert (oop_recorder() != NULL, "this assembler needs an OopRecorder"); 4008 int index = oop_recorder()->find_index(k); 4009 assert(! Universe::heap()->is_in_reserved(k), "should not be an oop"); 4010 4011 InstructionMark im(this); 4012 RelocationHolder rspec = metadata_Relocation::spec(index); 4013 code_section()->relocate(inst_mark(), rspec); 4014 narrowKlass nk = Klass::encode_klass(k); 4015 movz(dst, (nk >> 16), 16); 4016 movk(dst, nk & 0xffff); 4017 } 4018 4019 void MacroAssembler::access_load_at(BasicType type, DecoratorSet decorators, 4020 Register dst, Address src, 4021 Register tmp1, Register thread_tmp) { 4022 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 4023 decorators = AccessInternal::decorator_fixup(decorators); 4024 bool as_raw = (decorators & AS_RAW) != 0; 4025 if (as_raw) { 4026 bs->BarrierSetAssembler::load_at(this, decorators, type, dst, src, tmp1, thread_tmp); 4027 } else { 4028 bs->load_at(this, decorators, type, dst, src, tmp1, thread_tmp); 4029 } 4030 } 4031 4032 void MacroAssembler::access_store_at(BasicType type, DecoratorSet decorators, 4033 Address dst, Register src, 4034 Register tmp1, Register thread_tmp) { 4035 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 4036 decorators = AccessInternal::decorator_fixup(decorators); 4037 bool as_raw = (decorators & AS_RAW) != 0; 4038 if (as_raw) { 4039 bs->BarrierSetAssembler::store_at(this, decorators, type, dst, src, tmp1, thread_tmp); 4040 } else { 4041 bs->store_at(this, decorators, type, dst, src, tmp1, thread_tmp); 4042 } 4043 } 4044 4045 void MacroAssembler::resolve(DecoratorSet decorators, Register obj) { 4046 // Use stronger ACCESS_WRITE|ACCESS_READ by default. 4047 if ((decorators & (ACCESS_READ | ACCESS_WRITE)) == 0) { 4048 decorators |= ACCESS_READ | ACCESS_WRITE; 4049 } 4050 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 4051 return bs->resolve(this, decorators, obj); 4052 } 4053 4054 void MacroAssembler::load_heap_oop(Register dst, Address src, Register tmp1, 4055 Register thread_tmp, DecoratorSet decorators) { 4056 access_load_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, thread_tmp); 4057 } 4058 4059 void MacroAssembler::load_heap_oop_not_null(Register dst, Address src, Register tmp1, 4060 Register thread_tmp, DecoratorSet decorators) { 4061 access_load_at(T_OBJECT, IN_HEAP | IS_NOT_NULL | decorators, dst, src, tmp1, thread_tmp); 4062 } 4063 4064 void MacroAssembler::store_heap_oop(Address dst, Register src, Register tmp1, 4065 Register thread_tmp, DecoratorSet decorators) { 4066 access_store_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, thread_tmp); 4067 } 4068 4069 // Used for storing NULLs. 4070 void MacroAssembler::store_heap_oop_null(Address dst) { 4071 access_store_at(T_OBJECT, IN_HEAP, dst, noreg, noreg, noreg); 4072 } 4073 4074 Address MacroAssembler::allocate_metadata_address(Metadata* obj) { 4075 assert(oop_recorder() != NULL, "this assembler needs a Recorder"); 4076 int index = oop_recorder()->allocate_metadata_index(obj); 4077 RelocationHolder rspec = metadata_Relocation::spec(index); 4078 return Address((address)obj, rspec); 4079 } 4080 4081 // Move an oop into a register. immediate is true if we want 4082 // immediate instrcutions, i.e. we are not going to patch this 4083 // instruction while the code is being executed by another thread. In 4084 // that case we can use move immediates rather than the constant pool. 4085 void MacroAssembler::movoop(Register dst, jobject obj, bool immediate) { 4086 int oop_index; 4087 if (obj == NULL) { 4088 oop_index = oop_recorder()->allocate_oop_index(obj); 4089 } else { 4090 #ifdef ASSERT 4091 { 4092 ThreadInVMfromUnknown tiv; 4093 assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "should be real oop"); 4094 } 4095 #endif 4096 oop_index = oop_recorder()->find_index(obj); 4097 } 4098 RelocationHolder rspec = oop_Relocation::spec(oop_index); 4099 if (! immediate) { 4100 address dummy = address(uintptr_t(pc()) & -wordSize); // A nearby aligned address 4101 ldr_constant(dst, Address(dummy, rspec)); 4102 } else 4103 mov(dst, Address((address)obj, rspec)); 4104 } 4105 4106 // Move a metadata address into a register. 4107 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) { 4108 int oop_index; 4109 if (obj == NULL) { 4110 oop_index = oop_recorder()->allocate_metadata_index(obj); 4111 } else { 4112 oop_index = oop_recorder()->find_index(obj); 4113 } 4114 RelocationHolder rspec = metadata_Relocation::spec(oop_index); 4115 mov(dst, Address((address)obj, rspec)); 4116 } 4117 4118 Address MacroAssembler::constant_oop_address(jobject obj) { 4119 #ifdef ASSERT 4120 { 4121 ThreadInVMfromUnknown tiv; 4122 assert(oop_recorder() != NULL, "this assembler needs an OopRecorder"); 4123 assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "not an oop"); 4124 } 4125 #endif 4126 int oop_index = oop_recorder()->find_index(obj); 4127 return Address((address)obj, oop_Relocation::spec(oop_index)); 4128 } 4129 4130 // Defines obj, preserves var_size_in_bytes, okay for t2 == var_size_in_bytes. 4131 void MacroAssembler::tlab_allocate(Register obj, 4132 Register var_size_in_bytes, 4133 int con_size_in_bytes, 4134 Register t1, 4135 Register t2, 4136 Label& slow_case) { 4137 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 4138 bs->tlab_allocate(this, obj, var_size_in_bytes, con_size_in_bytes, t1, t2, slow_case); 4139 } 4140 4141 // Defines obj, preserves var_size_in_bytes 4142 void MacroAssembler::eden_allocate(Register obj, 4143 Register var_size_in_bytes, 4144 int con_size_in_bytes, 4145 Register t1, 4146 Label& slow_case) { 4147 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 4148 bs->eden_allocate(this, obj, var_size_in_bytes, con_size_in_bytes, t1, slow_case); 4149 } 4150 4151 // Zero words; len is in bytes 4152 // Destroys all registers except addr 4153 // len must be a nonzero multiple of wordSize 4154 void MacroAssembler::zero_memory(Register addr, Register len, Register t1) { 4155 assert_different_registers(addr, len, t1, rscratch1, rscratch2); 4156 4157 #ifdef ASSERT 4158 { Label L; 4159 tst(len, BytesPerWord - 1); 4160 br(Assembler::EQ, L); 4161 stop("len is not a multiple of BytesPerWord"); 4162 bind(L); 4163 } 4164 #endif 4165 4166 #ifndef PRODUCT 4167 block_comment("zero memory"); 4168 #endif 4169 4170 Label loop; 4171 Label entry; 4172 4173 // Algorithm: 4174 // 4175 // scratch1 = cnt & 7; 4176 // cnt -= scratch1; 4177 // p += scratch1; 4178 // switch (scratch1) { 4179 // do { 4180 // cnt -= 8; 4181 // p[-8] = 0; 4182 // case 7: 4183 // p[-7] = 0; 4184 // case 6: 4185 // p[-6] = 0; 4186 // // ... 4187 // case 1: 4188 // p[-1] = 0; 4189 // case 0: 4190 // p += 8; 4191 // } while (cnt); 4192 // } 4193 4194 const int unroll = 8; // Number of str(zr) instructions we'll unroll 4195 4196 lsr(len, len, LogBytesPerWord); 4197 andr(rscratch1, len, unroll - 1); // tmp1 = cnt % unroll 4198 sub(len, len, rscratch1); // cnt -= unroll 4199 // t1 always points to the end of the region we're about to zero 4200 add(t1, addr, rscratch1, Assembler::LSL, LogBytesPerWord); 4201 adr(rscratch2, entry); 4202 sub(rscratch2, rscratch2, rscratch1, Assembler::LSL, 2); 4203 br(rscratch2); 4204 bind(loop); 4205 sub(len, len, unroll); 4206 for (int i = -unroll; i < 0; i++) 4207 Assembler::str(zr, Address(t1, i * wordSize)); 4208 bind(entry); 4209 add(t1, t1, unroll * wordSize); 4210 cbnz(len, loop); 4211 } 4212 4213 void MacroAssembler::verify_tlab() { 4214 #ifdef ASSERT 4215 if (UseTLAB && VerifyOops) { 4216 Label next, ok; 4217 4218 stp(rscratch2, rscratch1, Address(pre(sp, -16))); 4219 4220 ldr(rscratch2, Address(rthread, in_bytes(JavaThread::tlab_top_offset()))); 4221 ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_start_offset()))); 4222 cmp(rscratch2, rscratch1); 4223 br(Assembler::HS, next); 4224 STOP("assert(top >= start)"); 4225 should_not_reach_here(); 4226 4227 bind(next); 4228 ldr(rscratch2, Address(rthread, in_bytes(JavaThread::tlab_end_offset()))); 4229 ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_top_offset()))); 4230 cmp(rscratch2, rscratch1); 4231 br(Assembler::HS, ok); 4232 STOP("assert(top <= end)"); 4233 should_not_reach_here(); 4234 4235 bind(ok); 4236 ldp(rscratch2, rscratch1, Address(post(sp, 16))); 4237 } 4238 #endif 4239 } 4240 4241 // Writes to stack successive pages until offset reached to check for 4242 // stack overflow + shadow pages. This clobbers tmp. 4243 void MacroAssembler::bang_stack_size(Register size, Register tmp) { 4244 assert_different_registers(tmp, size, rscratch1); 4245 mov(tmp, sp); 4246 // Bang stack for total size given plus shadow page size. 4247 // Bang one page at a time because large size can bang beyond yellow and 4248 // red zones. 4249 Label loop; 4250 mov(rscratch1, os::vm_page_size()); 4251 bind(loop); 4252 lea(tmp, Address(tmp, -os::vm_page_size())); 4253 subsw(size, size, rscratch1); 4254 str(size, Address(tmp)); 4255 br(Assembler::GT, loop); 4256 4257 // Bang down shadow pages too. 4258 // At this point, (tmp-0) is the last address touched, so don't 4259 // touch it again. (It was touched as (tmp-pagesize) but then tmp 4260 // was post-decremented.) Skip this address by starting at i=1, and 4261 // touch a few more pages below. N.B. It is important to touch all 4262 // the way down to and including i=StackShadowPages. 4263 for (int i = 0; i < (int)(JavaThread::stack_shadow_zone_size() / os::vm_page_size()) - 1; i++) { 4264 // this could be any sized move but this is can be a debugging crumb 4265 // so the bigger the better. 4266 lea(tmp, Address(tmp, -os::vm_page_size())); 4267 str(size, Address(tmp)); 4268 } 4269 } 4270 4271 4272 // Move the address of the polling page into dest. 4273 void MacroAssembler::get_polling_page(Register dest, address page, relocInfo::relocType rtype) { 4274 if (SafepointMechanism::uses_thread_local_poll()) { 4275 ldr(dest, Address(rthread, Thread::polling_page_offset())); 4276 } else { 4277 unsigned long off; 4278 adrp(dest, Address(page, rtype), off); 4279 assert(off == 0, "polling page must be page aligned"); 4280 } 4281 } 4282 4283 // Move the address of the polling page into r, then read the polling 4284 // page. 4285 address MacroAssembler::read_polling_page(Register r, address page, relocInfo::relocType rtype) { 4286 get_polling_page(r, page, rtype); 4287 return read_polling_page(r, rtype); 4288 } 4289 4290 // Read the polling page. The address of the polling page must 4291 // already be in r. 4292 address MacroAssembler::read_polling_page(Register r, relocInfo::relocType rtype) { 4293 InstructionMark im(this); 4294 code_section()->relocate(inst_mark(), rtype); 4295 ldrw(zr, Address(r, 0)); 4296 return inst_mark(); 4297 } 4298 4299 void MacroAssembler::adrp(Register reg1, const Address &dest, unsigned long &byte_offset) { 4300 relocInfo::relocType rtype = dest.rspec().reloc()->type(); 4301 unsigned long low_page = (unsigned long)CodeCache::low_bound() >> 12; 4302 unsigned long high_page = (unsigned long)(CodeCache::high_bound()-1) >> 12; 4303 unsigned long dest_page = (unsigned long)dest.target() >> 12; 4304 long offset_low = dest_page - low_page; 4305 long offset_high = dest_page - high_page; 4306 4307 assert(is_valid_AArch64_address(dest.target()), "bad address"); 4308 assert(dest.getMode() == Address::literal, "ADRP must be applied to a literal address"); 4309 4310 InstructionMark im(this); 4311 code_section()->relocate(inst_mark(), dest.rspec()); 4312 // 8143067: Ensure that the adrp can reach the dest from anywhere within 4313 // the code cache so that if it is relocated we know it will still reach 4314 if (offset_high >= -(1<<20) && offset_low < (1<<20)) { 4315 _adrp(reg1, dest.target()); 4316 } else { 4317 unsigned long target = (unsigned long)dest.target(); 4318 unsigned long adrp_target 4319 = (target & 0xffffffffUL) | ((unsigned long)pc() & 0xffff00000000UL); 4320 4321 _adrp(reg1, (address)adrp_target); 4322 movk(reg1, target >> 32, 32); 4323 } 4324 byte_offset = (unsigned long)dest.target() & 0xfff; 4325 } 4326 4327 void MacroAssembler::load_byte_map_base(Register reg) { 4328 CardTable::CardValue* byte_map_base = 4329 ((CardTableBarrierSet*)(BarrierSet::barrier_set()))->card_table()->byte_map_base(); 4330 4331 if (is_valid_AArch64_address((address)byte_map_base)) { 4332 // Strictly speaking the byte_map_base isn't an address at all, 4333 // and it might even be negative. 4334 unsigned long offset; 4335 adrp(reg, ExternalAddress((address)byte_map_base), offset); 4336 // We expect offset to be zero with most collectors. 4337 if (offset != 0) { 4338 add(reg, reg, offset); 4339 } 4340 } else { 4341 mov(reg, (uint64_t)byte_map_base); 4342 } 4343 } 4344 4345 void MacroAssembler::build_frame(int framesize) { 4346 assert(framesize > 0, "framesize must be > 0"); 4347 if (framesize < ((1 << 9) + 2 * wordSize)) { 4348 sub(sp, sp, framesize); 4349 stp(rfp, lr, Address(sp, framesize - 2 * wordSize)); 4350 if (PreserveFramePointer) add(rfp, sp, framesize - 2 * wordSize); 4351 } else { 4352 stp(rfp, lr, Address(pre(sp, -2 * wordSize))); 4353 if (PreserveFramePointer) mov(rfp, sp); 4354 if (framesize < ((1 << 12) + 2 * wordSize)) 4355 sub(sp, sp, framesize - 2 * wordSize); 4356 else { 4357 mov(rscratch1, framesize - 2 * wordSize); 4358 sub(sp, sp, rscratch1); 4359 } 4360 } 4361 } 4362 4363 void MacroAssembler::remove_frame(int framesize) { 4364 assert(framesize > 0, "framesize must be > 0"); 4365 if (framesize < ((1 << 9) + 2 * wordSize)) { 4366 ldp(rfp, lr, Address(sp, framesize - 2 * wordSize)); 4367 add(sp, sp, framesize); 4368 } else { 4369 if (framesize < ((1 << 12) + 2 * wordSize)) 4370 add(sp, sp, framesize - 2 * wordSize); 4371 else { 4372 mov(rscratch1, framesize - 2 * wordSize); 4373 add(sp, sp, rscratch1); 4374 } 4375 ldp(rfp, lr, Address(post(sp, 2 * wordSize))); 4376 } 4377 } 4378 4379 #ifdef COMPILER2 4380 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr); 4381 4382 // Search for str1 in str2 and return index or -1 4383 void MacroAssembler::string_indexof(Register str2, Register str1, 4384 Register cnt2, Register cnt1, 4385 Register tmp1, Register tmp2, 4386 Register tmp3, Register tmp4, 4387 Register tmp5, Register tmp6, 4388 int icnt1, Register result, int ae) { 4389 // NOTE: tmp5, tmp6 can be zr depending on specific method version 4390 Label LINEARSEARCH, LINEARSTUB, LINEAR_MEDIUM, DONE, NOMATCH, MATCH; 4391 4392 Register ch1 = rscratch1; 4393 Register ch2 = rscratch2; 4394 Register cnt1tmp = tmp1; 4395 Register cnt2tmp = tmp2; 4396 Register cnt1_neg = cnt1; 4397 Register cnt2_neg = cnt2; 4398 Register result_tmp = tmp4; 4399 4400 bool isL = ae == StrIntrinsicNode::LL; 4401 4402 bool str1_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL; 4403 bool str2_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU; 4404 int str1_chr_shift = str1_isL ? 0:1; 4405 int str2_chr_shift = str2_isL ? 0:1; 4406 int str1_chr_size = str1_isL ? 1:2; 4407 int str2_chr_size = str2_isL ? 1:2; 4408 chr_insn str1_load_1chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb : 4409 (chr_insn)&MacroAssembler::ldrh; 4410 chr_insn str2_load_1chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb : 4411 (chr_insn)&MacroAssembler::ldrh; 4412 chr_insn load_2chr = isL ? (chr_insn)&MacroAssembler::ldrh : (chr_insn)&MacroAssembler::ldrw; 4413 chr_insn load_4chr = isL ? (chr_insn)&MacroAssembler::ldrw : (chr_insn)&MacroAssembler::ldr; 4414 4415 // Note, inline_string_indexOf() generates checks: 4416 // if (substr.count > string.count) return -1; 4417 // if (substr.count == 0) return 0; 4418 4419 // We have two strings, a source string in str2, cnt2 and a pattern string 4420 // in str1, cnt1. Find the 1st occurence of pattern in source or return -1. 4421 4422 // For larger pattern and source we use a simplified Boyer Moore algorithm. 4423 // With a small pattern and source we use linear scan. 4424 4425 if (icnt1 == -1) { 4426 sub(result_tmp, cnt2, cnt1); 4427 cmp(cnt1, (u1)8); // Use Linear Scan if cnt1 < 8 || cnt1 >= 256 4428 br(LT, LINEARSEARCH); 4429 dup(v0, T16B, cnt1); // done in separate FPU pipeline. Almost no penalty 4430 subs(zr, cnt1, 256); 4431 lsr(tmp1, cnt2, 2); 4432 ccmp(cnt1, tmp1, 0b0000, LT); // Source must be 4 * pattern for BM 4433 br(GE, LINEARSTUB); 4434 } 4435 4436 // The Boyer Moore alogorithm is based on the description here:- 4437 // 4438 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm 4439 // 4440 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule 4441 // and the 'Good Suffix' rule. 4442 // 4443 // These rules are essentially heuristics for how far we can shift the 4444 // pattern along the search string. 4445 // 4446 // The implementation here uses the 'Bad Character' rule only because of the 4447 // complexity of initialisation for the 'Good Suffix' rule. 4448 // 4449 // This is also known as the Boyer-Moore-Horspool algorithm:- 4450 // 4451 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm 4452 // 4453 // This particular implementation has few java-specific optimizations. 4454 // 4455 // #define ASIZE 256 4456 // 4457 // int bm(unsigned char *x, int m, unsigned char *y, int n) { 4458 // int i, j; 4459 // unsigned c; 4460 // unsigned char bc[ASIZE]; 4461 // 4462 // /* Preprocessing */ 4463 // for (i = 0; i < ASIZE; ++i) 4464 // bc[i] = m; 4465 // for (i = 0; i < m - 1; ) { 4466 // c = x[i]; 4467 // ++i; 4468 // // c < 256 for Latin1 string, so, no need for branch 4469 // #ifdef PATTERN_STRING_IS_LATIN1 4470 // bc[c] = m - i; 4471 // #else 4472 // if (c < ASIZE) bc[c] = m - i; 4473 // #endif 4474 // } 4475 // 4476 // /* Searching */ 4477 // j = 0; 4478 // while (j <= n - m) { 4479 // c = y[i+j]; 4480 // if (x[m-1] == c) 4481 // for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i); 4482 // if (i < 0) return j; 4483 // // c < 256 for Latin1 string, so, no need for branch 4484 // #ifdef SOURCE_STRING_IS_LATIN1 4485 // // LL case: (c< 256) always true. Remove branch 4486 // j += bc[y[j+m-1]]; 4487 // #endif 4488 // #ifndef PATTERN_STRING_IS_UTF 4489 // // UU case: need if (c<ASIZE) check. Skip 1 character if not. 4490 // if (c < ASIZE) 4491 // j += bc[y[j+m-1]]; 4492 // else 4493 // j += 1 4494 // #endif 4495 // #ifdef PATTERN_IS_LATIN1_AND_SOURCE_IS_UTF 4496 // // UL case: need if (c<ASIZE) check. Skip <pattern length> if not. 4497 // if (c < ASIZE) 4498 // j += bc[y[j+m-1]]; 4499 // else 4500 // j += m 4501 // #endif 4502 // } 4503 // } 4504 4505 if (icnt1 == -1) { 4506 Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH, 4507 BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP; 4508 Register cnt1end = tmp2; 4509 Register str2end = cnt2; 4510 Register skipch = tmp2; 4511 4512 // str1 length is >=8, so, we can read at least 1 register for cases when 4513 // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for 4514 // UL case. We'll re-read last character in inner pre-loop code to have 4515 // single outer pre-loop load 4516 const int firstStep = isL ? 7 : 3; 4517 4518 const int ASIZE = 256; 4519 const int STORED_BYTES = 32; // amount of bytes stored per instruction 4520 sub(sp, sp, ASIZE); 4521 mov(tmp5, ASIZE/STORED_BYTES); // loop iterations 4522 mov(ch1, sp); 4523 BIND(BM_INIT_LOOP); 4524 stpq(v0, v0, Address(post(ch1, STORED_BYTES))); 4525 subs(tmp5, tmp5, 1); 4526 br(GT, BM_INIT_LOOP); 4527 4528 sub(cnt1tmp, cnt1, 1); 4529 mov(tmp5, str2); 4530 add(str2end, str2, result_tmp, LSL, str2_chr_shift); 4531 sub(ch2, cnt1, 1); 4532 mov(tmp3, str1); 4533 BIND(BCLOOP); 4534 (this->*str1_load_1chr)(ch1, Address(post(tmp3, str1_chr_size))); 4535 if (!str1_isL) { 4536 subs(zr, ch1, ASIZE); 4537 br(HS, BCSKIP); 4538 } 4539 strb(ch2, Address(sp, ch1)); 4540 BIND(BCSKIP); 4541 subs(ch2, ch2, 1); 4542 br(GT, BCLOOP); 4543 4544 add(tmp6, str1, cnt1, LSL, str1_chr_shift); // address after str1 4545 if (str1_isL == str2_isL) { 4546 // load last 8 bytes (8LL/4UU symbols) 4547 ldr(tmp6, Address(tmp6, -wordSize)); 4548 } else { 4549 ldrw(tmp6, Address(tmp6, -wordSize/2)); // load last 4 bytes(4 symbols) 4550 // convert Latin1 to UTF. We'll have to wait until load completed, but 4551 // it's still faster than per-character loads+checks 4552 lsr(tmp3, tmp6, BitsPerByte * (wordSize/2 - str1_chr_size)); // str1[N-1] 4553 ubfx(ch1, tmp6, 8, 8); // str1[N-2] 4554 ubfx(ch2, tmp6, 16, 8); // str1[N-3] 4555 andr(tmp6, tmp6, 0xFF); // str1[N-4] 4556 orr(ch2, ch1, ch2, LSL, 16); 4557 orr(tmp6, tmp6, tmp3, LSL, 48); 4558 orr(tmp6, tmp6, ch2, LSL, 16); 4559 } 4560 BIND(BMLOOPSTR2); 4561 (this->*str2_load_1chr)(skipch, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift))); 4562 sub(cnt1tmp, cnt1tmp, firstStep); // cnt1tmp is positive here, because cnt1 >= 8 4563 if (str1_isL == str2_isL) { 4564 // re-init tmp3. It's for free because it's executed in parallel with 4565 // load above. Alternative is to initialize it before loop, but it'll 4566 // affect performance on in-order systems with 2 or more ld/st pipelines 4567 lsr(tmp3, tmp6, BitsPerByte * (wordSize - str1_chr_size)); 4568 } 4569 if (!isL) { // UU/UL case 4570 lsl(ch2, cnt1tmp, 1); // offset in bytes 4571 } 4572 cmp(tmp3, skipch); 4573 br(NE, BMSKIP); 4574 ldr(ch2, Address(str2, isL ? cnt1tmp : ch2)); 4575 mov(ch1, tmp6); 4576 if (isL) { 4577 b(BMLOOPSTR1_AFTER_LOAD); 4578 } else { 4579 sub(cnt1tmp, cnt1tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8 4580 b(BMLOOPSTR1_CMP); 4581 } 4582 BIND(BMLOOPSTR1); 4583 (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift))); 4584 (this->*str2_load_1chr)(ch2, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift))); 4585 BIND(BMLOOPSTR1_AFTER_LOAD); 4586 subs(cnt1tmp, cnt1tmp, 1); 4587 br(LT, BMLOOPSTR1_LASTCMP); 4588 BIND(BMLOOPSTR1_CMP); 4589 cmp(ch1, ch2); 4590 br(EQ, BMLOOPSTR1); 4591 BIND(BMSKIP); 4592 if (!isL) { 4593 // if we've met UTF symbol while searching Latin1 pattern, then we can 4594 // skip cnt1 symbols 4595 if (str1_isL != str2_isL) { 4596 mov(result_tmp, cnt1); 4597 } else { 4598 mov(result_tmp, 1); 4599 } 4600 subs(zr, skipch, ASIZE); 4601 br(HS, BMADV); 4602 } 4603 ldrb(result_tmp, Address(sp, skipch)); // load skip distance 4604 BIND(BMADV); 4605 sub(cnt1tmp, cnt1, 1); 4606 add(str2, str2, result_tmp, LSL, str2_chr_shift); 4607 cmp(str2, str2end); 4608 br(LE, BMLOOPSTR2); 4609 add(sp, sp, ASIZE); 4610 b(NOMATCH); 4611 BIND(BMLOOPSTR1_LASTCMP); 4612 cmp(ch1, ch2); 4613 br(NE, BMSKIP); 4614 BIND(BMMATCH); 4615 sub(result, str2, tmp5); 4616 if (!str2_isL) lsr(result, result, 1); 4617 add(sp, sp, ASIZE); 4618 b(DONE); 4619 4620 BIND(LINEARSTUB); 4621 cmp(cnt1, (u1)16); // small patterns still should be handled by simple algorithm 4622 br(LT, LINEAR_MEDIUM); 4623 mov(result, zr); 4624 RuntimeAddress stub = NULL; 4625 if (isL) { 4626 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ll()); 4627 assert(stub.target() != NULL, "string_indexof_linear_ll stub has not been generated"); 4628 } else if (str1_isL) { 4629 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ul()); 4630 assert(stub.target() != NULL, "string_indexof_linear_ul stub has not been generated"); 4631 } else { 4632 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_uu()); 4633 assert(stub.target() != NULL, "string_indexof_linear_uu stub has not been generated"); 4634 } 4635 trampoline_call(stub); 4636 b(DONE); 4637 } 4638 4639 BIND(LINEARSEARCH); 4640 { 4641 Label DO1, DO2, DO3; 4642 4643 Register str2tmp = tmp2; 4644 Register first = tmp3; 4645 4646 if (icnt1 == -1) 4647 { 4648 Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT; 4649 4650 cmp(cnt1, u1(str1_isL == str2_isL ? 4 : 2)); 4651 br(LT, DOSHORT); 4652 BIND(LINEAR_MEDIUM); 4653 (this->*str1_load_1chr)(first, Address(str1)); 4654 lea(str1, Address(str1, cnt1, Address::lsl(str1_chr_shift))); 4655 sub(cnt1_neg, zr, cnt1, LSL, str1_chr_shift); 4656 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 4657 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 4658 4659 BIND(FIRST_LOOP); 4660 (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg)); 4661 cmp(first, ch2); 4662 br(EQ, STR1_LOOP); 4663 BIND(STR2_NEXT); 4664 adds(cnt2_neg, cnt2_neg, str2_chr_size); 4665 br(LE, FIRST_LOOP); 4666 b(NOMATCH); 4667 4668 BIND(STR1_LOOP); 4669 adds(cnt1tmp, cnt1_neg, str1_chr_size); 4670 add(cnt2tmp, cnt2_neg, str2_chr_size); 4671 br(GE, MATCH); 4672 4673 BIND(STR1_NEXT); 4674 (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp)); 4675 (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp)); 4676 cmp(ch1, ch2); 4677 br(NE, STR2_NEXT); 4678 adds(cnt1tmp, cnt1tmp, str1_chr_size); 4679 add(cnt2tmp, cnt2tmp, str2_chr_size); 4680 br(LT, STR1_NEXT); 4681 b(MATCH); 4682 4683 BIND(DOSHORT); 4684 if (str1_isL == str2_isL) { 4685 cmp(cnt1, (u1)2); 4686 br(LT, DO1); 4687 br(GT, DO3); 4688 } 4689 } 4690 4691 if (icnt1 == 4) { 4692 Label CH1_LOOP; 4693 4694 (this->*load_4chr)(ch1, str1); 4695 sub(result_tmp, cnt2, 4); 4696 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 4697 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 4698 4699 BIND(CH1_LOOP); 4700 (this->*load_4chr)(ch2, Address(str2, cnt2_neg)); 4701 cmp(ch1, ch2); 4702 br(EQ, MATCH); 4703 adds(cnt2_neg, cnt2_neg, str2_chr_size); 4704 br(LE, CH1_LOOP); 4705 b(NOMATCH); 4706 } 4707 4708 if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 2) { 4709 Label CH1_LOOP; 4710 4711 BIND(DO2); 4712 (this->*load_2chr)(ch1, str1); 4713 if (icnt1 == 2) { 4714 sub(result_tmp, cnt2, 2); 4715 } 4716 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 4717 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 4718 BIND(CH1_LOOP); 4719 (this->*load_2chr)(ch2, Address(str2, cnt2_neg)); 4720 cmp(ch1, ch2); 4721 br(EQ, MATCH); 4722 adds(cnt2_neg, cnt2_neg, str2_chr_size); 4723 br(LE, CH1_LOOP); 4724 b(NOMATCH); 4725 } 4726 4727 if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 3) { 4728 Label FIRST_LOOP, STR2_NEXT, STR1_LOOP; 4729 4730 BIND(DO3); 4731 (this->*load_2chr)(first, str1); 4732 (this->*str1_load_1chr)(ch1, Address(str1, 2*str1_chr_size)); 4733 if (icnt1 == 3) { 4734 sub(result_tmp, cnt2, 3); 4735 } 4736 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 4737 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 4738 BIND(FIRST_LOOP); 4739 (this->*load_2chr)(ch2, Address(str2, cnt2_neg)); 4740 cmpw(first, ch2); 4741 br(EQ, STR1_LOOP); 4742 BIND(STR2_NEXT); 4743 adds(cnt2_neg, cnt2_neg, str2_chr_size); 4744 br(LE, FIRST_LOOP); 4745 b(NOMATCH); 4746 4747 BIND(STR1_LOOP); 4748 add(cnt2tmp, cnt2_neg, 2*str2_chr_size); 4749 (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp)); 4750 cmp(ch1, ch2); 4751 br(NE, STR2_NEXT); 4752 b(MATCH); 4753 } 4754 4755 if (icnt1 == -1 || icnt1 == 1) { 4756 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP; 4757 4758 BIND(DO1); 4759 (this->*str1_load_1chr)(ch1, str1); 4760 cmp(cnt2, (u1)8); 4761 br(LT, DO1_SHORT); 4762 4763 sub(result_tmp, cnt2, 8/str2_chr_size); 4764 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 4765 mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001); 4766 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 4767 4768 if (str2_isL) { 4769 orr(ch1, ch1, ch1, LSL, 8); 4770 } 4771 orr(ch1, ch1, ch1, LSL, 16); 4772 orr(ch1, ch1, ch1, LSL, 32); 4773 BIND(CH1_LOOP); 4774 ldr(ch2, Address(str2, cnt2_neg)); 4775 eor(ch2, ch1, ch2); 4776 sub(tmp1, ch2, tmp3); 4777 orr(tmp2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 4778 bics(tmp1, tmp1, tmp2); 4779 br(NE, HAS_ZERO); 4780 adds(cnt2_neg, cnt2_neg, 8); 4781 br(LT, CH1_LOOP); 4782 4783 cmp(cnt2_neg, (u1)8); 4784 mov(cnt2_neg, 0); 4785 br(LT, CH1_LOOP); 4786 b(NOMATCH); 4787 4788 BIND(HAS_ZERO); 4789 rev(tmp1, tmp1); 4790 clz(tmp1, tmp1); 4791 add(cnt2_neg, cnt2_neg, tmp1, LSR, 3); 4792 b(MATCH); 4793 4794 BIND(DO1_SHORT); 4795 mov(result_tmp, cnt2); 4796 lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift))); 4797 sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift); 4798 BIND(DO1_LOOP); 4799 (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg)); 4800 cmpw(ch1, ch2); 4801 br(EQ, MATCH); 4802 adds(cnt2_neg, cnt2_neg, str2_chr_size); 4803 br(LT, DO1_LOOP); 4804 } 4805 } 4806 BIND(NOMATCH); 4807 mov(result, -1); 4808 b(DONE); 4809 BIND(MATCH); 4810 add(result, result_tmp, cnt2_neg, ASR, str2_chr_shift); 4811 BIND(DONE); 4812 } 4813 4814 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr); 4815 typedef void (MacroAssembler::* uxt_insn)(Register Rd, Register Rn); 4816 4817 void MacroAssembler::string_indexof_char(Register str1, Register cnt1, 4818 Register ch, Register result, 4819 Register tmp1, Register tmp2, Register tmp3) 4820 { 4821 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE; 4822 Register cnt1_neg = cnt1; 4823 Register ch1 = rscratch1; 4824 Register result_tmp = rscratch2; 4825 4826 cmp(cnt1, (u1)4); 4827 br(LT, DO1_SHORT); 4828 4829 orr(ch, ch, ch, LSL, 16); 4830 orr(ch, ch, ch, LSL, 32); 4831 4832 sub(cnt1, cnt1, 4); 4833 mov(result_tmp, cnt1); 4834 lea(str1, Address(str1, cnt1, Address::uxtw(1))); 4835 sub(cnt1_neg, zr, cnt1, LSL, 1); 4836 4837 mov(tmp3, 0x0001000100010001); 4838 4839 BIND(CH1_LOOP); 4840 ldr(ch1, Address(str1, cnt1_neg)); 4841 eor(ch1, ch, ch1); 4842 sub(tmp1, ch1, tmp3); 4843 orr(tmp2, ch1, 0x7fff7fff7fff7fff); 4844 bics(tmp1, tmp1, tmp2); 4845 br(NE, HAS_ZERO); 4846 adds(cnt1_neg, cnt1_neg, 8); 4847 br(LT, CH1_LOOP); 4848 4849 cmp(cnt1_neg, (u1)8); 4850 mov(cnt1_neg, 0); 4851 br(LT, CH1_LOOP); 4852 b(NOMATCH); 4853 4854 BIND(HAS_ZERO); 4855 rev(tmp1, tmp1); 4856 clz(tmp1, tmp1); 4857 add(cnt1_neg, cnt1_neg, tmp1, LSR, 3); 4858 b(MATCH); 4859 4860 BIND(DO1_SHORT); 4861 mov(result_tmp, cnt1); 4862 lea(str1, Address(str1, cnt1, Address::uxtw(1))); 4863 sub(cnt1_neg, zr, cnt1, LSL, 1); 4864 BIND(DO1_LOOP); 4865 ldrh(ch1, Address(str1, cnt1_neg)); 4866 cmpw(ch, ch1); 4867 br(EQ, MATCH); 4868 adds(cnt1_neg, cnt1_neg, 2); 4869 br(LT, DO1_LOOP); 4870 BIND(NOMATCH); 4871 mov(result, -1); 4872 b(DONE); 4873 BIND(MATCH); 4874 add(result, result_tmp, cnt1_neg, ASR, 1); 4875 BIND(DONE); 4876 } 4877 4878 // Compare strings. 4879 void MacroAssembler::string_compare(Register str1, Register str2, 4880 Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2, 4881 FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3, int ae) { 4882 Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB, 4883 DIFFERENCE, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, 4884 SHORT_LOOP_START, TAIL_CHECK; 4885 4886 const u1 STUB_THRESHOLD = 64 + 8; 4887 bool isLL = ae == StrIntrinsicNode::LL; 4888 bool isLU = ae == StrIntrinsicNode::LU; 4889 bool isUL = ae == StrIntrinsicNode::UL; 4890 4891 bool str1_isL = isLL || isLU; 4892 bool str2_isL = isLL || isUL; 4893 4894 int str1_chr_shift = str1_isL ? 0 : 1; 4895 int str2_chr_shift = str2_isL ? 0 : 1; 4896 int str1_chr_size = str1_isL ? 1 : 2; 4897 int str2_chr_size = str2_isL ? 1 : 2; 4898 int minCharsInWord = isLL ? wordSize : wordSize/2; 4899 4900 FloatRegister vtmpZ = vtmp1, vtmp = vtmp2; 4901 chr_insn str1_load_chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb : 4902 (chr_insn)&MacroAssembler::ldrh; 4903 chr_insn str2_load_chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb : 4904 (chr_insn)&MacroAssembler::ldrh; 4905 uxt_insn ext_chr = isLL ? (uxt_insn)&MacroAssembler::uxtbw : 4906 (uxt_insn)&MacroAssembler::uxthw; 4907 4908 BLOCK_COMMENT("string_compare {"); 4909 4910 // Bizzarely, the counts are passed in bytes, regardless of whether they 4911 // are L or U strings, however the result is always in characters. 4912 if (!str1_isL) asrw(cnt1, cnt1, 1); 4913 if (!str2_isL) asrw(cnt2, cnt2, 1); 4914 4915 // Compute the minimum of the string lengths and save the difference. 4916 subsw(result, cnt1, cnt2); 4917 cselw(cnt2, cnt1, cnt2, Assembler::LE); // min 4918 4919 // A very short string 4920 cmpw(cnt2, minCharsInWord); 4921 br(Assembler::LE, SHORT_STRING); 4922 4923 // Compare longwords 4924 // load first parts of strings and finish initialization while loading 4925 { 4926 if (str1_isL == str2_isL) { // LL or UU 4927 ldr(tmp1, Address(str1)); 4928 cmp(str1, str2); 4929 br(Assembler::EQ, DONE); 4930 ldr(tmp2, Address(str2)); 4931 cmp(cnt2, STUB_THRESHOLD); 4932 br(GE, STUB); 4933 subsw(cnt2, cnt2, minCharsInWord); 4934 br(EQ, TAIL_CHECK); 4935 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift))); 4936 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift))); 4937 sub(cnt2, zr, cnt2, LSL, str2_chr_shift); 4938 } else if (isLU) { 4939 ldrs(vtmp, Address(str1)); 4940 cmp(str1, str2); 4941 br(Assembler::EQ, DONE); 4942 ldr(tmp2, Address(str2)); 4943 cmp(cnt2, STUB_THRESHOLD); 4944 br(GE, STUB); 4945 subw(cnt2, cnt2, 4); 4946 eor(vtmpZ, T16B, vtmpZ, vtmpZ); 4947 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift))); 4948 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift))); 4949 zip1(vtmp, T8B, vtmp, vtmpZ); 4950 sub(cnt1, zr, cnt2, LSL, str1_chr_shift); 4951 sub(cnt2, zr, cnt2, LSL, str2_chr_shift); 4952 add(cnt1, cnt1, 4); 4953 fmovd(tmp1, vtmp); 4954 } else { // UL case 4955 ldr(tmp1, Address(str1)); 4956 cmp(str1, str2); 4957 br(Assembler::EQ, DONE); 4958 ldrs(vtmp, Address(str2)); 4959 cmp(cnt2, STUB_THRESHOLD); 4960 br(GE, STUB); 4961 subw(cnt2, cnt2, 4); 4962 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift))); 4963 eor(vtmpZ, T16B, vtmpZ, vtmpZ); 4964 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift))); 4965 sub(cnt1, zr, cnt2, LSL, str1_chr_shift); 4966 zip1(vtmp, T8B, vtmp, vtmpZ); 4967 sub(cnt2, zr, cnt2, LSL, str2_chr_shift); 4968 add(cnt1, cnt1, 8); 4969 fmovd(tmp2, vtmp); 4970 } 4971 adds(cnt2, cnt2, isUL ? 4 : 8); 4972 br(GE, TAIL); 4973 eor(rscratch2, tmp1, tmp2); 4974 cbnz(rscratch2, DIFFERENCE); 4975 // main loop 4976 bind(NEXT_WORD); 4977 if (str1_isL == str2_isL) { 4978 ldr(tmp1, Address(str1, cnt2)); 4979 ldr(tmp2, Address(str2, cnt2)); 4980 adds(cnt2, cnt2, 8); 4981 } else if (isLU) { 4982 ldrs(vtmp, Address(str1, cnt1)); 4983 ldr(tmp2, Address(str2, cnt2)); 4984 add(cnt1, cnt1, 4); 4985 zip1(vtmp, T8B, vtmp, vtmpZ); 4986 fmovd(tmp1, vtmp); 4987 adds(cnt2, cnt2, 8); 4988 } else { // UL 4989 ldrs(vtmp, Address(str2, cnt2)); 4990 ldr(tmp1, Address(str1, cnt1)); 4991 zip1(vtmp, T8B, vtmp, vtmpZ); 4992 add(cnt1, cnt1, 8); 4993 fmovd(tmp2, vtmp); 4994 adds(cnt2, cnt2, 4); 4995 } 4996 br(GE, TAIL); 4997 4998 eor(rscratch2, tmp1, tmp2); 4999 cbz(rscratch2, NEXT_WORD); 5000 b(DIFFERENCE); 5001 bind(TAIL); 5002 eor(rscratch2, tmp1, tmp2); 5003 cbnz(rscratch2, DIFFERENCE); 5004 // Last longword. In the case where length == 4 we compare the 5005 // same longword twice, but that's still faster than another 5006 // conditional branch. 5007 if (str1_isL == str2_isL) { 5008 ldr(tmp1, Address(str1)); 5009 ldr(tmp2, Address(str2)); 5010 } else if (isLU) { 5011 ldrs(vtmp, Address(str1)); 5012 ldr(tmp2, Address(str2)); 5013 zip1(vtmp, T8B, vtmp, vtmpZ); 5014 fmovd(tmp1, vtmp); 5015 } else { // UL 5016 ldrs(vtmp, Address(str2)); 5017 ldr(tmp1, Address(str1)); 5018 zip1(vtmp, T8B, vtmp, vtmpZ); 5019 fmovd(tmp2, vtmp); 5020 } 5021 bind(TAIL_CHECK); 5022 eor(rscratch2, tmp1, tmp2); 5023 cbz(rscratch2, DONE); 5024 5025 // Find the first different characters in the longwords and 5026 // compute their difference. 5027 bind(DIFFERENCE); 5028 rev(rscratch2, rscratch2); 5029 clz(rscratch2, rscratch2); 5030 andr(rscratch2, rscratch2, isLL ? -8 : -16); 5031 lsrv(tmp1, tmp1, rscratch2); 5032 (this->*ext_chr)(tmp1, tmp1); 5033 lsrv(tmp2, tmp2, rscratch2); 5034 (this->*ext_chr)(tmp2, tmp2); 5035 subw(result, tmp1, tmp2); 5036 b(DONE); 5037 } 5038 5039 bind(STUB); 5040 RuntimeAddress stub = NULL; 5041 switch(ae) { 5042 case StrIntrinsicNode::LL: 5043 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LL()); 5044 break; 5045 case StrIntrinsicNode::UU: 5046 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UU()); 5047 break; 5048 case StrIntrinsicNode::LU: 5049 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LU()); 5050 break; 5051 case StrIntrinsicNode::UL: 5052 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UL()); 5053 break; 5054 default: 5055 ShouldNotReachHere(); 5056 } 5057 assert(stub.target() != NULL, "compare_long_string stub has not been generated"); 5058 trampoline_call(stub); 5059 b(DONE); 5060 5061 bind(SHORT_STRING); 5062 // Is the minimum length zero? 5063 cbz(cnt2, DONE); 5064 // arrange code to do most branches while loading and loading next characters 5065 // while comparing previous 5066 (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size))); 5067 subs(cnt2, cnt2, 1); 5068 br(EQ, SHORT_LAST_INIT); 5069 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size))); 5070 b(SHORT_LOOP_START); 5071 bind(SHORT_LOOP); 5072 subs(cnt2, cnt2, 1); 5073 br(EQ, SHORT_LAST); 5074 bind(SHORT_LOOP_START); 5075 (this->*str1_load_chr)(tmp2, Address(post(str1, str1_chr_size))); 5076 (this->*str2_load_chr)(rscratch1, Address(post(str2, str2_chr_size))); 5077 cmp(tmp1, cnt1); 5078 br(NE, SHORT_LOOP_TAIL); 5079 subs(cnt2, cnt2, 1); 5080 br(EQ, SHORT_LAST2); 5081 (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size))); 5082 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size))); 5083 cmp(tmp2, rscratch1); 5084 br(EQ, SHORT_LOOP); 5085 sub(result, tmp2, rscratch1); 5086 b(DONE); 5087 bind(SHORT_LOOP_TAIL); 5088 sub(result, tmp1, cnt1); 5089 b(DONE); 5090 bind(SHORT_LAST2); 5091 cmp(tmp2, rscratch1); 5092 br(EQ, DONE); 5093 sub(result, tmp2, rscratch1); 5094 5095 b(DONE); 5096 bind(SHORT_LAST_INIT); 5097 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size))); 5098 bind(SHORT_LAST); 5099 cmp(tmp1, cnt1); 5100 br(EQ, DONE); 5101 sub(result, tmp1, cnt1); 5102 5103 bind(DONE); 5104 5105 BLOCK_COMMENT("} string_compare"); 5106 } 5107 #endif // COMPILER2 5108 5109 // This method checks if provided byte array contains byte with highest bit set. 5110 void MacroAssembler::has_negatives(Register ary1, Register len, Register result) { 5111 // Simple and most common case of aligned small array which is not at the 5112 // end of memory page is placed here. All other cases are in stub. 5113 Label LOOP, END, STUB, STUB_LONG, SET_RESULT, DONE; 5114 const uint64_t UPPER_BIT_MASK=0x8080808080808080; 5115 assert_different_registers(ary1, len, result); 5116 5117 cmpw(len, 0); 5118 br(LE, SET_RESULT); 5119 cmpw(len, 4 * wordSize); 5120 br(GE, STUB_LONG); // size > 32 then go to stub 5121 5122 int shift = 64 - exact_log2(os::vm_page_size()); 5123 lsl(rscratch1, ary1, shift); 5124 mov(rscratch2, (size_t)(4 * wordSize) << shift); 5125 adds(rscratch2, rscratch1, rscratch2); // At end of page? 5126 br(CS, STUB); // at the end of page then go to stub 5127 subs(len, len, wordSize); 5128 br(LT, END); 5129 5130 BIND(LOOP); 5131 ldr(rscratch1, Address(post(ary1, wordSize))); 5132 tst(rscratch1, UPPER_BIT_MASK); 5133 br(NE, SET_RESULT); 5134 subs(len, len, wordSize); 5135 br(GE, LOOP); 5136 cmpw(len, -wordSize); 5137 br(EQ, SET_RESULT); 5138 5139 BIND(END); 5140 ldr(result, Address(ary1)); 5141 sub(len, zr, len, LSL, 3); // LSL 3 is to get bits from bytes 5142 lslv(result, result, len); 5143 tst(result, UPPER_BIT_MASK); 5144 b(SET_RESULT); 5145 5146 BIND(STUB); 5147 RuntimeAddress has_neg = RuntimeAddress(StubRoutines::aarch64::has_negatives()); 5148 assert(has_neg.target() != NULL, "has_negatives stub has not been generated"); 5149 trampoline_call(has_neg); 5150 b(DONE); 5151 5152 BIND(STUB_LONG); 5153 RuntimeAddress has_neg_long = RuntimeAddress( 5154 StubRoutines::aarch64::has_negatives_long()); 5155 assert(has_neg_long.target() != NULL, "has_negatives stub has not been generated"); 5156 trampoline_call(has_neg_long); 5157 b(DONE); 5158 5159 BIND(SET_RESULT); 5160 cset(result, NE); // set true or false 5161 5162 BIND(DONE); 5163 } 5164 5165 void MacroAssembler::arrays_equals(Register a1, Register a2, Register tmp3, 5166 Register tmp4, Register tmp5, Register result, 5167 Register cnt1, int elem_size) { 5168 Label DONE, SAME; 5169 Register tmp1 = rscratch1; 5170 Register tmp2 = rscratch2; 5171 Register cnt2 = tmp2; // cnt2 only used in array length compare 5172 int elem_per_word = wordSize/elem_size; 5173 int log_elem_size = exact_log2(elem_size); 5174 int length_offset = arrayOopDesc::length_offset_in_bytes(); 5175 int base_offset 5176 = arrayOopDesc::base_offset_in_bytes(elem_size == 2 ? T_CHAR : T_BYTE); 5177 int stubBytesThreshold = 3 * 64 + (UseSIMDForArrayEquals ? 0 : 16); 5178 5179 assert(elem_size == 1 || elem_size == 2, "must be char or byte"); 5180 assert_different_registers(a1, a2, result, cnt1, rscratch1, rscratch2); 5181 5182 #ifndef PRODUCT 5183 { 5184 const char kind = (elem_size == 2) ? 'U' : 'L'; 5185 char comment[64]; 5186 snprintf(comment, sizeof comment, "array_equals%c{", kind); 5187 BLOCK_COMMENT(comment); 5188 } 5189 #endif 5190 5191 // if (a1 == a2) 5192 // return true; 5193 cmpoop(a1, a2); // May have read barriers for a1 and a2. 5194 br(EQ, SAME); 5195 5196 if (UseSimpleArrayEquals) { 5197 Label NEXT_WORD, SHORT, TAIL03, TAIL01, A_MIGHT_BE_NULL, A_IS_NOT_NULL; 5198 // if (a1 == null || a2 == null) 5199 // return false; 5200 // a1 & a2 == 0 means (some-pointer is null) or 5201 // (very-rare-or-even-probably-impossible-pointer-values) 5202 // so, we can save one branch in most cases 5203 tst(a1, a2); 5204 mov(result, false); 5205 br(EQ, A_MIGHT_BE_NULL); 5206 // if (a1.length != a2.length) 5207 // return false; 5208 bind(A_IS_NOT_NULL); 5209 ldrw(cnt1, Address(a1, length_offset)); 5210 ldrw(cnt2, Address(a2, length_offset)); 5211 eorw(tmp5, cnt1, cnt2); 5212 cbnzw(tmp5, DONE); 5213 lea(a1, Address(a1, base_offset)); 5214 lea(a2, Address(a2, base_offset)); 5215 // Check for short strings, i.e. smaller than wordSize. 5216 subs(cnt1, cnt1, elem_per_word); 5217 br(Assembler::LT, SHORT); 5218 // Main 8 byte comparison loop. 5219 bind(NEXT_WORD); { 5220 ldr(tmp1, Address(post(a1, wordSize))); 5221 ldr(tmp2, Address(post(a2, wordSize))); 5222 subs(cnt1, cnt1, elem_per_word); 5223 eor(tmp5, tmp1, tmp2); 5224 cbnz(tmp5, DONE); 5225 } br(GT, NEXT_WORD); 5226 // Last longword. In the case where length == 4 we compare the 5227 // same longword twice, but that's still faster than another 5228 // conditional branch. 5229 // cnt1 could be 0, -1, -2, -3, -4 for chars; -4 only happens when 5230 // length == 4. 5231 if (log_elem_size > 0) 5232 lsl(cnt1, cnt1, log_elem_size); 5233 ldr(tmp3, Address(a1, cnt1)); 5234 ldr(tmp4, Address(a2, cnt1)); 5235 eor(tmp5, tmp3, tmp4); 5236 cbnz(tmp5, DONE); 5237 b(SAME); 5238 bind(A_MIGHT_BE_NULL); 5239 // in case both a1 and a2 are not-null, proceed with loads 5240 cbz(a1, DONE); 5241 cbz(a2, DONE); 5242 b(A_IS_NOT_NULL); 5243 bind(SHORT); 5244 5245 tbz(cnt1, 2 - log_elem_size, TAIL03); // 0-7 bytes left. 5246 { 5247 ldrw(tmp1, Address(post(a1, 4))); 5248 ldrw(tmp2, Address(post(a2, 4))); 5249 eorw(tmp5, tmp1, tmp2); 5250 cbnzw(tmp5, DONE); 5251 } 5252 bind(TAIL03); 5253 tbz(cnt1, 1 - log_elem_size, TAIL01); // 0-3 bytes left. 5254 { 5255 ldrh(tmp3, Address(post(a1, 2))); 5256 ldrh(tmp4, Address(post(a2, 2))); 5257 eorw(tmp5, tmp3, tmp4); 5258 cbnzw(tmp5, DONE); 5259 } 5260 bind(TAIL01); 5261 if (elem_size == 1) { // Only needed when comparing byte arrays. 5262 tbz(cnt1, 0, SAME); // 0-1 bytes left. 5263 { 5264 ldrb(tmp1, a1); 5265 ldrb(tmp2, a2); 5266 eorw(tmp5, tmp1, tmp2); 5267 cbnzw(tmp5, DONE); 5268 } 5269 } 5270 } else { 5271 Label NEXT_DWORD, SHORT, TAIL, TAIL2, STUB, EARLY_OUT, 5272 CSET_EQ, LAST_CHECK; 5273 mov(result, false); 5274 cbz(a1, DONE); 5275 ldrw(cnt1, Address(a1, length_offset)); 5276 cbz(a2, DONE); 5277 ldrw(cnt2, Address(a2, length_offset)); 5278 // on most CPUs a2 is still "locked"(surprisingly) in ldrw and it's 5279 // faster to perform another branch before comparing a1 and a2 5280 cmp(cnt1, (u1)elem_per_word); 5281 br(LE, SHORT); // short or same 5282 ldr(tmp3, Address(pre(a1, base_offset))); 5283 subs(zr, cnt1, stubBytesThreshold); 5284 br(GE, STUB); 5285 ldr(tmp4, Address(pre(a2, base_offset))); 5286 sub(tmp5, zr, cnt1, LSL, 3 + log_elem_size); 5287 cmp(cnt2, cnt1); 5288 br(NE, DONE); 5289 5290 // Main 16 byte comparison loop with 2 exits 5291 bind(NEXT_DWORD); { 5292 ldr(tmp1, Address(pre(a1, wordSize))); 5293 ldr(tmp2, Address(pre(a2, wordSize))); 5294 subs(cnt1, cnt1, 2 * elem_per_word); 5295 br(LE, TAIL); 5296 eor(tmp4, tmp3, tmp4); 5297 cbnz(tmp4, DONE); 5298 ldr(tmp3, Address(pre(a1, wordSize))); 5299 ldr(tmp4, Address(pre(a2, wordSize))); 5300 cmp(cnt1, (u1)elem_per_word); 5301 br(LE, TAIL2); 5302 cmp(tmp1, tmp2); 5303 } br(EQ, NEXT_DWORD); 5304 b(DONE); 5305 5306 bind(TAIL); 5307 eor(tmp4, tmp3, tmp4); 5308 eor(tmp2, tmp1, tmp2); 5309 lslv(tmp2, tmp2, tmp5); 5310 orr(tmp5, tmp4, tmp2); 5311 cmp(tmp5, zr); 5312 b(CSET_EQ); 5313 5314 bind(TAIL2); 5315 eor(tmp2, tmp1, tmp2); 5316 cbnz(tmp2, DONE); 5317 b(LAST_CHECK); 5318 5319 bind(STUB); 5320 ldr(tmp4, Address(pre(a2, base_offset))); 5321 cmp(cnt2, cnt1); 5322 br(NE, DONE); 5323 if (elem_size == 2) { // convert to byte counter 5324 lsl(cnt1, cnt1, 1); 5325 } 5326 eor(tmp5, tmp3, tmp4); 5327 cbnz(tmp5, DONE); 5328 RuntimeAddress stub = RuntimeAddress(StubRoutines::aarch64::large_array_equals()); 5329 assert(stub.target() != NULL, "array_equals_long stub has not been generated"); 5330 trampoline_call(stub); 5331 b(DONE); 5332 5333 bind(EARLY_OUT); 5334 // (a1 != null && a2 == null) || (a1 != null && a2 != null && a1 == a2) 5335 // so, if a2 == null => return false(0), else return true, so we can return a2 5336 mov(result, a2); 5337 b(DONE); 5338 bind(SHORT); 5339 cmp(cnt2, cnt1); 5340 br(NE, DONE); 5341 cbz(cnt1, SAME); 5342 sub(tmp5, zr, cnt1, LSL, 3 + log_elem_size); 5343 ldr(tmp3, Address(a1, base_offset)); 5344 ldr(tmp4, Address(a2, base_offset)); 5345 bind(LAST_CHECK); 5346 eor(tmp4, tmp3, tmp4); 5347 lslv(tmp5, tmp4, tmp5); 5348 cmp(tmp5, zr); 5349 bind(CSET_EQ); 5350 cset(result, EQ); 5351 b(DONE); 5352 } 5353 5354 bind(SAME); 5355 mov(result, true); 5356 // That's it. 5357 bind(DONE); 5358 5359 BLOCK_COMMENT("} array_equals"); 5360 } 5361 5362 // Compare Strings 5363 5364 // For Strings we're passed the address of the first characters in a1 5365 // and a2 and the length in cnt1. 5366 // elem_size is the element size in bytes: either 1 or 2. 5367 // There are two implementations. For arrays >= 8 bytes, all 5368 // comparisons (including the final one, which may overlap) are 5369 // performed 8 bytes at a time. For strings < 8 bytes, we compare a 5370 // halfword, then a short, and then a byte. 5371 5372 void MacroAssembler::string_equals(Register a1, Register a2, 5373 Register result, Register cnt1, int elem_size) 5374 { 5375 Label SAME, DONE, SHORT, NEXT_WORD; 5376 Register tmp1 = rscratch1; 5377 Register tmp2 = rscratch2; 5378 Register cnt2 = tmp2; // cnt2 only used in array length compare 5379 5380 assert(elem_size == 1 || elem_size == 2, "must be 2 or 1 byte"); 5381 assert_different_registers(a1, a2, result, cnt1, rscratch1, rscratch2); 5382 5383 #ifndef PRODUCT 5384 { 5385 const char kind = (elem_size == 2) ? 'U' : 'L'; 5386 char comment[64]; 5387 snprintf(comment, sizeof comment, "{string_equals%c", kind); 5388 BLOCK_COMMENT(comment); 5389 } 5390 #endif 5391 5392 mov(result, false); 5393 5394 // Check for short strings, i.e. smaller than wordSize. 5395 subs(cnt1, cnt1, wordSize); 5396 br(Assembler::LT, SHORT); 5397 // Main 8 byte comparison loop. 5398 bind(NEXT_WORD); { 5399 ldr(tmp1, Address(post(a1, wordSize))); 5400 ldr(tmp2, Address(post(a2, wordSize))); 5401 subs(cnt1, cnt1, wordSize); 5402 eor(tmp1, tmp1, tmp2); 5403 cbnz(tmp1, DONE); 5404 } br(GT, NEXT_WORD); 5405 // Last longword. In the case where length == 4 we compare the 5406 // same longword twice, but that's still faster than another 5407 // conditional branch. 5408 // cnt1 could be 0, -1, -2, -3, -4 for chars; -4 only happens when 5409 // length == 4. 5410 ldr(tmp1, Address(a1, cnt1)); 5411 ldr(tmp2, Address(a2, cnt1)); 5412 eor(tmp2, tmp1, tmp2); 5413 cbnz(tmp2, DONE); 5414 b(SAME); 5415 5416 bind(SHORT); 5417 Label TAIL03, TAIL01; 5418 5419 tbz(cnt1, 2, TAIL03); // 0-7 bytes left. 5420 { 5421 ldrw(tmp1, Address(post(a1, 4))); 5422 ldrw(tmp2, Address(post(a2, 4))); 5423 eorw(tmp1, tmp1, tmp2); 5424 cbnzw(tmp1, DONE); 5425 } 5426 bind(TAIL03); 5427 tbz(cnt1, 1, TAIL01); // 0-3 bytes left. 5428 { 5429 ldrh(tmp1, Address(post(a1, 2))); 5430 ldrh(tmp2, Address(post(a2, 2))); 5431 eorw(tmp1, tmp1, tmp2); 5432 cbnzw(tmp1, DONE); 5433 } 5434 bind(TAIL01); 5435 if (elem_size == 1) { // Only needed when comparing 1-byte elements 5436 tbz(cnt1, 0, SAME); // 0-1 bytes left. 5437 { 5438 ldrb(tmp1, a1); 5439 ldrb(tmp2, a2); 5440 eorw(tmp1, tmp1, tmp2); 5441 cbnzw(tmp1, DONE); 5442 } 5443 } 5444 // Arrays are equal. 5445 bind(SAME); 5446 mov(result, true); 5447 5448 // That's it. 5449 bind(DONE); 5450 BLOCK_COMMENT("} string_equals"); 5451 } 5452 5453 5454 // The size of the blocks erased by the zero_blocks stub. We must 5455 // handle anything smaller than this ourselves in zero_words(). 5456 const int MacroAssembler::zero_words_block_size = 8; 5457 5458 // zero_words() is used by C2 ClearArray patterns. It is as small as 5459 // possible, handling small word counts locally and delegating 5460 // anything larger to the zero_blocks stub. It is expanded many times 5461 // in compiled code, so it is important to keep it short. 5462 5463 // ptr: Address of a buffer to be zeroed. 5464 // cnt: Count in HeapWords. 5465 // 5466 // ptr, cnt, rscratch1, and rscratch2 are clobbered. 5467 void MacroAssembler::zero_words(Register ptr, Register cnt) 5468 { 5469 assert(is_power_of_2(zero_words_block_size), "adjust this"); 5470 assert(ptr == r10 && cnt == r11, "mismatch in register usage"); 5471 5472 BLOCK_COMMENT("zero_words {"); 5473 cmp(cnt, (u1)zero_words_block_size); 5474 Label around; 5475 br(LO, around); 5476 { 5477 RuntimeAddress zero_blocks = RuntimeAddress(StubRoutines::aarch64::zero_blocks()); 5478 assert(zero_blocks.target() != NULL, "zero_blocks stub has not been generated"); 5479 if (StubRoutines::aarch64::complete()) { 5480 trampoline_call(zero_blocks); 5481 } else { 5482 bl(zero_blocks); 5483 } 5484 } 5485 bind(around); 5486 for (int i = zero_words_block_size >> 1; i > 1; i >>= 1) { 5487 Label l; 5488 tbz(cnt, exact_log2(i), l); 5489 for (int j = 0; j < i; j += 2) { 5490 stp(zr, zr, post(ptr, 16)); 5491 } 5492 bind(l); 5493 } 5494 { 5495 Label l; 5496 tbz(cnt, 0, l); 5497 str(zr, Address(ptr)); 5498 bind(l); 5499 } 5500 BLOCK_COMMENT("} zero_words"); 5501 } 5502 5503 // base: Address of a buffer to be zeroed, 8 bytes aligned. 5504 // cnt: Immediate count in HeapWords. 5505 #define SmallArraySize (18 * BytesPerLong) 5506 void MacroAssembler::zero_words(Register base, u_int64_t cnt) 5507 { 5508 BLOCK_COMMENT("zero_words {"); 5509 int i = cnt & 1; // store any odd word to start 5510 if (i) str(zr, Address(base)); 5511 5512 if (cnt <= SmallArraySize / BytesPerLong) { 5513 for (; i < (int)cnt; i += 2) 5514 stp(zr, zr, Address(base, i * wordSize)); 5515 } else { 5516 const int unroll = 4; // Number of stp(zr, zr) instructions we'll unroll 5517 int remainder = cnt % (2 * unroll); 5518 for (; i < remainder; i += 2) 5519 stp(zr, zr, Address(base, i * wordSize)); 5520 5521 Label loop; 5522 Register cnt_reg = rscratch1; 5523 Register loop_base = rscratch2; 5524 cnt = cnt - remainder; 5525 mov(cnt_reg, cnt); 5526 // adjust base and prebias by -2 * wordSize so we can pre-increment 5527 add(loop_base, base, (remainder - 2) * wordSize); 5528 bind(loop); 5529 sub(cnt_reg, cnt_reg, 2 * unroll); 5530 for (i = 1; i < unroll; i++) 5531 stp(zr, zr, Address(loop_base, 2 * i * wordSize)); 5532 stp(zr, zr, Address(pre(loop_base, 2 * unroll * wordSize))); 5533 cbnz(cnt_reg, loop); 5534 } 5535 BLOCK_COMMENT("} zero_words"); 5536 } 5537 5538 // Zero blocks of memory by using DC ZVA. 5539 // 5540 // Aligns the base address first sufficently for DC ZVA, then uses 5541 // DC ZVA repeatedly for every full block. cnt is the size to be 5542 // zeroed in HeapWords. Returns the count of words left to be zeroed 5543 // in cnt. 5544 // 5545 // NOTE: This is intended to be used in the zero_blocks() stub. If 5546 // you want to use it elsewhere, note that cnt must be >= 2*zva_length. 5547 void MacroAssembler::zero_dcache_blocks(Register base, Register cnt) { 5548 Register tmp = rscratch1; 5549 Register tmp2 = rscratch2; 5550 int zva_length = VM_Version::zva_length(); 5551 Label initial_table_end, loop_zva; 5552 Label fini; 5553 5554 // Base must be 16 byte aligned. If not just return and let caller handle it 5555 tst(base, 0x0f); 5556 br(Assembler::NE, fini); 5557 // Align base with ZVA length. 5558 neg(tmp, base); 5559 andr(tmp, tmp, zva_length - 1); 5560 5561 // tmp: the number of bytes to be filled to align the base with ZVA length. 5562 add(base, base, tmp); 5563 sub(cnt, cnt, tmp, Assembler::ASR, 3); 5564 adr(tmp2, initial_table_end); 5565 sub(tmp2, tmp2, tmp, Assembler::LSR, 2); 5566 br(tmp2); 5567 5568 for (int i = -zva_length + 16; i < 0; i += 16) 5569 stp(zr, zr, Address(base, i)); 5570 bind(initial_table_end); 5571 5572 sub(cnt, cnt, zva_length >> 3); 5573 bind(loop_zva); 5574 dc(Assembler::ZVA, base); 5575 subs(cnt, cnt, zva_length >> 3); 5576 add(base, base, zva_length); 5577 br(Assembler::GE, loop_zva); 5578 add(cnt, cnt, zva_length >> 3); // count not zeroed by DC ZVA 5579 bind(fini); 5580 } 5581 5582 // base: Address of a buffer to be filled, 8 bytes aligned. 5583 // cnt: Count in 8-byte unit. 5584 // value: Value to be filled with. 5585 // base will point to the end of the buffer after filling. 5586 void MacroAssembler::fill_words(Register base, Register cnt, Register value) 5587 { 5588 // Algorithm: 5589 // 5590 // scratch1 = cnt & 7; 5591 // cnt -= scratch1; 5592 // p += scratch1; 5593 // switch (scratch1) { 5594 // do { 5595 // cnt -= 8; 5596 // p[-8] = v; 5597 // case 7: 5598 // p[-7] = v; 5599 // case 6: 5600 // p[-6] = v; 5601 // // ... 5602 // case 1: 5603 // p[-1] = v; 5604 // case 0: 5605 // p += 8; 5606 // } while (cnt); 5607 // } 5608 5609 assert_different_registers(base, cnt, value, rscratch1, rscratch2); 5610 5611 Label fini, skip, entry, loop; 5612 const int unroll = 8; // Number of stp instructions we'll unroll 5613 5614 cbz(cnt, fini); 5615 tbz(base, 3, skip); 5616 str(value, Address(post(base, 8))); 5617 sub(cnt, cnt, 1); 5618 bind(skip); 5619 5620 andr(rscratch1, cnt, (unroll-1) * 2); 5621 sub(cnt, cnt, rscratch1); 5622 add(base, base, rscratch1, Assembler::LSL, 3); 5623 adr(rscratch2, entry); 5624 sub(rscratch2, rscratch2, rscratch1, Assembler::LSL, 1); 5625 br(rscratch2); 5626 5627 bind(loop); 5628 add(base, base, unroll * 16); 5629 for (int i = -unroll; i < 0; i++) 5630 stp(value, value, Address(base, i * 16)); 5631 bind(entry); 5632 subs(cnt, cnt, unroll * 2); 5633 br(Assembler::GE, loop); 5634 5635 tbz(cnt, 0, fini); 5636 str(value, Address(post(base, 8))); 5637 bind(fini); 5638 } 5639 5640 // Intrinsic for sun/nio/cs/ISO_8859_1$Encoder.implEncodeISOArray and 5641 // java/lang/StringUTF16.compress. 5642 void MacroAssembler::encode_iso_array(Register src, Register dst, 5643 Register len, Register result, 5644 FloatRegister Vtmp1, FloatRegister Vtmp2, 5645 FloatRegister Vtmp3, FloatRegister Vtmp4) 5646 { 5647 Label DONE, SET_RESULT, NEXT_32, NEXT_32_PRFM, LOOP_8, NEXT_8, LOOP_1, NEXT_1, 5648 NEXT_32_START, NEXT_32_PRFM_START; 5649 Register tmp1 = rscratch1, tmp2 = rscratch2; 5650 5651 mov(result, len); // Save initial len 5652 5653 #ifndef BUILTIN_SIM 5654 cmp(len, (u1)8); // handle shortest strings first 5655 br(LT, LOOP_1); 5656 cmp(len, (u1)32); 5657 br(LT, NEXT_8); 5658 // The following code uses the SIMD 'uzp1' and 'uzp2' instructions 5659 // to convert chars to bytes 5660 if (SoftwarePrefetchHintDistance >= 0) { 5661 ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src); 5662 subs(tmp2, len, SoftwarePrefetchHintDistance/2 + 16); 5663 br(LE, NEXT_32_START); 5664 b(NEXT_32_PRFM_START); 5665 BIND(NEXT_32_PRFM); 5666 ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src); 5667 BIND(NEXT_32_PRFM_START); 5668 prfm(Address(src, SoftwarePrefetchHintDistance)); 5669 orr(v4, T16B, Vtmp1, Vtmp2); 5670 orr(v5, T16B, Vtmp3, Vtmp4); 5671 uzp1(Vtmp1, T16B, Vtmp1, Vtmp2); 5672 uzp1(Vtmp3, T16B, Vtmp3, Vtmp4); 5673 uzp2(v5, T16B, v4, v5); // high bytes 5674 umov(tmp2, v5, D, 1); 5675 fmovd(tmp1, v5); 5676 orr(tmp1, tmp1, tmp2); 5677 cbnz(tmp1, LOOP_8); 5678 stpq(Vtmp1, Vtmp3, dst); 5679 sub(len, len, 32); 5680 add(dst, dst, 32); 5681 add(src, src, 64); 5682 subs(tmp2, len, SoftwarePrefetchHintDistance/2 + 16); 5683 br(GE, NEXT_32_PRFM); 5684 cmp(len, (u1)32); 5685 br(LT, LOOP_8); 5686 BIND(NEXT_32); 5687 ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src); 5688 BIND(NEXT_32_START); 5689 } else { 5690 BIND(NEXT_32); 5691 ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src); 5692 } 5693 prfm(Address(src, SoftwarePrefetchHintDistance)); 5694 uzp1(v4, T16B, Vtmp1, Vtmp2); 5695 uzp1(v5, T16B, Vtmp3, Vtmp4); 5696 orr(Vtmp1, T16B, Vtmp1, Vtmp2); 5697 orr(Vtmp3, T16B, Vtmp3, Vtmp4); 5698 uzp2(Vtmp1, T16B, Vtmp1, Vtmp3); // high bytes 5699 umov(tmp2, Vtmp1, D, 1); 5700 fmovd(tmp1, Vtmp1); 5701 orr(tmp1, tmp1, tmp2); 5702 cbnz(tmp1, LOOP_8); 5703 stpq(v4, v5, dst); 5704 sub(len, len, 32); 5705 add(dst, dst, 32); 5706 add(src, src, 64); 5707 cmp(len, (u1)32); 5708 br(GE, NEXT_32); 5709 cbz(len, DONE); 5710 5711 BIND(LOOP_8); 5712 cmp(len, (u1)8); 5713 br(LT, LOOP_1); 5714 BIND(NEXT_8); 5715 ld1(Vtmp1, T8H, src); 5716 uzp1(Vtmp2, T16B, Vtmp1, Vtmp1); // low bytes 5717 uzp2(Vtmp3, T16B, Vtmp1, Vtmp1); // high bytes 5718 fmovd(tmp1, Vtmp3); 5719 cbnz(tmp1, NEXT_1); 5720 strd(Vtmp2, dst); 5721 5722 sub(len, len, 8); 5723 add(dst, dst, 8); 5724 add(src, src, 16); 5725 cmp(len, (u1)8); 5726 br(GE, NEXT_8); 5727 5728 BIND(LOOP_1); 5729 #endif 5730 cbz(len, DONE); 5731 BIND(NEXT_1); 5732 ldrh(tmp1, Address(post(src, 2))); 5733 tst(tmp1, 0xff00); 5734 br(NE, SET_RESULT); 5735 strb(tmp1, Address(post(dst, 1))); 5736 subs(len, len, 1); 5737 br(GT, NEXT_1); 5738 5739 BIND(SET_RESULT); 5740 sub(result, result, len); // Return index where we stopped 5741 // Return len == 0 if we processed all 5742 // characters 5743 BIND(DONE); 5744 } 5745 5746 5747 // Inflate byte[] array to char[]. 5748 void MacroAssembler::byte_array_inflate(Register src, Register dst, Register len, 5749 FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3, 5750 Register tmp4) { 5751 Label big, done, after_init, to_stub; 5752 5753 assert_different_registers(src, dst, len, tmp4, rscratch1); 5754 5755 fmovd(vtmp1, zr); 5756 lsrw(tmp4, len, 3); 5757 bind(after_init); 5758 cbnzw(tmp4, big); 5759 // Short string: less than 8 bytes. 5760 { 5761 Label loop, tiny; 5762 5763 cmpw(len, 4); 5764 br(LT, tiny); 5765 // Use SIMD to do 4 bytes. 5766 ldrs(vtmp2, post(src, 4)); 5767 zip1(vtmp3, T8B, vtmp2, vtmp1); 5768 subw(len, len, 4); 5769 strd(vtmp3, post(dst, 8)); 5770 5771 cbzw(len, done); 5772 5773 // Do the remaining bytes by steam. 5774 bind(loop); 5775 ldrb(tmp4, post(src, 1)); 5776 strh(tmp4, post(dst, 2)); 5777 subw(len, len, 1); 5778 5779 bind(tiny); 5780 cbnz(len, loop); 5781 5782 b(done); 5783 } 5784 5785 if (SoftwarePrefetchHintDistance >= 0) { 5786 bind(to_stub); 5787 RuntimeAddress stub = RuntimeAddress(StubRoutines::aarch64::large_byte_array_inflate()); 5788 assert(stub.target() != NULL, "large_byte_array_inflate stub has not been generated"); 5789 trampoline_call(stub); 5790 b(after_init); 5791 } 5792 5793 // Unpack the bytes 8 at a time. 5794 bind(big); 5795 { 5796 Label loop, around, loop_last, loop_start; 5797 5798 if (SoftwarePrefetchHintDistance >= 0) { 5799 const int large_loop_threshold = (64 + 16)/8; 5800 ldrd(vtmp2, post(src, 8)); 5801 andw(len, len, 7); 5802 cmp(tmp4, (u1)large_loop_threshold); 5803 br(GE, to_stub); 5804 b(loop_start); 5805 5806 bind(loop); 5807 ldrd(vtmp2, post(src, 8)); 5808 bind(loop_start); 5809 subs(tmp4, tmp4, 1); 5810 br(EQ, loop_last); 5811 zip1(vtmp2, T16B, vtmp2, vtmp1); 5812 ldrd(vtmp3, post(src, 8)); 5813 st1(vtmp2, T8H, post(dst, 16)); 5814 subs(tmp4, tmp4, 1); 5815 zip1(vtmp3, T16B, vtmp3, vtmp1); 5816 st1(vtmp3, T8H, post(dst, 16)); 5817 br(NE, loop); 5818 b(around); 5819 bind(loop_last); 5820 zip1(vtmp2, T16B, vtmp2, vtmp1); 5821 st1(vtmp2, T8H, post(dst, 16)); 5822 bind(around); 5823 cbz(len, done); 5824 } else { 5825 andw(len, len, 7); 5826 bind(loop); 5827 ldrd(vtmp2, post(src, 8)); 5828 sub(tmp4, tmp4, 1); 5829 zip1(vtmp3, T16B, vtmp2, vtmp1); 5830 st1(vtmp3, T8H, post(dst, 16)); 5831 cbnz(tmp4, loop); 5832 } 5833 } 5834 5835 // Do the tail of up to 8 bytes. 5836 add(src, src, len); 5837 ldrd(vtmp3, Address(src, -8)); 5838 add(dst, dst, len, ext::uxtw, 1); 5839 zip1(vtmp3, T16B, vtmp3, vtmp1); 5840 strq(vtmp3, Address(dst, -16)); 5841 5842 bind(done); 5843 } 5844 5845 // Compress char[] array to byte[]. 5846 void MacroAssembler::char_array_compress(Register src, Register dst, Register len, 5847 FloatRegister tmp1Reg, FloatRegister tmp2Reg, 5848 FloatRegister tmp3Reg, FloatRegister tmp4Reg, 5849 Register result) { 5850 encode_iso_array(src, dst, len, result, 5851 tmp1Reg, tmp2Reg, tmp3Reg, tmp4Reg); 5852 cmp(len, zr); 5853 csel(result, result, zr, EQ); 5854 } 5855 5856 // get_thread() can be called anywhere inside generated code so we 5857 // need to save whatever non-callee save context might get clobbered 5858 // by the call to JavaThread::aarch64_get_thread_helper() or, indeed, 5859 // the call setup code. 5860 // 5861 // aarch64_get_thread_helper() clobbers only r0, r1, and flags. 5862 // 5863 void MacroAssembler::get_thread(Register dst) { 5864 RegSet saved_regs = RegSet::range(r0, r1) + lr - dst; 5865 push(saved_regs, sp); 5866 5867 mov(lr, CAST_FROM_FN_PTR(address, JavaThread::aarch64_get_thread_helper)); 5868 blrt(lr, 1, 0, 1); 5869 if (dst != c_rarg0) { 5870 mov(dst, c_rarg0); 5871 } 5872 5873 pop(saved_regs, sp); 5874 }