1 /* 2 * Copyright (c) 1997, 2018, Oracle and/or its affiliates. All rights reserved. 3 * Copyright (c) 2014, 2015, Red Hat Inc. All rights reserved. 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. 9 * 10 * This code is distributed in the hope that it will be useful, but WITHOUT 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 * version 2 for more details (a copy is included in the LICENSE file that 14 * accompanied this code). 15 * 16 * You should have received a copy of the GNU General Public License version 17 * 2 along with this work; if not, write to the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 19 * 20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 21 * or visit www.oracle.com if you need additional information or have any 22 * questions. 23 * 24 */ 25 26 #include <sys/types.h> 27 28 #include "precompiled.hpp" 29 #include "jvm.h" 30 #include "asm/assembler.hpp" 31 #include "asm/assembler.inline.hpp" 32 #include "gc/shared/barrierSet.hpp" 33 #include "gc/shared/cardTable.hpp" 34 #include "gc/shared/barrierSetAssembler.hpp" 35 #include "gc/shared/cardTableBarrierSet.hpp" 36 #include "interpreter/interpreter.hpp" 37 #include "compiler/disassembler.hpp" 38 #include "memory/resourceArea.hpp" 39 #include "nativeInst_aarch64.hpp" 40 #include "oops/accessDecorators.hpp" 41 #include "oops/compressedOops.inline.hpp" 42 #include "oops/klass.inline.hpp" 43 #include "runtime/biasedLocking.hpp" 44 #include "runtime/icache.hpp" 45 #include "runtime/interfaceSupport.inline.hpp" 46 #include "runtime/jniHandles.inline.hpp" 47 #include "runtime/sharedRuntime.hpp" 48 #include "runtime/thread.hpp" 49 #ifdef COMPILER1 50 #include "c1/c1_LIRAssembler.hpp" 51 #endif 52 #ifdef COMPILER2 53 #include "oops/oop.hpp" 54 #include "opto/compile.hpp" 55 #include "opto/intrinsicnode.hpp" 56 #include "opto/node.hpp" 57 #endif 58 59 #ifdef PRODUCT 60 #define BLOCK_COMMENT(str) /* nothing */ 61 #define STOP(error) stop(error) 62 #else 63 #define BLOCK_COMMENT(str) block_comment(str) 64 #define STOP(error) block_comment(error); stop(error) 65 #endif 66 67 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 68 69 // Patch any kind of instruction; there may be several instructions. 70 // Return the total length (in bytes) of the instructions. 71 int MacroAssembler::pd_patch_instruction_size(address branch, address target) { 72 int instructions = 1; 73 assert((uint64_t)target < (1ul << 48), "48-bit overflow in address constant"); 74 long offset = (target - branch) >> 2; 75 unsigned insn = *(unsigned*)branch; 76 if ((Instruction_aarch64::extract(insn, 29, 24) & 0b111011) == 0b011000) { 77 // Load register (literal) 78 Instruction_aarch64::spatch(branch, 23, 5, offset); 79 } else if (Instruction_aarch64::extract(insn, 30, 26) == 0b00101) { 80 // Unconditional branch (immediate) 81 Instruction_aarch64::spatch(branch, 25, 0, offset); 82 } else if (Instruction_aarch64::extract(insn, 31, 25) == 0b0101010) { 83 // Conditional branch (immediate) 84 Instruction_aarch64::spatch(branch, 23, 5, offset); 85 } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011010) { 86 // Compare & branch (immediate) 87 Instruction_aarch64::spatch(branch, 23, 5, offset); 88 } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011011) { 89 // Test & branch (immediate) 90 Instruction_aarch64::spatch(branch, 18, 5, offset); 91 } else if (Instruction_aarch64::extract(insn, 28, 24) == 0b10000) { 92 // PC-rel. addressing 93 offset = target-branch; 94 int shift = Instruction_aarch64::extract(insn, 31, 31); 95 if (shift) { 96 u_int64_t dest = (u_int64_t)target; 97 uint64_t pc_page = (uint64_t)branch >> 12; 98 uint64_t adr_page = (uint64_t)target >> 12; 99 unsigned offset_lo = dest & 0xfff; 100 offset = adr_page - pc_page; 101 102 // We handle 4 types of PC relative addressing 103 // 1 - adrp Rx, target_page 104 // ldr/str Ry, [Rx, #offset_in_page] 105 // 2 - adrp Rx, target_page 106 // add Ry, Rx, #offset_in_page 107 // 3 - adrp Rx, target_page (page aligned reloc, offset == 0) 108 // movk Rx, #imm16<<32 109 // 4 - adrp Rx, target_page (page aligned reloc, offset == 0) 110 // In the first 3 cases we must check that Rx is the same in the adrp and the 111 // subsequent ldr/str, add or movk instruction. Otherwise we could accidentally end 112 // up treating a type 4 relocation as a type 1, 2 or 3 just because it happened 113 // to be followed by a random unrelated ldr/str, add or movk instruction. 114 // 115 unsigned insn2 = ((unsigned*)branch)[1]; 116 if (Instruction_aarch64::extract(insn2, 29, 24) == 0b111001 && 117 Instruction_aarch64::extract(insn, 4, 0) == 118 Instruction_aarch64::extract(insn2, 9, 5)) { 119 // Load/store register (unsigned immediate) 120 unsigned size = Instruction_aarch64::extract(insn2, 31, 30); 121 Instruction_aarch64::patch(branch + sizeof (unsigned), 122 21, 10, offset_lo >> size); 123 guarantee(((dest >> size) << size) == dest, "misaligned target"); 124 instructions = 2; 125 } else if (Instruction_aarch64::extract(insn2, 31, 22) == 0b1001000100 && 126 Instruction_aarch64::extract(insn, 4, 0) == 127 Instruction_aarch64::extract(insn2, 4, 0)) { 128 // add (immediate) 129 Instruction_aarch64::patch(branch + sizeof (unsigned), 130 21, 10, offset_lo); 131 instructions = 2; 132 } else if (Instruction_aarch64::extract(insn2, 31, 21) == 0b11110010110 && 133 Instruction_aarch64::extract(insn, 4, 0) == 134 Instruction_aarch64::extract(insn2, 4, 0)) { 135 // movk #imm16<<32 136 Instruction_aarch64::patch(branch + 4, 20, 5, (uint64_t)target >> 32); 137 long dest = ((long)target & 0xffffffffL) | ((long)branch & 0xffff00000000L); 138 long pc_page = (long)branch >> 12; 139 long adr_page = (long)dest >> 12; 140 offset = adr_page - pc_page; 141 instructions = 2; 142 } 143 } 144 int offset_lo = offset & 3; 145 offset >>= 2; 146 Instruction_aarch64::spatch(branch, 23, 5, offset); 147 Instruction_aarch64::patch(branch, 30, 29, offset_lo); 148 } else if (Instruction_aarch64::extract(insn, 31, 21) == 0b11010010100) { 149 u_int64_t dest = (u_int64_t)target; 150 // Move wide constant 151 assert(nativeInstruction_at(branch+4)->is_movk(), "wrong insns in patch"); 152 assert(nativeInstruction_at(branch+8)->is_movk(), "wrong insns in patch"); 153 Instruction_aarch64::patch(branch, 20, 5, dest & 0xffff); 154 Instruction_aarch64::patch(branch+4, 20, 5, (dest >>= 16) & 0xffff); 155 Instruction_aarch64::patch(branch+8, 20, 5, (dest >>= 16) & 0xffff); 156 assert(target_addr_for_insn(branch) == target, "should be"); 157 instructions = 3; 158 } else if (Instruction_aarch64::extract(insn, 31, 22) == 0b1011100101 && 159 Instruction_aarch64::extract(insn, 4, 0) == 0b11111) { 160 // nothing to do 161 assert(target == 0, "did not expect to relocate target for polling page load"); 162 } else { 163 ShouldNotReachHere(); 164 } 165 return instructions * NativeInstruction::instruction_size; 166 } 167 168 int MacroAssembler::patch_oop(address insn_addr, address o) { 169 int instructions; 170 unsigned insn = *(unsigned*)insn_addr; 171 assert(nativeInstruction_at(insn_addr+4)->is_movk(), "wrong insns in patch"); 172 173 // OOPs are either narrow (32 bits) or wide (48 bits). We encode 174 // narrow OOPs by setting the upper 16 bits in the first 175 // instruction. 176 if (Instruction_aarch64::extract(insn, 31, 21) == 0b11010010101) { 177 // Move narrow OOP 178 narrowOop n = CompressedOops::encode((oop)o); 179 Instruction_aarch64::patch(insn_addr, 20, 5, n >> 16); 180 Instruction_aarch64::patch(insn_addr+4, 20, 5, n & 0xffff); 181 instructions = 2; 182 } else { 183 // Move wide OOP 184 assert(nativeInstruction_at(insn_addr+8)->is_movk(), "wrong insns in patch"); 185 uintptr_t dest = (uintptr_t)o; 186 Instruction_aarch64::patch(insn_addr, 20, 5, dest & 0xffff); 187 Instruction_aarch64::patch(insn_addr+4, 20, 5, (dest >>= 16) & 0xffff); 188 Instruction_aarch64::patch(insn_addr+8, 20, 5, (dest >>= 16) & 0xffff); 189 instructions = 3; 190 } 191 return instructions * NativeInstruction::instruction_size; 192 } 193 194 int MacroAssembler::patch_narrow_klass(address insn_addr, narrowKlass n) { 195 // Metatdata pointers are either narrow (32 bits) or wide (48 bits). 196 // We encode narrow ones by setting the upper 16 bits in the first 197 // instruction. 198 NativeInstruction *insn = nativeInstruction_at(insn_addr); 199 assert(Instruction_aarch64::extract(insn->encoding(), 31, 21) == 0b11010010101 && 200 nativeInstruction_at(insn_addr+4)->is_movk(), "wrong insns in patch"); 201 202 Instruction_aarch64::patch(insn_addr, 20, 5, n >> 16); 203 Instruction_aarch64::patch(insn_addr+4, 20, 5, n & 0xffff); 204 return 2 * NativeInstruction::instruction_size; 205 } 206 207 address MacroAssembler::target_addr_for_insn(address insn_addr, unsigned insn) { 208 long offset = 0; 209 if ((Instruction_aarch64::extract(insn, 29, 24) & 0b011011) == 0b00011000) { 210 // Load register (literal) 211 offset = Instruction_aarch64::sextract(insn, 23, 5); 212 return address(((uint64_t)insn_addr + (offset << 2))); 213 } else if (Instruction_aarch64::extract(insn, 30, 26) == 0b00101) { 214 // Unconditional branch (immediate) 215 offset = Instruction_aarch64::sextract(insn, 25, 0); 216 } else if (Instruction_aarch64::extract(insn, 31, 25) == 0b0101010) { 217 // Conditional branch (immediate) 218 offset = Instruction_aarch64::sextract(insn, 23, 5); 219 } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011010) { 220 // Compare & branch (immediate) 221 offset = Instruction_aarch64::sextract(insn, 23, 5); 222 } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011011) { 223 // Test & branch (immediate) 224 offset = Instruction_aarch64::sextract(insn, 18, 5); 225 } else if (Instruction_aarch64::extract(insn, 28, 24) == 0b10000) { 226 // PC-rel. addressing 227 offset = Instruction_aarch64::extract(insn, 30, 29); 228 offset |= Instruction_aarch64::sextract(insn, 23, 5) << 2; 229 int shift = Instruction_aarch64::extract(insn, 31, 31) ? 12 : 0; 230 if (shift) { 231 offset <<= shift; 232 uint64_t target_page = ((uint64_t)insn_addr) + offset; 233 target_page &= ((uint64_t)-1) << shift; 234 // Return the target address for the following sequences 235 // 1 - adrp Rx, target_page 236 // ldr/str Ry, [Rx, #offset_in_page] 237 // 2 - adrp Rx, target_page 238 // add Ry, Rx, #offset_in_page 239 // 3 - adrp Rx, target_page (page aligned reloc, offset == 0) 240 // movk Rx, #imm12<<32 241 // 4 - adrp Rx, target_page (page aligned reloc, offset == 0) 242 // 243 // In the first two cases we check that the register is the same and 244 // return the target_page + the offset within the page. 245 // Otherwise we assume it is a page aligned relocation and return 246 // the target page only. 247 // 248 unsigned insn2 = ((unsigned*)insn_addr)[1]; 249 if (Instruction_aarch64::extract(insn2, 29, 24) == 0b111001 && 250 Instruction_aarch64::extract(insn, 4, 0) == 251 Instruction_aarch64::extract(insn2, 9, 5)) { 252 // Load/store register (unsigned immediate) 253 unsigned int byte_offset = Instruction_aarch64::extract(insn2, 21, 10); 254 unsigned int size = Instruction_aarch64::extract(insn2, 31, 30); 255 return address(target_page + (byte_offset << size)); 256 } else if (Instruction_aarch64::extract(insn2, 31, 22) == 0b1001000100 && 257 Instruction_aarch64::extract(insn, 4, 0) == 258 Instruction_aarch64::extract(insn2, 4, 0)) { 259 // add (immediate) 260 unsigned int byte_offset = Instruction_aarch64::extract(insn2, 21, 10); 261 return address(target_page + byte_offset); 262 } else { 263 if (Instruction_aarch64::extract(insn2, 31, 21) == 0b11110010110 && 264 Instruction_aarch64::extract(insn, 4, 0) == 265 Instruction_aarch64::extract(insn2, 4, 0)) { 266 target_page = (target_page & 0xffffffff) | 267 ((uint64_t)Instruction_aarch64::extract(insn2, 20, 5) << 32); 268 } 269 return (address)target_page; 270 } 271 } else { 272 ShouldNotReachHere(); 273 } 274 } else if (Instruction_aarch64::extract(insn, 31, 23) == 0b110100101) { 275 u_int32_t *insns = (u_int32_t *)insn_addr; 276 // Move wide constant: movz, movk, movk. See movptr(). 277 assert(nativeInstruction_at(insns+1)->is_movk(), "wrong insns in patch"); 278 assert(nativeInstruction_at(insns+2)->is_movk(), "wrong insns in patch"); 279 return address(u_int64_t(Instruction_aarch64::extract(insns[0], 20, 5)) 280 + (u_int64_t(Instruction_aarch64::extract(insns[1], 20, 5)) << 16) 281 + (u_int64_t(Instruction_aarch64::extract(insns[2], 20, 5)) << 32)); 282 } else if (Instruction_aarch64::extract(insn, 31, 22) == 0b1011100101 && 283 Instruction_aarch64::extract(insn, 4, 0) == 0b11111) { 284 return 0; 285 } else { 286 ShouldNotReachHere(); 287 } 288 return address(((uint64_t)insn_addr + (offset << 2))); 289 } 290 291 void MacroAssembler::safepoint_poll(Label& slow_path) { 292 if (SafepointMechanism::uses_thread_local_poll()) { 293 ldr(rscratch1, Address(rthread, Thread::polling_page_offset())); 294 tbnz(rscratch1, exact_log2(SafepointMechanism::poll_bit()), slow_path); 295 } else { 296 unsigned long offset; 297 adrp(rscratch1, ExternalAddress(SafepointSynchronize::address_of_state()), offset); 298 ldrw(rscratch1, Address(rscratch1, offset)); 299 assert(SafepointSynchronize::_not_synchronized == 0, "rewrite this code"); 300 cbnz(rscratch1, slow_path); 301 } 302 } 303 304 // Just like safepoint_poll, but use an acquiring load for thread- 305 // local polling. 306 // 307 // We need an acquire here to ensure that any subsequent load of the 308 // global SafepointSynchronize::_state flag is ordered after this load 309 // of the local Thread::_polling page. We don't want this poll to 310 // return false (i.e. not safepointing) and a later poll of the global 311 // SafepointSynchronize::_state spuriously to return true. 312 // 313 // This is to avoid a race when we're in a native->Java transition 314 // racing the code which wakes up from a safepoint. 315 // 316 void MacroAssembler::safepoint_poll_acquire(Label& slow_path) { 317 if (SafepointMechanism::uses_thread_local_poll()) { 318 lea(rscratch1, Address(rthread, Thread::polling_page_offset())); 319 ldar(rscratch1, rscratch1); 320 tbnz(rscratch1, exact_log2(SafepointMechanism::poll_bit()), slow_path); 321 } else { 322 safepoint_poll(slow_path); 323 } 324 } 325 326 void MacroAssembler::reset_last_Java_frame(bool clear_fp) { 327 // we must set sp to zero to clear frame 328 str(zr, Address(rthread, JavaThread::last_Java_sp_offset())); 329 330 // must clear fp, so that compiled frames are not confused; it is 331 // possible that we need it only for debugging 332 if (clear_fp) { 333 str(zr, Address(rthread, JavaThread::last_Java_fp_offset())); 334 } 335 336 // Always clear the pc because it could have been set by make_walkable() 337 str(zr, Address(rthread, JavaThread::last_Java_pc_offset())); 338 } 339 340 // Calls to C land 341 // 342 // When entering C land, the rfp, & resp of the last Java frame have to be recorded 343 // in the (thread-local) JavaThread object. When leaving C land, the last Java fp 344 // has to be reset to 0. This is required to allow proper stack traversal. 345 void MacroAssembler::set_last_Java_frame(Register last_java_sp, 346 Register last_java_fp, 347 Register last_java_pc, 348 Register scratch) { 349 350 if (last_java_pc->is_valid()) { 351 str(last_java_pc, Address(rthread, 352 JavaThread::frame_anchor_offset() 353 + JavaFrameAnchor::last_Java_pc_offset())); 354 } 355 356 // determine last_java_sp register 357 if (last_java_sp == sp) { 358 mov(scratch, sp); 359 last_java_sp = scratch; 360 } else if (!last_java_sp->is_valid()) { 361 last_java_sp = esp; 362 } 363 364 str(last_java_sp, Address(rthread, JavaThread::last_Java_sp_offset())); 365 366 // last_java_fp is optional 367 if (last_java_fp->is_valid()) { 368 str(last_java_fp, Address(rthread, JavaThread::last_Java_fp_offset())); 369 } 370 } 371 372 void MacroAssembler::set_last_Java_frame(Register last_java_sp, 373 Register last_java_fp, 374 address last_java_pc, 375 Register scratch) { 376 if (last_java_pc != NULL) { 377 adr(scratch, last_java_pc); 378 } else { 379 // FIXME: This is almost never correct. We should delete all 380 // cases of set_last_Java_frame with last_java_pc=NULL and use the 381 // correct return address instead. 382 adr(scratch, pc()); 383 } 384 385 str(scratch, Address(rthread, 386 JavaThread::frame_anchor_offset() 387 + JavaFrameAnchor::last_Java_pc_offset())); 388 389 set_last_Java_frame(last_java_sp, last_java_fp, noreg, scratch); 390 } 391 392 void MacroAssembler::set_last_Java_frame(Register last_java_sp, 393 Register last_java_fp, 394 Label &L, 395 Register scratch) { 396 if (L.is_bound()) { 397 set_last_Java_frame(last_java_sp, last_java_fp, target(L), scratch); 398 } else { 399 InstructionMark im(this); 400 L.add_patch_at(code(), locator()); 401 set_last_Java_frame(last_java_sp, last_java_fp, (address)NULL, scratch); 402 } 403 } 404 405 void MacroAssembler::far_call(Address entry, CodeBuffer *cbuf, Register tmp) { 406 assert(ReservedCodeCacheSize < 4*G, "branch out of range"); 407 assert(CodeCache::find_blob(entry.target()) != NULL, 408 "destination of far call not found in code cache"); 409 if (far_branches()) { 410 unsigned long offset; 411 // We can use ADRP here because we know that the total size of 412 // the code cache cannot exceed 2Gb. 413 adrp(tmp, entry, offset); 414 add(tmp, tmp, offset); 415 if (cbuf) cbuf->set_insts_mark(); 416 blr(tmp); 417 } else { 418 if (cbuf) cbuf->set_insts_mark(); 419 bl(entry); 420 } 421 } 422 423 void MacroAssembler::far_jump(Address entry, CodeBuffer *cbuf, Register tmp) { 424 assert(ReservedCodeCacheSize < 4*G, "branch out of range"); 425 assert(CodeCache::find_blob(entry.target()) != NULL, 426 "destination of far call not found in code cache"); 427 if (far_branches()) { 428 unsigned long offset; 429 // We can use ADRP here because we know that the total size of 430 // the code cache cannot exceed 2Gb. 431 adrp(tmp, entry, offset); 432 add(tmp, tmp, offset); 433 if (cbuf) cbuf->set_insts_mark(); 434 br(tmp); 435 } else { 436 if (cbuf) cbuf->set_insts_mark(); 437 b(entry); 438 } 439 } 440 441 void MacroAssembler::reserved_stack_check() { 442 // testing if reserved zone needs to be enabled 443 Label no_reserved_zone_enabling; 444 445 ldr(rscratch1, Address(rthread, JavaThread::reserved_stack_activation_offset())); 446 cmp(sp, rscratch1); 447 br(Assembler::LO, no_reserved_zone_enabling); 448 449 enter(); // LR and FP are live. 450 lea(rscratch1, CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone)); 451 mov(c_rarg0, rthread); 452 blr(rscratch1); 453 leave(); 454 455 // We have already removed our own frame. 456 // throw_delayed_StackOverflowError will think that it's been 457 // called by our caller. 458 lea(rscratch1, RuntimeAddress(StubRoutines::throw_delayed_StackOverflowError_entry())); 459 br(rscratch1); 460 should_not_reach_here(); 461 462 bind(no_reserved_zone_enabling); 463 } 464 465 int MacroAssembler::biased_locking_enter(Register lock_reg, 466 Register obj_reg, 467 Register swap_reg, 468 Register tmp_reg, 469 bool swap_reg_contains_mark, 470 Label& done, 471 Label* slow_case, 472 BiasedLockingCounters* counters) { 473 assert(UseBiasedLocking, "why call this otherwise?"); 474 assert_different_registers(lock_reg, obj_reg, swap_reg); 475 476 if (PrintBiasedLockingStatistics && counters == NULL) 477 counters = BiasedLocking::counters(); 478 479 assert_different_registers(lock_reg, obj_reg, swap_reg, tmp_reg, rscratch1, rscratch2, noreg); 480 assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits, "biased locking makes assumptions about bit layout"); 481 Address mark_addr (obj_reg, oopDesc::mark_offset_in_bytes()); 482 Address klass_addr (obj_reg, oopDesc::klass_offset_in_bytes()); 483 Address saved_mark_addr(lock_reg, 0); 484 485 // Biased locking 486 // See whether the lock is currently biased toward our thread and 487 // whether the epoch is still valid 488 // Note that the runtime guarantees sufficient alignment of JavaThread 489 // pointers to allow age to be placed into low bits 490 // First check to see whether biasing is even enabled for this object 491 Label cas_label; 492 int null_check_offset = -1; 493 if (!swap_reg_contains_mark) { 494 null_check_offset = offset(); 495 ldr(swap_reg, mark_addr); 496 } 497 andr(tmp_reg, swap_reg, markOopDesc::biased_lock_mask_in_place); 498 cmp(tmp_reg, (u1)markOopDesc::biased_lock_pattern); 499 br(Assembler::NE, cas_label); 500 // The bias pattern is present in the object's header. Need to check 501 // whether the bias owner and the epoch are both still current. 502 load_prototype_header(tmp_reg, obj_reg); 503 orr(tmp_reg, tmp_reg, rthread); 504 eor(tmp_reg, swap_reg, tmp_reg); 505 andr(tmp_reg, tmp_reg, ~((int) markOopDesc::age_mask_in_place)); 506 if (counters != NULL) { 507 Label around; 508 cbnz(tmp_reg, around); 509 atomic_incw(Address((address)counters->biased_lock_entry_count_addr()), tmp_reg, rscratch1, rscratch2); 510 b(done); 511 bind(around); 512 } else { 513 cbz(tmp_reg, done); 514 } 515 516 Label try_revoke_bias; 517 Label try_rebias; 518 519 // At this point we know that the header has the bias pattern and 520 // that we are not the bias owner in the current epoch. We need to 521 // figure out more details about the state of the header in order to 522 // know what operations can be legally performed on the object's 523 // header. 524 525 // If the low three bits in the xor result aren't clear, that means 526 // the prototype header is no longer biased and we have to revoke 527 // the bias on this object. 528 andr(rscratch1, tmp_reg, markOopDesc::biased_lock_mask_in_place); 529 cbnz(rscratch1, try_revoke_bias); 530 531 // Biasing is still enabled for this data type. See whether the 532 // epoch of the current bias is still valid, meaning that the epoch 533 // bits of the mark word are equal to the epoch bits of the 534 // prototype header. (Note that the prototype header's epoch bits 535 // only change at a safepoint.) If not, attempt to rebias the object 536 // toward the current thread. Note that we must be absolutely sure 537 // that the current epoch is invalid in order to do this because 538 // otherwise the manipulations it performs on the mark word are 539 // illegal. 540 andr(rscratch1, tmp_reg, markOopDesc::epoch_mask_in_place); 541 cbnz(rscratch1, try_rebias); 542 543 // The epoch of the current bias is still valid but we know nothing 544 // about the owner; it might be set or it might be clear. Try to 545 // acquire the bias of the object using an atomic operation. If this 546 // fails we will go in to the runtime to revoke the object's bias. 547 // Note that we first construct the presumed unbiased header so we 548 // don't accidentally blow away another thread's valid bias. 549 { 550 Label here; 551 mov(rscratch1, markOopDesc::biased_lock_mask_in_place | markOopDesc::age_mask_in_place | markOopDesc::epoch_mask_in_place); 552 andr(swap_reg, swap_reg, rscratch1); 553 orr(tmp_reg, swap_reg, rthread); 554 cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, rscratch1, here, slow_case); 555 // If the biasing toward our thread failed, this means that 556 // another thread succeeded in biasing it toward itself and we 557 // need to revoke that bias. The revocation will occur in the 558 // interpreter runtime in the slow case. 559 bind(here); 560 if (counters != NULL) { 561 atomic_incw(Address((address)counters->anonymously_biased_lock_entry_count_addr()), 562 tmp_reg, rscratch1, rscratch2); 563 } 564 } 565 b(done); 566 567 bind(try_rebias); 568 // At this point we know the epoch has expired, meaning that the 569 // current "bias owner", if any, is actually invalid. Under these 570 // circumstances _only_, we are allowed to use the current header's 571 // value as the comparison value when doing the cas to acquire the 572 // bias in the current epoch. In other words, we allow transfer of 573 // the bias from one thread to another directly in this situation. 574 // 575 // FIXME: due to a lack of registers we currently blow away the age 576 // bits in this situation. Should attempt to preserve them. 577 { 578 Label here; 579 load_prototype_header(tmp_reg, obj_reg); 580 orr(tmp_reg, rthread, tmp_reg); 581 cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, rscratch1, here, slow_case); 582 // If the biasing toward our thread failed, then another thread 583 // succeeded in biasing it toward itself and we need to revoke that 584 // bias. The revocation will occur in the runtime in the slow case. 585 bind(here); 586 if (counters != NULL) { 587 atomic_incw(Address((address)counters->rebiased_lock_entry_count_addr()), 588 tmp_reg, rscratch1, rscratch2); 589 } 590 } 591 b(done); 592 593 bind(try_revoke_bias); 594 // The prototype mark in the klass doesn't have the bias bit set any 595 // more, indicating that objects of this data type are not supposed 596 // to be biased any more. We are going to try to reset the mark of 597 // this object to the prototype value and fall through to the 598 // CAS-based locking scheme. Note that if our CAS fails, it means 599 // that another thread raced us for the privilege of revoking the 600 // bias of this particular object, so it's okay to continue in the 601 // normal locking code. 602 // 603 // FIXME: due to a lack of registers we currently blow away the age 604 // bits in this situation. Should attempt to preserve them. 605 { 606 Label here, nope; 607 load_prototype_header(tmp_reg, obj_reg); 608 cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, rscratch1, here, &nope); 609 bind(here); 610 611 // Fall through to the normal CAS-based lock, because no matter what 612 // the result of the above CAS, some thread must have succeeded in 613 // removing the bias bit from the object's header. 614 if (counters != NULL) { 615 atomic_incw(Address((address)counters->revoked_lock_entry_count_addr()), tmp_reg, 616 rscratch1, rscratch2); 617 } 618 bind(nope); 619 } 620 621 bind(cas_label); 622 623 return null_check_offset; 624 } 625 626 void MacroAssembler::biased_locking_exit(Register obj_reg, Register temp_reg, Label& done) { 627 assert(UseBiasedLocking, "why call this otherwise?"); 628 629 // Check for biased locking unlock case, which is a no-op 630 // Note: we do not have to check the thread ID for two reasons. 631 // First, the interpreter checks for IllegalMonitorStateException at 632 // a higher level. Second, if the bias was revoked while we held the 633 // lock, the object could not be rebiased toward another thread, so 634 // the bias bit would be clear. 635 ldr(temp_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes())); 636 andr(temp_reg, temp_reg, markOopDesc::biased_lock_mask_in_place); 637 cmp(temp_reg, (u1)markOopDesc::biased_lock_pattern); 638 br(Assembler::EQ, done); 639 } 640 641 static void pass_arg0(MacroAssembler* masm, Register arg) { 642 if (c_rarg0 != arg ) { 643 masm->mov(c_rarg0, arg); 644 } 645 } 646 647 static void pass_arg1(MacroAssembler* masm, Register arg) { 648 if (c_rarg1 != arg ) { 649 masm->mov(c_rarg1, arg); 650 } 651 } 652 653 static void pass_arg2(MacroAssembler* masm, Register arg) { 654 if (c_rarg2 != arg ) { 655 masm->mov(c_rarg2, arg); 656 } 657 } 658 659 static void pass_arg3(MacroAssembler* masm, Register arg) { 660 if (c_rarg3 != arg ) { 661 masm->mov(c_rarg3, arg); 662 } 663 } 664 665 void MacroAssembler::call_VM_base(Register oop_result, 666 Register java_thread, 667 Register last_java_sp, 668 address entry_point, 669 int number_of_arguments, 670 bool check_exceptions) { 671 // determine java_thread register 672 if (!java_thread->is_valid()) { 673 java_thread = rthread; 674 } 675 676 // determine last_java_sp register 677 if (!last_java_sp->is_valid()) { 678 last_java_sp = esp; 679 } 680 681 // debugging support 682 assert(number_of_arguments >= 0 , "cannot have negative number of arguments"); 683 assert(java_thread == rthread, "unexpected register"); 684 #ifdef ASSERT 685 // TraceBytecodes does not use r12 but saves it over the call, so don't verify 686 // if ((UseCompressedOops || UseCompressedClassPointers) && !TraceBytecodes) verify_heapbase("call_VM_base: heap base corrupted?"); 687 #endif // ASSERT 688 689 assert(java_thread != oop_result , "cannot use the same register for java_thread & oop_result"); 690 assert(java_thread != last_java_sp, "cannot use the same register for java_thread & last_java_sp"); 691 692 // push java thread (becomes first argument of C function) 693 694 mov(c_rarg0, java_thread); 695 696 // set last Java frame before call 697 assert(last_java_sp != rfp, "can't use rfp"); 698 699 Label l; 700 set_last_Java_frame(last_java_sp, rfp, l, rscratch1); 701 702 // do the call, remove parameters 703 MacroAssembler::call_VM_leaf_base(entry_point, number_of_arguments, &l); 704 705 // reset last Java frame 706 // Only interpreter should have to clear fp 707 reset_last_Java_frame(true); 708 709 // C++ interp handles this in the interpreter 710 check_and_handle_popframe(java_thread); 711 check_and_handle_earlyret(java_thread); 712 713 if (check_exceptions) { 714 // check for pending exceptions (java_thread is set upon return) 715 ldr(rscratch1, Address(java_thread, in_bytes(Thread::pending_exception_offset()))); 716 Label ok; 717 cbz(rscratch1, ok); 718 lea(rscratch1, RuntimeAddress(StubRoutines::forward_exception_entry())); 719 br(rscratch1); 720 bind(ok); 721 } 722 723 // get oop result if there is one and reset the value in the thread 724 if (oop_result->is_valid()) { 725 get_vm_result(oop_result, java_thread); 726 } 727 } 728 729 void MacroAssembler::call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions) { 730 call_VM_base(oop_result, noreg, noreg, entry_point, number_of_arguments, check_exceptions); 731 } 732 733 // Maybe emit a call via a trampoline. If the code cache is small 734 // trampolines won't be emitted. 735 736 address MacroAssembler::trampoline_call(Address entry, CodeBuffer *cbuf) { 737 assert(JavaThread::current()->is_Compiler_thread(), "just checking"); 738 assert(entry.rspec().type() == relocInfo::runtime_call_type 739 || entry.rspec().type() == relocInfo::opt_virtual_call_type 740 || entry.rspec().type() == relocInfo::static_call_type 741 || entry.rspec().type() == relocInfo::virtual_call_type, "wrong reloc type"); 742 743 // We need a trampoline if branches are far. 744 if (far_branches()) { 745 bool in_scratch_emit_size = false; 746 #ifdef COMPILER2 747 // We don't want to emit a trampoline if C2 is generating dummy 748 // code during its branch shortening phase. 749 CompileTask* task = ciEnv::current()->task(); 750 in_scratch_emit_size = 751 (task != NULL && is_c2_compile(task->comp_level()) && 752 Compile::current()->in_scratch_emit_size()); 753 #endif 754 if (!in_scratch_emit_size) { 755 address stub = emit_trampoline_stub(offset(), entry.target()); 756 if (stub == NULL) { 757 return NULL; // CodeCache is full 758 } 759 } 760 } 761 762 if (cbuf) cbuf->set_insts_mark(); 763 relocate(entry.rspec()); 764 if (!far_branches()) { 765 bl(entry.target()); 766 } else { 767 bl(pc()); 768 } 769 // just need to return a non-null address 770 return pc(); 771 } 772 773 774 // Emit a trampoline stub for a call to a target which is too far away. 775 // 776 // code sequences: 777 // 778 // call-site: 779 // branch-and-link to <destination> or <trampoline stub> 780 // 781 // Related trampoline stub for this call site in the stub section: 782 // load the call target from the constant pool 783 // branch (LR still points to the call site above) 784 785 address MacroAssembler::emit_trampoline_stub(int insts_call_instruction_offset, 786 address dest) { 787 // Max stub size: alignment nop, TrampolineStub. 788 address stub = start_a_stub(NativeInstruction::instruction_size 789 + NativeCallTrampolineStub::instruction_size); 790 if (stub == NULL) { 791 return NULL; // CodeBuffer::expand failed 792 } 793 794 // Create a trampoline stub relocation which relates this trampoline stub 795 // with the call instruction at insts_call_instruction_offset in the 796 // instructions code-section. 797 align(wordSize); 798 relocate(trampoline_stub_Relocation::spec(code()->insts()->start() 799 + insts_call_instruction_offset)); 800 const int stub_start_offset = offset(); 801 802 // Now, create the trampoline stub's code: 803 // - load the call 804 // - call 805 Label target; 806 ldr(rscratch1, target); 807 br(rscratch1); 808 bind(target); 809 assert(offset() - stub_start_offset == NativeCallTrampolineStub::data_offset, 810 "should be"); 811 emit_int64((int64_t)dest); 812 813 const address stub_start_addr = addr_at(stub_start_offset); 814 815 assert(is_NativeCallTrampolineStub_at(stub_start_addr), "doesn't look like a trampoline"); 816 817 end_a_stub(); 818 return stub_start_addr; 819 } 820 821 void MacroAssembler::c2bool(Register x) { 822 // implements x == 0 ? 0 : 1 823 // note: must only look at least-significant byte of x 824 // since C-style booleans are stored in one byte 825 // only! (was bug) 826 tst(x, 0xff); 827 cset(x, Assembler::NE); 828 } 829 830 address MacroAssembler::ic_call(address entry, jint method_index) { 831 RelocationHolder rh = virtual_call_Relocation::spec(pc(), method_index); 832 // address const_ptr = long_constant((jlong)Universe::non_oop_word()); 833 // unsigned long offset; 834 // ldr_constant(rscratch2, const_ptr); 835 movptr(rscratch2, (uintptr_t)Universe::non_oop_word()); 836 return trampoline_call(Address(entry, rh)); 837 } 838 839 // Implementation of call_VM versions 840 841 void MacroAssembler::call_VM(Register oop_result, 842 address entry_point, 843 bool check_exceptions) { 844 call_VM_helper(oop_result, entry_point, 0, check_exceptions); 845 } 846 847 void MacroAssembler::call_VM(Register oop_result, 848 address entry_point, 849 Register arg_1, 850 bool check_exceptions) { 851 pass_arg1(this, arg_1); 852 call_VM_helper(oop_result, entry_point, 1, check_exceptions); 853 } 854 855 void MacroAssembler::call_VM(Register oop_result, 856 address entry_point, 857 Register arg_1, 858 Register arg_2, 859 bool check_exceptions) { 860 assert(arg_1 != c_rarg2, "smashed arg"); 861 pass_arg2(this, arg_2); 862 pass_arg1(this, arg_1); 863 call_VM_helper(oop_result, entry_point, 2, check_exceptions); 864 } 865 866 void MacroAssembler::call_VM(Register oop_result, 867 address entry_point, 868 Register arg_1, 869 Register arg_2, 870 Register arg_3, 871 bool check_exceptions) { 872 assert(arg_1 != c_rarg3, "smashed arg"); 873 assert(arg_2 != c_rarg3, "smashed arg"); 874 pass_arg3(this, arg_3); 875 876 assert(arg_1 != c_rarg2, "smashed arg"); 877 pass_arg2(this, arg_2); 878 879 pass_arg1(this, arg_1); 880 call_VM_helper(oop_result, entry_point, 3, check_exceptions); 881 } 882 883 void MacroAssembler::call_VM(Register oop_result, 884 Register last_java_sp, 885 address entry_point, 886 int number_of_arguments, 887 bool check_exceptions) { 888 call_VM_base(oop_result, rthread, last_java_sp, entry_point, number_of_arguments, check_exceptions); 889 } 890 891 void MacroAssembler::call_VM(Register oop_result, 892 Register last_java_sp, 893 address entry_point, 894 Register arg_1, 895 bool check_exceptions) { 896 pass_arg1(this, arg_1); 897 call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions); 898 } 899 900 void MacroAssembler::call_VM(Register oop_result, 901 Register last_java_sp, 902 address entry_point, 903 Register arg_1, 904 Register arg_2, 905 bool check_exceptions) { 906 907 assert(arg_1 != c_rarg2, "smashed arg"); 908 pass_arg2(this, arg_2); 909 pass_arg1(this, arg_1); 910 call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions); 911 } 912 913 void MacroAssembler::call_VM(Register oop_result, 914 Register last_java_sp, 915 address entry_point, 916 Register arg_1, 917 Register arg_2, 918 Register arg_3, 919 bool check_exceptions) { 920 assert(arg_1 != c_rarg3, "smashed arg"); 921 assert(arg_2 != c_rarg3, "smashed arg"); 922 pass_arg3(this, arg_3); 923 assert(arg_1 != c_rarg2, "smashed arg"); 924 pass_arg2(this, arg_2); 925 pass_arg1(this, arg_1); 926 call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions); 927 } 928 929 930 void MacroAssembler::get_vm_result(Register oop_result, Register java_thread) { 931 ldr(oop_result, Address(java_thread, JavaThread::vm_result_offset())); 932 str(zr, Address(java_thread, JavaThread::vm_result_offset())); 933 verify_oop(oop_result, "broken oop in call_VM_base"); 934 } 935 936 void MacroAssembler::get_vm_result_2(Register metadata_result, Register java_thread) { 937 ldr(metadata_result, Address(java_thread, JavaThread::vm_result_2_offset())); 938 str(zr, Address(java_thread, JavaThread::vm_result_2_offset())); 939 } 940 941 void MacroAssembler::align(int modulus) { 942 while (offset() % modulus != 0) nop(); 943 } 944 945 // these are no-ops overridden by InterpreterMacroAssembler 946 947 void MacroAssembler::check_and_handle_earlyret(Register java_thread) { } 948 949 void MacroAssembler::check_and_handle_popframe(Register java_thread) { } 950 951 952 RegisterOrConstant MacroAssembler::delayed_value_impl(intptr_t* delayed_value_addr, 953 Register tmp, 954 int offset) { 955 intptr_t value = *delayed_value_addr; 956 if (value != 0) 957 return RegisterOrConstant(value + offset); 958 959 // load indirectly to solve generation ordering problem 960 ldr(tmp, ExternalAddress((address) delayed_value_addr)); 961 962 if (offset != 0) 963 add(tmp, tmp, offset); 964 965 return RegisterOrConstant(tmp); 966 } 967 968 969 void MacroAssembler:: notify(int type) { 970 if (type == bytecode_start) { 971 // set_last_Java_frame(esp, rfp, (address)NULL); 972 Assembler:: notify(type); 973 // reset_last_Java_frame(true); 974 } 975 else 976 Assembler:: notify(type); 977 } 978 979 // Look up the method for a megamorphic invokeinterface call. 980 // The target method is determined by <intf_klass, itable_index>. 981 // The receiver klass is in recv_klass. 982 // On success, the result will be in method_result, and execution falls through. 983 // On failure, execution transfers to the given label. 984 void MacroAssembler::lookup_interface_method(Register recv_klass, 985 Register intf_klass, 986 RegisterOrConstant itable_index, 987 Register method_result, 988 Register scan_temp, 989 Label& L_no_such_interface, 990 bool return_method) { 991 assert_different_registers(recv_klass, intf_klass, scan_temp); 992 assert_different_registers(method_result, intf_klass, scan_temp); 993 assert(recv_klass != method_result || !return_method, 994 "recv_klass can be destroyed when method isn't needed"); 995 assert(itable_index.is_constant() || itable_index.as_register() == method_result, 996 "caller must use same register for non-constant itable index as for method"); 997 998 // Compute start of first itableOffsetEntry (which is at the end of the vtable) 999 int vtable_base = in_bytes(Klass::vtable_start_offset()); 1000 int itentry_off = itableMethodEntry::method_offset_in_bytes(); 1001 int scan_step = itableOffsetEntry::size() * wordSize; 1002 int vte_size = vtableEntry::size_in_bytes(); 1003 assert(vte_size == wordSize, "else adjust times_vte_scale"); 1004 1005 ldrw(scan_temp, Address(recv_klass, Klass::vtable_length_offset())); 1006 1007 // %%% Could store the aligned, prescaled offset in the klassoop. 1008 // lea(scan_temp, Address(recv_klass, scan_temp, times_vte_scale, vtable_base)); 1009 lea(scan_temp, Address(recv_klass, scan_temp, Address::lsl(3))); 1010 add(scan_temp, scan_temp, vtable_base); 1011 1012 if (return_method) { 1013 // Adjust recv_klass by scaled itable_index, so we can free itable_index. 1014 assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below"); 1015 // lea(recv_klass, Address(recv_klass, itable_index, Address::times_ptr, itentry_off)); 1016 lea(recv_klass, Address(recv_klass, itable_index, Address::lsl(3))); 1017 if (itentry_off) 1018 add(recv_klass, recv_klass, itentry_off); 1019 } 1020 1021 // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) { 1022 // if (scan->interface() == intf) { 1023 // result = (klass + scan->offset() + itable_index); 1024 // } 1025 // } 1026 Label search, found_method; 1027 1028 for (int peel = 1; peel >= 0; peel--) { 1029 ldr(method_result, Address(scan_temp, itableOffsetEntry::interface_offset_in_bytes())); 1030 cmp(intf_klass, method_result); 1031 1032 if (peel) { 1033 br(Assembler::EQ, found_method); 1034 } else { 1035 br(Assembler::NE, search); 1036 // (invert the test to fall through to found_method...) 1037 } 1038 1039 if (!peel) break; 1040 1041 bind(search); 1042 1043 // Check that the previous entry is non-null. A null entry means that 1044 // the receiver class doesn't implement the interface, and wasn't the 1045 // same as when the caller was compiled. 1046 cbz(method_result, L_no_such_interface); 1047 add(scan_temp, scan_temp, scan_step); 1048 } 1049 1050 bind(found_method); 1051 1052 // Got a hit. 1053 if (return_method) { 1054 ldrw(scan_temp, Address(scan_temp, itableOffsetEntry::offset_offset_in_bytes())); 1055 ldr(method_result, Address(recv_klass, scan_temp, Address::uxtw(0))); 1056 } 1057 } 1058 1059 // virtual method calling 1060 void MacroAssembler::lookup_virtual_method(Register recv_klass, 1061 RegisterOrConstant vtable_index, 1062 Register method_result) { 1063 const int base = in_bytes(Klass::vtable_start_offset()); 1064 assert(vtableEntry::size() * wordSize == 8, 1065 "adjust the scaling in the code below"); 1066 int vtable_offset_in_bytes = base + vtableEntry::method_offset_in_bytes(); 1067 1068 if (vtable_index.is_register()) { 1069 lea(method_result, Address(recv_klass, 1070 vtable_index.as_register(), 1071 Address::lsl(LogBytesPerWord))); 1072 ldr(method_result, Address(method_result, vtable_offset_in_bytes)); 1073 } else { 1074 vtable_offset_in_bytes += vtable_index.as_constant() * wordSize; 1075 ldr(method_result, 1076 form_address(rscratch1, recv_klass, vtable_offset_in_bytes, 0)); 1077 } 1078 } 1079 1080 void MacroAssembler::check_klass_subtype(Register sub_klass, 1081 Register super_klass, 1082 Register temp_reg, 1083 Label& L_success) { 1084 Label L_failure; 1085 check_klass_subtype_fast_path(sub_klass, super_klass, temp_reg, &L_success, &L_failure, NULL); 1086 check_klass_subtype_slow_path(sub_klass, super_klass, temp_reg, noreg, &L_success, NULL); 1087 bind(L_failure); 1088 } 1089 1090 1091 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass, 1092 Register super_klass, 1093 Register temp_reg, 1094 Label* L_success, 1095 Label* L_failure, 1096 Label* L_slow_path, 1097 RegisterOrConstant super_check_offset) { 1098 assert_different_registers(sub_klass, super_klass, temp_reg); 1099 bool must_load_sco = (super_check_offset.constant_or_zero() == -1); 1100 if (super_check_offset.is_register()) { 1101 assert_different_registers(sub_klass, super_klass, 1102 super_check_offset.as_register()); 1103 } else if (must_load_sco) { 1104 assert(temp_reg != noreg, "supply either a temp or a register offset"); 1105 } 1106 1107 Label L_fallthrough; 1108 int label_nulls = 0; 1109 if (L_success == NULL) { L_success = &L_fallthrough; label_nulls++; } 1110 if (L_failure == NULL) { L_failure = &L_fallthrough; label_nulls++; } 1111 if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; } 1112 assert(label_nulls <= 1, "at most one NULL in the batch"); 1113 1114 int sc_offset = in_bytes(Klass::secondary_super_cache_offset()); 1115 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 1116 Address super_check_offset_addr(super_klass, sco_offset); 1117 1118 // Hacked jmp, which may only be used just before L_fallthrough. 1119 #define final_jmp(label) \ 1120 if (&(label) == &L_fallthrough) { /*do nothing*/ } \ 1121 else b(label) /*omit semi*/ 1122 1123 // If the pointers are equal, we are done (e.g., String[] elements). 1124 // This self-check enables sharing of secondary supertype arrays among 1125 // non-primary types such as array-of-interface. Otherwise, each such 1126 // type would need its own customized SSA. 1127 // We move this check to the front of the fast path because many 1128 // type checks are in fact trivially successful in this manner, 1129 // so we get a nicely predicted branch right at the start of the check. 1130 cmp(sub_klass, super_klass); 1131 br(Assembler::EQ, *L_success); 1132 1133 // Check the supertype display: 1134 if (must_load_sco) { 1135 ldrw(temp_reg, super_check_offset_addr); 1136 super_check_offset = RegisterOrConstant(temp_reg); 1137 } 1138 Address super_check_addr(sub_klass, super_check_offset); 1139 ldr(rscratch1, super_check_addr); 1140 cmp(super_klass, rscratch1); // load displayed supertype 1141 1142 // This check has worked decisively for primary supers. 1143 // Secondary supers are sought in the super_cache ('super_cache_addr'). 1144 // (Secondary supers are interfaces and very deeply nested subtypes.) 1145 // This works in the same check above because of a tricky aliasing 1146 // between the super_cache and the primary super display elements. 1147 // (The 'super_check_addr' can address either, as the case requires.) 1148 // Note that the cache is updated below if it does not help us find 1149 // what we need immediately. 1150 // So if it was a primary super, we can just fail immediately. 1151 // Otherwise, it's the slow path for us (no success at this point). 1152 1153 if (super_check_offset.is_register()) { 1154 br(Assembler::EQ, *L_success); 1155 subs(zr, super_check_offset.as_register(), sc_offset); 1156 if (L_failure == &L_fallthrough) { 1157 br(Assembler::EQ, *L_slow_path); 1158 } else { 1159 br(Assembler::NE, *L_failure); 1160 final_jmp(*L_slow_path); 1161 } 1162 } else if (super_check_offset.as_constant() == sc_offset) { 1163 // Need a slow path; fast failure is impossible. 1164 if (L_slow_path == &L_fallthrough) { 1165 br(Assembler::EQ, *L_success); 1166 } else { 1167 br(Assembler::NE, *L_slow_path); 1168 final_jmp(*L_success); 1169 } 1170 } else { 1171 // No slow path; it's a fast decision. 1172 if (L_failure == &L_fallthrough) { 1173 br(Assembler::EQ, *L_success); 1174 } else { 1175 br(Assembler::NE, *L_failure); 1176 final_jmp(*L_success); 1177 } 1178 } 1179 1180 bind(L_fallthrough); 1181 1182 #undef final_jmp 1183 } 1184 1185 // These two are taken from x86, but they look generally useful 1186 1187 // scans count pointer sized words at [addr] for occurence of value, 1188 // generic 1189 void MacroAssembler::repne_scan(Register addr, Register value, Register count, 1190 Register scratch) { 1191 Label Lloop, Lexit; 1192 cbz(count, Lexit); 1193 bind(Lloop); 1194 ldr(scratch, post(addr, wordSize)); 1195 cmp(value, scratch); 1196 br(EQ, Lexit); 1197 sub(count, count, 1); 1198 cbnz(count, Lloop); 1199 bind(Lexit); 1200 } 1201 1202 // scans count 4 byte words at [addr] for occurence of value, 1203 // generic 1204 void MacroAssembler::repne_scanw(Register addr, Register value, Register count, 1205 Register scratch) { 1206 Label Lloop, Lexit; 1207 cbz(count, Lexit); 1208 bind(Lloop); 1209 ldrw(scratch, post(addr, wordSize)); 1210 cmpw(value, scratch); 1211 br(EQ, Lexit); 1212 sub(count, count, 1); 1213 cbnz(count, Lloop); 1214 bind(Lexit); 1215 } 1216 1217 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass, 1218 Register super_klass, 1219 Register temp_reg, 1220 Register temp2_reg, 1221 Label* L_success, 1222 Label* L_failure, 1223 bool set_cond_codes) { 1224 assert_different_registers(sub_klass, super_klass, temp_reg); 1225 if (temp2_reg != noreg) 1226 assert_different_registers(sub_klass, super_klass, temp_reg, temp2_reg, rscratch1); 1227 #define IS_A_TEMP(reg) ((reg) == temp_reg || (reg) == temp2_reg) 1228 1229 Label L_fallthrough; 1230 int label_nulls = 0; 1231 if (L_success == NULL) { L_success = &L_fallthrough; label_nulls++; } 1232 if (L_failure == NULL) { L_failure = &L_fallthrough; label_nulls++; } 1233 assert(label_nulls <= 1, "at most one NULL in the batch"); 1234 1235 // a couple of useful fields in sub_klass: 1236 int ss_offset = in_bytes(Klass::secondary_supers_offset()); 1237 int sc_offset = in_bytes(Klass::secondary_super_cache_offset()); 1238 Address secondary_supers_addr(sub_klass, ss_offset); 1239 Address super_cache_addr( sub_klass, sc_offset); 1240 1241 BLOCK_COMMENT("check_klass_subtype_slow_path"); 1242 1243 // Do a linear scan of the secondary super-klass chain. 1244 // This code is rarely used, so simplicity is a virtue here. 1245 // The repne_scan instruction uses fixed registers, which we must spill. 1246 // Don't worry too much about pre-existing connections with the input regs. 1247 1248 assert(sub_klass != r0, "killed reg"); // killed by mov(r0, super) 1249 assert(sub_klass != r2, "killed reg"); // killed by lea(r2, &pst_counter) 1250 1251 RegSet pushed_registers; 1252 if (!IS_A_TEMP(r2)) pushed_registers += r2; 1253 if (!IS_A_TEMP(r5)) pushed_registers += r5; 1254 1255 if (super_klass != r0 || UseCompressedOops) { 1256 if (!IS_A_TEMP(r0)) pushed_registers += r0; 1257 } 1258 1259 push(pushed_registers, sp); 1260 1261 // Get super_klass value into r0 (even if it was in r5 or r2). 1262 if (super_klass != r0) { 1263 mov(r0, super_klass); 1264 } 1265 1266 #ifndef PRODUCT 1267 mov(rscratch2, (address)&SharedRuntime::_partial_subtype_ctr); 1268 Address pst_counter_addr(rscratch2); 1269 ldr(rscratch1, pst_counter_addr); 1270 add(rscratch1, rscratch1, 1); 1271 str(rscratch1, pst_counter_addr); 1272 #endif //PRODUCT 1273 1274 // We will consult the secondary-super array. 1275 ldr(r5, secondary_supers_addr); 1276 // Load the array length. 1277 ldrw(r2, Address(r5, Array<Klass*>::length_offset_in_bytes())); 1278 // Skip to start of data. 1279 add(r5, r5, Array<Klass*>::base_offset_in_bytes()); 1280 1281 cmp(sp, zr); // Clear Z flag; SP is never zero 1282 // Scan R2 words at [R5] for an occurrence of R0. 1283 // Set NZ/Z based on last compare. 1284 repne_scan(r5, r0, r2, rscratch1); 1285 1286 // Unspill the temp. registers: 1287 pop(pushed_registers, sp); 1288 1289 br(Assembler::NE, *L_failure); 1290 1291 // Success. Cache the super we found and proceed in triumph. 1292 str(super_klass, super_cache_addr); 1293 1294 if (L_success != &L_fallthrough) { 1295 b(*L_success); 1296 } 1297 1298 #undef IS_A_TEMP 1299 1300 bind(L_fallthrough); 1301 } 1302 1303 1304 void MacroAssembler::verify_oop(Register reg, const char* s) { 1305 if (!VerifyOops || VerifyAdapterSharing) { 1306 // Below address of the code string confuses VerifyAdapterSharing 1307 // because it may differ between otherwise equivalent adapters. 1308 return; 1309 } 1310 1311 // Pass register number to verify_oop_subroutine 1312 const char* b = NULL; 1313 { 1314 ResourceMark rm; 1315 stringStream ss; 1316 ss.print("verify_oop: %s: %s", reg->name(), s); 1317 b = code_string(ss.as_string()); 1318 } 1319 BLOCK_COMMENT("verify_oop {"); 1320 1321 stp(r0, rscratch1, Address(pre(sp, -2 * wordSize))); 1322 stp(rscratch2, lr, Address(pre(sp, -2 * wordSize))); 1323 1324 mov(r0, reg); 1325 mov(rscratch1, (address)b); 1326 1327 // call indirectly to solve generation ordering problem 1328 lea(rscratch2, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address())); 1329 ldr(rscratch2, Address(rscratch2)); 1330 blr(rscratch2); 1331 1332 ldp(rscratch2, lr, Address(post(sp, 2 * wordSize))); 1333 ldp(r0, rscratch1, Address(post(sp, 2 * wordSize))); 1334 1335 BLOCK_COMMENT("} verify_oop"); 1336 } 1337 1338 void MacroAssembler::verify_oop_addr(Address addr, const char* s) { 1339 if (!VerifyOops || VerifyAdapterSharing) { 1340 // Below address of the code string confuses VerifyAdapterSharing 1341 // because it may differ between otherwise equivalent adapters. 1342 return; 1343 } 1344 1345 const char* b = NULL; 1346 { 1347 ResourceMark rm; 1348 stringStream ss; 1349 ss.print("verify_oop_addr: %s", s); 1350 b = code_string(ss.as_string()); 1351 } 1352 BLOCK_COMMENT("verify_oop_addr {"); 1353 1354 stp(r0, rscratch1, Address(pre(sp, -2 * wordSize))); 1355 stp(rscratch2, lr, Address(pre(sp, -2 * wordSize))); 1356 1357 // addr may contain sp so we will have to adjust it based on the 1358 // pushes that we just did. 1359 if (addr.uses(sp)) { 1360 lea(r0, addr); 1361 ldr(r0, Address(r0, 4 * wordSize)); 1362 } else { 1363 ldr(r0, addr); 1364 } 1365 mov(rscratch1, (address)b); 1366 1367 // call indirectly to solve generation ordering problem 1368 lea(rscratch2, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address())); 1369 ldr(rscratch2, Address(rscratch2)); 1370 blr(rscratch2); 1371 1372 ldp(rscratch2, lr, Address(post(sp, 2 * wordSize))); 1373 ldp(r0, rscratch1, Address(post(sp, 2 * wordSize))); 1374 1375 BLOCK_COMMENT("} verify_oop_addr"); 1376 } 1377 1378 Address MacroAssembler::argument_address(RegisterOrConstant arg_slot, 1379 int extra_slot_offset) { 1380 // cf. TemplateTable::prepare_invoke(), if (load_receiver). 1381 int stackElementSize = Interpreter::stackElementSize; 1382 int offset = Interpreter::expr_offset_in_bytes(extra_slot_offset+0); 1383 #ifdef ASSERT 1384 int offset1 = Interpreter::expr_offset_in_bytes(extra_slot_offset+1); 1385 assert(offset1 - offset == stackElementSize, "correct arithmetic"); 1386 #endif 1387 if (arg_slot.is_constant()) { 1388 return Address(esp, arg_slot.as_constant() * stackElementSize 1389 + offset); 1390 } else { 1391 add(rscratch1, esp, arg_slot.as_register(), 1392 ext::uxtx, exact_log2(stackElementSize)); 1393 return Address(rscratch1, offset); 1394 } 1395 } 1396 1397 void MacroAssembler::call_VM_leaf_base(address entry_point, 1398 int number_of_arguments, 1399 Label *retaddr) { 1400 call_VM_leaf_base1(entry_point, number_of_arguments, 0, ret_type_integral, retaddr); 1401 } 1402 1403 void MacroAssembler::call_VM_leaf_base1(address entry_point, 1404 int number_of_gp_arguments, 1405 int number_of_fp_arguments, 1406 ret_type type, 1407 Label *retaddr) { 1408 Label E, L; 1409 1410 stp(rscratch1, rmethod, Address(pre(sp, -2 * wordSize))); 1411 1412 // We add 1 to number_of_arguments because the thread in arg0 is 1413 // not counted 1414 mov(rscratch1, entry_point); 1415 blrt(rscratch1, number_of_gp_arguments + 1, number_of_fp_arguments, type); 1416 if (retaddr) 1417 bind(*retaddr); 1418 1419 ldp(rscratch1, rmethod, Address(post(sp, 2 * wordSize))); 1420 maybe_isb(); 1421 } 1422 1423 void MacroAssembler::call_VM_leaf(address entry_point, int number_of_arguments) { 1424 call_VM_leaf_base(entry_point, number_of_arguments); 1425 } 1426 1427 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0) { 1428 pass_arg0(this, arg_0); 1429 call_VM_leaf_base(entry_point, 1); 1430 } 1431 1432 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1) { 1433 pass_arg0(this, arg_0); 1434 pass_arg1(this, arg_1); 1435 call_VM_leaf_base(entry_point, 2); 1436 } 1437 1438 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, 1439 Register arg_1, Register arg_2) { 1440 pass_arg0(this, arg_0); 1441 pass_arg1(this, arg_1); 1442 pass_arg2(this, arg_2); 1443 call_VM_leaf_base(entry_point, 3); 1444 } 1445 1446 void MacroAssembler::super_call_VM_leaf(address entry_point) { 1447 MacroAssembler::call_VM_leaf_base(entry_point, 1); 1448 } 1449 1450 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0) { 1451 pass_arg0(this, arg_0); 1452 MacroAssembler::call_VM_leaf_base(entry_point, 1); 1453 } 1454 1455 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1) { 1456 1457 assert(arg_0 != c_rarg1, "smashed arg"); 1458 pass_arg1(this, arg_1); 1459 pass_arg0(this, arg_0); 1460 MacroAssembler::call_VM_leaf_base(entry_point, 2); 1461 } 1462 1463 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) { 1464 assert(arg_0 != c_rarg2, "smashed arg"); 1465 assert(arg_1 != c_rarg2, "smashed arg"); 1466 pass_arg2(this, arg_2); 1467 assert(arg_0 != c_rarg1, "smashed arg"); 1468 pass_arg1(this, arg_1); 1469 pass_arg0(this, arg_0); 1470 MacroAssembler::call_VM_leaf_base(entry_point, 3); 1471 } 1472 1473 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) { 1474 assert(arg_0 != c_rarg3, "smashed arg"); 1475 assert(arg_1 != c_rarg3, "smashed arg"); 1476 assert(arg_2 != c_rarg3, "smashed arg"); 1477 pass_arg3(this, arg_3); 1478 assert(arg_0 != c_rarg2, "smashed arg"); 1479 assert(arg_1 != c_rarg2, "smashed arg"); 1480 pass_arg2(this, arg_2); 1481 assert(arg_0 != c_rarg1, "smashed arg"); 1482 pass_arg1(this, arg_1); 1483 pass_arg0(this, arg_0); 1484 MacroAssembler::call_VM_leaf_base(entry_point, 4); 1485 } 1486 1487 void MacroAssembler::null_check(Register reg, int offset) { 1488 if (needs_explicit_null_check(offset)) { 1489 // provoke OS NULL exception if reg = NULL by 1490 // accessing M[reg] w/o changing any registers 1491 // NOTE: this is plenty to provoke a segv 1492 ldr(zr, Address(reg)); 1493 } else { 1494 // nothing to do, (later) access of M[reg + offset] 1495 // will provoke OS NULL exception if reg = NULL 1496 } 1497 } 1498 1499 void MacroAssembler::test_klass_is_value(Register klass, Register temp_reg, Label& is_value) { 1500 ldrw(temp_reg, Address(klass, Klass::access_flags_offset())); 1501 andr(temp_reg, temp_reg, JVM_ACC_VALUE); 1502 cbnz(temp_reg, is_value); 1503 } 1504 1505 void MacroAssembler::test_field_is_flattenable(Register flags, Register temp_reg, Label& is_flattenable) { 1506 (void) temp_reg; // keep signature uniform with x86 1507 tbnz(flags, ConstantPoolCacheEntry::is_flattenable_field_shift, is_flattenable); 1508 } 1509 1510 void MacroAssembler::test_field_is_not_flattenable(Register flags, Register temp_reg, Label& not_flattenable) { 1511 (void) temp_reg; // keep signature uniform with x86 1512 tbz(flags, ConstantPoolCacheEntry::is_flattenable_field_shift, not_flattenable); 1513 } 1514 1515 void MacroAssembler::test_field_is_flattened(Register flags, Register temp_reg, Label& is_flattened) { 1516 (void) temp_reg; // keep signature uniform with x86 1517 tbnz(flags, ConstantPoolCacheEntry::is_flattened_field_shift, is_flattened); 1518 } 1519 1520 void MacroAssembler::test_flat_array_klass(Register klass, Register temp_reg, Label& is_flattened) { 1521 ldrw(temp_reg, Address(klass, Klass::layout_helper_offset())); 1522 asrw(temp_reg, temp_reg, Klass::_lh_array_tag_shift); 1523 cmpw(temp_reg, Klass::_lh_array_tag_vt_value); 1524 br(Assembler::EQ, is_flattened); 1525 } 1526 1527 void MacroAssembler::test_flat_array_oop(Register oop, Register temp_reg, Label& is_flattened) { 1528 load_klass(temp_reg, oop); 1529 test_flat_array_klass(temp_reg, temp_reg, is_flattened); 1530 } 1531 1532 // MacroAssembler protected routines needed to implement 1533 // public methods 1534 1535 void MacroAssembler::mov(Register r, Address dest) { 1536 code_section()->relocate(pc(), dest.rspec()); 1537 u_int64_t imm64 = (u_int64_t)dest.target(); 1538 movptr(r, imm64); 1539 } 1540 1541 // Move a constant pointer into r. In AArch64 mode the virtual 1542 // address space is 48 bits in size, so we only need three 1543 // instructions to create a patchable instruction sequence that can 1544 // reach anywhere. 1545 void MacroAssembler::movptr(Register r, uintptr_t imm64) { 1546 #ifndef PRODUCT 1547 { 1548 char buffer[64]; 1549 snprintf(buffer, sizeof(buffer), "0x%" PRIX64, imm64); 1550 block_comment(buffer); 1551 } 1552 #endif 1553 assert(imm64 < (1ul << 48), "48-bit overflow in address constant"); 1554 movz(r, imm64 & 0xffff); 1555 imm64 >>= 16; 1556 movk(r, imm64 & 0xffff, 16); 1557 imm64 >>= 16; 1558 movk(r, imm64 & 0xffff, 32); 1559 } 1560 1561 // Macro to mov replicated immediate to vector register. 1562 // Vd will get the following values for different arrangements in T 1563 // imm32 == hex 000000gh T8B: Vd = ghghghghghghghgh 1564 // imm32 == hex 000000gh T16B: Vd = ghghghghghghghghghghghghghghghgh 1565 // imm32 == hex 0000efgh T4H: Vd = efghefghefghefgh 1566 // imm32 == hex 0000efgh T8H: Vd = efghefghefghefghefghefghefghefgh 1567 // imm32 == hex abcdefgh T2S: Vd = abcdefghabcdefgh 1568 // imm32 == hex abcdefgh T4S: Vd = abcdefghabcdefghabcdefghabcdefgh 1569 // T1D/T2D: invalid 1570 void MacroAssembler::mov(FloatRegister Vd, SIMD_Arrangement T, u_int32_t imm32) { 1571 assert(T != T1D && T != T2D, "invalid arrangement"); 1572 if (T == T8B || T == T16B) { 1573 assert((imm32 & ~0xff) == 0, "extraneous bits in unsigned imm32 (T8B/T16B)"); 1574 movi(Vd, T, imm32 & 0xff, 0); 1575 return; 1576 } 1577 u_int32_t nimm32 = ~imm32; 1578 if (T == T4H || T == T8H) { 1579 assert((imm32 & ~0xffff) == 0, "extraneous bits in unsigned imm32 (T4H/T8H)"); 1580 imm32 &= 0xffff; 1581 nimm32 &= 0xffff; 1582 } 1583 u_int32_t x = imm32; 1584 int movi_cnt = 0; 1585 int movn_cnt = 0; 1586 while (x) { if (x & 0xff) movi_cnt++; x >>= 8; } 1587 x = nimm32; 1588 while (x) { if (x & 0xff) movn_cnt++; x >>= 8; } 1589 if (movn_cnt < movi_cnt) imm32 = nimm32; 1590 unsigned lsl = 0; 1591 while (imm32 && (imm32 & 0xff) == 0) { lsl += 8; imm32 >>= 8; } 1592 if (movn_cnt < movi_cnt) 1593 mvni(Vd, T, imm32 & 0xff, lsl); 1594 else 1595 movi(Vd, T, imm32 & 0xff, lsl); 1596 imm32 >>= 8; lsl += 8; 1597 while (imm32) { 1598 while ((imm32 & 0xff) == 0) { lsl += 8; imm32 >>= 8; } 1599 if (movn_cnt < movi_cnt) 1600 bici(Vd, T, imm32 & 0xff, lsl); 1601 else 1602 orri(Vd, T, imm32 & 0xff, lsl); 1603 lsl += 8; imm32 >>= 8; 1604 } 1605 } 1606 1607 void MacroAssembler::mov_immediate64(Register dst, u_int64_t imm64) 1608 { 1609 #ifndef PRODUCT 1610 { 1611 char buffer[64]; 1612 snprintf(buffer, sizeof(buffer), "0x%" PRIX64, imm64); 1613 block_comment(buffer); 1614 } 1615 #endif 1616 if (operand_valid_for_logical_immediate(false, imm64)) { 1617 orr(dst, zr, imm64); 1618 } else { 1619 // we can use a combination of MOVZ or MOVN with 1620 // MOVK to build up the constant 1621 u_int64_t imm_h[4]; 1622 int zero_count = 0; 1623 int neg_count = 0; 1624 int i; 1625 for (i = 0; i < 4; i++) { 1626 imm_h[i] = ((imm64 >> (i * 16)) & 0xffffL); 1627 if (imm_h[i] == 0) { 1628 zero_count++; 1629 } else if (imm_h[i] == 0xffffL) { 1630 neg_count++; 1631 } 1632 } 1633 if (zero_count == 4) { 1634 // one MOVZ will do 1635 movz(dst, 0); 1636 } else if (neg_count == 4) { 1637 // one MOVN will do 1638 movn(dst, 0); 1639 } else if (zero_count == 3) { 1640 for (i = 0; i < 4; i++) { 1641 if (imm_h[i] != 0L) { 1642 movz(dst, (u_int32_t)imm_h[i], (i << 4)); 1643 break; 1644 } 1645 } 1646 } else if (neg_count == 3) { 1647 // one MOVN will do 1648 for (int i = 0; i < 4; i++) { 1649 if (imm_h[i] != 0xffffL) { 1650 movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4)); 1651 break; 1652 } 1653 } 1654 } else if (zero_count == 2) { 1655 // one MOVZ and one MOVK will do 1656 for (i = 0; i < 3; i++) { 1657 if (imm_h[i] != 0L) { 1658 movz(dst, (u_int32_t)imm_h[i], (i << 4)); 1659 i++; 1660 break; 1661 } 1662 } 1663 for (;i < 4; i++) { 1664 if (imm_h[i] != 0L) { 1665 movk(dst, (u_int32_t)imm_h[i], (i << 4)); 1666 } 1667 } 1668 } else if (neg_count == 2) { 1669 // one MOVN and one MOVK will do 1670 for (i = 0; i < 4; i++) { 1671 if (imm_h[i] != 0xffffL) { 1672 movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4)); 1673 i++; 1674 break; 1675 } 1676 } 1677 for (;i < 4; i++) { 1678 if (imm_h[i] != 0xffffL) { 1679 movk(dst, (u_int32_t)imm_h[i], (i << 4)); 1680 } 1681 } 1682 } else if (zero_count == 1) { 1683 // one MOVZ and two MOVKs will do 1684 for (i = 0; i < 4; i++) { 1685 if (imm_h[i] != 0L) { 1686 movz(dst, (u_int32_t)imm_h[i], (i << 4)); 1687 i++; 1688 break; 1689 } 1690 } 1691 for (;i < 4; i++) { 1692 if (imm_h[i] != 0x0L) { 1693 movk(dst, (u_int32_t)imm_h[i], (i << 4)); 1694 } 1695 } 1696 } else if (neg_count == 1) { 1697 // one MOVN and two MOVKs will do 1698 for (i = 0; i < 4; i++) { 1699 if (imm_h[i] != 0xffffL) { 1700 movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4)); 1701 i++; 1702 break; 1703 } 1704 } 1705 for (;i < 4; i++) { 1706 if (imm_h[i] != 0xffffL) { 1707 movk(dst, (u_int32_t)imm_h[i], (i << 4)); 1708 } 1709 } 1710 } else { 1711 // use a MOVZ and 3 MOVKs (makes it easier to debug) 1712 movz(dst, (u_int32_t)imm_h[0], 0); 1713 for (i = 1; i < 4; i++) { 1714 movk(dst, (u_int32_t)imm_h[i], (i << 4)); 1715 } 1716 } 1717 } 1718 } 1719 1720 void MacroAssembler::mov_immediate32(Register dst, u_int32_t imm32) 1721 { 1722 #ifndef PRODUCT 1723 { 1724 char buffer[64]; 1725 snprintf(buffer, sizeof(buffer), "0x%" PRIX32, imm32); 1726 block_comment(buffer); 1727 } 1728 #endif 1729 if (operand_valid_for_logical_immediate(true, imm32)) { 1730 orrw(dst, zr, imm32); 1731 } else { 1732 // we can use MOVZ, MOVN or two calls to MOVK to build up the 1733 // constant 1734 u_int32_t imm_h[2]; 1735 imm_h[0] = imm32 & 0xffff; 1736 imm_h[1] = ((imm32 >> 16) & 0xffff); 1737 if (imm_h[0] == 0) { 1738 movzw(dst, imm_h[1], 16); 1739 } else if (imm_h[0] == 0xffff) { 1740 movnw(dst, imm_h[1] ^ 0xffff, 16); 1741 } else if (imm_h[1] == 0) { 1742 movzw(dst, imm_h[0], 0); 1743 } else if (imm_h[1] == 0xffff) { 1744 movnw(dst, imm_h[0] ^ 0xffff, 0); 1745 } else { 1746 // use a MOVZ and MOVK (makes it easier to debug) 1747 movzw(dst, imm_h[0], 0); 1748 movkw(dst, imm_h[1], 16); 1749 } 1750 } 1751 } 1752 1753 // Form an address from base + offset in Rd. Rd may or may 1754 // not actually be used: you must use the Address that is returned. 1755 // It is up to you to ensure that the shift provided matches the size 1756 // of your data. 1757 Address MacroAssembler::form_address(Register Rd, Register base, long byte_offset, int shift) { 1758 if (Address::offset_ok_for_immed(byte_offset, shift)) 1759 // It fits; no need for any heroics 1760 return Address(base, byte_offset); 1761 1762 // Don't do anything clever with negative or misaligned offsets 1763 unsigned mask = (1 << shift) - 1; 1764 if (byte_offset < 0 || byte_offset & mask) { 1765 mov(Rd, byte_offset); 1766 add(Rd, base, Rd); 1767 return Address(Rd); 1768 } 1769 1770 // See if we can do this with two 12-bit offsets 1771 { 1772 unsigned long word_offset = byte_offset >> shift; 1773 unsigned long masked_offset = word_offset & 0xfff000; 1774 if (Address::offset_ok_for_immed(word_offset - masked_offset) 1775 && Assembler::operand_valid_for_add_sub_immediate(masked_offset << shift)) { 1776 add(Rd, base, masked_offset << shift); 1777 word_offset -= masked_offset; 1778 return Address(Rd, word_offset << shift); 1779 } 1780 } 1781 1782 // Do it the hard way 1783 mov(Rd, byte_offset); 1784 add(Rd, base, Rd); 1785 return Address(Rd); 1786 } 1787 1788 void MacroAssembler::atomic_incw(Register counter_addr, Register tmp, Register tmp2) { 1789 if (UseLSE) { 1790 mov(tmp, 1); 1791 ldadd(Assembler::word, tmp, zr, counter_addr); 1792 return; 1793 } 1794 Label retry_load; 1795 if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH)) 1796 prfm(Address(counter_addr), PSTL1STRM); 1797 bind(retry_load); 1798 // flush and load exclusive from the memory location 1799 ldxrw(tmp, counter_addr); 1800 addw(tmp, tmp, 1); 1801 // if we store+flush with no intervening write tmp wil be zero 1802 stxrw(tmp2, tmp, counter_addr); 1803 cbnzw(tmp2, retry_load); 1804 } 1805 1806 1807 int MacroAssembler::corrected_idivl(Register result, Register ra, Register rb, 1808 bool want_remainder, Register scratch) 1809 { 1810 // Full implementation of Java idiv and irem. The function 1811 // returns the (pc) offset of the div instruction - may be needed 1812 // for implicit exceptions. 1813 // 1814 // constraint : ra/rb =/= scratch 1815 // normal case 1816 // 1817 // input : ra: dividend 1818 // rb: divisor 1819 // 1820 // result: either 1821 // quotient (= ra idiv rb) 1822 // remainder (= ra irem rb) 1823 1824 assert(ra != scratch && rb != scratch, "reg cannot be scratch"); 1825 1826 int idivl_offset = offset(); 1827 if (! want_remainder) { 1828 sdivw(result, ra, rb); 1829 } else { 1830 sdivw(scratch, ra, rb); 1831 Assembler::msubw(result, scratch, rb, ra); 1832 } 1833 1834 return idivl_offset; 1835 } 1836 1837 int MacroAssembler::corrected_idivq(Register result, Register ra, Register rb, 1838 bool want_remainder, Register scratch) 1839 { 1840 // Full implementation of Java ldiv and lrem. The function 1841 // returns the (pc) offset of the div instruction - may be needed 1842 // for implicit exceptions. 1843 // 1844 // constraint : ra/rb =/= scratch 1845 // normal case 1846 // 1847 // input : ra: dividend 1848 // rb: divisor 1849 // 1850 // result: either 1851 // quotient (= ra idiv rb) 1852 // remainder (= ra irem rb) 1853 1854 assert(ra != scratch && rb != scratch, "reg cannot be scratch"); 1855 1856 int idivq_offset = offset(); 1857 if (! want_remainder) { 1858 sdiv(result, ra, rb); 1859 } else { 1860 sdiv(scratch, ra, rb); 1861 Assembler::msub(result, scratch, rb, ra); 1862 } 1863 1864 return idivq_offset; 1865 } 1866 1867 void MacroAssembler::membar(Membar_mask_bits order_constraint) { 1868 address prev = pc() - NativeMembar::instruction_size; 1869 address last = code()->last_insn(); 1870 if (last != NULL && nativeInstruction_at(last)->is_Membar() && prev == last) { 1871 NativeMembar *bar = NativeMembar_at(prev); 1872 // We are merging two memory barrier instructions. On AArch64 we 1873 // can do this simply by ORing them together. 1874 bar->set_kind(bar->get_kind() | order_constraint); 1875 BLOCK_COMMENT("merged membar"); 1876 } else { 1877 code()->set_last_insn(pc()); 1878 dmb(Assembler::barrier(order_constraint)); 1879 } 1880 } 1881 1882 bool MacroAssembler::try_merge_ldst(Register rt, const Address &adr, size_t size_in_bytes, bool is_store) { 1883 if (ldst_can_merge(rt, adr, size_in_bytes, is_store)) { 1884 merge_ldst(rt, adr, size_in_bytes, is_store); 1885 code()->clear_last_insn(); 1886 return true; 1887 } else { 1888 assert(size_in_bytes == 8 || size_in_bytes == 4, "only 8 bytes or 4 bytes load/store is supported."); 1889 const unsigned mask = size_in_bytes - 1; 1890 if (adr.getMode() == Address::base_plus_offset && 1891 (adr.offset() & mask) == 0) { // only supports base_plus_offset. 1892 code()->set_last_insn(pc()); 1893 } 1894 return false; 1895 } 1896 } 1897 1898 void MacroAssembler::ldr(Register Rx, const Address &adr) { 1899 // We always try to merge two adjacent loads into one ldp. 1900 if (!try_merge_ldst(Rx, adr, 8, false)) { 1901 Assembler::ldr(Rx, adr); 1902 } 1903 } 1904 1905 void MacroAssembler::ldrw(Register Rw, const Address &adr) { 1906 // We always try to merge two adjacent loads into one ldp. 1907 if (!try_merge_ldst(Rw, adr, 4, false)) { 1908 Assembler::ldrw(Rw, adr); 1909 } 1910 } 1911 1912 void MacroAssembler::str(Register Rx, const Address &adr) { 1913 // We always try to merge two adjacent stores into one stp. 1914 if (!try_merge_ldst(Rx, adr, 8, true)) { 1915 Assembler::str(Rx, adr); 1916 } 1917 } 1918 1919 void MacroAssembler::strw(Register Rw, const Address &adr) { 1920 // We always try to merge two adjacent stores into one stp. 1921 if (!try_merge_ldst(Rw, adr, 4, true)) { 1922 Assembler::strw(Rw, adr); 1923 } 1924 } 1925 1926 // MacroAssembler routines found actually to be needed 1927 1928 void MacroAssembler::push(Register src) 1929 { 1930 str(src, Address(pre(esp, -1 * wordSize))); 1931 } 1932 1933 void MacroAssembler::pop(Register dst) 1934 { 1935 ldr(dst, Address(post(esp, 1 * wordSize))); 1936 } 1937 1938 // Note: load_unsigned_short used to be called load_unsigned_word. 1939 int MacroAssembler::load_unsigned_short(Register dst, Address src) { 1940 int off = offset(); 1941 ldrh(dst, src); 1942 return off; 1943 } 1944 1945 int MacroAssembler::load_unsigned_byte(Register dst, Address src) { 1946 int off = offset(); 1947 ldrb(dst, src); 1948 return off; 1949 } 1950 1951 int MacroAssembler::load_signed_short(Register dst, Address src) { 1952 int off = offset(); 1953 ldrsh(dst, src); 1954 return off; 1955 } 1956 1957 int MacroAssembler::load_signed_byte(Register dst, Address src) { 1958 int off = offset(); 1959 ldrsb(dst, src); 1960 return off; 1961 } 1962 1963 int MacroAssembler::load_signed_short32(Register dst, Address src) { 1964 int off = offset(); 1965 ldrshw(dst, src); 1966 return off; 1967 } 1968 1969 int MacroAssembler::load_signed_byte32(Register dst, Address src) { 1970 int off = offset(); 1971 ldrsbw(dst, src); 1972 return off; 1973 } 1974 1975 void MacroAssembler::load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed, Register dst2) { 1976 switch (size_in_bytes) { 1977 case 8: ldr(dst, src); break; 1978 case 4: ldrw(dst, src); break; 1979 case 2: is_signed ? load_signed_short(dst, src) : load_unsigned_short(dst, src); break; 1980 case 1: is_signed ? load_signed_byte( dst, src) : load_unsigned_byte( dst, src); break; 1981 default: ShouldNotReachHere(); 1982 } 1983 } 1984 1985 void MacroAssembler::store_sized_value(Address dst, Register src, size_t size_in_bytes, Register src2) { 1986 switch (size_in_bytes) { 1987 case 8: str(src, dst); break; 1988 case 4: strw(src, dst); break; 1989 case 2: strh(src, dst); break; 1990 case 1: strb(src, dst); break; 1991 default: ShouldNotReachHere(); 1992 } 1993 } 1994 1995 void MacroAssembler::decrementw(Register reg, int value) 1996 { 1997 if (value < 0) { incrementw(reg, -value); return; } 1998 if (value == 0) { return; } 1999 if (value < (1 << 12)) { subw(reg, reg, value); return; } 2000 /* else */ { 2001 guarantee(reg != rscratch2, "invalid dst for register decrement"); 2002 movw(rscratch2, (unsigned)value); 2003 subw(reg, reg, rscratch2); 2004 } 2005 } 2006 2007 void MacroAssembler::decrement(Register reg, int value) 2008 { 2009 if (value < 0) { increment(reg, -value); return; } 2010 if (value == 0) { return; } 2011 if (value < (1 << 12)) { sub(reg, reg, value); return; } 2012 /* else */ { 2013 assert(reg != rscratch2, "invalid dst for register decrement"); 2014 mov(rscratch2, (unsigned long)value); 2015 sub(reg, reg, rscratch2); 2016 } 2017 } 2018 2019 void MacroAssembler::decrementw(Address dst, int value) 2020 { 2021 assert(!dst.uses(rscratch1), "invalid dst for address decrement"); 2022 if (dst.getMode() == Address::literal) { 2023 assert(abs(value) < (1 << 12), "invalid value and address mode combination"); 2024 lea(rscratch2, dst); 2025 dst = Address(rscratch2); 2026 } 2027 ldrw(rscratch1, dst); 2028 decrementw(rscratch1, value); 2029 strw(rscratch1, dst); 2030 } 2031 2032 void MacroAssembler::decrement(Address dst, int value) 2033 { 2034 assert(!dst.uses(rscratch1), "invalid address for decrement"); 2035 if (dst.getMode() == Address::literal) { 2036 assert(abs(value) < (1 << 12), "invalid value and address mode combination"); 2037 lea(rscratch2, dst); 2038 dst = Address(rscratch2); 2039 } 2040 ldr(rscratch1, dst); 2041 decrement(rscratch1, value); 2042 str(rscratch1, dst); 2043 } 2044 2045 void MacroAssembler::incrementw(Register reg, int value) 2046 { 2047 if (value < 0) { decrementw(reg, -value); return; } 2048 if (value == 0) { return; } 2049 if (value < (1 << 12)) { addw(reg, reg, value); return; } 2050 /* else */ { 2051 assert(reg != rscratch2, "invalid dst for register increment"); 2052 movw(rscratch2, (unsigned)value); 2053 addw(reg, reg, rscratch2); 2054 } 2055 } 2056 2057 void MacroAssembler::increment(Register reg, int value) 2058 { 2059 if (value < 0) { decrement(reg, -value); return; } 2060 if (value == 0) { return; } 2061 if (value < (1 << 12)) { add(reg, reg, value); return; } 2062 /* else */ { 2063 assert(reg != rscratch2, "invalid dst for register increment"); 2064 movw(rscratch2, (unsigned)value); 2065 add(reg, reg, rscratch2); 2066 } 2067 } 2068 2069 void MacroAssembler::incrementw(Address dst, int value) 2070 { 2071 assert(!dst.uses(rscratch1), "invalid dst for address increment"); 2072 if (dst.getMode() == Address::literal) { 2073 assert(abs(value) < (1 << 12), "invalid value and address mode combination"); 2074 lea(rscratch2, dst); 2075 dst = Address(rscratch2); 2076 } 2077 ldrw(rscratch1, dst); 2078 incrementw(rscratch1, value); 2079 strw(rscratch1, dst); 2080 } 2081 2082 void MacroAssembler::increment(Address dst, int value) 2083 { 2084 assert(!dst.uses(rscratch1), "invalid dst for address increment"); 2085 if (dst.getMode() == Address::literal) { 2086 assert(abs(value) < (1 << 12), "invalid value and address mode combination"); 2087 lea(rscratch2, dst); 2088 dst = Address(rscratch2); 2089 } 2090 ldr(rscratch1, dst); 2091 increment(rscratch1, value); 2092 str(rscratch1, dst); 2093 } 2094 2095 2096 void MacroAssembler::pusha() { 2097 push(0x7fffffff, sp); 2098 } 2099 2100 void MacroAssembler::popa() { 2101 pop(0x7fffffff, sp); 2102 } 2103 2104 // Push lots of registers in the bit set supplied. Don't push sp. 2105 // Return the number of words pushed 2106 int MacroAssembler::push(unsigned int bitset, Register stack) { 2107 int words_pushed = 0; 2108 2109 // Scan bitset to accumulate register pairs 2110 unsigned char regs[32]; 2111 int count = 0; 2112 for (int reg = 0; reg <= 30; reg++) { 2113 if (1 & bitset) 2114 regs[count++] = reg; 2115 bitset >>= 1; 2116 } 2117 regs[count++] = zr->encoding_nocheck(); 2118 count &= ~1; // Only push an even nuber of regs 2119 2120 if (count) { 2121 stp(as_Register(regs[0]), as_Register(regs[1]), 2122 Address(pre(stack, -count * wordSize))); 2123 words_pushed += 2; 2124 } 2125 for (int i = 2; i < count; i += 2) { 2126 stp(as_Register(regs[i]), as_Register(regs[i+1]), 2127 Address(stack, i * wordSize)); 2128 words_pushed += 2; 2129 } 2130 2131 assert(words_pushed == count, "oops, pushed != count"); 2132 2133 return count; 2134 } 2135 2136 int MacroAssembler::pop(unsigned int bitset, Register stack) { 2137 int words_pushed = 0; 2138 2139 // Scan bitset to accumulate register pairs 2140 unsigned char regs[32]; 2141 int count = 0; 2142 for (int reg = 0; reg <= 30; reg++) { 2143 if (1 & bitset) 2144 regs[count++] = reg; 2145 bitset >>= 1; 2146 } 2147 regs[count++] = zr->encoding_nocheck(); 2148 count &= ~1; 2149 2150 for (int i = 2; i < count; i += 2) { 2151 ldp(as_Register(regs[i]), as_Register(regs[i+1]), 2152 Address(stack, i * wordSize)); 2153 words_pushed += 2; 2154 } 2155 if (count) { 2156 ldp(as_Register(regs[0]), as_Register(regs[1]), 2157 Address(post(stack, count * wordSize))); 2158 words_pushed += 2; 2159 } 2160 2161 assert(words_pushed == count, "oops, pushed != count"); 2162 2163 return count; 2164 } 2165 #ifdef ASSERT 2166 void MacroAssembler::verify_heapbase(const char* msg) { 2167 #if 0 2168 assert (UseCompressedOops || UseCompressedClassPointers, "should be compressed"); 2169 assert (Universe::heap() != NULL, "java heap should be initialized"); 2170 if (CheckCompressedOops) { 2171 Label ok; 2172 push(1 << rscratch1->encoding(), sp); // cmpptr trashes rscratch1 2173 cmpptr(rheapbase, ExternalAddress((address)Universe::narrow_ptrs_base_addr())); 2174 br(Assembler::EQ, ok); 2175 stop(msg); 2176 bind(ok); 2177 pop(1 << rscratch1->encoding(), sp); 2178 } 2179 #endif 2180 } 2181 #endif 2182 2183 void MacroAssembler::resolve_jobject(Register value, Register thread, Register tmp) { 2184 Label done, not_weak; 2185 cbz(value, done); // Use NULL as-is. 2186 2187 STATIC_ASSERT(JNIHandles::weak_tag_mask == 1u); 2188 tbz(r0, 0, not_weak); // Test for jweak tag. 2189 2190 // Resolve jweak. 2191 access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF, value, 2192 Address(value, -JNIHandles::weak_tag_value), tmp, thread); 2193 verify_oop(value); 2194 b(done); 2195 2196 bind(not_weak); 2197 // Resolve (untagged) jobject. 2198 access_load_at(T_OBJECT, IN_NATIVE, value, Address(value, 0), tmp, thread); 2199 verify_oop(value); 2200 bind(done); 2201 } 2202 2203 void MacroAssembler::stop(const char* msg) { 2204 address ip = pc(); 2205 pusha(); 2206 mov(c_rarg0, (address)msg); 2207 mov(c_rarg1, (address)ip); 2208 mov(c_rarg2, sp); 2209 mov(c_rarg3, CAST_FROM_FN_PTR(address, MacroAssembler::debug64)); 2210 // call(c_rarg3); 2211 blrt(c_rarg3, 3, 0, 1); 2212 hlt(0); 2213 } 2214 2215 void MacroAssembler::unimplemented(const char* what) { 2216 const char* buf = NULL; 2217 { 2218 ResourceMark rm; 2219 stringStream ss; 2220 ss.print("unimplemented: %s", what); 2221 buf = code_string(ss.as_string()); 2222 } 2223 stop(buf); 2224 } 2225 2226 // If a constant does not fit in an immediate field, generate some 2227 // number of MOV instructions and then perform the operation. 2228 void MacroAssembler::wrap_add_sub_imm_insn(Register Rd, Register Rn, unsigned imm, 2229 add_sub_imm_insn insn1, 2230 add_sub_reg_insn insn2) { 2231 assert(Rd != zr, "Rd = zr and not setting flags?"); 2232 if (operand_valid_for_add_sub_immediate((int)imm)) { 2233 (this->*insn1)(Rd, Rn, imm); 2234 } else { 2235 if (uabs(imm) < (1 << 24)) { 2236 (this->*insn1)(Rd, Rn, imm & -(1 << 12)); 2237 (this->*insn1)(Rd, Rd, imm & ((1 << 12)-1)); 2238 } else { 2239 assert_different_registers(Rd, Rn); 2240 mov(Rd, (uint64_t)imm); 2241 (this->*insn2)(Rd, Rn, Rd, LSL, 0); 2242 } 2243 } 2244 } 2245 2246 // Seperate vsn which sets the flags. Optimisations are more restricted 2247 // because we must set the flags correctly. 2248 void MacroAssembler::wrap_adds_subs_imm_insn(Register Rd, Register Rn, unsigned imm, 2249 add_sub_imm_insn insn1, 2250 add_sub_reg_insn insn2) { 2251 if (operand_valid_for_add_sub_immediate((int)imm)) { 2252 (this->*insn1)(Rd, Rn, imm); 2253 } else { 2254 assert_different_registers(Rd, Rn); 2255 assert(Rd != zr, "overflow in immediate operand"); 2256 mov(Rd, (uint64_t)imm); 2257 (this->*insn2)(Rd, Rn, Rd, LSL, 0); 2258 } 2259 } 2260 2261 2262 void MacroAssembler::add(Register Rd, Register Rn, RegisterOrConstant increment) { 2263 if (increment.is_register()) { 2264 add(Rd, Rn, increment.as_register()); 2265 } else { 2266 add(Rd, Rn, increment.as_constant()); 2267 } 2268 } 2269 2270 void MacroAssembler::addw(Register Rd, Register Rn, RegisterOrConstant increment) { 2271 if (increment.is_register()) { 2272 addw(Rd, Rn, increment.as_register()); 2273 } else { 2274 addw(Rd, Rn, increment.as_constant()); 2275 } 2276 } 2277 2278 void MacroAssembler::sub(Register Rd, Register Rn, RegisterOrConstant decrement) { 2279 if (decrement.is_register()) { 2280 sub(Rd, Rn, decrement.as_register()); 2281 } else { 2282 sub(Rd, Rn, decrement.as_constant()); 2283 } 2284 } 2285 2286 void MacroAssembler::subw(Register Rd, Register Rn, RegisterOrConstant decrement) { 2287 if (decrement.is_register()) { 2288 subw(Rd, Rn, decrement.as_register()); 2289 } else { 2290 subw(Rd, Rn, decrement.as_constant()); 2291 } 2292 } 2293 2294 void MacroAssembler::reinit_heapbase() 2295 { 2296 if (UseCompressedOops) { 2297 if (Universe::is_fully_initialized()) { 2298 mov(rheapbase, Universe::narrow_ptrs_base()); 2299 } else { 2300 lea(rheapbase, ExternalAddress((address)Universe::narrow_ptrs_base_addr())); 2301 ldr(rheapbase, Address(rheapbase)); 2302 } 2303 } 2304 } 2305 2306 // this simulates the behaviour of the x86 cmpxchg instruction using a 2307 // load linked/store conditional pair. we use the acquire/release 2308 // versions of these instructions so that we flush pending writes as 2309 // per Java semantics. 2310 2311 // n.b the x86 version assumes the old value to be compared against is 2312 // in rax and updates rax with the value located in memory if the 2313 // cmpxchg fails. we supply a register for the old value explicitly 2314 2315 // the aarch64 load linked/store conditional instructions do not 2316 // accept an offset. so, unlike x86, we must provide a plain register 2317 // to identify the memory word to be compared/exchanged rather than a 2318 // register+offset Address. 2319 2320 void MacroAssembler::cmpxchgptr(Register oldv, Register newv, Register addr, Register tmp, 2321 Label &succeed, Label *fail) { 2322 // oldv holds comparison value 2323 // newv holds value to write in exchange 2324 // addr identifies memory word to compare against/update 2325 if (UseLSE) { 2326 mov(tmp, oldv); 2327 casal(Assembler::xword, oldv, newv, addr); 2328 cmp(tmp, oldv); 2329 br(Assembler::EQ, succeed); 2330 membar(AnyAny); 2331 } else { 2332 Label retry_load, nope; 2333 if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH)) 2334 prfm(Address(addr), PSTL1STRM); 2335 bind(retry_load); 2336 // flush and load exclusive from the memory location 2337 // and fail if it is not what we expect 2338 ldaxr(tmp, addr); 2339 cmp(tmp, oldv); 2340 br(Assembler::NE, nope); 2341 // if we store+flush with no intervening write tmp wil be zero 2342 stlxr(tmp, newv, addr); 2343 cbzw(tmp, succeed); 2344 // retry so we only ever return after a load fails to compare 2345 // ensures we don't return a stale value after a failed write. 2346 b(retry_load); 2347 // if the memory word differs we return it in oldv and signal a fail 2348 bind(nope); 2349 membar(AnyAny); 2350 mov(oldv, tmp); 2351 } 2352 if (fail) 2353 b(*fail); 2354 } 2355 2356 void MacroAssembler::cmpxchg_obj_header(Register oldv, Register newv, Register obj, Register tmp, 2357 Label &succeed, Label *fail) { 2358 assert(oopDesc::mark_offset_in_bytes() == 0, "assumption"); 2359 cmpxchgptr(oldv, newv, obj, tmp, succeed, fail); 2360 } 2361 2362 void MacroAssembler::cmpxchgw(Register oldv, Register newv, Register addr, Register tmp, 2363 Label &succeed, Label *fail) { 2364 // oldv holds comparison value 2365 // newv holds value to write in exchange 2366 // addr identifies memory word to compare against/update 2367 // tmp returns 0/1 for success/failure 2368 if (UseLSE) { 2369 mov(tmp, oldv); 2370 casal(Assembler::word, oldv, newv, addr); 2371 cmp(tmp, oldv); 2372 br(Assembler::EQ, succeed); 2373 membar(AnyAny); 2374 } else { 2375 Label retry_load, nope; 2376 if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH)) 2377 prfm(Address(addr), PSTL1STRM); 2378 bind(retry_load); 2379 // flush and load exclusive from the memory location 2380 // and fail if it is not what we expect 2381 ldaxrw(tmp, addr); 2382 cmp(tmp, oldv); 2383 br(Assembler::NE, nope); 2384 // if we store+flush with no intervening write tmp wil be zero 2385 stlxrw(tmp, newv, addr); 2386 cbzw(tmp, succeed); 2387 // retry so we only ever return after a load fails to compare 2388 // ensures we don't return a stale value after a failed write. 2389 b(retry_load); 2390 // if the memory word differs we return it in oldv and signal a fail 2391 bind(nope); 2392 membar(AnyAny); 2393 mov(oldv, tmp); 2394 } 2395 if (fail) 2396 b(*fail); 2397 } 2398 2399 // A generic CAS; success or failure is in the EQ flag. A weak CAS 2400 // doesn't retry and may fail spuriously. If the oldval is wanted, 2401 // Pass a register for the result, otherwise pass noreg. 2402 2403 // Clobbers rscratch1 2404 void MacroAssembler::cmpxchg(Register addr, Register expected, 2405 Register new_val, 2406 enum operand_size size, 2407 bool acquire, bool release, 2408 bool weak, 2409 Register result) { 2410 if (result == noreg) result = rscratch1; 2411 BLOCK_COMMENT("cmpxchg {"); 2412 if (UseLSE) { 2413 mov(result, expected); 2414 lse_cas(result, new_val, addr, size, acquire, release, /*not_pair*/ true); 2415 compare_eq(result, expected, size); 2416 } else { 2417 Label retry_load, done; 2418 if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH)) 2419 prfm(Address(addr), PSTL1STRM); 2420 bind(retry_load); 2421 load_exclusive(result, addr, size, acquire); 2422 compare_eq(result, expected, size); 2423 br(Assembler::NE, done); 2424 store_exclusive(rscratch1, new_val, addr, size, release); 2425 if (weak) { 2426 cmpw(rscratch1, 0u); // If the store fails, return NE to our caller. 2427 } else { 2428 cbnzw(rscratch1, retry_load); 2429 } 2430 bind(done); 2431 } 2432 BLOCK_COMMENT("} cmpxchg"); 2433 } 2434 2435 // A generic comparison. Only compares for equality, clobbers rscratch1. 2436 void MacroAssembler::compare_eq(Register rm, Register rn, enum operand_size size) { 2437 if (size == xword) { 2438 cmp(rm, rn); 2439 } else if (size == word) { 2440 cmpw(rm, rn); 2441 } else if (size == halfword) { 2442 eorw(rscratch1, rm, rn); 2443 ands(zr, rscratch1, 0xffff); 2444 } else if (size == byte) { 2445 eorw(rscratch1, rm, rn); 2446 ands(zr, rscratch1, 0xff); 2447 } else { 2448 ShouldNotReachHere(); 2449 } 2450 } 2451 2452 2453 static bool different(Register a, RegisterOrConstant b, Register c) { 2454 if (b.is_constant()) 2455 return a != c; 2456 else 2457 return a != b.as_register() && a != c && b.as_register() != c; 2458 } 2459 2460 #define ATOMIC_OP(NAME, LDXR, OP, IOP, AOP, STXR, sz) \ 2461 void MacroAssembler::atomic_##NAME(Register prev, RegisterOrConstant incr, Register addr) { \ 2462 if (UseLSE) { \ 2463 prev = prev->is_valid() ? prev : zr; \ 2464 if (incr.is_register()) { \ 2465 AOP(sz, incr.as_register(), prev, addr); \ 2466 } else { \ 2467 mov(rscratch2, incr.as_constant()); \ 2468 AOP(sz, rscratch2, prev, addr); \ 2469 } \ 2470 return; \ 2471 } \ 2472 Register result = rscratch2; \ 2473 if (prev->is_valid()) \ 2474 result = different(prev, incr, addr) ? prev : rscratch2; \ 2475 \ 2476 Label retry_load; \ 2477 if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH)) \ 2478 prfm(Address(addr), PSTL1STRM); \ 2479 bind(retry_load); \ 2480 LDXR(result, addr); \ 2481 OP(rscratch1, result, incr); \ 2482 STXR(rscratch2, rscratch1, addr); \ 2483 cbnzw(rscratch2, retry_load); \ 2484 if (prev->is_valid() && prev != result) { \ 2485 IOP(prev, rscratch1, incr); \ 2486 } \ 2487 } 2488 2489 ATOMIC_OP(add, ldxr, add, sub, ldadd, stxr, Assembler::xword) 2490 ATOMIC_OP(addw, ldxrw, addw, subw, ldadd, stxrw, Assembler::word) 2491 ATOMIC_OP(addal, ldaxr, add, sub, ldaddal, stlxr, Assembler::xword) 2492 ATOMIC_OP(addalw, ldaxrw, addw, subw, ldaddal, stlxrw, Assembler::word) 2493 2494 #undef ATOMIC_OP 2495 2496 #define ATOMIC_XCHG(OP, AOP, LDXR, STXR, sz) \ 2497 void MacroAssembler::atomic_##OP(Register prev, Register newv, Register addr) { \ 2498 if (UseLSE) { \ 2499 prev = prev->is_valid() ? prev : zr; \ 2500 AOP(sz, newv, prev, addr); \ 2501 return; \ 2502 } \ 2503 Register result = rscratch2; \ 2504 if (prev->is_valid()) \ 2505 result = different(prev, newv, addr) ? prev : rscratch2; \ 2506 \ 2507 Label retry_load; \ 2508 if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH)) \ 2509 prfm(Address(addr), PSTL1STRM); \ 2510 bind(retry_load); \ 2511 LDXR(result, addr); \ 2512 STXR(rscratch1, newv, addr); \ 2513 cbnzw(rscratch1, retry_load); \ 2514 if (prev->is_valid() && prev != result) \ 2515 mov(prev, result); \ 2516 } 2517 2518 ATOMIC_XCHG(xchg, swp, ldxr, stxr, Assembler::xword) 2519 ATOMIC_XCHG(xchgw, swp, ldxrw, stxrw, Assembler::word) 2520 ATOMIC_XCHG(xchgal, swpal, ldaxr, stlxr, Assembler::xword) 2521 ATOMIC_XCHG(xchgalw, swpal, ldaxrw, stlxrw, Assembler::word) 2522 2523 #undef ATOMIC_XCHG 2524 2525 #ifndef PRODUCT 2526 extern "C" void findpc(intptr_t x); 2527 #endif 2528 2529 void MacroAssembler::debug64(char* msg, int64_t pc, int64_t regs[]) 2530 { 2531 // In order to get locks to work, we need to fake a in_VM state 2532 if (ShowMessageBoxOnError ) { 2533 JavaThread* thread = JavaThread::current(); 2534 JavaThreadState saved_state = thread->thread_state(); 2535 thread->set_thread_state(_thread_in_vm); 2536 #ifndef PRODUCT 2537 if (CountBytecodes || TraceBytecodes || StopInterpreterAt) { 2538 ttyLocker ttyl; 2539 BytecodeCounter::print(); 2540 } 2541 #endif 2542 if (os::message_box(msg, "Execution stopped, print registers?")) { 2543 ttyLocker ttyl; 2544 tty->print_cr(" pc = 0x%016lx", pc); 2545 #ifndef PRODUCT 2546 tty->cr(); 2547 findpc(pc); 2548 tty->cr(); 2549 #endif 2550 tty->print_cr(" r0 = 0x%016lx", regs[0]); 2551 tty->print_cr(" r1 = 0x%016lx", regs[1]); 2552 tty->print_cr(" r2 = 0x%016lx", regs[2]); 2553 tty->print_cr(" r3 = 0x%016lx", regs[3]); 2554 tty->print_cr(" r4 = 0x%016lx", regs[4]); 2555 tty->print_cr(" r5 = 0x%016lx", regs[5]); 2556 tty->print_cr(" r6 = 0x%016lx", regs[6]); 2557 tty->print_cr(" r7 = 0x%016lx", regs[7]); 2558 tty->print_cr(" r8 = 0x%016lx", regs[8]); 2559 tty->print_cr(" r9 = 0x%016lx", regs[9]); 2560 tty->print_cr("r10 = 0x%016lx", regs[10]); 2561 tty->print_cr("r11 = 0x%016lx", regs[11]); 2562 tty->print_cr("r12 = 0x%016lx", regs[12]); 2563 tty->print_cr("r13 = 0x%016lx", regs[13]); 2564 tty->print_cr("r14 = 0x%016lx", regs[14]); 2565 tty->print_cr("r15 = 0x%016lx", regs[15]); 2566 tty->print_cr("r16 = 0x%016lx", regs[16]); 2567 tty->print_cr("r17 = 0x%016lx", regs[17]); 2568 tty->print_cr("r18 = 0x%016lx", regs[18]); 2569 tty->print_cr("r19 = 0x%016lx", regs[19]); 2570 tty->print_cr("r20 = 0x%016lx", regs[20]); 2571 tty->print_cr("r21 = 0x%016lx", regs[21]); 2572 tty->print_cr("r22 = 0x%016lx", regs[22]); 2573 tty->print_cr("r23 = 0x%016lx", regs[23]); 2574 tty->print_cr("r24 = 0x%016lx", regs[24]); 2575 tty->print_cr("r25 = 0x%016lx", regs[25]); 2576 tty->print_cr("r26 = 0x%016lx", regs[26]); 2577 tty->print_cr("r27 = 0x%016lx", regs[27]); 2578 tty->print_cr("r28 = 0x%016lx", regs[28]); 2579 tty->print_cr("r30 = 0x%016lx", regs[30]); 2580 tty->print_cr("r31 = 0x%016lx", regs[31]); 2581 BREAKPOINT; 2582 } 2583 ThreadStateTransition::transition(thread, _thread_in_vm, saved_state); 2584 } else { 2585 ttyLocker ttyl; 2586 ::tty->print_cr("=============== DEBUG MESSAGE: %s ================\n", 2587 msg); 2588 assert(false, "DEBUG MESSAGE: %s", msg); 2589 } 2590 } 2591 2592 #ifdef BUILTIN_SIM 2593 // routine to generate an x86 prolog for a stub function which 2594 // bootstraps into the generated ARM code which directly follows the 2595 // stub 2596 // 2597 // the argument encodes the number of general and fp registers 2598 // passed by the caller and the callng convention (currently just 2599 // the number of general registers and assumes C argument passing) 2600 2601 extern "C" { 2602 int aarch64_stub_prolog_size(); 2603 void aarch64_stub_prolog(); 2604 void aarch64_prolog(); 2605 } 2606 2607 void MacroAssembler::c_stub_prolog(int gp_arg_count, int fp_arg_count, int ret_type, 2608 address *prolog_ptr) 2609 { 2610 int calltype = (((ret_type & 0x3) << 8) | 2611 ((fp_arg_count & 0xf) << 4) | 2612 (gp_arg_count & 0xf)); 2613 2614 // the addresses for the x86 to ARM entry code we need to use 2615 address start = pc(); 2616 // printf("start = %lx\n", start); 2617 int byteCount = aarch64_stub_prolog_size(); 2618 // printf("byteCount = %x\n", byteCount); 2619 int instructionCount = (byteCount + 3)/ 4; 2620 // printf("instructionCount = %x\n", instructionCount); 2621 for (int i = 0; i < instructionCount; i++) { 2622 nop(); 2623 } 2624 2625 memcpy(start, (void*)aarch64_stub_prolog, byteCount); 2626 2627 // write the address of the setup routine and the call format at the 2628 // end of into the copied code 2629 u_int64_t *patch_end = (u_int64_t *)(start + byteCount); 2630 if (prolog_ptr) 2631 patch_end[-2] = (u_int64_t)prolog_ptr; 2632 patch_end[-1] = calltype; 2633 } 2634 #endif 2635 2636 void MacroAssembler::push_call_clobbered_registers() { 2637 int step = 4 * wordSize; 2638 push(RegSet::range(r0, r18) - RegSet::of(rscratch1, rscratch2), sp); 2639 sub(sp, sp, step); 2640 mov(rscratch1, -step); 2641 // Push v0-v7, v16-v31. 2642 for (int i = 31; i>= 4; i -= 4) { 2643 if (i <= v7->encoding() || i >= v16->encoding()) 2644 st1(as_FloatRegister(i-3), as_FloatRegister(i-2), as_FloatRegister(i-1), 2645 as_FloatRegister(i), T1D, Address(post(sp, rscratch1))); 2646 } 2647 st1(as_FloatRegister(0), as_FloatRegister(1), as_FloatRegister(2), 2648 as_FloatRegister(3), T1D, Address(sp)); 2649 } 2650 2651 void MacroAssembler::pop_call_clobbered_registers() { 2652 for (int i = 0; i < 32; i += 4) { 2653 if (i <= v7->encoding() || i >= v16->encoding()) 2654 ld1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2), 2655 as_FloatRegister(i+3), T1D, Address(post(sp, 4 * wordSize))); 2656 } 2657 2658 pop(RegSet::range(r0, r18) - RegSet::of(rscratch1, rscratch2), sp); 2659 } 2660 2661 void MacroAssembler::push_CPU_state(bool save_vectors) { 2662 int step = (save_vectors ? 8 : 4) * wordSize; 2663 push(0x3fffffff, sp); // integer registers except lr & sp 2664 mov(rscratch1, -step); 2665 sub(sp, sp, step); 2666 for (int i = 28; i >= 4; i -= 4) { 2667 st1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2), 2668 as_FloatRegister(i+3), save_vectors ? T2D : T1D, Address(post(sp, rscratch1))); 2669 } 2670 st1(v0, v1, v2, v3, save_vectors ? T2D : T1D, sp); 2671 } 2672 2673 void MacroAssembler::pop_CPU_state(bool restore_vectors) { 2674 int step = (restore_vectors ? 8 : 4) * wordSize; 2675 for (int i = 0; i <= 28; i += 4) 2676 ld1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2), 2677 as_FloatRegister(i+3), restore_vectors ? T2D : T1D, Address(post(sp, step))); 2678 pop(0x3fffffff, sp); // integer registers except lr & sp 2679 } 2680 2681 /** 2682 * Helpers for multiply_to_len(). 2683 */ 2684 void MacroAssembler::add2_with_carry(Register final_dest_hi, Register dest_hi, Register dest_lo, 2685 Register src1, Register src2) { 2686 adds(dest_lo, dest_lo, src1); 2687 adc(dest_hi, dest_hi, zr); 2688 adds(dest_lo, dest_lo, src2); 2689 adc(final_dest_hi, dest_hi, zr); 2690 } 2691 2692 // Generate an address from (r + r1 extend offset). "size" is the 2693 // size of the operand. The result may be in rscratch2. 2694 Address MacroAssembler::offsetted_address(Register r, Register r1, 2695 Address::extend ext, int offset, int size) { 2696 if (offset || (ext.shift() % size != 0)) { 2697 lea(rscratch2, Address(r, r1, ext)); 2698 return Address(rscratch2, offset); 2699 } else { 2700 return Address(r, r1, ext); 2701 } 2702 } 2703 2704 Address MacroAssembler::spill_address(int size, int offset, Register tmp) 2705 { 2706 assert(offset >= 0, "spill to negative address?"); 2707 // Offset reachable ? 2708 // Not aligned - 9 bits signed offset 2709 // Aligned - 12 bits unsigned offset shifted 2710 Register base = sp; 2711 if ((offset & (size-1)) && offset >= (1<<8)) { 2712 add(tmp, base, offset & ((1<<12)-1)); 2713 base = tmp; 2714 offset &= -1<<12; 2715 } 2716 2717 if (offset >= (1<<12) * size) { 2718 add(tmp, base, offset & (((1<<12)-1)<<12)); 2719 base = tmp; 2720 offset &= ~(((1<<12)-1)<<12); 2721 } 2722 2723 return Address(base, offset); 2724 } 2725 2726 // Checks whether offset is aligned. 2727 // Returns true if it is, else false. 2728 bool MacroAssembler::merge_alignment_check(Register base, 2729 size_t size, 2730 long cur_offset, 2731 long prev_offset) const { 2732 if (AvoidUnalignedAccesses) { 2733 if (base == sp) { 2734 // Checks whether low offset if aligned to pair of registers. 2735 long pair_mask = size * 2 - 1; 2736 long offset = prev_offset > cur_offset ? cur_offset : prev_offset; 2737 return (offset & pair_mask) == 0; 2738 } else { // If base is not sp, we can't guarantee the access is aligned. 2739 return false; 2740 } 2741 } else { 2742 long mask = size - 1; 2743 // Load/store pair instruction only supports element size aligned offset. 2744 return (cur_offset & mask) == 0 && (prev_offset & mask) == 0; 2745 } 2746 } 2747 2748 // Checks whether current and previous loads/stores can be merged. 2749 // Returns true if it can be merged, else false. 2750 bool MacroAssembler::ldst_can_merge(Register rt, 2751 const Address &adr, 2752 size_t cur_size_in_bytes, 2753 bool is_store) const { 2754 address prev = pc() - NativeInstruction::instruction_size; 2755 address last = code()->last_insn(); 2756 2757 if (last == NULL || !nativeInstruction_at(last)->is_Imm_LdSt()) { 2758 return false; 2759 } 2760 2761 if (adr.getMode() != Address::base_plus_offset || prev != last) { 2762 return false; 2763 } 2764 2765 NativeLdSt* prev_ldst = NativeLdSt_at(prev); 2766 size_t prev_size_in_bytes = prev_ldst->size_in_bytes(); 2767 2768 assert(prev_size_in_bytes == 4 || prev_size_in_bytes == 8, "only supports 64/32bit merging."); 2769 assert(cur_size_in_bytes == 4 || cur_size_in_bytes == 8, "only supports 64/32bit merging."); 2770 2771 if (cur_size_in_bytes != prev_size_in_bytes || is_store != prev_ldst->is_store()) { 2772 return false; 2773 } 2774 2775 long max_offset = 63 * prev_size_in_bytes; 2776 long min_offset = -64 * prev_size_in_bytes; 2777 2778 assert(prev_ldst->is_not_pre_post_index(), "pre-index or post-index is not supported to be merged."); 2779 2780 // Only same base can be merged. 2781 if (adr.base() != prev_ldst->base()) { 2782 return false; 2783 } 2784 2785 long cur_offset = adr.offset(); 2786 long prev_offset = prev_ldst->offset(); 2787 size_t diff = abs(cur_offset - prev_offset); 2788 if (diff != prev_size_in_bytes) { 2789 return false; 2790 } 2791 2792 // Following cases can not be merged: 2793 // ldr x2, [x2, #8] 2794 // ldr x3, [x2, #16] 2795 // or: 2796 // ldr x2, [x3, #8] 2797 // ldr x2, [x3, #16] 2798 // If t1 and t2 is the same in "ldp t1, t2, [xn, #imm]", we'll get SIGILL. 2799 if (!is_store && (adr.base() == prev_ldst->target() || rt == prev_ldst->target())) { 2800 return false; 2801 } 2802 2803 long low_offset = prev_offset > cur_offset ? cur_offset : prev_offset; 2804 // Offset range must be in ldp/stp instruction's range. 2805 if (low_offset > max_offset || low_offset < min_offset) { 2806 return false; 2807 } 2808 2809 if (merge_alignment_check(adr.base(), prev_size_in_bytes, cur_offset, prev_offset)) { 2810 return true; 2811 } 2812 2813 return false; 2814 } 2815 2816 // Merge current load/store with previous load/store into ldp/stp. 2817 void MacroAssembler::merge_ldst(Register rt, 2818 const Address &adr, 2819 size_t cur_size_in_bytes, 2820 bool is_store) { 2821 2822 assert(ldst_can_merge(rt, adr, cur_size_in_bytes, is_store) == true, "cur and prev must be able to be merged."); 2823 2824 Register rt_low, rt_high; 2825 address prev = pc() - NativeInstruction::instruction_size; 2826 NativeLdSt* prev_ldst = NativeLdSt_at(prev); 2827 2828 long offset; 2829 2830 if (adr.offset() < prev_ldst->offset()) { 2831 offset = adr.offset(); 2832 rt_low = rt; 2833 rt_high = prev_ldst->target(); 2834 } else { 2835 offset = prev_ldst->offset(); 2836 rt_low = prev_ldst->target(); 2837 rt_high = rt; 2838 } 2839 2840 Address adr_p = Address(prev_ldst->base(), offset); 2841 // Overwrite previous generated binary. 2842 code_section()->set_end(prev); 2843 2844 const int sz = prev_ldst->size_in_bytes(); 2845 assert(sz == 8 || sz == 4, "only supports 64/32bit merging."); 2846 if (!is_store) { 2847 BLOCK_COMMENT("merged ldr pair"); 2848 if (sz == 8) { 2849 ldp(rt_low, rt_high, adr_p); 2850 } else { 2851 ldpw(rt_low, rt_high, adr_p); 2852 } 2853 } else { 2854 BLOCK_COMMENT("merged str pair"); 2855 if (sz == 8) { 2856 stp(rt_low, rt_high, adr_p); 2857 } else { 2858 stpw(rt_low, rt_high, adr_p); 2859 } 2860 } 2861 } 2862 2863 /** 2864 * Multiply 64 bit by 64 bit first loop. 2865 */ 2866 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart, 2867 Register y, Register y_idx, Register z, 2868 Register carry, Register product, 2869 Register idx, Register kdx) { 2870 // 2871 // jlong carry, x[], y[], z[]; 2872 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) { 2873 // huge_128 product = y[idx] * x[xstart] + carry; 2874 // z[kdx] = (jlong)product; 2875 // carry = (jlong)(product >>> 64); 2876 // } 2877 // z[xstart] = carry; 2878 // 2879 2880 Label L_first_loop, L_first_loop_exit; 2881 Label L_one_x, L_one_y, L_multiply; 2882 2883 subsw(xstart, xstart, 1); 2884 br(Assembler::MI, L_one_x); 2885 2886 lea(rscratch1, Address(x, xstart, Address::lsl(LogBytesPerInt))); 2887 ldr(x_xstart, Address(rscratch1)); 2888 ror(x_xstart, x_xstart, 32); // convert big-endian to little-endian 2889 2890 bind(L_first_loop); 2891 subsw(idx, idx, 1); 2892 br(Assembler::MI, L_first_loop_exit); 2893 subsw(idx, idx, 1); 2894 br(Assembler::MI, L_one_y); 2895 lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt))); 2896 ldr(y_idx, Address(rscratch1)); 2897 ror(y_idx, y_idx, 32); // convert big-endian to little-endian 2898 bind(L_multiply); 2899 2900 // AArch64 has a multiply-accumulate instruction that we can't use 2901 // here because it has no way to process carries, so we have to use 2902 // separate add and adc instructions. Bah. 2903 umulh(rscratch1, x_xstart, y_idx); // x_xstart * y_idx -> rscratch1:product 2904 mul(product, x_xstart, y_idx); 2905 adds(product, product, carry); 2906 adc(carry, rscratch1, zr); // x_xstart * y_idx + carry -> carry:product 2907 2908 subw(kdx, kdx, 2); 2909 ror(product, product, 32); // back to big-endian 2910 str(product, offsetted_address(z, kdx, Address::uxtw(LogBytesPerInt), 0, BytesPerLong)); 2911 2912 b(L_first_loop); 2913 2914 bind(L_one_y); 2915 ldrw(y_idx, Address(y, 0)); 2916 b(L_multiply); 2917 2918 bind(L_one_x); 2919 ldrw(x_xstart, Address(x, 0)); 2920 b(L_first_loop); 2921 2922 bind(L_first_loop_exit); 2923 } 2924 2925 /** 2926 * Multiply 128 bit by 128. Unrolled inner loop. 2927 * 2928 */ 2929 void MacroAssembler::multiply_128_x_128_loop(Register y, Register z, 2930 Register carry, Register carry2, 2931 Register idx, Register jdx, 2932 Register yz_idx1, Register yz_idx2, 2933 Register tmp, Register tmp3, Register tmp4, 2934 Register tmp6, Register product_hi) { 2935 2936 // jlong carry, x[], y[], z[]; 2937 // int kdx = ystart+1; 2938 // for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop 2939 // huge_128 tmp3 = (y[idx+1] * product_hi) + z[kdx+idx+1] + carry; 2940 // jlong carry2 = (jlong)(tmp3 >>> 64); 2941 // huge_128 tmp4 = (y[idx] * product_hi) + z[kdx+idx] + carry2; 2942 // carry = (jlong)(tmp4 >>> 64); 2943 // z[kdx+idx+1] = (jlong)tmp3; 2944 // z[kdx+idx] = (jlong)tmp4; 2945 // } 2946 // idx += 2; 2947 // if (idx > 0) { 2948 // yz_idx1 = (y[idx] * product_hi) + z[kdx+idx] + carry; 2949 // z[kdx+idx] = (jlong)yz_idx1; 2950 // carry = (jlong)(yz_idx1 >>> 64); 2951 // } 2952 // 2953 2954 Label L_third_loop, L_third_loop_exit, L_post_third_loop_done; 2955 2956 lsrw(jdx, idx, 2); 2957 2958 bind(L_third_loop); 2959 2960 subsw(jdx, jdx, 1); 2961 br(Assembler::MI, L_third_loop_exit); 2962 subw(idx, idx, 4); 2963 2964 lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt))); 2965 2966 ldp(yz_idx2, yz_idx1, Address(rscratch1, 0)); 2967 2968 lea(tmp6, Address(z, idx, Address::uxtw(LogBytesPerInt))); 2969 2970 ror(yz_idx1, yz_idx1, 32); // convert big-endian to little-endian 2971 ror(yz_idx2, yz_idx2, 32); 2972 2973 ldp(rscratch2, rscratch1, Address(tmp6, 0)); 2974 2975 mul(tmp3, product_hi, yz_idx1); // yz_idx1 * product_hi -> tmp4:tmp3 2976 umulh(tmp4, product_hi, yz_idx1); 2977 2978 ror(rscratch1, rscratch1, 32); // convert big-endian to little-endian 2979 ror(rscratch2, rscratch2, 32); 2980 2981 mul(tmp, product_hi, yz_idx2); // yz_idx2 * product_hi -> carry2:tmp 2982 umulh(carry2, product_hi, yz_idx2); 2983 2984 // propagate sum of both multiplications into carry:tmp4:tmp3 2985 adds(tmp3, tmp3, carry); 2986 adc(tmp4, tmp4, zr); 2987 adds(tmp3, tmp3, rscratch1); 2988 adcs(tmp4, tmp4, tmp); 2989 adc(carry, carry2, zr); 2990 adds(tmp4, tmp4, rscratch2); 2991 adc(carry, carry, zr); 2992 2993 ror(tmp3, tmp3, 32); // convert little-endian to big-endian 2994 ror(tmp4, tmp4, 32); 2995 stp(tmp4, tmp3, Address(tmp6, 0)); 2996 2997 b(L_third_loop); 2998 bind (L_third_loop_exit); 2999 3000 andw (idx, idx, 0x3); 3001 cbz(idx, L_post_third_loop_done); 3002 3003 Label L_check_1; 3004 subsw(idx, idx, 2); 3005 br(Assembler::MI, L_check_1); 3006 3007 lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt))); 3008 ldr(yz_idx1, Address(rscratch1, 0)); 3009 ror(yz_idx1, yz_idx1, 32); 3010 mul(tmp3, product_hi, yz_idx1); // yz_idx1 * product_hi -> tmp4:tmp3 3011 umulh(tmp4, product_hi, yz_idx1); 3012 lea(rscratch1, Address(z, idx, Address::uxtw(LogBytesPerInt))); 3013 ldr(yz_idx2, Address(rscratch1, 0)); 3014 ror(yz_idx2, yz_idx2, 32); 3015 3016 add2_with_carry(carry, tmp4, tmp3, carry, yz_idx2); 3017 3018 ror(tmp3, tmp3, 32); 3019 str(tmp3, Address(rscratch1, 0)); 3020 3021 bind (L_check_1); 3022 3023 andw (idx, idx, 0x1); 3024 subsw(idx, idx, 1); 3025 br(Assembler::MI, L_post_third_loop_done); 3026 ldrw(tmp4, Address(y, idx, Address::uxtw(LogBytesPerInt))); 3027 mul(tmp3, tmp4, product_hi); // tmp4 * product_hi -> carry2:tmp3 3028 umulh(carry2, tmp4, product_hi); 3029 ldrw(tmp4, Address(z, idx, Address::uxtw(LogBytesPerInt))); 3030 3031 add2_with_carry(carry2, tmp3, tmp4, carry); 3032 3033 strw(tmp3, Address(z, idx, Address::uxtw(LogBytesPerInt))); 3034 extr(carry, carry2, tmp3, 32); 3035 3036 bind(L_post_third_loop_done); 3037 } 3038 3039 /** 3040 * Code for BigInteger::multiplyToLen() instrinsic. 3041 * 3042 * r0: x 3043 * r1: xlen 3044 * r2: y 3045 * r3: ylen 3046 * r4: z 3047 * r5: zlen 3048 * r10: tmp1 3049 * r11: tmp2 3050 * r12: tmp3 3051 * r13: tmp4 3052 * r14: tmp5 3053 * r15: tmp6 3054 * r16: tmp7 3055 * 3056 */ 3057 void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Register ylen, 3058 Register z, Register zlen, 3059 Register tmp1, Register tmp2, Register tmp3, Register tmp4, 3060 Register tmp5, Register tmp6, Register product_hi) { 3061 3062 assert_different_registers(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6); 3063 3064 const Register idx = tmp1; 3065 const Register kdx = tmp2; 3066 const Register xstart = tmp3; 3067 3068 const Register y_idx = tmp4; 3069 const Register carry = tmp5; 3070 const Register product = xlen; 3071 const Register x_xstart = zlen; // reuse register 3072 3073 // First Loop. 3074 // 3075 // final static long LONG_MASK = 0xffffffffL; 3076 // int xstart = xlen - 1; 3077 // int ystart = ylen - 1; 3078 // long carry = 0; 3079 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) { 3080 // long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry; 3081 // z[kdx] = (int)product; 3082 // carry = product >>> 32; 3083 // } 3084 // z[xstart] = (int)carry; 3085 // 3086 3087 movw(idx, ylen); // idx = ylen; 3088 movw(kdx, zlen); // kdx = xlen+ylen; 3089 mov(carry, zr); // carry = 0; 3090 3091 Label L_done; 3092 3093 movw(xstart, xlen); 3094 subsw(xstart, xstart, 1); 3095 br(Assembler::MI, L_done); 3096 3097 multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, carry, product, idx, kdx); 3098 3099 Label L_second_loop; 3100 cbzw(kdx, L_second_loop); 3101 3102 Label L_carry; 3103 subw(kdx, kdx, 1); 3104 cbzw(kdx, L_carry); 3105 3106 strw(carry, Address(z, kdx, Address::uxtw(LogBytesPerInt))); 3107 lsr(carry, carry, 32); 3108 subw(kdx, kdx, 1); 3109 3110 bind(L_carry); 3111 strw(carry, Address(z, kdx, Address::uxtw(LogBytesPerInt))); 3112 3113 // Second and third (nested) loops. 3114 // 3115 // for (int i = xstart-1; i >= 0; i--) { // Second loop 3116 // carry = 0; 3117 // for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop 3118 // long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) + 3119 // (z[k] & LONG_MASK) + carry; 3120 // z[k] = (int)product; 3121 // carry = product >>> 32; 3122 // } 3123 // z[i] = (int)carry; 3124 // } 3125 // 3126 // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = product_hi 3127 3128 const Register jdx = tmp1; 3129 3130 bind(L_second_loop); 3131 mov(carry, zr); // carry = 0; 3132 movw(jdx, ylen); // j = ystart+1 3133 3134 subsw(xstart, xstart, 1); // i = xstart-1; 3135 br(Assembler::MI, L_done); 3136 3137 str(z, Address(pre(sp, -4 * wordSize))); 3138 3139 Label L_last_x; 3140 lea(z, offsetted_address(z, xstart, Address::uxtw(LogBytesPerInt), 4, BytesPerInt)); // z = z + k - j 3141 subsw(xstart, xstart, 1); // i = xstart-1; 3142 br(Assembler::MI, L_last_x); 3143 3144 lea(rscratch1, Address(x, xstart, Address::uxtw(LogBytesPerInt))); 3145 ldr(product_hi, Address(rscratch1)); 3146 ror(product_hi, product_hi, 32); // convert big-endian to little-endian 3147 3148 Label L_third_loop_prologue; 3149 bind(L_third_loop_prologue); 3150 3151 str(ylen, Address(sp, wordSize)); 3152 stp(x, xstart, Address(sp, 2 * wordSize)); 3153 multiply_128_x_128_loop(y, z, carry, x, jdx, ylen, product, 3154 tmp2, x_xstart, tmp3, tmp4, tmp6, product_hi); 3155 ldp(z, ylen, Address(post(sp, 2 * wordSize))); 3156 ldp(x, xlen, Address(post(sp, 2 * wordSize))); // copy old xstart -> xlen 3157 3158 addw(tmp3, xlen, 1); 3159 strw(carry, Address(z, tmp3, Address::uxtw(LogBytesPerInt))); 3160 subsw(tmp3, tmp3, 1); 3161 br(Assembler::MI, L_done); 3162 3163 lsr(carry, carry, 32); 3164 strw(carry, Address(z, tmp3, Address::uxtw(LogBytesPerInt))); 3165 b(L_second_loop); 3166 3167 // Next infrequent code is moved outside loops. 3168 bind(L_last_x); 3169 ldrw(product_hi, Address(x, 0)); 3170 b(L_third_loop_prologue); 3171 3172 bind(L_done); 3173 } 3174 3175 // Code for BigInteger::mulAdd instrinsic 3176 // out = r0 3177 // in = r1 3178 // offset = r2 (already out.length-offset) 3179 // len = r3 3180 // k = r4 3181 // 3182 // pseudo code from java implementation: 3183 // carry = 0; 3184 // offset = out.length-offset - 1; 3185 // for (int j=len-1; j >= 0; j--) { 3186 // product = (in[j] & LONG_MASK) * kLong + (out[offset] & LONG_MASK) + carry; 3187 // out[offset--] = (int)product; 3188 // carry = product >>> 32; 3189 // } 3190 // return (int)carry; 3191 void MacroAssembler::mul_add(Register out, Register in, Register offset, 3192 Register len, Register k) { 3193 Label LOOP, END; 3194 // pre-loop 3195 cmp(len, zr); // cmp, not cbz/cbnz: to use condition twice => less branches 3196 csel(out, zr, out, Assembler::EQ); 3197 br(Assembler::EQ, END); 3198 add(in, in, len, LSL, 2); // in[j+1] address 3199 add(offset, out, offset, LSL, 2); // out[offset + 1] address 3200 mov(out, zr); // used to keep carry now 3201 BIND(LOOP); 3202 ldrw(rscratch1, Address(pre(in, -4))); 3203 madd(rscratch1, rscratch1, k, out); 3204 ldrw(rscratch2, Address(pre(offset, -4))); 3205 add(rscratch1, rscratch1, rscratch2); 3206 strw(rscratch1, Address(offset)); 3207 lsr(out, rscratch1, 32); 3208 subs(len, len, 1); 3209 br(Assembler::NE, LOOP); 3210 BIND(END); 3211 } 3212 3213 /** 3214 * Emits code to update CRC-32 with a byte value according to constants in table 3215 * 3216 * @param [in,out]crc Register containing the crc. 3217 * @param [in]val Register containing the byte to fold into the CRC. 3218 * @param [in]table Register containing the table of crc constants. 3219 * 3220 * uint32_t crc; 3221 * val = crc_table[(val ^ crc) & 0xFF]; 3222 * crc = val ^ (crc >> 8); 3223 * 3224 */ 3225 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) { 3226 eor(val, val, crc); 3227 andr(val, val, 0xff); 3228 ldrw(val, Address(table, val, Address::lsl(2))); 3229 eor(crc, val, crc, Assembler::LSR, 8); 3230 } 3231 3232 /** 3233 * Emits code to update CRC-32 with a 32-bit value according to tables 0 to 3 3234 * 3235 * @param [in,out]crc Register containing the crc. 3236 * @param [in]v Register containing the 32-bit to fold into the CRC. 3237 * @param [in]table0 Register containing table 0 of crc constants. 3238 * @param [in]table1 Register containing table 1 of crc constants. 3239 * @param [in]table2 Register containing table 2 of crc constants. 3240 * @param [in]table3 Register containing table 3 of crc constants. 3241 * 3242 * uint32_t crc; 3243 * v = crc ^ v 3244 * crc = table3[v&0xff]^table2[(v>>8)&0xff]^table1[(v>>16)&0xff]^table0[v>>24] 3245 * 3246 */ 3247 void MacroAssembler::update_word_crc32(Register crc, Register v, Register tmp, 3248 Register table0, Register table1, Register table2, Register table3, 3249 bool upper) { 3250 eor(v, crc, v, upper ? LSR:LSL, upper ? 32:0); 3251 uxtb(tmp, v); 3252 ldrw(crc, Address(table3, tmp, Address::lsl(2))); 3253 ubfx(tmp, v, 8, 8); 3254 ldrw(tmp, Address(table2, tmp, Address::lsl(2))); 3255 eor(crc, crc, tmp); 3256 ubfx(tmp, v, 16, 8); 3257 ldrw(tmp, Address(table1, tmp, Address::lsl(2))); 3258 eor(crc, crc, tmp); 3259 ubfx(tmp, v, 24, 8); 3260 ldrw(tmp, Address(table0, tmp, Address::lsl(2))); 3261 eor(crc, crc, tmp); 3262 } 3263 3264 void MacroAssembler::kernel_crc32_using_crc32(Register crc, Register buf, 3265 Register len, Register tmp0, Register tmp1, Register tmp2, 3266 Register tmp3) { 3267 Label CRC_by64_loop, CRC_by4_loop, CRC_by1_loop, CRC_less64, CRC_by64_pre, CRC_by32_loop, CRC_less32, L_exit; 3268 assert_different_registers(crc, buf, len, tmp0, tmp1, tmp2, tmp3); 3269 3270 mvnw(crc, crc); 3271 3272 subs(len, len, 128); 3273 br(Assembler::GE, CRC_by64_pre); 3274 BIND(CRC_less64); 3275 adds(len, len, 128-32); 3276 br(Assembler::GE, CRC_by32_loop); 3277 BIND(CRC_less32); 3278 adds(len, len, 32-4); 3279 br(Assembler::GE, CRC_by4_loop); 3280 adds(len, len, 4); 3281 br(Assembler::GT, CRC_by1_loop); 3282 b(L_exit); 3283 3284 BIND(CRC_by32_loop); 3285 ldp(tmp0, tmp1, Address(post(buf, 16))); 3286 subs(len, len, 32); 3287 crc32x(crc, crc, tmp0); 3288 ldr(tmp2, Address(post(buf, 8))); 3289 crc32x(crc, crc, tmp1); 3290 ldr(tmp3, Address(post(buf, 8))); 3291 crc32x(crc, crc, tmp2); 3292 crc32x(crc, crc, tmp3); 3293 br(Assembler::GE, CRC_by32_loop); 3294 cmn(len, 32); 3295 br(Assembler::NE, CRC_less32); 3296 b(L_exit); 3297 3298 BIND(CRC_by4_loop); 3299 ldrw(tmp0, Address(post(buf, 4))); 3300 subs(len, len, 4); 3301 crc32w(crc, crc, tmp0); 3302 br(Assembler::GE, CRC_by4_loop); 3303 adds(len, len, 4); 3304 br(Assembler::LE, L_exit); 3305 BIND(CRC_by1_loop); 3306 ldrb(tmp0, Address(post(buf, 1))); 3307 subs(len, len, 1); 3308 crc32b(crc, crc, tmp0); 3309 br(Assembler::GT, CRC_by1_loop); 3310 b(L_exit); 3311 3312 BIND(CRC_by64_pre); 3313 sub(buf, buf, 8); 3314 ldp(tmp0, tmp1, Address(buf, 8)); 3315 crc32x(crc, crc, tmp0); 3316 ldr(tmp2, Address(buf, 24)); 3317 crc32x(crc, crc, tmp1); 3318 ldr(tmp3, Address(buf, 32)); 3319 crc32x(crc, crc, tmp2); 3320 ldr(tmp0, Address(buf, 40)); 3321 crc32x(crc, crc, tmp3); 3322 ldr(tmp1, Address(buf, 48)); 3323 crc32x(crc, crc, tmp0); 3324 ldr(tmp2, Address(buf, 56)); 3325 crc32x(crc, crc, tmp1); 3326 ldr(tmp3, Address(pre(buf, 64))); 3327 3328 b(CRC_by64_loop); 3329 3330 align(CodeEntryAlignment); 3331 BIND(CRC_by64_loop); 3332 subs(len, len, 64); 3333 crc32x(crc, crc, tmp2); 3334 ldr(tmp0, Address(buf, 8)); 3335 crc32x(crc, crc, tmp3); 3336 ldr(tmp1, Address(buf, 16)); 3337 crc32x(crc, crc, tmp0); 3338 ldr(tmp2, Address(buf, 24)); 3339 crc32x(crc, crc, tmp1); 3340 ldr(tmp3, Address(buf, 32)); 3341 crc32x(crc, crc, tmp2); 3342 ldr(tmp0, Address(buf, 40)); 3343 crc32x(crc, crc, tmp3); 3344 ldr(tmp1, Address(buf, 48)); 3345 crc32x(crc, crc, tmp0); 3346 ldr(tmp2, Address(buf, 56)); 3347 crc32x(crc, crc, tmp1); 3348 ldr(tmp3, Address(pre(buf, 64))); 3349 br(Assembler::GE, CRC_by64_loop); 3350 3351 // post-loop 3352 crc32x(crc, crc, tmp2); 3353 crc32x(crc, crc, tmp3); 3354 3355 sub(len, len, 64); 3356 add(buf, buf, 8); 3357 cmn(len, 128); 3358 br(Assembler::NE, CRC_less64); 3359 BIND(L_exit); 3360 mvnw(crc, crc); 3361 } 3362 3363 /** 3364 * @param crc register containing existing CRC (32-bit) 3365 * @param buf register pointing to input byte buffer (byte*) 3366 * @param len register containing number of bytes 3367 * @param table register that will contain address of CRC table 3368 * @param tmp scratch register 3369 */ 3370 void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len, 3371 Register table0, Register table1, Register table2, Register table3, 3372 Register tmp, Register tmp2, Register tmp3) { 3373 Label L_by16, L_by16_loop, L_by4, L_by4_loop, L_by1, L_by1_loop, L_exit; 3374 unsigned long offset; 3375 3376 if (UseCRC32) { 3377 kernel_crc32_using_crc32(crc, buf, len, table0, table1, table2, table3); 3378 return; 3379 } 3380 3381 mvnw(crc, crc); 3382 3383 adrp(table0, ExternalAddress(StubRoutines::crc_table_addr()), offset); 3384 if (offset) add(table0, table0, offset); 3385 add(table1, table0, 1*256*sizeof(juint)); 3386 add(table2, table0, 2*256*sizeof(juint)); 3387 add(table3, table0, 3*256*sizeof(juint)); 3388 3389 if (UseNeon) { 3390 cmp(len, (u1)64); 3391 br(Assembler::LT, L_by16); 3392 eor(v16, T16B, v16, v16); 3393 3394 Label L_fold; 3395 3396 add(tmp, table0, 4*256*sizeof(juint)); // Point at the Neon constants 3397 3398 ld1(v0, v1, T2D, post(buf, 32)); 3399 ld1r(v4, T2D, post(tmp, 8)); 3400 ld1r(v5, T2D, post(tmp, 8)); 3401 ld1r(v6, T2D, post(tmp, 8)); 3402 ld1r(v7, T2D, post(tmp, 8)); 3403 mov(v16, T4S, 0, crc); 3404 3405 eor(v0, T16B, v0, v16); 3406 sub(len, len, 64); 3407 3408 BIND(L_fold); 3409 pmull(v22, T8H, v0, v5, T8B); 3410 pmull(v20, T8H, v0, v7, T8B); 3411 pmull(v23, T8H, v0, v4, T8B); 3412 pmull(v21, T8H, v0, v6, T8B); 3413 3414 pmull2(v18, T8H, v0, v5, T16B); 3415 pmull2(v16, T8H, v0, v7, T16B); 3416 pmull2(v19, T8H, v0, v4, T16B); 3417 pmull2(v17, T8H, v0, v6, T16B); 3418 3419 uzp1(v24, T8H, v20, v22); 3420 uzp2(v25, T8H, v20, v22); 3421 eor(v20, T16B, v24, v25); 3422 3423 uzp1(v26, T8H, v16, v18); 3424 uzp2(v27, T8H, v16, v18); 3425 eor(v16, T16B, v26, v27); 3426 3427 ushll2(v22, T4S, v20, T8H, 8); 3428 ushll(v20, T4S, v20, T4H, 8); 3429 3430 ushll2(v18, T4S, v16, T8H, 8); 3431 ushll(v16, T4S, v16, T4H, 8); 3432 3433 eor(v22, T16B, v23, v22); 3434 eor(v18, T16B, v19, v18); 3435 eor(v20, T16B, v21, v20); 3436 eor(v16, T16B, v17, v16); 3437 3438 uzp1(v17, T2D, v16, v20); 3439 uzp2(v21, T2D, v16, v20); 3440 eor(v17, T16B, v17, v21); 3441 3442 ushll2(v20, T2D, v17, T4S, 16); 3443 ushll(v16, T2D, v17, T2S, 16); 3444 3445 eor(v20, T16B, v20, v22); 3446 eor(v16, T16B, v16, v18); 3447 3448 uzp1(v17, T2D, v20, v16); 3449 uzp2(v21, T2D, v20, v16); 3450 eor(v28, T16B, v17, v21); 3451 3452 pmull(v22, T8H, v1, v5, T8B); 3453 pmull(v20, T8H, v1, v7, T8B); 3454 pmull(v23, T8H, v1, v4, T8B); 3455 pmull(v21, T8H, v1, v6, T8B); 3456 3457 pmull2(v18, T8H, v1, v5, T16B); 3458 pmull2(v16, T8H, v1, v7, T16B); 3459 pmull2(v19, T8H, v1, v4, T16B); 3460 pmull2(v17, T8H, v1, v6, T16B); 3461 3462 ld1(v0, v1, T2D, post(buf, 32)); 3463 3464 uzp1(v24, T8H, v20, v22); 3465 uzp2(v25, T8H, v20, v22); 3466 eor(v20, T16B, v24, v25); 3467 3468 uzp1(v26, T8H, v16, v18); 3469 uzp2(v27, T8H, v16, v18); 3470 eor(v16, T16B, v26, v27); 3471 3472 ushll2(v22, T4S, v20, T8H, 8); 3473 ushll(v20, T4S, v20, T4H, 8); 3474 3475 ushll2(v18, T4S, v16, T8H, 8); 3476 ushll(v16, T4S, v16, T4H, 8); 3477 3478 eor(v22, T16B, v23, v22); 3479 eor(v18, T16B, v19, v18); 3480 eor(v20, T16B, v21, v20); 3481 eor(v16, T16B, v17, v16); 3482 3483 uzp1(v17, T2D, v16, v20); 3484 uzp2(v21, T2D, v16, v20); 3485 eor(v16, T16B, v17, v21); 3486 3487 ushll2(v20, T2D, v16, T4S, 16); 3488 ushll(v16, T2D, v16, T2S, 16); 3489 3490 eor(v20, T16B, v22, v20); 3491 eor(v16, T16B, v16, v18); 3492 3493 uzp1(v17, T2D, v20, v16); 3494 uzp2(v21, T2D, v20, v16); 3495 eor(v20, T16B, v17, v21); 3496 3497 shl(v16, T2D, v28, 1); 3498 shl(v17, T2D, v20, 1); 3499 3500 eor(v0, T16B, v0, v16); 3501 eor(v1, T16B, v1, v17); 3502 3503 subs(len, len, 32); 3504 br(Assembler::GE, L_fold); 3505 3506 mov(crc, 0); 3507 mov(tmp, v0, T1D, 0); 3508 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false); 3509 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true); 3510 mov(tmp, v0, T1D, 1); 3511 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false); 3512 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true); 3513 mov(tmp, v1, T1D, 0); 3514 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false); 3515 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true); 3516 mov(tmp, v1, T1D, 1); 3517 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false); 3518 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true); 3519 3520 add(len, len, 32); 3521 } 3522 3523 BIND(L_by16); 3524 subs(len, len, 16); 3525 br(Assembler::GE, L_by16_loop); 3526 adds(len, len, 16-4); 3527 br(Assembler::GE, L_by4_loop); 3528 adds(len, len, 4); 3529 br(Assembler::GT, L_by1_loop); 3530 b(L_exit); 3531 3532 BIND(L_by4_loop); 3533 ldrw(tmp, Address(post(buf, 4))); 3534 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3); 3535 subs(len, len, 4); 3536 br(Assembler::GE, L_by4_loop); 3537 adds(len, len, 4); 3538 br(Assembler::LE, L_exit); 3539 BIND(L_by1_loop); 3540 subs(len, len, 1); 3541 ldrb(tmp, Address(post(buf, 1))); 3542 update_byte_crc32(crc, tmp, table0); 3543 br(Assembler::GT, L_by1_loop); 3544 b(L_exit); 3545 3546 align(CodeEntryAlignment); 3547 BIND(L_by16_loop); 3548 subs(len, len, 16); 3549 ldp(tmp, tmp3, Address(post(buf, 16))); 3550 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false); 3551 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true); 3552 update_word_crc32(crc, tmp3, tmp2, table0, table1, table2, table3, false); 3553 update_word_crc32(crc, tmp3, tmp2, table0, table1, table2, table3, true); 3554 br(Assembler::GE, L_by16_loop); 3555 adds(len, len, 16-4); 3556 br(Assembler::GE, L_by4_loop); 3557 adds(len, len, 4); 3558 br(Assembler::GT, L_by1_loop); 3559 BIND(L_exit); 3560 mvnw(crc, crc); 3561 } 3562 3563 void MacroAssembler::kernel_crc32c_using_crc32c(Register crc, Register buf, 3564 Register len, Register tmp0, Register tmp1, Register tmp2, 3565 Register tmp3) { 3566 Label CRC_by64_loop, CRC_by4_loop, CRC_by1_loop, CRC_less64, CRC_by64_pre, CRC_by32_loop, CRC_less32, L_exit; 3567 assert_different_registers(crc, buf, len, tmp0, tmp1, tmp2, tmp3); 3568 3569 subs(len, len, 128); 3570 br(Assembler::GE, CRC_by64_pre); 3571 BIND(CRC_less64); 3572 adds(len, len, 128-32); 3573 br(Assembler::GE, CRC_by32_loop); 3574 BIND(CRC_less32); 3575 adds(len, len, 32-4); 3576 br(Assembler::GE, CRC_by4_loop); 3577 adds(len, len, 4); 3578 br(Assembler::GT, CRC_by1_loop); 3579 b(L_exit); 3580 3581 BIND(CRC_by32_loop); 3582 ldp(tmp0, tmp1, Address(post(buf, 16))); 3583 subs(len, len, 32); 3584 crc32cx(crc, crc, tmp0); 3585 ldr(tmp2, Address(post(buf, 8))); 3586 crc32cx(crc, crc, tmp1); 3587 ldr(tmp3, Address(post(buf, 8))); 3588 crc32cx(crc, crc, tmp2); 3589 crc32cx(crc, crc, tmp3); 3590 br(Assembler::GE, CRC_by32_loop); 3591 cmn(len, 32); 3592 br(Assembler::NE, CRC_less32); 3593 b(L_exit); 3594 3595 BIND(CRC_by4_loop); 3596 ldrw(tmp0, Address(post(buf, 4))); 3597 subs(len, len, 4); 3598 crc32cw(crc, crc, tmp0); 3599 br(Assembler::GE, CRC_by4_loop); 3600 adds(len, len, 4); 3601 br(Assembler::LE, L_exit); 3602 BIND(CRC_by1_loop); 3603 ldrb(tmp0, Address(post(buf, 1))); 3604 subs(len, len, 1); 3605 crc32cb(crc, crc, tmp0); 3606 br(Assembler::GT, CRC_by1_loop); 3607 b(L_exit); 3608 3609 BIND(CRC_by64_pre); 3610 sub(buf, buf, 8); 3611 ldp(tmp0, tmp1, Address(buf, 8)); 3612 crc32cx(crc, crc, tmp0); 3613 ldr(tmp2, Address(buf, 24)); 3614 crc32cx(crc, crc, tmp1); 3615 ldr(tmp3, Address(buf, 32)); 3616 crc32cx(crc, crc, tmp2); 3617 ldr(tmp0, Address(buf, 40)); 3618 crc32cx(crc, crc, tmp3); 3619 ldr(tmp1, Address(buf, 48)); 3620 crc32cx(crc, crc, tmp0); 3621 ldr(tmp2, Address(buf, 56)); 3622 crc32cx(crc, crc, tmp1); 3623 ldr(tmp3, Address(pre(buf, 64))); 3624 3625 b(CRC_by64_loop); 3626 3627 align(CodeEntryAlignment); 3628 BIND(CRC_by64_loop); 3629 subs(len, len, 64); 3630 crc32cx(crc, crc, tmp2); 3631 ldr(tmp0, Address(buf, 8)); 3632 crc32cx(crc, crc, tmp3); 3633 ldr(tmp1, Address(buf, 16)); 3634 crc32cx(crc, crc, tmp0); 3635 ldr(tmp2, Address(buf, 24)); 3636 crc32cx(crc, crc, tmp1); 3637 ldr(tmp3, Address(buf, 32)); 3638 crc32cx(crc, crc, tmp2); 3639 ldr(tmp0, Address(buf, 40)); 3640 crc32cx(crc, crc, tmp3); 3641 ldr(tmp1, Address(buf, 48)); 3642 crc32cx(crc, crc, tmp0); 3643 ldr(tmp2, Address(buf, 56)); 3644 crc32cx(crc, crc, tmp1); 3645 ldr(tmp3, Address(pre(buf, 64))); 3646 br(Assembler::GE, CRC_by64_loop); 3647 3648 // post-loop 3649 crc32cx(crc, crc, tmp2); 3650 crc32cx(crc, crc, tmp3); 3651 3652 sub(len, len, 64); 3653 add(buf, buf, 8); 3654 cmn(len, 128); 3655 br(Assembler::NE, CRC_less64); 3656 BIND(L_exit); 3657 } 3658 3659 /** 3660 * @param crc register containing existing CRC (32-bit) 3661 * @param buf register pointing to input byte buffer (byte*) 3662 * @param len register containing number of bytes 3663 * @param table register that will contain address of CRC table 3664 * @param tmp scratch register 3665 */ 3666 void MacroAssembler::kernel_crc32c(Register crc, Register buf, Register len, 3667 Register table0, Register table1, Register table2, Register table3, 3668 Register tmp, Register tmp2, Register tmp3) { 3669 kernel_crc32c_using_crc32c(crc, buf, len, table0, table1, table2, table3); 3670 } 3671 3672 3673 SkipIfEqual::SkipIfEqual( 3674 MacroAssembler* masm, const bool* flag_addr, bool value) { 3675 _masm = masm; 3676 unsigned long offset; 3677 _masm->adrp(rscratch1, ExternalAddress((address)flag_addr), offset); 3678 _masm->ldrb(rscratch1, Address(rscratch1, offset)); 3679 _masm->cbzw(rscratch1, _label); 3680 } 3681 3682 SkipIfEqual::~SkipIfEqual() { 3683 _masm->bind(_label); 3684 } 3685 3686 void MacroAssembler::addptr(const Address &dst, int32_t src) { 3687 Address adr; 3688 switch(dst.getMode()) { 3689 case Address::base_plus_offset: 3690 // This is the expected mode, although we allow all the other 3691 // forms below. 3692 adr = form_address(rscratch2, dst.base(), dst.offset(), LogBytesPerWord); 3693 break; 3694 default: 3695 lea(rscratch2, dst); 3696 adr = Address(rscratch2); 3697 break; 3698 } 3699 ldr(rscratch1, adr); 3700 add(rscratch1, rscratch1, src); 3701 str(rscratch1, adr); 3702 } 3703 3704 void MacroAssembler::cmpptr(Register src1, Address src2) { 3705 unsigned long offset; 3706 adrp(rscratch1, src2, offset); 3707 ldr(rscratch1, Address(rscratch1, offset)); 3708 cmp(src1, rscratch1); 3709 } 3710 3711 void MacroAssembler::cmpoop(Register obj1, Register obj2) { 3712 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 3713 bs->obj_equals(this, obj1, obj2); 3714 } 3715 3716 void MacroAssembler::load_klass(Register dst, Register src) { 3717 if (UseCompressedClassPointers) { 3718 ldrw(dst, Address(src, oopDesc::klass_offset_in_bytes())); 3719 decode_klass_not_null(dst); 3720 } else { 3721 ldr(dst, Address(src, oopDesc::klass_offset_in_bytes())); 3722 } 3723 } 3724 3725 // ((OopHandle)result).resolve(); 3726 void MacroAssembler::resolve_oop_handle(Register result, Register tmp) { 3727 // OopHandle::resolve is an indirection. 3728 access_load_at(T_OBJECT, IN_NATIVE, result, Address(result, 0), tmp, noreg); 3729 } 3730 3731 void MacroAssembler::load_mirror(Register dst, Register method, Register tmp) { 3732 const int mirror_offset = in_bytes(Klass::java_mirror_offset()); 3733 ldr(dst, Address(rmethod, Method::const_offset())); 3734 ldr(dst, Address(dst, ConstMethod::constants_offset())); 3735 ldr(dst, Address(dst, ConstantPool::pool_holder_offset_in_bytes())); 3736 ldr(dst, Address(dst, mirror_offset)); 3737 resolve_oop_handle(dst, tmp); 3738 } 3739 3740 void MacroAssembler::cmp_klass(Register oop, Register trial_klass, Register tmp) { 3741 if (UseCompressedClassPointers) { 3742 ldrw(tmp, Address(oop, oopDesc::klass_offset_in_bytes())); 3743 if (Universe::narrow_klass_base() == NULL) { 3744 cmp(trial_klass, tmp, LSL, Universe::narrow_klass_shift()); 3745 return; 3746 } else if (((uint64_t)Universe::narrow_klass_base() & 0xffffffff) == 0 3747 && Universe::narrow_klass_shift() == 0) { 3748 // Only the bottom 32 bits matter 3749 cmpw(trial_klass, tmp); 3750 return; 3751 } 3752 decode_klass_not_null(tmp); 3753 } else { 3754 ldr(tmp, Address(oop, oopDesc::klass_offset_in_bytes())); 3755 } 3756 cmp(trial_klass, tmp); 3757 } 3758 3759 void MacroAssembler::load_prototype_header(Register dst, Register src) { 3760 load_klass(dst, src); 3761 ldr(dst, Address(dst, Klass::prototype_header_offset())); 3762 } 3763 3764 void MacroAssembler::store_klass(Register dst, Register src) { 3765 // FIXME: Should this be a store release? concurrent gcs assumes 3766 // klass length is valid if klass field is not null. 3767 if (UseCompressedClassPointers) { 3768 encode_klass_not_null(src); 3769 strw(src, Address(dst, oopDesc::klass_offset_in_bytes())); 3770 } else { 3771 str(src, Address(dst, oopDesc::klass_offset_in_bytes())); 3772 } 3773 } 3774 3775 void MacroAssembler::store_klass_gap(Register dst, Register src) { 3776 if (UseCompressedClassPointers) { 3777 // Store to klass gap in destination 3778 strw(src, Address(dst, oopDesc::klass_gap_offset_in_bytes())); 3779 } 3780 } 3781 3782 // Algorithm must match CompressedOops::encode. 3783 void MacroAssembler::encode_heap_oop(Register d, Register s) { 3784 #ifdef ASSERT 3785 verify_heapbase("MacroAssembler::encode_heap_oop: heap base corrupted?"); 3786 #endif 3787 verify_oop(s, "broken oop in encode_heap_oop"); 3788 if (Universe::narrow_oop_base() == NULL) { 3789 if (Universe::narrow_oop_shift() != 0) { 3790 assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong"); 3791 lsr(d, s, LogMinObjAlignmentInBytes); 3792 } else { 3793 mov(d, s); 3794 } 3795 } else { 3796 subs(d, s, rheapbase); 3797 csel(d, d, zr, Assembler::HS); 3798 lsr(d, d, LogMinObjAlignmentInBytes); 3799 3800 /* Old algorithm: is this any worse? 3801 Label nonnull; 3802 cbnz(r, nonnull); 3803 sub(r, r, rheapbase); 3804 bind(nonnull); 3805 lsr(r, r, LogMinObjAlignmentInBytes); 3806 */ 3807 } 3808 } 3809 3810 void MacroAssembler::encode_heap_oop_not_null(Register r) { 3811 #ifdef ASSERT 3812 verify_heapbase("MacroAssembler::encode_heap_oop_not_null: heap base corrupted?"); 3813 if (CheckCompressedOops) { 3814 Label ok; 3815 cbnz(r, ok); 3816 stop("null oop passed to encode_heap_oop_not_null"); 3817 bind(ok); 3818 } 3819 #endif 3820 verify_oop(r, "broken oop in encode_heap_oop_not_null"); 3821 if (Universe::narrow_oop_base() != NULL) { 3822 sub(r, r, rheapbase); 3823 } 3824 if (Universe::narrow_oop_shift() != 0) { 3825 assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong"); 3826 lsr(r, r, LogMinObjAlignmentInBytes); 3827 } 3828 } 3829 3830 void MacroAssembler::encode_heap_oop_not_null(Register dst, Register src) { 3831 #ifdef ASSERT 3832 verify_heapbase("MacroAssembler::encode_heap_oop_not_null2: heap base corrupted?"); 3833 if (CheckCompressedOops) { 3834 Label ok; 3835 cbnz(src, ok); 3836 stop("null oop passed to encode_heap_oop_not_null2"); 3837 bind(ok); 3838 } 3839 #endif 3840 verify_oop(src, "broken oop in encode_heap_oop_not_null2"); 3841 3842 Register data = src; 3843 if (Universe::narrow_oop_base() != NULL) { 3844 sub(dst, src, rheapbase); 3845 data = dst; 3846 } 3847 if (Universe::narrow_oop_shift() != 0) { 3848 assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong"); 3849 lsr(dst, data, LogMinObjAlignmentInBytes); 3850 data = dst; 3851 } 3852 if (data == src) 3853 mov(dst, src); 3854 } 3855 3856 void MacroAssembler::decode_heap_oop(Register d, Register s) { 3857 #ifdef ASSERT 3858 verify_heapbase("MacroAssembler::decode_heap_oop: heap base corrupted?"); 3859 #endif 3860 if (Universe::narrow_oop_base() == NULL) { 3861 if (Universe::narrow_oop_shift() != 0 || d != s) { 3862 lsl(d, s, Universe::narrow_oop_shift()); 3863 } 3864 } else { 3865 Label done; 3866 if (d != s) 3867 mov(d, s); 3868 cbz(s, done); 3869 add(d, rheapbase, s, Assembler::LSL, LogMinObjAlignmentInBytes); 3870 bind(done); 3871 } 3872 verify_oop(d, "broken oop in decode_heap_oop"); 3873 } 3874 3875 void MacroAssembler::decode_heap_oop_not_null(Register r) { 3876 assert (UseCompressedOops, "should only be used for compressed headers"); 3877 assert (Universe::heap() != NULL, "java heap should be initialized"); 3878 // Cannot assert, unverified entry point counts instructions (see .ad file) 3879 // vtableStubs also counts instructions in pd_code_size_limit. 3880 // Also do not verify_oop as this is called by verify_oop. 3881 if (Universe::narrow_oop_shift() != 0) { 3882 assert(LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong"); 3883 if (Universe::narrow_oop_base() != NULL) { 3884 add(r, rheapbase, r, Assembler::LSL, LogMinObjAlignmentInBytes); 3885 } else { 3886 add(r, zr, r, Assembler::LSL, LogMinObjAlignmentInBytes); 3887 } 3888 } else { 3889 assert (Universe::narrow_oop_base() == NULL, "sanity"); 3890 } 3891 } 3892 3893 void MacroAssembler::decode_heap_oop_not_null(Register dst, Register src) { 3894 assert (UseCompressedOops, "should only be used for compressed headers"); 3895 assert (Universe::heap() != NULL, "java heap should be initialized"); 3896 // Cannot assert, unverified entry point counts instructions (see .ad file) 3897 // vtableStubs also counts instructions in pd_code_size_limit. 3898 // Also do not verify_oop as this is called by verify_oop. 3899 if (Universe::narrow_oop_shift() != 0) { 3900 assert(LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong"); 3901 if (Universe::narrow_oop_base() != NULL) { 3902 add(dst, rheapbase, src, Assembler::LSL, LogMinObjAlignmentInBytes); 3903 } else { 3904 add(dst, zr, src, Assembler::LSL, LogMinObjAlignmentInBytes); 3905 } 3906 } else { 3907 assert (Universe::narrow_oop_base() == NULL, "sanity"); 3908 if (dst != src) { 3909 mov(dst, src); 3910 } 3911 } 3912 } 3913 3914 void MacroAssembler::encode_klass_not_null(Register dst, Register src) { 3915 if (Universe::narrow_klass_base() == NULL) { 3916 if (Universe::narrow_klass_shift() != 0) { 3917 assert (LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong"); 3918 lsr(dst, src, LogKlassAlignmentInBytes); 3919 } else { 3920 if (dst != src) mov(dst, src); 3921 } 3922 return; 3923 } 3924 3925 if (use_XOR_for_compressed_class_base) { 3926 if (Universe::narrow_klass_shift() != 0) { 3927 eor(dst, src, (uint64_t)Universe::narrow_klass_base()); 3928 lsr(dst, dst, LogKlassAlignmentInBytes); 3929 } else { 3930 eor(dst, src, (uint64_t)Universe::narrow_klass_base()); 3931 } 3932 return; 3933 } 3934 3935 if (((uint64_t)Universe::narrow_klass_base() & 0xffffffff) == 0 3936 && Universe::narrow_klass_shift() == 0) { 3937 movw(dst, src); 3938 return; 3939 } 3940 3941 #ifdef ASSERT 3942 verify_heapbase("MacroAssembler::encode_klass_not_null2: heap base corrupted?"); 3943 #endif 3944 3945 Register rbase = dst; 3946 if (dst == src) rbase = rheapbase; 3947 mov(rbase, (uint64_t)Universe::narrow_klass_base()); 3948 sub(dst, src, rbase); 3949 if (Universe::narrow_klass_shift() != 0) { 3950 assert (LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong"); 3951 lsr(dst, dst, LogKlassAlignmentInBytes); 3952 } 3953 if (dst == src) reinit_heapbase(); 3954 } 3955 3956 void MacroAssembler::encode_klass_not_null(Register r) { 3957 encode_klass_not_null(r, r); 3958 } 3959 3960 void MacroAssembler::decode_klass_not_null(Register dst, Register src) { 3961 Register rbase = dst; 3962 assert (UseCompressedClassPointers, "should only be used for compressed headers"); 3963 3964 if (Universe::narrow_klass_base() == NULL) { 3965 if (Universe::narrow_klass_shift() != 0) { 3966 assert(LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong"); 3967 lsl(dst, src, LogKlassAlignmentInBytes); 3968 } else { 3969 if (dst != src) mov(dst, src); 3970 } 3971 return; 3972 } 3973 3974 if (use_XOR_for_compressed_class_base) { 3975 if (Universe::narrow_klass_shift() != 0) { 3976 lsl(dst, src, LogKlassAlignmentInBytes); 3977 eor(dst, dst, (uint64_t)Universe::narrow_klass_base()); 3978 } else { 3979 eor(dst, src, (uint64_t)Universe::narrow_klass_base()); 3980 } 3981 return; 3982 } 3983 3984 if (((uint64_t)Universe::narrow_klass_base() & 0xffffffff) == 0 3985 && Universe::narrow_klass_shift() == 0) { 3986 if (dst != src) 3987 movw(dst, src); 3988 movk(dst, (uint64_t)Universe::narrow_klass_base() >> 32, 32); 3989 return; 3990 } 3991 3992 // Cannot assert, unverified entry point counts instructions (see .ad file) 3993 // vtableStubs also counts instructions in pd_code_size_limit. 3994 // Also do not verify_oop as this is called by verify_oop. 3995 if (dst == src) rbase = rheapbase; 3996 mov(rbase, (uint64_t)Universe::narrow_klass_base()); 3997 if (Universe::narrow_klass_shift() != 0) { 3998 assert(LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong"); 3999 add(dst, rbase, src, Assembler::LSL, LogKlassAlignmentInBytes); 4000 } else { 4001 add(dst, rbase, src); 4002 } 4003 if (dst == src) reinit_heapbase(); 4004 } 4005 4006 void MacroAssembler::decode_klass_not_null(Register r) { 4007 decode_klass_not_null(r, r); 4008 } 4009 4010 void MacroAssembler::set_narrow_oop(Register dst, jobject obj) { 4011 #ifdef ASSERT 4012 { 4013 ThreadInVMfromUnknown tiv; 4014 assert (UseCompressedOops, "should only be used for compressed oops"); 4015 assert (Universe::heap() != NULL, "java heap should be initialized"); 4016 assert (oop_recorder() != NULL, "this assembler needs an OopRecorder"); 4017 assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "should be real oop"); 4018 } 4019 #endif 4020 int oop_index = oop_recorder()->find_index(obj); 4021 InstructionMark im(this); 4022 RelocationHolder rspec = oop_Relocation::spec(oop_index); 4023 code_section()->relocate(inst_mark(), rspec); 4024 movz(dst, 0xDEAD, 16); 4025 movk(dst, 0xBEEF); 4026 } 4027 4028 void MacroAssembler::set_narrow_klass(Register dst, Klass* k) { 4029 assert (UseCompressedClassPointers, "should only be used for compressed headers"); 4030 assert (oop_recorder() != NULL, "this assembler needs an OopRecorder"); 4031 int index = oop_recorder()->find_index(k); 4032 assert(! Universe::heap()->is_in_reserved(k), "should not be an oop"); 4033 4034 InstructionMark im(this); 4035 RelocationHolder rspec = metadata_Relocation::spec(index); 4036 code_section()->relocate(inst_mark(), rspec); 4037 narrowKlass nk = Klass::encode_klass(k); 4038 movz(dst, (nk >> 16), 16); 4039 movk(dst, nk & 0xffff); 4040 } 4041 4042 void MacroAssembler::access_load_at(BasicType type, DecoratorSet decorators, 4043 Register dst, Address src, 4044 Register tmp1, Register thread_tmp) { 4045 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 4046 decorators = AccessInternal::decorator_fixup(decorators); 4047 bool as_raw = (decorators & AS_RAW) != 0; 4048 if (as_raw) { 4049 bs->BarrierSetAssembler::load_at(this, decorators, type, dst, src, tmp1, thread_tmp); 4050 } else { 4051 bs->load_at(this, decorators, type, dst, src, tmp1, thread_tmp); 4052 } 4053 } 4054 4055 void MacroAssembler::access_store_at(BasicType type, DecoratorSet decorators, 4056 Address dst, Register src, 4057 Register tmp1, Register thread_tmp) { 4058 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 4059 decorators = AccessInternal::decorator_fixup(decorators); 4060 bool as_raw = (decorators & AS_RAW) != 0; 4061 if (as_raw) { 4062 bs->BarrierSetAssembler::store_at(this, decorators, type, dst, src, tmp1, thread_tmp); 4063 } else { 4064 bs->store_at(this, decorators, type, dst, src, tmp1, thread_tmp); 4065 } 4066 } 4067 4068 void MacroAssembler::resolve(DecoratorSet decorators, Register obj) { 4069 // Use stronger ACCESS_WRITE|ACCESS_READ by default. 4070 if ((decorators & (ACCESS_READ | ACCESS_WRITE)) == 0) { 4071 decorators |= ACCESS_READ | ACCESS_WRITE; 4072 } 4073 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 4074 return bs->resolve(this, decorators, obj); 4075 } 4076 4077 void MacroAssembler::load_heap_oop(Register dst, Address src, Register tmp1, 4078 Register thread_tmp, DecoratorSet decorators) { 4079 access_load_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, thread_tmp); 4080 } 4081 4082 void MacroAssembler::load_heap_oop_not_null(Register dst, Address src, Register tmp1, 4083 Register thread_tmp, DecoratorSet decorators) { 4084 access_load_at(T_OBJECT, IN_HEAP | IS_NOT_NULL | decorators, dst, src, tmp1, thread_tmp); 4085 } 4086 4087 void MacroAssembler::store_heap_oop(Address dst, Register src, Register tmp1, 4088 Register thread_tmp, DecoratorSet decorators) { 4089 access_store_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, thread_tmp); 4090 } 4091 4092 // Used for storing NULLs. 4093 void MacroAssembler::store_heap_oop_null(Address dst) { 4094 access_store_at(T_OBJECT, IN_HEAP, dst, noreg, noreg, noreg); 4095 } 4096 4097 Address MacroAssembler::allocate_metadata_address(Metadata* obj) { 4098 assert(oop_recorder() != NULL, "this assembler needs a Recorder"); 4099 int index = oop_recorder()->allocate_metadata_index(obj); 4100 RelocationHolder rspec = metadata_Relocation::spec(index); 4101 return Address((address)obj, rspec); 4102 } 4103 4104 // Move an oop into a register. immediate is true if we want 4105 // immediate instrcutions, i.e. we are not going to patch this 4106 // instruction while the code is being executed by another thread. In 4107 // that case we can use move immediates rather than the constant pool. 4108 void MacroAssembler::movoop(Register dst, jobject obj, bool immediate) { 4109 int oop_index; 4110 if (obj == NULL) { 4111 oop_index = oop_recorder()->allocate_oop_index(obj); 4112 } else { 4113 #ifdef ASSERT 4114 { 4115 ThreadInVMfromUnknown tiv; 4116 assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "should be real oop"); 4117 } 4118 #endif 4119 oop_index = oop_recorder()->find_index(obj); 4120 } 4121 RelocationHolder rspec = oop_Relocation::spec(oop_index); 4122 if (! immediate) { 4123 address dummy = address(uintptr_t(pc()) & -wordSize); // A nearby aligned address 4124 ldr_constant(dst, Address(dummy, rspec)); 4125 } else 4126 mov(dst, Address((address)obj, rspec)); 4127 } 4128 4129 // Move a metadata address into a register. 4130 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) { 4131 int oop_index; 4132 if (obj == NULL) { 4133 oop_index = oop_recorder()->allocate_metadata_index(obj); 4134 } else { 4135 oop_index = oop_recorder()->find_index(obj); 4136 } 4137 RelocationHolder rspec = metadata_Relocation::spec(oop_index); 4138 mov(dst, Address((address)obj, rspec)); 4139 } 4140 4141 Address MacroAssembler::constant_oop_address(jobject obj) { 4142 #ifdef ASSERT 4143 { 4144 ThreadInVMfromUnknown tiv; 4145 assert(oop_recorder() != NULL, "this assembler needs an OopRecorder"); 4146 assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "not an oop"); 4147 } 4148 #endif 4149 int oop_index = oop_recorder()->find_index(obj); 4150 return Address((address)obj, oop_Relocation::spec(oop_index)); 4151 } 4152 4153 // Defines obj, preserves var_size_in_bytes, okay for t2 == var_size_in_bytes. 4154 void MacroAssembler::tlab_allocate(Register obj, 4155 Register var_size_in_bytes, 4156 int con_size_in_bytes, 4157 Register t1, 4158 Register t2, 4159 Label& slow_case) { 4160 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 4161 bs->tlab_allocate(this, obj, var_size_in_bytes, con_size_in_bytes, t1, t2, slow_case); 4162 } 4163 4164 // Defines obj, preserves var_size_in_bytes 4165 void MacroAssembler::eden_allocate(Register obj, 4166 Register var_size_in_bytes, 4167 int con_size_in_bytes, 4168 Register t1, 4169 Label& slow_case) { 4170 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 4171 bs->eden_allocate(this, obj, var_size_in_bytes, con_size_in_bytes, t1, slow_case); 4172 } 4173 4174 // Zero words; len is in bytes 4175 // Destroys all registers except addr 4176 // len must be a nonzero multiple of wordSize 4177 void MacroAssembler::zero_memory(Register addr, Register len, Register t1) { 4178 assert_different_registers(addr, len, t1, rscratch1, rscratch2); 4179 4180 #ifdef ASSERT 4181 { Label L; 4182 tst(len, BytesPerWord - 1); 4183 br(Assembler::EQ, L); 4184 stop("len is not a multiple of BytesPerWord"); 4185 bind(L); 4186 } 4187 #endif 4188 4189 #ifndef PRODUCT 4190 block_comment("zero memory"); 4191 #endif 4192 4193 Label loop; 4194 Label entry; 4195 4196 // Algorithm: 4197 // 4198 // scratch1 = cnt & 7; 4199 // cnt -= scratch1; 4200 // p += scratch1; 4201 // switch (scratch1) { 4202 // do { 4203 // cnt -= 8; 4204 // p[-8] = 0; 4205 // case 7: 4206 // p[-7] = 0; 4207 // case 6: 4208 // p[-6] = 0; 4209 // // ... 4210 // case 1: 4211 // p[-1] = 0; 4212 // case 0: 4213 // p += 8; 4214 // } while (cnt); 4215 // } 4216 4217 const int unroll = 8; // Number of str(zr) instructions we'll unroll 4218 4219 lsr(len, len, LogBytesPerWord); 4220 andr(rscratch1, len, unroll - 1); // tmp1 = cnt % unroll 4221 sub(len, len, rscratch1); // cnt -= unroll 4222 // t1 always points to the end of the region we're about to zero 4223 add(t1, addr, rscratch1, Assembler::LSL, LogBytesPerWord); 4224 adr(rscratch2, entry); 4225 sub(rscratch2, rscratch2, rscratch1, Assembler::LSL, 2); 4226 br(rscratch2); 4227 bind(loop); 4228 sub(len, len, unroll); 4229 for (int i = -unroll; i < 0; i++) 4230 Assembler::str(zr, Address(t1, i * wordSize)); 4231 bind(entry); 4232 add(t1, t1, unroll * wordSize); 4233 cbnz(len, loop); 4234 } 4235 4236 void MacroAssembler::verify_tlab() { 4237 #ifdef ASSERT 4238 if (UseTLAB && VerifyOops) { 4239 Label next, ok; 4240 4241 stp(rscratch2, rscratch1, Address(pre(sp, -16))); 4242 4243 ldr(rscratch2, Address(rthread, in_bytes(JavaThread::tlab_top_offset()))); 4244 ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_start_offset()))); 4245 cmp(rscratch2, rscratch1); 4246 br(Assembler::HS, next); 4247 STOP("assert(top >= start)"); 4248 should_not_reach_here(); 4249 4250 bind(next); 4251 ldr(rscratch2, Address(rthread, in_bytes(JavaThread::tlab_end_offset()))); 4252 ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_top_offset()))); 4253 cmp(rscratch2, rscratch1); 4254 br(Assembler::HS, ok); 4255 STOP("assert(top <= end)"); 4256 should_not_reach_here(); 4257 4258 bind(ok); 4259 ldp(rscratch2, rscratch1, Address(post(sp, 16))); 4260 } 4261 #endif 4262 } 4263 4264 // Writes to stack successive pages until offset reached to check for 4265 // stack overflow + shadow pages. This clobbers tmp. 4266 void MacroAssembler::bang_stack_size(Register size, Register tmp) { 4267 assert_different_registers(tmp, size, rscratch1); 4268 mov(tmp, sp); 4269 // Bang stack for total size given plus shadow page size. 4270 // Bang one page at a time because large size can bang beyond yellow and 4271 // red zones. 4272 Label loop; 4273 mov(rscratch1, os::vm_page_size()); 4274 bind(loop); 4275 lea(tmp, Address(tmp, -os::vm_page_size())); 4276 subsw(size, size, rscratch1); 4277 str(size, Address(tmp)); 4278 br(Assembler::GT, loop); 4279 4280 // Bang down shadow pages too. 4281 // At this point, (tmp-0) is the last address touched, so don't 4282 // touch it again. (It was touched as (tmp-pagesize) but then tmp 4283 // was post-decremented.) Skip this address by starting at i=1, and 4284 // touch a few more pages below. N.B. It is important to touch all 4285 // the way down to and including i=StackShadowPages. 4286 for (int i = 0; i < (int)(JavaThread::stack_shadow_zone_size() / os::vm_page_size()) - 1; i++) { 4287 // this could be any sized move but this is can be a debugging crumb 4288 // so the bigger the better. 4289 lea(tmp, Address(tmp, -os::vm_page_size())); 4290 str(size, Address(tmp)); 4291 } 4292 } 4293 4294 4295 // Move the address of the polling page into dest. 4296 void MacroAssembler::get_polling_page(Register dest, address page, relocInfo::relocType rtype) { 4297 if (SafepointMechanism::uses_thread_local_poll()) { 4298 ldr(dest, Address(rthread, Thread::polling_page_offset())); 4299 } else { 4300 unsigned long off; 4301 adrp(dest, Address(page, rtype), off); 4302 assert(off == 0, "polling page must be page aligned"); 4303 } 4304 } 4305 4306 // Move the address of the polling page into r, then read the polling 4307 // page. 4308 address MacroAssembler::read_polling_page(Register r, address page, relocInfo::relocType rtype) { 4309 get_polling_page(r, page, rtype); 4310 return read_polling_page(r, rtype); 4311 } 4312 4313 // Read the polling page. The address of the polling page must 4314 // already be in r. 4315 address MacroAssembler::read_polling_page(Register r, relocInfo::relocType rtype) { 4316 InstructionMark im(this); 4317 code_section()->relocate(inst_mark(), rtype); 4318 ldrw(zr, Address(r, 0)); 4319 return inst_mark(); 4320 } 4321 4322 void MacroAssembler::adrp(Register reg1, const Address &dest, unsigned long &byte_offset) { 4323 relocInfo::relocType rtype = dest.rspec().reloc()->type(); 4324 unsigned long low_page = (unsigned long)CodeCache::low_bound() >> 12; 4325 unsigned long high_page = (unsigned long)(CodeCache::high_bound()-1) >> 12; 4326 unsigned long dest_page = (unsigned long)dest.target() >> 12; 4327 long offset_low = dest_page - low_page; 4328 long offset_high = dest_page - high_page; 4329 4330 assert(is_valid_AArch64_address(dest.target()), "bad address"); 4331 assert(dest.getMode() == Address::literal, "ADRP must be applied to a literal address"); 4332 4333 InstructionMark im(this); 4334 code_section()->relocate(inst_mark(), dest.rspec()); 4335 // 8143067: Ensure that the adrp can reach the dest from anywhere within 4336 // the code cache so that if it is relocated we know it will still reach 4337 if (offset_high >= -(1<<20) && offset_low < (1<<20)) { 4338 _adrp(reg1, dest.target()); 4339 } else { 4340 unsigned long target = (unsigned long)dest.target(); 4341 unsigned long adrp_target 4342 = (target & 0xffffffffUL) | ((unsigned long)pc() & 0xffff00000000UL); 4343 4344 _adrp(reg1, (address)adrp_target); 4345 movk(reg1, target >> 32, 32); 4346 } 4347 byte_offset = (unsigned long)dest.target() & 0xfff; 4348 } 4349 4350 void MacroAssembler::load_byte_map_base(Register reg) { 4351 jbyte *byte_map_base = 4352 ((CardTableBarrierSet*)(BarrierSet::barrier_set()))->card_table()->byte_map_base(); 4353 4354 if (is_valid_AArch64_address((address)byte_map_base)) { 4355 // Strictly speaking the byte_map_base isn't an address at all, 4356 // and it might even be negative. 4357 unsigned long offset; 4358 adrp(reg, ExternalAddress((address)byte_map_base), offset); 4359 // We expect offset to be zero with most collectors. 4360 if (offset != 0) { 4361 add(reg, reg, offset); 4362 } 4363 } else { 4364 mov(reg, (uint64_t)byte_map_base); 4365 } 4366 } 4367 4368 void MacroAssembler::build_frame(int framesize) { 4369 assert(framesize > 0, "framesize must be > 0"); 4370 if (framesize < ((1 << 9) + 2 * wordSize)) { 4371 sub(sp, sp, framesize); 4372 stp(rfp, lr, Address(sp, framesize - 2 * wordSize)); 4373 if (PreserveFramePointer) add(rfp, sp, framesize - 2 * wordSize); 4374 } else { 4375 stp(rfp, lr, Address(pre(sp, -2 * wordSize))); 4376 if (PreserveFramePointer) mov(rfp, sp); 4377 if (framesize < ((1 << 12) + 2 * wordSize)) 4378 sub(sp, sp, framesize - 2 * wordSize); 4379 else { 4380 mov(rscratch1, framesize - 2 * wordSize); 4381 sub(sp, sp, rscratch1); 4382 } 4383 } 4384 } 4385 4386 void MacroAssembler::remove_frame(int framesize) { 4387 assert(framesize > 0, "framesize must be > 0"); 4388 if (framesize < ((1 << 9) + 2 * wordSize)) { 4389 ldp(rfp, lr, Address(sp, framesize - 2 * wordSize)); 4390 add(sp, sp, framesize); 4391 } else { 4392 if (framesize < ((1 << 12) + 2 * wordSize)) 4393 add(sp, sp, framesize - 2 * wordSize); 4394 else { 4395 mov(rscratch1, framesize - 2 * wordSize); 4396 add(sp, sp, rscratch1); 4397 } 4398 ldp(rfp, lr, Address(post(sp, 2 * wordSize))); 4399 } 4400 } 4401 4402 #ifdef COMPILER2 4403 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr); 4404 4405 // Search for str1 in str2 and return index or -1 4406 void MacroAssembler::string_indexof(Register str2, Register str1, 4407 Register cnt2, Register cnt1, 4408 Register tmp1, Register tmp2, 4409 Register tmp3, Register tmp4, 4410 Register tmp5, Register tmp6, 4411 int icnt1, Register result, int ae) { 4412 // NOTE: tmp5, tmp6 can be zr depending on specific method version 4413 Label LINEARSEARCH, LINEARSTUB, LINEAR_MEDIUM, DONE, NOMATCH, MATCH; 4414 4415 Register ch1 = rscratch1; 4416 Register ch2 = rscratch2; 4417 Register cnt1tmp = tmp1; 4418 Register cnt2tmp = tmp2; 4419 Register cnt1_neg = cnt1; 4420 Register cnt2_neg = cnt2; 4421 Register result_tmp = tmp4; 4422 4423 bool isL = ae == StrIntrinsicNode::LL; 4424 4425 bool str1_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL; 4426 bool str2_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU; 4427 int str1_chr_shift = str1_isL ? 0:1; 4428 int str2_chr_shift = str2_isL ? 0:1; 4429 int str1_chr_size = str1_isL ? 1:2; 4430 int str2_chr_size = str2_isL ? 1:2; 4431 chr_insn str1_load_1chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb : 4432 (chr_insn)&MacroAssembler::ldrh; 4433 chr_insn str2_load_1chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb : 4434 (chr_insn)&MacroAssembler::ldrh; 4435 chr_insn load_2chr = isL ? (chr_insn)&MacroAssembler::ldrh : (chr_insn)&MacroAssembler::ldrw; 4436 chr_insn load_4chr = isL ? (chr_insn)&MacroAssembler::ldrw : (chr_insn)&MacroAssembler::ldr; 4437 4438 // Note, inline_string_indexOf() generates checks: 4439 // if (substr.count > string.count) return -1; 4440 // if (substr.count == 0) return 0; 4441 4442 // We have two strings, a source string in str2, cnt2 and a pattern string 4443 // in str1, cnt1. Find the 1st occurence of pattern in source or return -1. 4444 4445 // For larger pattern and source we use a simplified Boyer Moore algorithm. 4446 // With a small pattern and source we use linear scan. 4447 4448 if (icnt1 == -1) { 4449 sub(result_tmp, cnt2, cnt1); 4450 cmp(cnt1, (u1)8); // Use Linear Scan if cnt1 < 8 || cnt1 >= 256 4451 br(LT, LINEARSEARCH); 4452 dup(v0, T16B, cnt1); // done in separate FPU pipeline. Almost no penalty 4453 subs(zr, cnt1, 256); 4454 lsr(tmp1, cnt2, 2); 4455 ccmp(cnt1, tmp1, 0b0000, LT); // Source must be 4 * pattern for BM 4456 br(GE, LINEARSTUB); 4457 } 4458 4459 // The Boyer Moore alogorithm is based on the description here:- 4460 // 4461 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm 4462 // 4463 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule 4464 // and the 'Good Suffix' rule. 4465 // 4466 // These rules are essentially heuristics for how far we can shift the 4467 // pattern along the search string. 4468 // 4469 // The implementation here uses the 'Bad Character' rule only because of the 4470 // complexity of initialisation for the 'Good Suffix' rule. 4471 // 4472 // This is also known as the Boyer-Moore-Horspool algorithm:- 4473 // 4474 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm 4475 // 4476 // This particular implementation has few java-specific optimizations. 4477 // 4478 // #define ASIZE 256 4479 // 4480 // int bm(unsigned char *x, int m, unsigned char *y, int n) { 4481 // int i, j; 4482 // unsigned c; 4483 // unsigned char bc[ASIZE]; 4484 // 4485 // /* Preprocessing */ 4486 // for (i = 0; i < ASIZE; ++i) 4487 // bc[i] = m; 4488 // for (i = 0; i < m - 1; ) { 4489 // c = x[i]; 4490 // ++i; 4491 // // c < 256 for Latin1 string, so, no need for branch 4492 // #ifdef PATTERN_STRING_IS_LATIN1 4493 // bc[c] = m - i; 4494 // #else 4495 // if (c < ASIZE) bc[c] = m - i; 4496 // #endif 4497 // } 4498 // 4499 // /* Searching */ 4500 // j = 0; 4501 // while (j <= n - m) { 4502 // c = y[i+j]; 4503 // if (x[m-1] == c) 4504 // for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i); 4505 // if (i < 0) return j; 4506 // // c < 256 for Latin1 string, so, no need for branch 4507 // #ifdef SOURCE_STRING_IS_LATIN1 4508 // // LL case: (c< 256) always true. Remove branch 4509 // j += bc[y[j+m-1]]; 4510 // #endif 4511 // #ifndef PATTERN_STRING_IS_UTF 4512 // // UU case: need if (c<ASIZE) check. Skip 1 character if not. 4513 // if (c < ASIZE) 4514 // j += bc[y[j+m-1]]; 4515 // else 4516 // j += 1 4517 // #endif 4518 // #ifdef PATTERN_IS_LATIN1_AND_SOURCE_IS_UTF 4519 // // UL case: need if (c<ASIZE) check. Skip <pattern length> if not. 4520 // if (c < ASIZE) 4521 // j += bc[y[j+m-1]]; 4522 // else 4523 // j += m 4524 // #endif 4525 // } 4526 // } 4527 4528 if (icnt1 == -1) { 4529 Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH, 4530 BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP; 4531 Register cnt1end = tmp2; 4532 Register str2end = cnt2; 4533 Register skipch = tmp2; 4534 4535 // str1 length is >=8, so, we can read at least 1 register for cases when 4536 // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for 4537 // UL case. We'll re-read last character in inner pre-loop code to have 4538 // single outer pre-loop load 4539 const int firstStep = isL ? 7 : 3; 4540 4541 const int ASIZE = 256; 4542 const int STORED_BYTES = 32; // amount of bytes stored per instruction 4543 sub(sp, sp, ASIZE); 4544 mov(tmp5, ASIZE/STORED_BYTES); // loop iterations 4545 mov(ch1, sp); 4546 BIND(BM_INIT_LOOP); 4547 stpq(v0, v0, Address(post(ch1, STORED_BYTES))); 4548 subs(tmp5, tmp5, 1); 4549 br(GT, BM_INIT_LOOP); 4550 4551 sub(cnt1tmp, cnt1, 1); 4552 mov(tmp5, str2); 4553 add(str2end, str2, result_tmp, LSL, str2_chr_shift); 4554 sub(ch2, cnt1, 1); 4555 mov(tmp3, str1); 4556 BIND(BCLOOP); 4557 (this->*str1_load_1chr)(ch1, Address(post(tmp3, str1_chr_size))); 4558 if (!str1_isL) { 4559 subs(zr, ch1, ASIZE); 4560 br(HS, BCSKIP); 4561 } 4562 strb(ch2, Address(sp, ch1)); 4563 BIND(BCSKIP); 4564 subs(ch2, ch2, 1); 4565 br(GT, BCLOOP); 4566 4567 add(tmp6, str1, cnt1, LSL, str1_chr_shift); // address after str1 4568 if (str1_isL == str2_isL) { 4569 // load last 8 bytes (8LL/4UU symbols) 4570 ldr(tmp6, Address(tmp6, -wordSize)); 4571 } else { 4572 ldrw(tmp6, Address(tmp6, -wordSize/2)); // load last 4 bytes(4 symbols) 4573 // convert Latin1 to UTF. We'll have to wait until load completed, but 4574 // it's still faster than per-character loads+checks 4575 lsr(tmp3, tmp6, BitsPerByte * (wordSize/2 - str1_chr_size)); // str1[N-1] 4576 ubfx(ch1, tmp6, 8, 8); // str1[N-2] 4577 ubfx(ch2, tmp6, 16, 8); // str1[N-3] 4578 andr(tmp6, tmp6, 0xFF); // str1[N-4] 4579 orr(ch2, ch1, ch2, LSL, 16); 4580 orr(tmp6, tmp6, tmp3, LSL, 48); 4581 orr(tmp6, tmp6, ch2, LSL, 16); 4582 } 4583 BIND(BMLOOPSTR2); 4584 (this->*str2_load_1chr)(skipch, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift))); 4585 sub(cnt1tmp, cnt1tmp, firstStep); // cnt1tmp is positive here, because cnt1 >= 8 4586 if (str1_isL == str2_isL) { 4587 // re-init tmp3. It's for free because it's executed in parallel with 4588 // load above. Alternative is to initialize it before loop, but it'll 4589 // affect performance on in-order systems with 2 or more ld/st pipelines 4590 lsr(tmp3, tmp6, BitsPerByte * (wordSize - str1_chr_size)); 4591 } 4592 if (!isL) { // UU/UL case 4593 lsl(ch2, cnt1tmp, 1); // offset in bytes 4594 } 4595 cmp(tmp3, skipch); 4596 br(NE, BMSKIP); 4597 ldr(ch2, Address(str2, isL ? cnt1tmp : ch2)); 4598 mov(ch1, tmp6); 4599 if (isL) { 4600 b(BMLOOPSTR1_AFTER_LOAD); 4601 } else { 4602 sub(cnt1tmp, cnt1tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8 4603 b(BMLOOPSTR1_CMP); 4604 } 4605 BIND(BMLOOPSTR1); 4606 (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift))); 4607 (this->*str2_load_1chr)(ch2, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift))); 4608 BIND(BMLOOPSTR1_AFTER_LOAD); 4609 subs(cnt1tmp, cnt1tmp, 1); 4610 br(LT, BMLOOPSTR1_LASTCMP); 4611 BIND(BMLOOPSTR1_CMP); 4612 cmp(ch1, ch2); 4613 br(EQ, BMLOOPSTR1); 4614 BIND(BMSKIP); 4615 if (!isL) { 4616 // if we've met UTF symbol while searching Latin1 pattern, then we can 4617 // skip cnt1 symbols 4618 if (str1_isL != str2_isL) { 4619 mov(result_tmp, cnt1); 4620 } else { 4621 mov(result_tmp, 1); 4622 } 4623 subs(zr, skipch, ASIZE); 4624 br(HS, BMADV); 4625 } 4626 ldrb(result_tmp, Address(sp, skipch)); // load skip distance 4627 BIND(BMADV); 4628 sub(cnt1tmp, cnt1, 1); 4629 add(str2, str2, result_tmp, LSL, str2_chr_shift); 4630 cmp(str2, str2end); 4631 br(LE, BMLOOPSTR2); 4632 add(sp, sp, ASIZE); 4633 b(NOMATCH); 4634 BIND(BMLOOPSTR1_LASTCMP); 4635 cmp(ch1, ch2); 4636 br(NE, BMSKIP); 4637 BIND(BMMATCH); 4638 sub(result, str2, tmp5); 4639 if (!str2_isL) lsr(result, result, 1); 4640 add(sp, sp, ASIZE); 4641 b(DONE); 4642 4643 BIND(LINEARSTUB); 4644 cmp(cnt1, (u1)16); // small patterns still should be handled by simple algorithm 4645 br(LT, LINEAR_MEDIUM); 4646 mov(result, zr); 4647 RuntimeAddress stub = NULL; 4648 if (isL) { 4649 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ll()); 4650 assert(stub.target() != NULL, "string_indexof_linear_ll stub has not been generated"); 4651 } else if (str1_isL) { 4652 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ul()); 4653 assert(stub.target() != NULL, "string_indexof_linear_ul stub has not been generated"); 4654 } else { 4655 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_uu()); 4656 assert(stub.target() != NULL, "string_indexof_linear_uu stub has not been generated"); 4657 } 4658 trampoline_call(stub); 4659 b(DONE); 4660 } 4661 4662 BIND(LINEARSEARCH); 4663 { 4664 Label DO1, DO2, DO3; 4665 4666 Register str2tmp = tmp2; 4667 Register first = tmp3; 4668 4669 if (icnt1 == -1) 4670 { 4671 Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT; 4672 4673 cmp(cnt1, u1(str1_isL == str2_isL ? 4 : 2)); 4674 br(LT, DOSHORT); 4675 BIND(LINEAR_MEDIUM); 4676 (this->*str1_load_1chr)(first, Address(str1)); 4677 lea(str1, Address(str1, cnt1, Address::lsl(str1_chr_shift))); 4678 sub(cnt1_neg, zr, cnt1, LSL, str1_chr_shift); 4679 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 4680 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 4681 4682 BIND(FIRST_LOOP); 4683 (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg)); 4684 cmp(first, ch2); 4685 br(EQ, STR1_LOOP); 4686 BIND(STR2_NEXT); 4687 adds(cnt2_neg, cnt2_neg, str2_chr_size); 4688 br(LE, FIRST_LOOP); 4689 b(NOMATCH); 4690 4691 BIND(STR1_LOOP); 4692 adds(cnt1tmp, cnt1_neg, str1_chr_size); 4693 add(cnt2tmp, cnt2_neg, str2_chr_size); 4694 br(GE, MATCH); 4695 4696 BIND(STR1_NEXT); 4697 (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp)); 4698 (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp)); 4699 cmp(ch1, ch2); 4700 br(NE, STR2_NEXT); 4701 adds(cnt1tmp, cnt1tmp, str1_chr_size); 4702 add(cnt2tmp, cnt2tmp, str2_chr_size); 4703 br(LT, STR1_NEXT); 4704 b(MATCH); 4705 4706 BIND(DOSHORT); 4707 if (str1_isL == str2_isL) { 4708 cmp(cnt1, (u1)2); 4709 br(LT, DO1); 4710 br(GT, DO3); 4711 } 4712 } 4713 4714 if (icnt1 == 4) { 4715 Label CH1_LOOP; 4716 4717 (this->*load_4chr)(ch1, str1); 4718 sub(result_tmp, cnt2, 4); 4719 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 4720 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 4721 4722 BIND(CH1_LOOP); 4723 (this->*load_4chr)(ch2, Address(str2, cnt2_neg)); 4724 cmp(ch1, ch2); 4725 br(EQ, MATCH); 4726 adds(cnt2_neg, cnt2_neg, str2_chr_size); 4727 br(LE, CH1_LOOP); 4728 b(NOMATCH); 4729 } 4730 4731 if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 2) { 4732 Label CH1_LOOP; 4733 4734 BIND(DO2); 4735 (this->*load_2chr)(ch1, str1); 4736 if (icnt1 == 2) { 4737 sub(result_tmp, cnt2, 2); 4738 } 4739 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 4740 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 4741 BIND(CH1_LOOP); 4742 (this->*load_2chr)(ch2, Address(str2, cnt2_neg)); 4743 cmp(ch1, ch2); 4744 br(EQ, MATCH); 4745 adds(cnt2_neg, cnt2_neg, str2_chr_size); 4746 br(LE, CH1_LOOP); 4747 b(NOMATCH); 4748 } 4749 4750 if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 3) { 4751 Label FIRST_LOOP, STR2_NEXT, STR1_LOOP; 4752 4753 BIND(DO3); 4754 (this->*load_2chr)(first, str1); 4755 (this->*str1_load_1chr)(ch1, Address(str1, 2*str1_chr_size)); 4756 if (icnt1 == 3) { 4757 sub(result_tmp, cnt2, 3); 4758 } 4759 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 4760 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 4761 BIND(FIRST_LOOP); 4762 (this->*load_2chr)(ch2, Address(str2, cnt2_neg)); 4763 cmpw(first, ch2); 4764 br(EQ, STR1_LOOP); 4765 BIND(STR2_NEXT); 4766 adds(cnt2_neg, cnt2_neg, str2_chr_size); 4767 br(LE, FIRST_LOOP); 4768 b(NOMATCH); 4769 4770 BIND(STR1_LOOP); 4771 add(cnt2tmp, cnt2_neg, 2*str2_chr_size); 4772 (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp)); 4773 cmp(ch1, ch2); 4774 br(NE, STR2_NEXT); 4775 b(MATCH); 4776 } 4777 4778 if (icnt1 == -1 || icnt1 == 1) { 4779 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP; 4780 4781 BIND(DO1); 4782 (this->*str1_load_1chr)(ch1, str1); 4783 cmp(cnt2, (u1)8); 4784 br(LT, DO1_SHORT); 4785 4786 sub(result_tmp, cnt2, 8/str2_chr_size); 4787 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 4788 mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001); 4789 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 4790 4791 if (str2_isL) { 4792 orr(ch1, ch1, ch1, LSL, 8); 4793 } 4794 orr(ch1, ch1, ch1, LSL, 16); 4795 orr(ch1, ch1, ch1, LSL, 32); 4796 BIND(CH1_LOOP); 4797 ldr(ch2, Address(str2, cnt2_neg)); 4798 eor(ch2, ch1, ch2); 4799 sub(tmp1, ch2, tmp3); 4800 orr(tmp2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 4801 bics(tmp1, tmp1, tmp2); 4802 br(NE, HAS_ZERO); 4803 adds(cnt2_neg, cnt2_neg, 8); 4804 br(LT, CH1_LOOP); 4805 4806 cmp(cnt2_neg, (u1)8); 4807 mov(cnt2_neg, 0); 4808 br(LT, CH1_LOOP); 4809 b(NOMATCH); 4810 4811 BIND(HAS_ZERO); 4812 rev(tmp1, tmp1); 4813 clz(tmp1, tmp1); 4814 add(cnt2_neg, cnt2_neg, tmp1, LSR, 3); 4815 b(MATCH); 4816 4817 BIND(DO1_SHORT); 4818 mov(result_tmp, cnt2); 4819 lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift))); 4820 sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift); 4821 BIND(DO1_LOOP); 4822 (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg)); 4823 cmpw(ch1, ch2); 4824 br(EQ, MATCH); 4825 adds(cnt2_neg, cnt2_neg, str2_chr_size); 4826 br(LT, DO1_LOOP); 4827 } 4828 } 4829 BIND(NOMATCH); 4830 mov(result, -1); 4831 b(DONE); 4832 BIND(MATCH); 4833 add(result, result_tmp, cnt2_neg, ASR, str2_chr_shift); 4834 BIND(DONE); 4835 } 4836 4837 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr); 4838 typedef void (MacroAssembler::* uxt_insn)(Register Rd, Register Rn); 4839 4840 void MacroAssembler::string_indexof_char(Register str1, Register cnt1, 4841 Register ch, Register result, 4842 Register tmp1, Register tmp2, Register tmp3) 4843 { 4844 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE; 4845 Register cnt1_neg = cnt1; 4846 Register ch1 = rscratch1; 4847 Register result_tmp = rscratch2; 4848 4849 cmp(cnt1, (u1)4); 4850 br(LT, DO1_SHORT); 4851 4852 orr(ch, ch, ch, LSL, 16); 4853 orr(ch, ch, ch, LSL, 32); 4854 4855 sub(cnt1, cnt1, 4); 4856 mov(result_tmp, cnt1); 4857 lea(str1, Address(str1, cnt1, Address::uxtw(1))); 4858 sub(cnt1_neg, zr, cnt1, LSL, 1); 4859 4860 mov(tmp3, 0x0001000100010001); 4861 4862 BIND(CH1_LOOP); 4863 ldr(ch1, Address(str1, cnt1_neg)); 4864 eor(ch1, ch, ch1); 4865 sub(tmp1, ch1, tmp3); 4866 orr(tmp2, ch1, 0x7fff7fff7fff7fff); 4867 bics(tmp1, tmp1, tmp2); 4868 br(NE, HAS_ZERO); 4869 adds(cnt1_neg, cnt1_neg, 8); 4870 br(LT, CH1_LOOP); 4871 4872 cmp(cnt1_neg, (u1)8); 4873 mov(cnt1_neg, 0); 4874 br(LT, CH1_LOOP); 4875 b(NOMATCH); 4876 4877 BIND(HAS_ZERO); 4878 rev(tmp1, tmp1); 4879 clz(tmp1, tmp1); 4880 add(cnt1_neg, cnt1_neg, tmp1, LSR, 3); 4881 b(MATCH); 4882 4883 BIND(DO1_SHORT); 4884 mov(result_tmp, cnt1); 4885 lea(str1, Address(str1, cnt1, Address::uxtw(1))); 4886 sub(cnt1_neg, zr, cnt1, LSL, 1); 4887 BIND(DO1_LOOP); 4888 ldrh(ch1, Address(str1, cnt1_neg)); 4889 cmpw(ch, ch1); 4890 br(EQ, MATCH); 4891 adds(cnt1_neg, cnt1_neg, 2); 4892 br(LT, DO1_LOOP); 4893 BIND(NOMATCH); 4894 mov(result, -1); 4895 b(DONE); 4896 BIND(MATCH); 4897 add(result, result_tmp, cnt1_neg, ASR, 1); 4898 BIND(DONE); 4899 } 4900 4901 // Compare strings. 4902 void MacroAssembler::string_compare(Register str1, Register str2, 4903 Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2, 4904 FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3, int ae) { 4905 Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB, 4906 DIFFERENCE, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, 4907 SHORT_LOOP_START, TAIL_CHECK; 4908 4909 const u1 STUB_THRESHOLD = 64 + 8; 4910 bool isLL = ae == StrIntrinsicNode::LL; 4911 bool isLU = ae == StrIntrinsicNode::LU; 4912 bool isUL = ae == StrIntrinsicNode::UL; 4913 4914 bool str1_isL = isLL || isLU; 4915 bool str2_isL = isLL || isUL; 4916 4917 int str1_chr_shift = str1_isL ? 0 : 1; 4918 int str2_chr_shift = str2_isL ? 0 : 1; 4919 int str1_chr_size = str1_isL ? 1 : 2; 4920 int str2_chr_size = str2_isL ? 1 : 2; 4921 int minCharsInWord = isLL ? wordSize : wordSize/2; 4922 4923 FloatRegister vtmpZ = vtmp1, vtmp = vtmp2; 4924 chr_insn str1_load_chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb : 4925 (chr_insn)&MacroAssembler::ldrh; 4926 chr_insn str2_load_chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb : 4927 (chr_insn)&MacroAssembler::ldrh; 4928 uxt_insn ext_chr = isLL ? (uxt_insn)&MacroAssembler::uxtbw : 4929 (uxt_insn)&MacroAssembler::uxthw; 4930 4931 BLOCK_COMMENT("string_compare {"); 4932 4933 // Bizzarely, the counts are passed in bytes, regardless of whether they 4934 // are L or U strings, however the result is always in characters. 4935 if (!str1_isL) asrw(cnt1, cnt1, 1); 4936 if (!str2_isL) asrw(cnt2, cnt2, 1); 4937 4938 // Compute the minimum of the string lengths and save the difference. 4939 subsw(result, cnt1, cnt2); 4940 cselw(cnt2, cnt1, cnt2, Assembler::LE); // min 4941 4942 // A very short string 4943 cmpw(cnt2, minCharsInWord); 4944 br(Assembler::LE, SHORT_STRING); 4945 4946 // Compare longwords 4947 // load first parts of strings and finish initialization while loading 4948 { 4949 if (str1_isL == str2_isL) { // LL or UU 4950 ldr(tmp1, Address(str1)); 4951 cmp(str1, str2); 4952 br(Assembler::EQ, DONE); 4953 ldr(tmp2, Address(str2)); 4954 cmp(cnt2, STUB_THRESHOLD); 4955 br(GE, STUB); 4956 subsw(cnt2, cnt2, minCharsInWord); 4957 br(EQ, TAIL_CHECK); 4958 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift))); 4959 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift))); 4960 sub(cnt2, zr, cnt2, LSL, str2_chr_shift); 4961 } else if (isLU) { 4962 ldrs(vtmp, Address(str1)); 4963 cmp(str1, str2); 4964 br(Assembler::EQ, DONE); 4965 ldr(tmp2, Address(str2)); 4966 cmp(cnt2, STUB_THRESHOLD); 4967 br(GE, STUB); 4968 subw(cnt2, cnt2, 4); 4969 eor(vtmpZ, T16B, vtmpZ, vtmpZ); 4970 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift))); 4971 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift))); 4972 zip1(vtmp, T8B, vtmp, vtmpZ); 4973 sub(cnt1, zr, cnt2, LSL, str1_chr_shift); 4974 sub(cnt2, zr, cnt2, LSL, str2_chr_shift); 4975 add(cnt1, cnt1, 4); 4976 fmovd(tmp1, vtmp); 4977 } else { // UL case 4978 ldr(tmp1, Address(str1)); 4979 cmp(str1, str2); 4980 br(Assembler::EQ, DONE); 4981 ldrs(vtmp, Address(str2)); 4982 cmp(cnt2, STUB_THRESHOLD); 4983 br(GE, STUB); 4984 subw(cnt2, cnt2, 4); 4985 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift))); 4986 eor(vtmpZ, T16B, vtmpZ, vtmpZ); 4987 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift))); 4988 sub(cnt1, zr, cnt2, LSL, str1_chr_shift); 4989 zip1(vtmp, T8B, vtmp, vtmpZ); 4990 sub(cnt2, zr, cnt2, LSL, str2_chr_shift); 4991 add(cnt1, cnt1, 8); 4992 fmovd(tmp2, vtmp); 4993 } 4994 adds(cnt2, cnt2, isUL ? 4 : 8); 4995 br(GE, TAIL); 4996 eor(rscratch2, tmp1, tmp2); 4997 cbnz(rscratch2, DIFFERENCE); 4998 // main loop 4999 bind(NEXT_WORD); 5000 if (str1_isL == str2_isL) { 5001 ldr(tmp1, Address(str1, cnt2)); 5002 ldr(tmp2, Address(str2, cnt2)); 5003 adds(cnt2, cnt2, 8); 5004 } else if (isLU) { 5005 ldrs(vtmp, Address(str1, cnt1)); 5006 ldr(tmp2, Address(str2, cnt2)); 5007 add(cnt1, cnt1, 4); 5008 zip1(vtmp, T8B, vtmp, vtmpZ); 5009 fmovd(tmp1, vtmp); 5010 adds(cnt2, cnt2, 8); 5011 } else { // UL 5012 ldrs(vtmp, Address(str2, cnt2)); 5013 ldr(tmp1, Address(str1, cnt1)); 5014 zip1(vtmp, T8B, vtmp, vtmpZ); 5015 add(cnt1, cnt1, 8); 5016 fmovd(tmp2, vtmp); 5017 adds(cnt2, cnt2, 4); 5018 } 5019 br(GE, TAIL); 5020 5021 eor(rscratch2, tmp1, tmp2); 5022 cbz(rscratch2, NEXT_WORD); 5023 b(DIFFERENCE); 5024 bind(TAIL); 5025 eor(rscratch2, tmp1, tmp2); 5026 cbnz(rscratch2, DIFFERENCE); 5027 // Last longword. In the case where length == 4 we compare the 5028 // same longword twice, but that's still faster than another 5029 // conditional branch. 5030 if (str1_isL == str2_isL) { 5031 ldr(tmp1, Address(str1)); 5032 ldr(tmp2, Address(str2)); 5033 } else if (isLU) { 5034 ldrs(vtmp, Address(str1)); 5035 ldr(tmp2, Address(str2)); 5036 zip1(vtmp, T8B, vtmp, vtmpZ); 5037 fmovd(tmp1, vtmp); 5038 } else { // UL 5039 ldrs(vtmp, Address(str2)); 5040 ldr(tmp1, Address(str1)); 5041 zip1(vtmp, T8B, vtmp, vtmpZ); 5042 fmovd(tmp2, vtmp); 5043 } 5044 bind(TAIL_CHECK); 5045 eor(rscratch2, tmp1, tmp2); 5046 cbz(rscratch2, DONE); 5047 5048 // Find the first different characters in the longwords and 5049 // compute their difference. 5050 bind(DIFFERENCE); 5051 rev(rscratch2, rscratch2); 5052 clz(rscratch2, rscratch2); 5053 andr(rscratch2, rscratch2, isLL ? -8 : -16); 5054 lsrv(tmp1, tmp1, rscratch2); 5055 (this->*ext_chr)(tmp1, tmp1); 5056 lsrv(tmp2, tmp2, rscratch2); 5057 (this->*ext_chr)(tmp2, tmp2); 5058 subw(result, tmp1, tmp2); 5059 b(DONE); 5060 } 5061 5062 bind(STUB); 5063 RuntimeAddress stub = NULL; 5064 switch(ae) { 5065 case StrIntrinsicNode::LL: 5066 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LL()); 5067 break; 5068 case StrIntrinsicNode::UU: 5069 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UU()); 5070 break; 5071 case StrIntrinsicNode::LU: 5072 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LU()); 5073 break; 5074 case StrIntrinsicNode::UL: 5075 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UL()); 5076 break; 5077 default: 5078 ShouldNotReachHere(); 5079 } 5080 assert(stub.target() != NULL, "compare_long_string stub has not been generated"); 5081 trampoline_call(stub); 5082 b(DONE); 5083 5084 bind(SHORT_STRING); 5085 // Is the minimum length zero? 5086 cbz(cnt2, DONE); 5087 // arrange code to do most branches while loading and loading next characters 5088 // while comparing previous 5089 (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size))); 5090 subs(cnt2, cnt2, 1); 5091 br(EQ, SHORT_LAST_INIT); 5092 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size))); 5093 b(SHORT_LOOP_START); 5094 bind(SHORT_LOOP); 5095 subs(cnt2, cnt2, 1); 5096 br(EQ, SHORT_LAST); 5097 bind(SHORT_LOOP_START); 5098 (this->*str1_load_chr)(tmp2, Address(post(str1, str1_chr_size))); 5099 (this->*str2_load_chr)(rscratch1, Address(post(str2, str2_chr_size))); 5100 cmp(tmp1, cnt1); 5101 br(NE, SHORT_LOOP_TAIL); 5102 subs(cnt2, cnt2, 1); 5103 br(EQ, SHORT_LAST2); 5104 (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size))); 5105 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size))); 5106 cmp(tmp2, rscratch1); 5107 br(EQ, SHORT_LOOP); 5108 sub(result, tmp2, rscratch1); 5109 b(DONE); 5110 bind(SHORT_LOOP_TAIL); 5111 sub(result, tmp1, cnt1); 5112 b(DONE); 5113 bind(SHORT_LAST2); 5114 cmp(tmp2, rscratch1); 5115 br(EQ, DONE); 5116 sub(result, tmp2, rscratch1); 5117 5118 b(DONE); 5119 bind(SHORT_LAST_INIT); 5120 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size))); 5121 bind(SHORT_LAST); 5122 cmp(tmp1, cnt1); 5123 br(EQ, DONE); 5124 sub(result, tmp1, cnt1); 5125 5126 bind(DONE); 5127 5128 BLOCK_COMMENT("} string_compare"); 5129 } 5130 #endif // COMPILER2 5131 5132 // This method checks if provided byte array contains byte with highest bit set. 5133 void MacroAssembler::has_negatives(Register ary1, Register len, Register result) { 5134 // Simple and most common case of aligned small array which is not at the 5135 // end of memory page is placed here. All other cases are in stub. 5136 Label LOOP, END, STUB, STUB_LONG, SET_RESULT, DONE; 5137 const uint64_t UPPER_BIT_MASK=0x8080808080808080; 5138 assert_different_registers(ary1, len, result); 5139 5140 cmpw(len, 0); 5141 br(LE, SET_RESULT); 5142 cmpw(len, 4 * wordSize); 5143 br(GE, STUB_LONG); // size > 32 then go to stub 5144 5145 int shift = 64 - exact_log2(os::vm_page_size()); 5146 lsl(rscratch1, ary1, shift); 5147 mov(rscratch2, (size_t)(4 * wordSize) << shift); 5148 adds(rscratch2, rscratch1, rscratch2); // At end of page? 5149 br(CS, STUB); // at the end of page then go to stub 5150 subs(len, len, wordSize); 5151 br(LT, END); 5152 5153 BIND(LOOP); 5154 ldr(rscratch1, Address(post(ary1, wordSize))); 5155 tst(rscratch1, UPPER_BIT_MASK); 5156 br(NE, SET_RESULT); 5157 subs(len, len, wordSize); 5158 br(GE, LOOP); 5159 cmpw(len, -wordSize); 5160 br(EQ, SET_RESULT); 5161 5162 BIND(END); 5163 ldr(result, Address(ary1)); 5164 sub(len, zr, len, LSL, 3); // LSL 3 is to get bits from bytes 5165 lslv(result, result, len); 5166 tst(result, UPPER_BIT_MASK); 5167 b(SET_RESULT); 5168 5169 BIND(STUB); 5170 RuntimeAddress has_neg = RuntimeAddress(StubRoutines::aarch64::has_negatives()); 5171 assert(has_neg.target() != NULL, "has_negatives stub has not been generated"); 5172 trampoline_call(has_neg); 5173 b(DONE); 5174 5175 BIND(STUB_LONG); 5176 RuntimeAddress has_neg_long = RuntimeAddress( 5177 StubRoutines::aarch64::has_negatives_long()); 5178 assert(has_neg_long.target() != NULL, "has_negatives stub has not been generated"); 5179 trampoline_call(has_neg_long); 5180 b(DONE); 5181 5182 BIND(SET_RESULT); 5183 cset(result, NE); // set true or false 5184 5185 BIND(DONE); 5186 } 5187 5188 void MacroAssembler::arrays_equals(Register a1, Register a2, Register tmp3, 5189 Register tmp4, Register tmp5, Register result, 5190 Register cnt1, int elem_size) { 5191 Label DONE, SAME; 5192 Register tmp1 = rscratch1; 5193 Register tmp2 = rscratch2; 5194 Register cnt2 = tmp2; // cnt2 only used in array length compare 5195 int elem_per_word = wordSize/elem_size; 5196 int log_elem_size = exact_log2(elem_size); 5197 int length_offset = arrayOopDesc::length_offset_in_bytes(); 5198 int base_offset 5199 = arrayOopDesc::base_offset_in_bytes(elem_size == 2 ? T_CHAR : T_BYTE); 5200 int stubBytesThreshold = 3 * 64 + (UseSIMDForArrayEquals ? 0 : 16); 5201 5202 assert(elem_size == 1 || elem_size == 2, "must be char or byte"); 5203 assert_different_registers(a1, a2, result, cnt1, rscratch1, rscratch2); 5204 5205 #ifndef PRODUCT 5206 { 5207 const char kind = (elem_size == 2) ? 'U' : 'L'; 5208 char comment[64]; 5209 snprintf(comment, sizeof comment, "array_equals%c{", kind); 5210 BLOCK_COMMENT(comment); 5211 } 5212 #endif 5213 5214 // if (a1 == a2) 5215 // return true; 5216 cmpoop(a1, a2); // May have read barriers for a1 and a2. 5217 br(EQ, SAME); 5218 5219 if (UseSimpleArrayEquals) { 5220 Label NEXT_WORD, SHORT, TAIL03, TAIL01, A_MIGHT_BE_NULL, A_IS_NOT_NULL; 5221 // if (a1 == null || a2 == null) 5222 // return false; 5223 // a1 & a2 == 0 means (some-pointer is null) or 5224 // (very-rare-or-even-probably-impossible-pointer-values) 5225 // so, we can save one branch in most cases 5226 tst(a1, a2); 5227 mov(result, false); 5228 br(EQ, A_MIGHT_BE_NULL); 5229 // if (a1.length != a2.length) 5230 // return false; 5231 bind(A_IS_NOT_NULL); 5232 ldrw(cnt1, Address(a1, length_offset)); 5233 ldrw(cnt2, Address(a2, length_offset)); 5234 eorw(tmp5, cnt1, cnt2); 5235 cbnzw(tmp5, DONE); 5236 lea(a1, Address(a1, base_offset)); 5237 lea(a2, Address(a2, base_offset)); 5238 // Check for short strings, i.e. smaller than wordSize. 5239 subs(cnt1, cnt1, elem_per_word); 5240 br(Assembler::LT, SHORT); 5241 // Main 8 byte comparison loop. 5242 bind(NEXT_WORD); { 5243 ldr(tmp1, Address(post(a1, wordSize))); 5244 ldr(tmp2, Address(post(a2, wordSize))); 5245 subs(cnt1, cnt1, elem_per_word); 5246 eor(tmp5, tmp1, tmp2); 5247 cbnz(tmp5, DONE); 5248 } br(GT, NEXT_WORD); 5249 // Last longword. In the case where length == 4 we compare the 5250 // same longword twice, but that's still faster than another 5251 // conditional branch. 5252 // cnt1 could be 0, -1, -2, -3, -4 for chars; -4 only happens when 5253 // length == 4. 5254 if (log_elem_size > 0) 5255 lsl(cnt1, cnt1, log_elem_size); 5256 ldr(tmp3, Address(a1, cnt1)); 5257 ldr(tmp4, Address(a2, cnt1)); 5258 eor(tmp5, tmp3, tmp4); 5259 cbnz(tmp5, DONE); 5260 b(SAME); 5261 bind(A_MIGHT_BE_NULL); 5262 // in case both a1 and a2 are not-null, proceed with loads 5263 cbz(a1, DONE); 5264 cbz(a2, DONE); 5265 b(A_IS_NOT_NULL); 5266 bind(SHORT); 5267 5268 tbz(cnt1, 2 - log_elem_size, TAIL03); // 0-7 bytes left. 5269 { 5270 ldrw(tmp1, Address(post(a1, 4))); 5271 ldrw(tmp2, Address(post(a2, 4))); 5272 eorw(tmp5, tmp1, tmp2); 5273 cbnzw(tmp5, DONE); 5274 } 5275 bind(TAIL03); 5276 tbz(cnt1, 1 - log_elem_size, TAIL01); // 0-3 bytes left. 5277 { 5278 ldrh(tmp3, Address(post(a1, 2))); 5279 ldrh(tmp4, Address(post(a2, 2))); 5280 eorw(tmp5, tmp3, tmp4); 5281 cbnzw(tmp5, DONE); 5282 } 5283 bind(TAIL01); 5284 if (elem_size == 1) { // Only needed when comparing byte arrays. 5285 tbz(cnt1, 0, SAME); // 0-1 bytes left. 5286 { 5287 ldrb(tmp1, a1); 5288 ldrb(tmp2, a2); 5289 eorw(tmp5, tmp1, tmp2); 5290 cbnzw(tmp5, DONE); 5291 } 5292 } 5293 } else { 5294 Label NEXT_DWORD, SHORT, TAIL, TAIL2, STUB, EARLY_OUT, 5295 CSET_EQ, LAST_CHECK; 5296 mov(result, false); 5297 cbz(a1, DONE); 5298 ldrw(cnt1, Address(a1, length_offset)); 5299 cbz(a2, DONE); 5300 ldrw(cnt2, Address(a2, length_offset)); 5301 // on most CPUs a2 is still "locked"(surprisingly) in ldrw and it's 5302 // faster to perform another branch before comparing a1 and a2 5303 cmp(cnt1, (u1)elem_per_word); 5304 br(LE, SHORT); // short or same 5305 ldr(tmp3, Address(pre(a1, base_offset))); 5306 subs(zr, cnt1, stubBytesThreshold); 5307 br(GE, STUB); 5308 ldr(tmp4, Address(pre(a2, base_offset))); 5309 sub(tmp5, zr, cnt1, LSL, 3 + log_elem_size); 5310 cmp(cnt2, cnt1); 5311 br(NE, DONE); 5312 5313 // Main 16 byte comparison loop with 2 exits 5314 bind(NEXT_DWORD); { 5315 ldr(tmp1, Address(pre(a1, wordSize))); 5316 ldr(tmp2, Address(pre(a2, wordSize))); 5317 subs(cnt1, cnt1, 2 * elem_per_word); 5318 br(LE, TAIL); 5319 eor(tmp4, tmp3, tmp4); 5320 cbnz(tmp4, DONE); 5321 ldr(tmp3, Address(pre(a1, wordSize))); 5322 ldr(tmp4, Address(pre(a2, wordSize))); 5323 cmp(cnt1, (u1)elem_per_word); 5324 br(LE, TAIL2); 5325 cmp(tmp1, tmp2); 5326 } br(EQ, NEXT_DWORD); 5327 b(DONE); 5328 5329 bind(TAIL); 5330 eor(tmp4, tmp3, tmp4); 5331 eor(tmp2, tmp1, tmp2); 5332 lslv(tmp2, tmp2, tmp5); 5333 orr(tmp5, tmp4, tmp2); 5334 cmp(tmp5, zr); 5335 b(CSET_EQ); 5336 5337 bind(TAIL2); 5338 eor(tmp2, tmp1, tmp2); 5339 cbnz(tmp2, DONE); 5340 b(LAST_CHECK); 5341 5342 bind(STUB); 5343 ldr(tmp4, Address(pre(a2, base_offset))); 5344 cmp(cnt2, cnt1); 5345 br(NE, DONE); 5346 if (elem_size == 2) { // convert to byte counter 5347 lsl(cnt1, cnt1, 1); 5348 } 5349 eor(tmp5, tmp3, tmp4); 5350 cbnz(tmp5, DONE); 5351 RuntimeAddress stub = RuntimeAddress(StubRoutines::aarch64::large_array_equals()); 5352 assert(stub.target() != NULL, "array_equals_long stub has not been generated"); 5353 trampoline_call(stub); 5354 b(DONE); 5355 5356 bind(EARLY_OUT); 5357 // (a1 != null && a2 == null) || (a1 != null && a2 != null && a1 == a2) 5358 // so, if a2 == null => return false(0), else return true, so we can return a2 5359 mov(result, a2); 5360 b(DONE); 5361 bind(SHORT); 5362 cmp(cnt2, cnt1); 5363 br(NE, DONE); 5364 cbz(cnt1, SAME); 5365 sub(tmp5, zr, cnt1, LSL, 3 + log_elem_size); 5366 ldr(tmp3, Address(a1, base_offset)); 5367 ldr(tmp4, Address(a2, base_offset)); 5368 bind(LAST_CHECK); 5369 eor(tmp4, tmp3, tmp4); 5370 lslv(tmp5, tmp4, tmp5); 5371 cmp(tmp5, zr); 5372 bind(CSET_EQ); 5373 cset(result, EQ); 5374 b(DONE); 5375 } 5376 5377 bind(SAME); 5378 mov(result, true); 5379 // That's it. 5380 bind(DONE); 5381 5382 BLOCK_COMMENT("} array_equals"); 5383 } 5384 5385 // Compare Strings 5386 5387 // For Strings we're passed the address of the first characters in a1 5388 // and a2 and the length in cnt1. 5389 // elem_size is the element size in bytes: either 1 or 2. 5390 // There are two implementations. For arrays >= 8 bytes, all 5391 // comparisons (including the final one, which may overlap) are 5392 // performed 8 bytes at a time. For strings < 8 bytes, we compare a 5393 // halfword, then a short, and then a byte. 5394 5395 void MacroAssembler::string_equals(Register a1, Register a2, 5396 Register result, Register cnt1, int elem_size) 5397 { 5398 Label SAME, DONE, SHORT, NEXT_WORD; 5399 Register tmp1 = rscratch1; 5400 Register tmp2 = rscratch2; 5401 Register cnt2 = tmp2; // cnt2 only used in array length compare 5402 5403 assert(elem_size == 1 || elem_size == 2, "must be 2 or 1 byte"); 5404 assert_different_registers(a1, a2, result, cnt1, rscratch1, rscratch2); 5405 5406 #ifndef PRODUCT 5407 { 5408 const char kind = (elem_size == 2) ? 'U' : 'L'; 5409 char comment[64]; 5410 snprintf(comment, sizeof comment, "{string_equals%c", kind); 5411 BLOCK_COMMENT(comment); 5412 } 5413 #endif 5414 5415 mov(result, false); 5416 5417 // Check for short strings, i.e. smaller than wordSize. 5418 subs(cnt1, cnt1, wordSize); 5419 br(Assembler::LT, SHORT); 5420 // Main 8 byte comparison loop. 5421 bind(NEXT_WORD); { 5422 ldr(tmp1, Address(post(a1, wordSize))); 5423 ldr(tmp2, Address(post(a2, wordSize))); 5424 subs(cnt1, cnt1, wordSize); 5425 eor(tmp1, tmp1, tmp2); 5426 cbnz(tmp1, DONE); 5427 } br(GT, NEXT_WORD); 5428 // Last longword. In the case where length == 4 we compare the 5429 // same longword twice, but that's still faster than another 5430 // conditional branch. 5431 // cnt1 could be 0, -1, -2, -3, -4 for chars; -4 only happens when 5432 // length == 4. 5433 ldr(tmp1, Address(a1, cnt1)); 5434 ldr(tmp2, Address(a2, cnt1)); 5435 eor(tmp2, tmp1, tmp2); 5436 cbnz(tmp2, DONE); 5437 b(SAME); 5438 5439 bind(SHORT); 5440 Label TAIL03, TAIL01; 5441 5442 tbz(cnt1, 2, TAIL03); // 0-7 bytes left. 5443 { 5444 ldrw(tmp1, Address(post(a1, 4))); 5445 ldrw(tmp2, Address(post(a2, 4))); 5446 eorw(tmp1, tmp1, tmp2); 5447 cbnzw(tmp1, DONE); 5448 } 5449 bind(TAIL03); 5450 tbz(cnt1, 1, TAIL01); // 0-3 bytes left. 5451 { 5452 ldrh(tmp1, Address(post(a1, 2))); 5453 ldrh(tmp2, Address(post(a2, 2))); 5454 eorw(tmp1, tmp1, tmp2); 5455 cbnzw(tmp1, DONE); 5456 } 5457 bind(TAIL01); 5458 if (elem_size == 1) { // Only needed when comparing 1-byte elements 5459 tbz(cnt1, 0, SAME); // 0-1 bytes left. 5460 { 5461 ldrb(tmp1, a1); 5462 ldrb(tmp2, a2); 5463 eorw(tmp1, tmp1, tmp2); 5464 cbnzw(tmp1, DONE); 5465 } 5466 } 5467 // Arrays are equal. 5468 bind(SAME); 5469 mov(result, true); 5470 5471 // That's it. 5472 bind(DONE); 5473 BLOCK_COMMENT("} string_equals"); 5474 } 5475 5476 5477 // The size of the blocks erased by the zero_blocks stub. We must 5478 // handle anything smaller than this ourselves in zero_words(). 5479 const int MacroAssembler::zero_words_block_size = 8; 5480 5481 // zero_words() is used by C2 ClearArray patterns. It is as small as 5482 // possible, handling small word counts locally and delegating 5483 // anything larger to the zero_blocks stub. It is expanded many times 5484 // in compiled code, so it is important to keep it short. 5485 5486 // ptr: Address of a buffer to be zeroed. 5487 // cnt: Count in HeapWords. 5488 // 5489 // ptr, cnt, rscratch1, and rscratch2 are clobbered. 5490 void MacroAssembler::zero_words(Register ptr, Register cnt) 5491 { 5492 assert(is_power_of_2(zero_words_block_size), "adjust this"); 5493 assert(ptr == r10 && cnt == r11, "mismatch in register usage"); 5494 5495 BLOCK_COMMENT("zero_words {"); 5496 cmp(cnt, (u1)zero_words_block_size); 5497 Label around; 5498 br(LO, around); 5499 { 5500 RuntimeAddress zero_blocks = RuntimeAddress(StubRoutines::aarch64::zero_blocks()); 5501 assert(zero_blocks.target() != NULL, "zero_blocks stub has not been generated"); 5502 if (StubRoutines::aarch64::complete()) { 5503 trampoline_call(zero_blocks); 5504 } else { 5505 bl(zero_blocks); 5506 } 5507 } 5508 bind(around); 5509 for (int i = zero_words_block_size >> 1; i > 1; i >>= 1) { 5510 Label l; 5511 tbz(cnt, exact_log2(i), l); 5512 for (int j = 0; j < i; j += 2) { 5513 stp(zr, zr, post(ptr, 16)); 5514 } 5515 bind(l); 5516 } 5517 { 5518 Label l; 5519 tbz(cnt, 0, l); 5520 str(zr, Address(ptr)); 5521 bind(l); 5522 } 5523 BLOCK_COMMENT("} zero_words"); 5524 } 5525 5526 // base: Address of a buffer to be zeroed, 8 bytes aligned. 5527 // cnt: Immediate count in HeapWords. 5528 #define SmallArraySize (18 * BytesPerLong) 5529 void MacroAssembler::zero_words(Register base, u_int64_t cnt) 5530 { 5531 BLOCK_COMMENT("zero_words {"); 5532 int i = cnt & 1; // store any odd word to start 5533 if (i) str(zr, Address(base)); 5534 5535 if (cnt <= SmallArraySize / BytesPerLong) { 5536 for (; i < (int)cnt; i += 2) 5537 stp(zr, zr, Address(base, i * wordSize)); 5538 } else { 5539 const int unroll = 4; // Number of stp(zr, zr) instructions we'll unroll 5540 int remainder = cnt % (2 * unroll); 5541 for (; i < remainder; i += 2) 5542 stp(zr, zr, Address(base, i * wordSize)); 5543 5544 Label loop; 5545 Register cnt_reg = rscratch1; 5546 Register loop_base = rscratch2; 5547 cnt = cnt - remainder; 5548 mov(cnt_reg, cnt); 5549 // adjust base and prebias by -2 * wordSize so we can pre-increment 5550 add(loop_base, base, (remainder - 2) * wordSize); 5551 bind(loop); 5552 sub(cnt_reg, cnt_reg, 2 * unroll); 5553 for (i = 1; i < unroll; i++) 5554 stp(zr, zr, Address(loop_base, 2 * i * wordSize)); 5555 stp(zr, zr, Address(pre(loop_base, 2 * unroll * wordSize))); 5556 cbnz(cnt_reg, loop); 5557 } 5558 BLOCK_COMMENT("} zero_words"); 5559 } 5560 5561 // Zero blocks of memory by using DC ZVA. 5562 // 5563 // Aligns the base address first sufficently for DC ZVA, then uses 5564 // DC ZVA repeatedly for every full block. cnt is the size to be 5565 // zeroed in HeapWords. Returns the count of words left to be zeroed 5566 // in cnt. 5567 // 5568 // NOTE: This is intended to be used in the zero_blocks() stub. If 5569 // you want to use it elsewhere, note that cnt must be >= 2*zva_length. 5570 void MacroAssembler::zero_dcache_blocks(Register base, Register cnt) { 5571 Register tmp = rscratch1; 5572 Register tmp2 = rscratch2; 5573 int zva_length = VM_Version::zva_length(); 5574 Label initial_table_end, loop_zva; 5575 Label fini; 5576 5577 // Base must be 16 byte aligned. If not just return and let caller handle it 5578 tst(base, 0x0f); 5579 br(Assembler::NE, fini); 5580 // Align base with ZVA length. 5581 neg(tmp, base); 5582 andr(tmp, tmp, zva_length - 1); 5583 5584 // tmp: the number of bytes to be filled to align the base with ZVA length. 5585 add(base, base, tmp); 5586 sub(cnt, cnt, tmp, Assembler::ASR, 3); 5587 adr(tmp2, initial_table_end); 5588 sub(tmp2, tmp2, tmp, Assembler::LSR, 2); 5589 br(tmp2); 5590 5591 for (int i = -zva_length + 16; i < 0; i += 16) 5592 stp(zr, zr, Address(base, i)); 5593 bind(initial_table_end); 5594 5595 sub(cnt, cnt, zva_length >> 3); 5596 bind(loop_zva); 5597 dc(Assembler::ZVA, base); 5598 subs(cnt, cnt, zva_length >> 3); 5599 add(base, base, zva_length); 5600 br(Assembler::GE, loop_zva); 5601 add(cnt, cnt, zva_length >> 3); // count not zeroed by DC ZVA 5602 bind(fini); 5603 } 5604 5605 // base: Address of a buffer to be filled, 8 bytes aligned. 5606 // cnt: Count in 8-byte unit. 5607 // value: Value to be filled with. 5608 // base will point to the end of the buffer after filling. 5609 void MacroAssembler::fill_words(Register base, Register cnt, Register value) 5610 { 5611 // Algorithm: 5612 // 5613 // scratch1 = cnt & 7; 5614 // cnt -= scratch1; 5615 // p += scratch1; 5616 // switch (scratch1) { 5617 // do { 5618 // cnt -= 8; 5619 // p[-8] = v; 5620 // case 7: 5621 // p[-7] = v; 5622 // case 6: 5623 // p[-6] = v; 5624 // // ... 5625 // case 1: 5626 // p[-1] = v; 5627 // case 0: 5628 // p += 8; 5629 // } while (cnt); 5630 // } 5631 5632 assert_different_registers(base, cnt, value, rscratch1, rscratch2); 5633 5634 Label fini, skip, entry, loop; 5635 const int unroll = 8; // Number of stp instructions we'll unroll 5636 5637 cbz(cnt, fini); 5638 tbz(base, 3, skip); 5639 str(value, Address(post(base, 8))); 5640 sub(cnt, cnt, 1); 5641 bind(skip); 5642 5643 andr(rscratch1, cnt, (unroll-1) * 2); 5644 sub(cnt, cnt, rscratch1); 5645 add(base, base, rscratch1, Assembler::LSL, 3); 5646 adr(rscratch2, entry); 5647 sub(rscratch2, rscratch2, rscratch1, Assembler::LSL, 1); 5648 br(rscratch2); 5649 5650 bind(loop); 5651 add(base, base, unroll * 16); 5652 for (int i = -unroll; i < 0; i++) 5653 stp(value, value, Address(base, i * 16)); 5654 bind(entry); 5655 subs(cnt, cnt, unroll * 2); 5656 br(Assembler::GE, loop); 5657 5658 tbz(cnt, 0, fini); 5659 str(value, Address(post(base, 8))); 5660 bind(fini); 5661 } 5662 5663 // Intrinsic for sun/nio/cs/ISO_8859_1$Encoder.implEncodeISOArray and 5664 // java/lang/StringUTF16.compress. 5665 void MacroAssembler::encode_iso_array(Register src, Register dst, 5666 Register len, Register result, 5667 FloatRegister Vtmp1, FloatRegister Vtmp2, 5668 FloatRegister Vtmp3, FloatRegister Vtmp4) 5669 { 5670 Label DONE, SET_RESULT, NEXT_32, NEXT_32_PRFM, LOOP_8, NEXT_8, LOOP_1, NEXT_1, 5671 NEXT_32_START, NEXT_32_PRFM_START; 5672 Register tmp1 = rscratch1, tmp2 = rscratch2; 5673 5674 mov(result, len); // Save initial len 5675 5676 #ifndef BUILTIN_SIM 5677 cmp(len, (u1)8); // handle shortest strings first 5678 br(LT, LOOP_1); 5679 cmp(len, (u1)32); 5680 br(LT, NEXT_8); 5681 // The following code uses the SIMD 'uzp1' and 'uzp2' instructions 5682 // to convert chars to bytes 5683 if (SoftwarePrefetchHintDistance >= 0) { 5684 ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src); 5685 subs(tmp2, len, SoftwarePrefetchHintDistance/2 + 16); 5686 br(LE, NEXT_32_START); 5687 b(NEXT_32_PRFM_START); 5688 BIND(NEXT_32_PRFM); 5689 ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src); 5690 BIND(NEXT_32_PRFM_START); 5691 prfm(Address(src, SoftwarePrefetchHintDistance)); 5692 orr(v4, T16B, Vtmp1, Vtmp2); 5693 orr(v5, T16B, Vtmp3, Vtmp4); 5694 uzp1(Vtmp1, T16B, Vtmp1, Vtmp2); 5695 uzp1(Vtmp3, T16B, Vtmp3, Vtmp4); 5696 uzp2(v5, T16B, v4, v5); // high bytes 5697 umov(tmp2, v5, D, 1); 5698 fmovd(tmp1, v5); 5699 orr(tmp1, tmp1, tmp2); 5700 cbnz(tmp1, LOOP_8); 5701 stpq(Vtmp1, Vtmp3, dst); 5702 sub(len, len, 32); 5703 add(dst, dst, 32); 5704 add(src, src, 64); 5705 subs(tmp2, len, SoftwarePrefetchHintDistance/2 + 16); 5706 br(GE, NEXT_32_PRFM); 5707 cmp(len, (u1)32); 5708 br(LT, LOOP_8); 5709 BIND(NEXT_32); 5710 ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src); 5711 BIND(NEXT_32_START); 5712 } else { 5713 BIND(NEXT_32); 5714 ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src); 5715 } 5716 prfm(Address(src, SoftwarePrefetchHintDistance)); 5717 uzp1(v4, T16B, Vtmp1, Vtmp2); 5718 uzp1(v5, T16B, Vtmp3, Vtmp4); 5719 orr(Vtmp1, T16B, Vtmp1, Vtmp2); 5720 orr(Vtmp3, T16B, Vtmp3, Vtmp4); 5721 uzp2(Vtmp1, T16B, Vtmp1, Vtmp3); // high bytes 5722 umov(tmp2, Vtmp1, D, 1); 5723 fmovd(tmp1, Vtmp1); 5724 orr(tmp1, tmp1, tmp2); 5725 cbnz(tmp1, LOOP_8); 5726 stpq(v4, v5, dst); 5727 sub(len, len, 32); 5728 add(dst, dst, 32); 5729 add(src, src, 64); 5730 cmp(len, (u1)32); 5731 br(GE, NEXT_32); 5732 cbz(len, DONE); 5733 5734 BIND(LOOP_8); 5735 cmp(len, (u1)8); 5736 br(LT, LOOP_1); 5737 BIND(NEXT_8); 5738 ld1(Vtmp1, T8H, src); 5739 uzp1(Vtmp2, T16B, Vtmp1, Vtmp1); // low bytes 5740 uzp2(Vtmp3, T16B, Vtmp1, Vtmp1); // high bytes 5741 fmovd(tmp1, Vtmp3); 5742 cbnz(tmp1, NEXT_1); 5743 strd(Vtmp2, dst); 5744 5745 sub(len, len, 8); 5746 add(dst, dst, 8); 5747 add(src, src, 16); 5748 cmp(len, (u1)8); 5749 br(GE, NEXT_8); 5750 5751 BIND(LOOP_1); 5752 #endif 5753 cbz(len, DONE); 5754 BIND(NEXT_1); 5755 ldrh(tmp1, Address(post(src, 2))); 5756 tst(tmp1, 0xff00); 5757 br(NE, SET_RESULT); 5758 strb(tmp1, Address(post(dst, 1))); 5759 subs(len, len, 1); 5760 br(GT, NEXT_1); 5761 5762 BIND(SET_RESULT); 5763 sub(result, result, len); // Return index where we stopped 5764 // Return len == 0 if we processed all 5765 // characters 5766 BIND(DONE); 5767 } 5768 5769 5770 // Inflate byte[] array to char[]. 5771 void MacroAssembler::byte_array_inflate(Register src, Register dst, Register len, 5772 FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3, 5773 Register tmp4) { 5774 Label big, done, after_init, to_stub; 5775 5776 assert_different_registers(src, dst, len, tmp4, rscratch1); 5777 5778 fmovd(vtmp1, zr); 5779 lsrw(tmp4, len, 3); 5780 bind(after_init); 5781 cbnzw(tmp4, big); 5782 // Short string: less than 8 bytes. 5783 { 5784 Label loop, tiny; 5785 5786 cmpw(len, 4); 5787 br(LT, tiny); 5788 // Use SIMD to do 4 bytes. 5789 ldrs(vtmp2, post(src, 4)); 5790 zip1(vtmp3, T8B, vtmp2, vtmp1); 5791 subw(len, len, 4); 5792 strd(vtmp3, post(dst, 8)); 5793 5794 cbzw(len, done); 5795 5796 // Do the remaining bytes by steam. 5797 bind(loop); 5798 ldrb(tmp4, post(src, 1)); 5799 strh(tmp4, post(dst, 2)); 5800 subw(len, len, 1); 5801 5802 bind(tiny); 5803 cbnz(len, loop); 5804 5805 b(done); 5806 } 5807 5808 if (SoftwarePrefetchHintDistance >= 0) { 5809 bind(to_stub); 5810 RuntimeAddress stub = RuntimeAddress(StubRoutines::aarch64::large_byte_array_inflate()); 5811 assert(stub.target() != NULL, "large_byte_array_inflate stub has not been generated"); 5812 trampoline_call(stub); 5813 b(after_init); 5814 } 5815 5816 // Unpack the bytes 8 at a time. 5817 bind(big); 5818 { 5819 Label loop, around, loop_last, loop_start; 5820 5821 if (SoftwarePrefetchHintDistance >= 0) { 5822 const int large_loop_threshold = (64 + 16)/8; 5823 ldrd(vtmp2, post(src, 8)); 5824 andw(len, len, 7); 5825 cmp(tmp4, (u1)large_loop_threshold); 5826 br(GE, to_stub); 5827 b(loop_start); 5828 5829 bind(loop); 5830 ldrd(vtmp2, post(src, 8)); 5831 bind(loop_start); 5832 subs(tmp4, tmp4, 1); 5833 br(EQ, loop_last); 5834 zip1(vtmp2, T16B, vtmp2, vtmp1); 5835 ldrd(vtmp3, post(src, 8)); 5836 st1(vtmp2, T8H, post(dst, 16)); 5837 subs(tmp4, tmp4, 1); 5838 zip1(vtmp3, T16B, vtmp3, vtmp1); 5839 st1(vtmp3, T8H, post(dst, 16)); 5840 br(NE, loop); 5841 b(around); 5842 bind(loop_last); 5843 zip1(vtmp2, T16B, vtmp2, vtmp1); 5844 st1(vtmp2, T8H, post(dst, 16)); 5845 bind(around); 5846 cbz(len, done); 5847 } else { 5848 andw(len, len, 7); 5849 bind(loop); 5850 ldrd(vtmp2, post(src, 8)); 5851 sub(tmp4, tmp4, 1); 5852 zip1(vtmp3, T16B, vtmp2, vtmp1); 5853 st1(vtmp3, T8H, post(dst, 16)); 5854 cbnz(tmp4, loop); 5855 } 5856 } 5857 5858 // Do the tail of up to 8 bytes. 5859 add(src, src, len); 5860 ldrd(vtmp3, Address(src, -8)); 5861 add(dst, dst, len, ext::uxtw, 1); 5862 zip1(vtmp3, T16B, vtmp3, vtmp1); 5863 strq(vtmp3, Address(dst, -16)); 5864 5865 bind(done); 5866 } 5867 5868 // Compress char[] array to byte[]. 5869 void MacroAssembler::char_array_compress(Register src, Register dst, Register len, 5870 FloatRegister tmp1Reg, FloatRegister tmp2Reg, 5871 FloatRegister tmp3Reg, FloatRegister tmp4Reg, 5872 Register result) { 5873 encode_iso_array(src, dst, len, result, 5874 tmp1Reg, tmp2Reg, tmp3Reg, tmp4Reg); 5875 cmp(len, zr); 5876 csel(result, result, zr, EQ); 5877 } 5878 5879 // get_thread() can be called anywhere inside generated code so we 5880 // need to save whatever non-callee save context might get clobbered 5881 // by the call to JavaThread::aarch64_get_thread_helper() or, indeed, 5882 // the call setup code. 5883 // 5884 // aarch64_get_thread_helper() clobbers only r0, r1, and flags. 5885 // 5886 void MacroAssembler::get_thread(Register dst) { 5887 RegSet saved_regs = RegSet::range(r0, r1) + lr - dst; 5888 push(saved_regs, sp); 5889 5890 mov(lr, CAST_FROM_FN_PTR(address, JavaThread::aarch64_get_thread_helper)); 5891 blrt(lr, 1, 0, 1); 5892 if (dst != c_rarg0) { 5893 mov(dst, c_rarg0); 5894 } 5895 5896 pop(saved_regs, sp); 5897 } 5898 5899 // DMS TODO ValueType MachVVEPNode support 5900 void MacroAssembler::unpack_value_args(Compile* C) { 5901 // Not implemented 5902 guarantee(false, "Support for MachVVEPNode is not implemented"); 5903 } 5904