1 /* 2 * Copyright (c) 1997, 2018, Oracle and/or its affiliates. All rights reserved. 3 * Copyright (c) 2014, 2015, Red Hat Inc. All rights reserved. 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. 9 * 10 * This code is distributed in the hope that it will be useful, but WITHOUT 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 * version 2 for more details (a copy is included in the LICENSE file that 14 * accompanied this code). 15 * 16 * You should have received a copy of the GNU General Public License version 17 * 2 along with this work; if not, write to the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 19 * 20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 21 * or visit www.oracle.com if you need additional information or have any 22 * questions. 23 * 24 */ 25 26 #include <sys/types.h> 27 28 #include "precompiled.hpp" 29 #include "jvm.h" 30 #include "asm/assembler.hpp" 31 #include "asm/assembler.inline.hpp" 32 #include "gc/shared/barrierSet.hpp" 33 #include "gc/shared/cardTable.hpp" 34 #include "gc/shared/barrierSetAssembler.hpp" 35 #include "gc/shared/cardTableBarrierSet.hpp" 36 #include "interpreter/interpreter.hpp" 37 #include "compiler/disassembler.hpp" 38 #include "memory/resourceArea.hpp" 39 #include "nativeInst_aarch64.hpp" 40 #include "oops/accessDecorators.hpp" 41 #include "oops/compressedOops.inline.hpp" 42 #include "oops/klass.inline.hpp" 43 #include "runtime/biasedLocking.hpp" 44 #include "runtime/icache.hpp" 45 #include "runtime/interfaceSupport.inline.hpp" 46 #include "runtime/jniHandles.inline.hpp" 47 #include "runtime/sharedRuntime.hpp" 48 #include "runtime/thread.hpp" 49 #ifdef COMPILER1 50 #include "c1/c1_LIRAssembler.hpp" 51 #endif 52 #ifdef COMPILER2 53 #include "oops/oop.hpp" 54 #include "opto/compile.hpp" 55 #include "opto/intrinsicnode.hpp" 56 #include "opto/node.hpp" 57 #endif 58 59 #ifdef PRODUCT 60 #define BLOCK_COMMENT(str) /* nothing */ 61 #define STOP(error) stop(error) 62 #else 63 #define BLOCK_COMMENT(str) block_comment(str) 64 #define STOP(error) block_comment(error); stop(error) 65 #endif 66 67 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 68 69 // Patch any kind of instruction; there may be several instructions. 70 // Return the total length (in bytes) of the instructions. 71 int MacroAssembler::pd_patch_instruction_size(address branch, address target) { 72 int instructions = 1; 73 assert((uint64_t)target < (1ul << 48), "48-bit overflow in address constant"); 74 long offset = (target - branch) >> 2; 75 unsigned insn = *(unsigned*)branch; 76 if ((Instruction_aarch64::extract(insn, 29, 24) & 0b111011) == 0b011000) { 77 // Load register (literal) 78 Instruction_aarch64::spatch(branch, 23, 5, offset); 79 } else if (Instruction_aarch64::extract(insn, 30, 26) == 0b00101) { 80 // Unconditional branch (immediate) 81 Instruction_aarch64::spatch(branch, 25, 0, offset); 82 } else if (Instruction_aarch64::extract(insn, 31, 25) == 0b0101010) { 83 // Conditional branch (immediate) 84 Instruction_aarch64::spatch(branch, 23, 5, offset); 85 } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011010) { 86 // Compare & branch (immediate) 87 Instruction_aarch64::spatch(branch, 23, 5, offset); 88 } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011011) { 89 // Test & branch (immediate) 90 Instruction_aarch64::spatch(branch, 18, 5, offset); 91 } else if (Instruction_aarch64::extract(insn, 28, 24) == 0b10000) { 92 // PC-rel. addressing 93 offset = target-branch; 94 int shift = Instruction_aarch64::extract(insn, 31, 31); 95 if (shift) { 96 u_int64_t dest = (u_int64_t)target; 97 uint64_t pc_page = (uint64_t)branch >> 12; 98 uint64_t adr_page = (uint64_t)target >> 12; 99 unsigned offset_lo = dest & 0xfff; 100 offset = adr_page - pc_page; 101 102 // We handle 4 types of PC relative addressing 103 // 1 - adrp Rx, target_page 104 // ldr/str Ry, [Rx, #offset_in_page] 105 // 2 - adrp Rx, target_page 106 // add Ry, Rx, #offset_in_page 107 // 3 - adrp Rx, target_page (page aligned reloc, offset == 0) 108 // movk Rx, #imm16<<32 109 // 4 - adrp Rx, target_page (page aligned reloc, offset == 0) 110 // In the first 3 cases we must check that Rx is the same in the adrp and the 111 // subsequent ldr/str, add or movk instruction. Otherwise we could accidentally end 112 // up treating a type 4 relocation as a type 1, 2 or 3 just because it happened 113 // to be followed by a random unrelated ldr/str, add or movk instruction. 114 // 115 unsigned insn2 = ((unsigned*)branch)[1]; 116 if (Instruction_aarch64::extract(insn2, 29, 24) == 0b111001 && 117 Instruction_aarch64::extract(insn, 4, 0) == 118 Instruction_aarch64::extract(insn2, 9, 5)) { 119 // Load/store register (unsigned immediate) 120 unsigned size = Instruction_aarch64::extract(insn2, 31, 30); 121 Instruction_aarch64::patch(branch + sizeof (unsigned), 122 21, 10, offset_lo >> size); 123 guarantee(((dest >> size) << size) == dest, "misaligned target"); 124 instructions = 2; 125 } else if (Instruction_aarch64::extract(insn2, 31, 22) == 0b1001000100 && 126 Instruction_aarch64::extract(insn, 4, 0) == 127 Instruction_aarch64::extract(insn2, 4, 0)) { 128 // add (immediate) 129 Instruction_aarch64::patch(branch + sizeof (unsigned), 130 21, 10, offset_lo); 131 instructions = 2; 132 } else if (Instruction_aarch64::extract(insn2, 31, 21) == 0b11110010110 && 133 Instruction_aarch64::extract(insn, 4, 0) == 134 Instruction_aarch64::extract(insn2, 4, 0)) { 135 // movk #imm16<<32 136 Instruction_aarch64::patch(branch + 4, 20, 5, (uint64_t)target >> 32); 137 long dest = ((long)target & 0xffffffffL) | ((long)branch & 0xffff00000000L); 138 long pc_page = (long)branch >> 12; 139 long adr_page = (long)dest >> 12; 140 offset = adr_page - pc_page; 141 instructions = 2; 142 } 143 } 144 int offset_lo = offset & 3; 145 offset >>= 2; 146 Instruction_aarch64::spatch(branch, 23, 5, offset); 147 Instruction_aarch64::patch(branch, 30, 29, offset_lo); 148 } else if (Instruction_aarch64::extract(insn, 31, 21) == 0b11010010100) { 149 u_int64_t dest = (u_int64_t)target; 150 // Move wide constant 151 assert(nativeInstruction_at(branch+4)->is_movk(), "wrong insns in patch"); 152 assert(nativeInstruction_at(branch+8)->is_movk(), "wrong insns in patch"); 153 Instruction_aarch64::patch(branch, 20, 5, dest & 0xffff); 154 Instruction_aarch64::patch(branch+4, 20, 5, (dest >>= 16) & 0xffff); 155 Instruction_aarch64::patch(branch+8, 20, 5, (dest >>= 16) & 0xffff); 156 assert(target_addr_for_insn(branch) == target, "should be"); 157 instructions = 3; 158 } else if (Instruction_aarch64::extract(insn, 31, 22) == 0b1011100101 && 159 Instruction_aarch64::extract(insn, 4, 0) == 0b11111) { 160 // nothing to do 161 assert(target == 0, "did not expect to relocate target for polling page load"); 162 } else { 163 ShouldNotReachHere(); 164 } 165 return instructions * NativeInstruction::instruction_size; 166 } 167 168 int MacroAssembler::patch_oop(address insn_addr, address o) { 169 int instructions; 170 unsigned insn = *(unsigned*)insn_addr; 171 assert(nativeInstruction_at(insn_addr+4)->is_movk(), "wrong insns in patch"); 172 173 // OOPs are either narrow (32 bits) or wide (48 bits). We encode 174 // narrow OOPs by setting the upper 16 bits in the first 175 // instruction. 176 if (Instruction_aarch64::extract(insn, 31, 21) == 0b11010010101) { 177 // Move narrow OOP 178 narrowOop n = CompressedOops::encode((oop)o); 179 Instruction_aarch64::patch(insn_addr, 20, 5, n >> 16); 180 Instruction_aarch64::patch(insn_addr+4, 20, 5, n & 0xffff); 181 instructions = 2; 182 } else { 183 // Move wide OOP 184 assert(nativeInstruction_at(insn_addr+8)->is_movk(), "wrong insns in patch"); 185 uintptr_t dest = (uintptr_t)o; 186 Instruction_aarch64::patch(insn_addr, 20, 5, dest & 0xffff); 187 Instruction_aarch64::patch(insn_addr+4, 20, 5, (dest >>= 16) & 0xffff); 188 Instruction_aarch64::patch(insn_addr+8, 20, 5, (dest >>= 16) & 0xffff); 189 instructions = 3; 190 } 191 return instructions * NativeInstruction::instruction_size; 192 } 193 194 int MacroAssembler::patch_narrow_klass(address insn_addr, narrowKlass n) { 195 // Metatdata pointers are either narrow (32 bits) or wide (48 bits). 196 // We encode narrow ones by setting the upper 16 bits in the first 197 // instruction. 198 NativeInstruction *insn = nativeInstruction_at(insn_addr); 199 assert(Instruction_aarch64::extract(insn->encoding(), 31, 21) == 0b11010010101 && 200 nativeInstruction_at(insn_addr+4)->is_movk(), "wrong insns in patch"); 201 202 Instruction_aarch64::patch(insn_addr, 20, 5, n >> 16); 203 Instruction_aarch64::patch(insn_addr+4, 20, 5, n & 0xffff); 204 return 2 * NativeInstruction::instruction_size; 205 } 206 207 address MacroAssembler::target_addr_for_insn(address insn_addr, unsigned insn) { 208 long offset = 0; 209 if ((Instruction_aarch64::extract(insn, 29, 24) & 0b011011) == 0b00011000) { 210 // Load register (literal) 211 offset = Instruction_aarch64::sextract(insn, 23, 5); 212 return address(((uint64_t)insn_addr + (offset << 2))); 213 } else if (Instruction_aarch64::extract(insn, 30, 26) == 0b00101) { 214 // Unconditional branch (immediate) 215 offset = Instruction_aarch64::sextract(insn, 25, 0); 216 } else if (Instruction_aarch64::extract(insn, 31, 25) == 0b0101010) { 217 // Conditional branch (immediate) 218 offset = Instruction_aarch64::sextract(insn, 23, 5); 219 } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011010) { 220 // Compare & branch (immediate) 221 offset = Instruction_aarch64::sextract(insn, 23, 5); 222 } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011011) { 223 // Test & branch (immediate) 224 offset = Instruction_aarch64::sextract(insn, 18, 5); 225 } else if (Instruction_aarch64::extract(insn, 28, 24) == 0b10000) { 226 // PC-rel. addressing 227 offset = Instruction_aarch64::extract(insn, 30, 29); 228 offset |= Instruction_aarch64::sextract(insn, 23, 5) << 2; 229 int shift = Instruction_aarch64::extract(insn, 31, 31) ? 12 : 0; 230 if (shift) { 231 offset <<= shift; 232 uint64_t target_page = ((uint64_t)insn_addr) + offset; 233 target_page &= ((uint64_t)-1) << shift; 234 // Return the target address for the following sequences 235 // 1 - adrp Rx, target_page 236 // ldr/str Ry, [Rx, #offset_in_page] 237 // 2 - adrp Rx, target_page 238 // add Ry, Rx, #offset_in_page 239 // 3 - adrp Rx, target_page (page aligned reloc, offset == 0) 240 // movk Rx, #imm12<<32 241 // 4 - adrp Rx, target_page (page aligned reloc, offset == 0) 242 // 243 // In the first two cases we check that the register is the same and 244 // return the target_page + the offset within the page. 245 // Otherwise we assume it is a page aligned relocation and return 246 // the target page only. 247 // 248 unsigned insn2 = ((unsigned*)insn_addr)[1]; 249 if (Instruction_aarch64::extract(insn2, 29, 24) == 0b111001 && 250 Instruction_aarch64::extract(insn, 4, 0) == 251 Instruction_aarch64::extract(insn2, 9, 5)) { 252 // Load/store register (unsigned immediate) 253 unsigned int byte_offset = Instruction_aarch64::extract(insn2, 21, 10); 254 unsigned int size = Instruction_aarch64::extract(insn2, 31, 30); 255 return address(target_page + (byte_offset << size)); 256 } else if (Instruction_aarch64::extract(insn2, 31, 22) == 0b1001000100 && 257 Instruction_aarch64::extract(insn, 4, 0) == 258 Instruction_aarch64::extract(insn2, 4, 0)) { 259 // add (immediate) 260 unsigned int byte_offset = Instruction_aarch64::extract(insn2, 21, 10); 261 return address(target_page + byte_offset); 262 } else { 263 if (Instruction_aarch64::extract(insn2, 31, 21) == 0b11110010110 && 264 Instruction_aarch64::extract(insn, 4, 0) == 265 Instruction_aarch64::extract(insn2, 4, 0)) { 266 target_page = (target_page & 0xffffffff) | 267 ((uint64_t)Instruction_aarch64::extract(insn2, 20, 5) << 32); 268 } 269 return (address)target_page; 270 } 271 } else { 272 ShouldNotReachHere(); 273 } 274 } else if (Instruction_aarch64::extract(insn, 31, 23) == 0b110100101) { 275 u_int32_t *insns = (u_int32_t *)insn_addr; 276 // Move wide constant: movz, movk, movk. See movptr(). 277 assert(nativeInstruction_at(insns+1)->is_movk(), "wrong insns in patch"); 278 assert(nativeInstruction_at(insns+2)->is_movk(), "wrong insns in patch"); 279 return address(u_int64_t(Instruction_aarch64::extract(insns[0], 20, 5)) 280 + (u_int64_t(Instruction_aarch64::extract(insns[1], 20, 5)) << 16) 281 + (u_int64_t(Instruction_aarch64::extract(insns[2], 20, 5)) << 32)); 282 } else if (Instruction_aarch64::extract(insn, 31, 22) == 0b1011100101 && 283 Instruction_aarch64::extract(insn, 4, 0) == 0b11111) { 284 return 0; 285 } else { 286 ShouldNotReachHere(); 287 } 288 return address(((uint64_t)insn_addr + (offset << 2))); 289 } 290 291 void MacroAssembler::safepoint_poll(Label& slow_path) { 292 if (SafepointMechanism::uses_thread_local_poll()) { 293 ldr(rscratch1, Address(rthread, Thread::polling_page_offset())); 294 tbnz(rscratch1, exact_log2(SafepointMechanism::poll_bit()), slow_path); 295 } else { 296 unsigned long offset; 297 adrp(rscratch1, ExternalAddress(SafepointSynchronize::address_of_state()), offset); 298 ldrw(rscratch1, Address(rscratch1, offset)); 299 assert(SafepointSynchronize::_not_synchronized == 0, "rewrite this code"); 300 cbnz(rscratch1, slow_path); 301 } 302 } 303 304 // Just like safepoint_poll, but use an acquiring load for thread- 305 // local polling. 306 // 307 // We need an acquire here to ensure that any subsequent load of the 308 // global SafepointSynchronize::_state flag is ordered after this load 309 // of the local Thread::_polling page. We don't want this poll to 310 // return false (i.e. not safepointing) and a later poll of the global 311 // SafepointSynchronize::_state spuriously to return true. 312 // 313 // This is to avoid a race when we're in a native->Java transition 314 // racing the code which wakes up from a safepoint. 315 // 316 void MacroAssembler::safepoint_poll_acquire(Label& slow_path) { 317 if (SafepointMechanism::uses_thread_local_poll()) { 318 lea(rscratch1, Address(rthread, Thread::polling_page_offset())); 319 ldar(rscratch1, rscratch1); 320 tbnz(rscratch1, exact_log2(SafepointMechanism::poll_bit()), slow_path); 321 } else { 322 safepoint_poll(slow_path); 323 } 324 } 325 326 void MacroAssembler::reset_last_Java_frame(bool clear_fp) { 327 // we must set sp to zero to clear frame 328 str(zr, Address(rthread, JavaThread::last_Java_sp_offset())); 329 330 // must clear fp, so that compiled frames are not confused; it is 331 // possible that we need it only for debugging 332 if (clear_fp) { 333 str(zr, Address(rthread, JavaThread::last_Java_fp_offset())); 334 } 335 336 // Always clear the pc because it could have been set by make_walkable() 337 str(zr, Address(rthread, JavaThread::last_Java_pc_offset())); 338 } 339 340 // Calls to C land 341 // 342 // When entering C land, the rfp, & resp of the last Java frame have to be recorded 343 // in the (thread-local) JavaThread object. When leaving C land, the last Java fp 344 // has to be reset to 0. This is required to allow proper stack traversal. 345 void MacroAssembler::set_last_Java_frame(Register last_java_sp, 346 Register last_java_fp, 347 Register last_java_pc, 348 Register scratch) { 349 350 if (last_java_pc->is_valid()) { 351 str(last_java_pc, Address(rthread, 352 JavaThread::frame_anchor_offset() 353 + JavaFrameAnchor::last_Java_pc_offset())); 354 } 355 356 // determine last_java_sp register 357 if (last_java_sp == sp) { 358 mov(scratch, sp); 359 last_java_sp = scratch; 360 } else if (!last_java_sp->is_valid()) { 361 last_java_sp = esp; 362 } 363 364 str(last_java_sp, Address(rthread, JavaThread::last_Java_sp_offset())); 365 366 // last_java_fp is optional 367 if (last_java_fp->is_valid()) { 368 str(last_java_fp, Address(rthread, JavaThread::last_Java_fp_offset())); 369 } 370 } 371 372 void MacroAssembler::set_last_Java_frame(Register last_java_sp, 373 Register last_java_fp, 374 address last_java_pc, 375 Register scratch) { 376 if (last_java_pc != NULL) { 377 adr(scratch, last_java_pc); 378 } else { 379 // FIXME: This is almost never correct. We should delete all 380 // cases of set_last_Java_frame with last_java_pc=NULL and use the 381 // correct return address instead. 382 adr(scratch, pc()); 383 } 384 385 str(scratch, Address(rthread, 386 JavaThread::frame_anchor_offset() 387 + JavaFrameAnchor::last_Java_pc_offset())); 388 389 set_last_Java_frame(last_java_sp, last_java_fp, noreg, scratch); 390 } 391 392 void MacroAssembler::set_last_Java_frame(Register last_java_sp, 393 Register last_java_fp, 394 Label &L, 395 Register scratch) { 396 if (L.is_bound()) { 397 set_last_Java_frame(last_java_sp, last_java_fp, target(L), scratch); 398 } else { 399 InstructionMark im(this); 400 L.add_patch_at(code(), locator()); 401 set_last_Java_frame(last_java_sp, last_java_fp, (address)NULL, scratch); 402 } 403 } 404 405 void MacroAssembler::far_call(Address entry, CodeBuffer *cbuf, Register tmp) { 406 assert(ReservedCodeCacheSize < 4*G, "branch out of range"); 407 assert(CodeCache::find_blob(entry.target()) != NULL, 408 "destination of far call not found in code cache"); 409 if (far_branches()) { 410 unsigned long offset; 411 // We can use ADRP here because we know that the total size of 412 // the code cache cannot exceed 2Gb. 413 adrp(tmp, entry, offset); 414 add(tmp, tmp, offset); 415 if (cbuf) cbuf->set_insts_mark(); 416 blr(tmp); 417 } else { 418 if (cbuf) cbuf->set_insts_mark(); 419 bl(entry); 420 } 421 } 422 423 void MacroAssembler::far_jump(Address entry, CodeBuffer *cbuf, Register tmp) { 424 assert(ReservedCodeCacheSize < 4*G, "branch out of range"); 425 assert(CodeCache::find_blob(entry.target()) != NULL, 426 "destination of far call not found in code cache"); 427 if (far_branches()) { 428 unsigned long offset; 429 // We can use ADRP here because we know that the total size of 430 // the code cache cannot exceed 2Gb. 431 adrp(tmp, entry, offset); 432 add(tmp, tmp, offset); 433 if (cbuf) cbuf->set_insts_mark(); 434 br(tmp); 435 } else { 436 if (cbuf) cbuf->set_insts_mark(); 437 b(entry); 438 } 439 } 440 441 void MacroAssembler::reserved_stack_check() { 442 // testing if reserved zone needs to be enabled 443 Label no_reserved_zone_enabling; 444 445 ldr(rscratch1, Address(rthread, JavaThread::reserved_stack_activation_offset())); 446 cmp(sp, rscratch1); 447 br(Assembler::LO, no_reserved_zone_enabling); 448 449 enter(); // LR and FP are live. 450 lea(rscratch1, CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone)); 451 mov(c_rarg0, rthread); 452 blr(rscratch1); 453 leave(); 454 455 // We have already removed our own frame. 456 // throw_delayed_StackOverflowError will think that it's been 457 // called by our caller. 458 lea(rscratch1, RuntimeAddress(StubRoutines::throw_delayed_StackOverflowError_entry())); 459 br(rscratch1); 460 should_not_reach_here(); 461 462 bind(no_reserved_zone_enabling); 463 } 464 465 int MacroAssembler::biased_locking_enter(Register lock_reg, 466 Register obj_reg, 467 Register swap_reg, 468 Register tmp_reg, 469 bool swap_reg_contains_mark, 470 Label& done, 471 Label* slow_case, 472 BiasedLockingCounters* counters) { 473 assert(UseBiasedLocking, "why call this otherwise?"); 474 assert_different_registers(lock_reg, obj_reg, swap_reg); 475 476 if (PrintBiasedLockingStatistics && counters == NULL) 477 counters = BiasedLocking::counters(); 478 479 assert_different_registers(lock_reg, obj_reg, swap_reg, tmp_reg, rscratch1, rscratch2, noreg); 480 assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits, "biased locking makes assumptions about bit layout"); 481 Address mark_addr (obj_reg, oopDesc::mark_offset_in_bytes()); 482 Address klass_addr (obj_reg, oopDesc::klass_offset_in_bytes()); 483 Address saved_mark_addr(lock_reg, 0); 484 485 // Biased locking 486 // See whether the lock is currently biased toward our thread and 487 // whether the epoch is still valid 488 // Note that the runtime guarantees sufficient alignment of JavaThread 489 // pointers to allow age to be placed into low bits 490 // First check to see whether biasing is even enabled for this object 491 Label cas_label; 492 int null_check_offset = -1; 493 if (!swap_reg_contains_mark) { 494 null_check_offset = offset(); 495 ldr(swap_reg, mark_addr); 496 } 497 andr(tmp_reg, swap_reg, markOopDesc::biased_lock_mask_in_place); 498 cmp(tmp_reg, (u1)markOopDesc::biased_lock_pattern); 499 br(Assembler::NE, cas_label); 500 // The bias pattern is present in the object's header. Need to check 501 // whether the bias owner and the epoch are both still current. 502 load_prototype_header(tmp_reg, obj_reg); 503 orr(tmp_reg, tmp_reg, rthread); 504 eor(tmp_reg, swap_reg, tmp_reg); 505 andr(tmp_reg, tmp_reg, ~((int) markOopDesc::age_mask_in_place)); 506 if (counters != NULL) { 507 Label around; 508 cbnz(tmp_reg, around); 509 atomic_incw(Address((address)counters->biased_lock_entry_count_addr()), tmp_reg, rscratch1, rscratch2); 510 b(done); 511 bind(around); 512 } else { 513 cbz(tmp_reg, done); 514 } 515 516 Label try_revoke_bias; 517 Label try_rebias; 518 519 // At this point we know that the header has the bias pattern and 520 // that we are not the bias owner in the current epoch. We need to 521 // figure out more details about the state of the header in order to 522 // know what operations can be legally performed on the object's 523 // header. 524 525 // If the low three bits in the xor result aren't clear, that means 526 // the prototype header is no longer biased and we have to revoke 527 // the bias on this object. 528 andr(rscratch1, tmp_reg, markOopDesc::biased_lock_mask_in_place); 529 cbnz(rscratch1, try_revoke_bias); 530 531 // Biasing is still enabled for this data type. See whether the 532 // epoch of the current bias is still valid, meaning that the epoch 533 // bits of the mark word are equal to the epoch bits of the 534 // prototype header. (Note that the prototype header's epoch bits 535 // only change at a safepoint.) If not, attempt to rebias the object 536 // toward the current thread. Note that we must be absolutely sure 537 // that the current epoch is invalid in order to do this because 538 // otherwise the manipulations it performs on the mark word are 539 // illegal. 540 andr(rscratch1, tmp_reg, markOopDesc::epoch_mask_in_place); 541 cbnz(rscratch1, try_rebias); 542 543 // The epoch of the current bias is still valid but we know nothing 544 // about the owner; it might be set or it might be clear. Try to 545 // acquire the bias of the object using an atomic operation. If this 546 // fails we will go in to the runtime to revoke the object's bias. 547 // Note that we first construct the presumed unbiased header so we 548 // don't accidentally blow away another thread's valid bias. 549 { 550 Label here; 551 mov(rscratch1, markOopDesc::biased_lock_mask_in_place | markOopDesc::age_mask_in_place | markOopDesc::epoch_mask_in_place); 552 andr(swap_reg, swap_reg, rscratch1); 553 orr(tmp_reg, swap_reg, rthread); 554 cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, rscratch1, here, slow_case); 555 // If the biasing toward our thread failed, this means that 556 // another thread succeeded in biasing it toward itself and we 557 // need to revoke that bias. The revocation will occur in the 558 // interpreter runtime in the slow case. 559 bind(here); 560 if (counters != NULL) { 561 atomic_incw(Address((address)counters->anonymously_biased_lock_entry_count_addr()), 562 tmp_reg, rscratch1, rscratch2); 563 } 564 } 565 b(done); 566 567 bind(try_rebias); 568 // At this point we know the epoch has expired, meaning that the 569 // current "bias owner", if any, is actually invalid. Under these 570 // circumstances _only_, we are allowed to use the current header's 571 // value as the comparison value when doing the cas to acquire the 572 // bias in the current epoch. In other words, we allow transfer of 573 // the bias from one thread to another directly in this situation. 574 // 575 // FIXME: due to a lack of registers we currently blow away the age 576 // bits in this situation. Should attempt to preserve them. 577 { 578 Label here; 579 load_prototype_header(tmp_reg, obj_reg); 580 orr(tmp_reg, rthread, tmp_reg); 581 cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, rscratch1, here, slow_case); 582 // If the biasing toward our thread failed, then another thread 583 // succeeded in biasing it toward itself and we need to revoke that 584 // bias. The revocation will occur in the runtime in the slow case. 585 bind(here); 586 if (counters != NULL) { 587 atomic_incw(Address((address)counters->rebiased_lock_entry_count_addr()), 588 tmp_reg, rscratch1, rscratch2); 589 } 590 } 591 b(done); 592 593 bind(try_revoke_bias); 594 // The prototype mark in the klass doesn't have the bias bit set any 595 // more, indicating that objects of this data type are not supposed 596 // to be biased any more. We are going to try to reset the mark of 597 // this object to the prototype value and fall through to the 598 // CAS-based locking scheme. Note that if our CAS fails, it means 599 // that another thread raced us for the privilege of revoking the 600 // bias of this particular object, so it's okay to continue in the 601 // normal locking code. 602 // 603 // FIXME: due to a lack of registers we currently blow away the age 604 // bits in this situation. Should attempt to preserve them. 605 { 606 Label here, nope; 607 load_prototype_header(tmp_reg, obj_reg); 608 cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, rscratch1, here, &nope); 609 bind(here); 610 611 // Fall through to the normal CAS-based lock, because no matter what 612 // the result of the above CAS, some thread must have succeeded in 613 // removing the bias bit from the object's header. 614 if (counters != NULL) { 615 atomic_incw(Address((address)counters->revoked_lock_entry_count_addr()), tmp_reg, 616 rscratch1, rscratch2); 617 } 618 bind(nope); 619 } 620 621 bind(cas_label); 622 623 return null_check_offset; 624 } 625 626 void MacroAssembler::biased_locking_exit(Register obj_reg, Register temp_reg, Label& done) { 627 assert(UseBiasedLocking, "why call this otherwise?"); 628 629 // Check for biased locking unlock case, which is a no-op 630 // Note: we do not have to check the thread ID for two reasons. 631 // First, the interpreter checks for IllegalMonitorStateException at 632 // a higher level. Second, if the bias was revoked while we held the 633 // lock, the object could not be rebiased toward another thread, so 634 // the bias bit would be clear. 635 ldr(temp_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes())); 636 andr(temp_reg, temp_reg, markOopDesc::biased_lock_mask_in_place); 637 cmp(temp_reg, (u1)markOopDesc::biased_lock_pattern); 638 br(Assembler::EQ, done); 639 } 640 641 static void pass_arg0(MacroAssembler* masm, Register arg) { 642 if (c_rarg0 != arg ) { 643 masm->mov(c_rarg0, arg); 644 } 645 } 646 647 static void pass_arg1(MacroAssembler* masm, Register arg) { 648 if (c_rarg1 != arg ) { 649 masm->mov(c_rarg1, arg); 650 } 651 } 652 653 static void pass_arg2(MacroAssembler* masm, Register arg) { 654 if (c_rarg2 != arg ) { 655 masm->mov(c_rarg2, arg); 656 } 657 } 658 659 static void pass_arg3(MacroAssembler* masm, Register arg) { 660 if (c_rarg3 != arg ) { 661 masm->mov(c_rarg3, arg); 662 } 663 } 664 665 void MacroAssembler::call_VM_base(Register oop_result, 666 Register java_thread, 667 Register last_java_sp, 668 address entry_point, 669 int number_of_arguments, 670 bool check_exceptions) { 671 // determine java_thread register 672 if (!java_thread->is_valid()) { 673 java_thread = rthread; 674 } 675 676 // determine last_java_sp register 677 if (!last_java_sp->is_valid()) { 678 last_java_sp = esp; 679 } 680 681 // debugging support 682 assert(number_of_arguments >= 0 , "cannot have negative number of arguments"); 683 assert(java_thread == rthread, "unexpected register"); 684 #ifdef ASSERT 685 // TraceBytecodes does not use r12 but saves it over the call, so don't verify 686 // if ((UseCompressedOops || UseCompressedClassPointers) && !TraceBytecodes) verify_heapbase("call_VM_base: heap base corrupted?"); 687 #endif // ASSERT 688 689 assert(java_thread != oop_result , "cannot use the same register for java_thread & oop_result"); 690 assert(java_thread != last_java_sp, "cannot use the same register for java_thread & last_java_sp"); 691 692 // push java thread (becomes first argument of C function) 693 694 mov(c_rarg0, java_thread); 695 696 // set last Java frame before call 697 assert(last_java_sp != rfp, "can't use rfp"); 698 699 Label l; 700 set_last_Java_frame(last_java_sp, rfp, l, rscratch1); 701 702 // do the call, remove parameters 703 MacroAssembler::call_VM_leaf_base(entry_point, number_of_arguments, &l); 704 705 // reset last Java frame 706 // Only interpreter should have to clear fp 707 reset_last_Java_frame(true); 708 709 // C++ interp handles this in the interpreter 710 check_and_handle_popframe(java_thread); 711 check_and_handle_earlyret(java_thread); 712 713 if (check_exceptions) { 714 // check for pending exceptions (java_thread is set upon return) 715 ldr(rscratch1, Address(java_thread, in_bytes(Thread::pending_exception_offset()))); 716 Label ok; 717 cbz(rscratch1, ok); 718 lea(rscratch1, RuntimeAddress(StubRoutines::forward_exception_entry())); 719 br(rscratch1); 720 bind(ok); 721 } 722 723 // get oop result if there is one and reset the value in the thread 724 if (oop_result->is_valid()) { 725 get_vm_result(oop_result, java_thread); 726 } 727 } 728 729 void MacroAssembler::call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions) { 730 call_VM_base(oop_result, noreg, noreg, entry_point, number_of_arguments, check_exceptions); 731 } 732 733 // Maybe emit a call via a trampoline. If the code cache is small 734 // trampolines won't be emitted. 735 736 address MacroAssembler::trampoline_call(Address entry, CodeBuffer *cbuf) { 737 assert(JavaThread::current()->is_Compiler_thread(), "just checking"); 738 assert(entry.rspec().type() == relocInfo::runtime_call_type 739 || entry.rspec().type() == relocInfo::opt_virtual_call_type 740 || entry.rspec().type() == relocInfo::static_call_type 741 || entry.rspec().type() == relocInfo::virtual_call_type, "wrong reloc type"); 742 743 // We need a trampoline if branches are far. 744 if (far_branches()) { 745 bool in_scratch_emit_size = false; 746 #ifdef COMPILER2 747 // We don't want to emit a trampoline if C2 is generating dummy 748 // code during its branch shortening phase. 749 CompileTask* task = ciEnv::current()->task(); 750 in_scratch_emit_size = 751 (task != NULL && is_c2_compile(task->comp_level()) && 752 Compile::current()->in_scratch_emit_size()); 753 #endif 754 if (!in_scratch_emit_size) { 755 address stub = emit_trampoline_stub(offset(), entry.target()); 756 if (stub == NULL) { 757 return NULL; // CodeCache is full 758 } 759 } 760 } 761 762 if (cbuf) cbuf->set_insts_mark(); 763 relocate(entry.rspec()); 764 if (!far_branches()) { 765 bl(entry.target()); 766 } else { 767 bl(pc()); 768 } 769 // just need to return a non-null address 770 return pc(); 771 } 772 773 774 // Emit a trampoline stub for a call to a target which is too far away. 775 // 776 // code sequences: 777 // 778 // call-site: 779 // branch-and-link to <destination> or <trampoline stub> 780 // 781 // Related trampoline stub for this call site in the stub section: 782 // load the call target from the constant pool 783 // branch (LR still points to the call site above) 784 785 address MacroAssembler::emit_trampoline_stub(int insts_call_instruction_offset, 786 address dest) { 787 // Max stub size: alignment nop, TrampolineStub. 788 address stub = start_a_stub(NativeInstruction::instruction_size 789 + NativeCallTrampolineStub::instruction_size); 790 if (stub == NULL) { 791 return NULL; // CodeBuffer::expand failed 792 } 793 794 // Create a trampoline stub relocation which relates this trampoline stub 795 // with the call instruction at insts_call_instruction_offset in the 796 // instructions code-section. 797 align(wordSize); 798 relocate(trampoline_stub_Relocation::spec(code()->insts()->start() 799 + insts_call_instruction_offset)); 800 const int stub_start_offset = offset(); 801 802 // Now, create the trampoline stub's code: 803 // - load the call 804 // - call 805 Label target; 806 ldr(rscratch1, target); 807 br(rscratch1); 808 bind(target); 809 assert(offset() - stub_start_offset == NativeCallTrampolineStub::data_offset, 810 "should be"); 811 emit_int64((int64_t)dest); 812 813 const address stub_start_addr = addr_at(stub_start_offset); 814 815 assert(is_NativeCallTrampolineStub_at(stub_start_addr), "doesn't look like a trampoline"); 816 817 end_a_stub(); 818 return stub_start_addr; 819 } 820 821 void MacroAssembler::c2bool(Register x) { 822 // implements x == 0 ? 0 : 1 823 // note: must only look at least-significant byte of x 824 // since C-style booleans are stored in one byte 825 // only! (was bug) 826 tst(x, 0xff); 827 cset(x, Assembler::NE); 828 } 829 830 address MacroAssembler::ic_call(address entry, jint method_index) { 831 RelocationHolder rh = virtual_call_Relocation::spec(pc(), method_index); 832 // address const_ptr = long_constant((jlong)Universe::non_oop_word()); 833 // unsigned long offset; 834 // ldr_constant(rscratch2, const_ptr); 835 movptr(rscratch2, (uintptr_t)Universe::non_oop_word()); 836 return trampoline_call(Address(entry, rh)); 837 } 838 839 // Implementation of call_VM versions 840 841 void MacroAssembler::call_VM(Register oop_result, 842 address entry_point, 843 bool check_exceptions) { 844 call_VM_helper(oop_result, entry_point, 0, check_exceptions); 845 } 846 847 void MacroAssembler::call_VM(Register oop_result, 848 address entry_point, 849 Register arg_1, 850 bool check_exceptions) { 851 pass_arg1(this, arg_1); 852 call_VM_helper(oop_result, entry_point, 1, check_exceptions); 853 } 854 855 void MacroAssembler::call_VM(Register oop_result, 856 address entry_point, 857 Register arg_1, 858 Register arg_2, 859 bool check_exceptions) { 860 assert(arg_1 != c_rarg2, "smashed arg"); 861 pass_arg2(this, arg_2); 862 pass_arg1(this, arg_1); 863 call_VM_helper(oop_result, entry_point, 2, check_exceptions); 864 } 865 866 void MacroAssembler::call_VM(Register oop_result, 867 address entry_point, 868 Register arg_1, 869 Register arg_2, 870 Register arg_3, 871 bool check_exceptions) { 872 assert(arg_1 != c_rarg3, "smashed arg"); 873 assert(arg_2 != c_rarg3, "smashed arg"); 874 pass_arg3(this, arg_3); 875 876 assert(arg_1 != c_rarg2, "smashed arg"); 877 pass_arg2(this, arg_2); 878 879 pass_arg1(this, arg_1); 880 call_VM_helper(oop_result, entry_point, 3, check_exceptions); 881 } 882 883 void MacroAssembler::call_VM(Register oop_result, 884 Register last_java_sp, 885 address entry_point, 886 int number_of_arguments, 887 bool check_exceptions) { 888 call_VM_base(oop_result, rthread, last_java_sp, entry_point, number_of_arguments, check_exceptions); 889 } 890 891 void MacroAssembler::call_VM(Register oop_result, 892 Register last_java_sp, 893 address entry_point, 894 Register arg_1, 895 bool check_exceptions) { 896 pass_arg1(this, arg_1); 897 call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions); 898 } 899 900 void MacroAssembler::call_VM(Register oop_result, 901 Register last_java_sp, 902 address entry_point, 903 Register arg_1, 904 Register arg_2, 905 bool check_exceptions) { 906 907 assert(arg_1 != c_rarg2, "smashed arg"); 908 pass_arg2(this, arg_2); 909 pass_arg1(this, arg_1); 910 call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions); 911 } 912 913 void MacroAssembler::call_VM(Register oop_result, 914 Register last_java_sp, 915 address entry_point, 916 Register arg_1, 917 Register arg_2, 918 Register arg_3, 919 bool check_exceptions) { 920 assert(arg_1 != c_rarg3, "smashed arg"); 921 assert(arg_2 != c_rarg3, "smashed arg"); 922 pass_arg3(this, arg_3); 923 assert(arg_1 != c_rarg2, "smashed arg"); 924 pass_arg2(this, arg_2); 925 pass_arg1(this, arg_1); 926 call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions); 927 } 928 929 930 void MacroAssembler::get_vm_result(Register oop_result, Register java_thread) { 931 ldr(oop_result, Address(java_thread, JavaThread::vm_result_offset())); 932 str(zr, Address(java_thread, JavaThread::vm_result_offset())); 933 verify_oop(oop_result, "broken oop in call_VM_base"); 934 } 935 936 void MacroAssembler::get_vm_result_2(Register metadata_result, Register java_thread) { 937 ldr(metadata_result, Address(java_thread, JavaThread::vm_result_2_offset())); 938 str(zr, Address(java_thread, JavaThread::vm_result_2_offset())); 939 } 940 941 void MacroAssembler::align(int modulus) { 942 while (offset() % modulus != 0) nop(); 943 } 944 945 // these are no-ops overridden by InterpreterMacroAssembler 946 947 void MacroAssembler::check_and_handle_earlyret(Register java_thread) { } 948 949 void MacroAssembler::check_and_handle_popframe(Register java_thread) { } 950 951 952 RegisterOrConstant MacroAssembler::delayed_value_impl(intptr_t* delayed_value_addr, 953 Register tmp, 954 int offset) { 955 intptr_t value = *delayed_value_addr; 956 if (value != 0) 957 return RegisterOrConstant(value + offset); 958 959 // load indirectly to solve generation ordering problem 960 ldr(tmp, ExternalAddress((address) delayed_value_addr)); 961 962 if (offset != 0) 963 add(tmp, tmp, offset); 964 965 return RegisterOrConstant(tmp); 966 } 967 968 969 void MacroAssembler:: notify(int type) { 970 if (type == bytecode_start) { 971 // set_last_Java_frame(esp, rfp, (address)NULL); 972 Assembler:: notify(type); 973 // reset_last_Java_frame(true); 974 } 975 else 976 Assembler:: notify(type); 977 } 978 979 // Look up the method for a megamorphic invokeinterface call. 980 // The target method is determined by <intf_klass, itable_index>. 981 // The receiver klass is in recv_klass. 982 // On success, the result will be in method_result, and execution falls through. 983 // On failure, execution transfers to the given label. 984 void MacroAssembler::lookup_interface_method(Register recv_klass, 985 Register intf_klass, 986 RegisterOrConstant itable_index, 987 Register method_result, 988 Register scan_temp, 989 Label& L_no_such_interface, 990 bool return_method) { 991 assert_different_registers(recv_klass, intf_klass, scan_temp); 992 assert_different_registers(method_result, intf_klass, scan_temp); 993 assert(recv_klass != method_result || !return_method, 994 "recv_klass can be destroyed when method isn't needed"); 995 assert(itable_index.is_constant() || itable_index.as_register() == method_result, 996 "caller must use same register for non-constant itable index as for method"); 997 998 // Compute start of first itableOffsetEntry (which is at the end of the vtable) 999 int vtable_base = in_bytes(Klass::vtable_start_offset()); 1000 int itentry_off = itableMethodEntry::method_offset_in_bytes(); 1001 int scan_step = itableOffsetEntry::size() * wordSize; 1002 int vte_size = vtableEntry::size_in_bytes(); 1003 assert(vte_size == wordSize, "else adjust times_vte_scale"); 1004 1005 ldrw(scan_temp, Address(recv_klass, Klass::vtable_length_offset())); 1006 1007 // %%% Could store the aligned, prescaled offset in the klassoop. 1008 // lea(scan_temp, Address(recv_klass, scan_temp, times_vte_scale, vtable_base)); 1009 lea(scan_temp, Address(recv_klass, scan_temp, Address::lsl(3))); 1010 add(scan_temp, scan_temp, vtable_base); 1011 1012 if (return_method) { 1013 // Adjust recv_klass by scaled itable_index, so we can free itable_index. 1014 assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below"); 1015 // lea(recv_klass, Address(recv_klass, itable_index, Address::times_ptr, itentry_off)); 1016 lea(recv_klass, Address(recv_klass, itable_index, Address::lsl(3))); 1017 if (itentry_off) 1018 add(recv_klass, recv_klass, itentry_off); 1019 } 1020 1021 // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) { 1022 // if (scan->interface() == intf) { 1023 // result = (klass + scan->offset() + itable_index); 1024 // } 1025 // } 1026 Label search, found_method; 1027 1028 for (int peel = 1; peel >= 0; peel--) { 1029 ldr(method_result, Address(scan_temp, itableOffsetEntry::interface_offset_in_bytes())); 1030 cmp(intf_klass, method_result); 1031 1032 if (peel) { 1033 br(Assembler::EQ, found_method); 1034 } else { 1035 br(Assembler::NE, search); 1036 // (invert the test to fall through to found_method...) 1037 } 1038 1039 if (!peel) break; 1040 1041 bind(search); 1042 1043 // Check that the previous entry is non-null. A null entry means that 1044 // the receiver class doesn't implement the interface, and wasn't the 1045 // same as when the caller was compiled. 1046 cbz(method_result, L_no_such_interface); 1047 add(scan_temp, scan_temp, scan_step); 1048 } 1049 1050 bind(found_method); 1051 1052 // Got a hit. 1053 if (return_method) { 1054 ldrw(scan_temp, Address(scan_temp, itableOffsetEntry::offset_offset_in_bytes())); 1055 ldr(method_result, Address(recv_klass, scan_temp, Address::uxtw(0))); 1056 } 1057 } 1058 1059 // virtual method calling 1060 void MacroAssembler::lookup_virtual_method(Register recv_klass, 1061 RegisterOrConstant vtable_index, 1062 Register method_result) { 1063 const int base = in_bytes(Klass::vtable_start_offset()); 1064 assert(vtableEntry::size() * wordSize == 8, 1065 "adjust the scaling in the code below"); 1066 int vtable_offset_in_bytes = base + vtableEntry::method_offset_in_bytes(); 1067 1068 if (vtable_index.is_register()) { 1069 lea(method_result, Address(recv_klass, 1070 vtable_index.as_register(), 1071 Address::lsl(LogBytesPerWord))); 1072 ldr(method_result, Address(method_result, vtable_offset_in_bytes)); 1073 } else { 1074 vtable_offset_in_bytes += vtable_index.as_constant() * wordSize; 1075 ldr(method_result, 1076 form_address(rscratch1, recv_klass, vtable_offset_in_bytes, 0)); 1077 } 1078 } 1079 1080 void MacroAssembler::check_klass_subtype(Register sub_klass, 1081 Register super_klass, 1082 Register temp_reg, 1083 Label& L_success) { 1084 Label L_failure; 1085 check_klass_subtype_fast_path(sub_klass, super_klass, temp_reg, &L_success, &L_failure, NULL); 1086 check_klass_subtype_slow_path(sub_klass, super_klass, temp_reg, noreg, &L_success, NULL); 1087 bind(L_failure); 1088 } 1089 1090 1091 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass, 1092 Register super_klass, 1093 Register temp_reg, 1094 Label* L_success, 1095 Label* L_failure, 1096 Label* L_slow_path, 1097 RegisterOrConstant super_check_offset) { 1098 assert_different_registers(sub_klass, super_klass, temp_reg); 1099 bool must_load_sco = (super_check_offset.constant_or_zero() == -1); 1100 if (super_check_offset.is_register()) { 1101 assert_different_registers(sub_klass, super_klass, 1102 super_check_offset.as_register()); 1103 } else if (must_load_sco) { 1104 assert(temp_reg != noreg, "supply either a temp or a register offset"); 1105 } 1106 1107 Label L_fallthrough; 1108 int label_nulls = 0; 1109 if (L_success == NULL) { L_success = &L_fallthrough; label_nulls++; } 1110 if (L_failure == NULL) { L_failure = &L_fallthrough; label_nulls++; } 1111 if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; } 1112 assert(label_nulls <= 1, "at most one NULL in the batch"); 1113 1114 int sc_offset = in_bytes(Klass::secondary_super_cache_offset()); 1115 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 1116 Address super_check_offset_addr(super_klass, sco_offset); 1117 1118 // Hacked jmp, which may only be used just before L_fallthrough. 1119 #define final_jmp(label) \ 1120 if (&(label) == &L_fallthrough) { /*do nothing*/ } \ 1121 else b(label) /*omit semi*/ 1122 1123 // If the pointers are equal, we are done (e.g., String[] elements). 1124 // This self-check enables sharing of secondary supertype arrays among 1125 // non-primary types such as array-of-interface. Otherwise, each such 1126 // type would need its own customized SSA. 1127 // We move this check to the front of the fast path because many 1128 // type checks are in fact trivially successful in this manner, 1129 // so we get a nicely predicted branch right at the start of the check. 1130 cmp(sub_klass, super_klass); 1131 br(Assembler::EQ, *L_success); 1132 1133 // Check the supertype display: 1134 if (must_load_sco) { 1135 ldrw(temp_reg, super_check_offset_addr); 1136 super_check_offset = RegisterOrConstant(temp_reg); 1137 } 1138 Address super_check_addr(sub_klass, super_check_offset); 1139 ldr(rscratch1, super_check_addr); 1140 cmp(super_klass, rscratch1); // load displayed supertype 1141 1142 // This check has worked decisively for primary supers. 1143 // Secondary supers are sought in the super_cache ('super_cache_addr'). 1144 // (Secondary supers are interfaces and very deeply nested subtypes.) 1145 // This works in the same check above because of a tricky aliasing 1146 // between the super_cache and the primary super display elements. 1147 // (The 'super_check_addr' can address either, as the case requires.) 1148 // Note that the cache is updated below if it does not help us find 1149 // what we need immediately. 1150 // So if it was a primary super, we can just fail immediately. 1151 // Otherwise, it's the slow path for us (no success at this point). 1152 1153 if (super_check_offset.is_register()) { 1154 br(Assembler::EQ, *L_success); 1155 subs(zr, super_check_offset.as_register(), sc_offset); 1156 if (L_failure == &L_fallthrough) { 1157 br(Assembler::EQ, *L_slow_path); 1158 } else { 1159 br(Assembler::NE, *L_failure); 1160 final_jmp(*L_slow_path); 1161 } 1162 } else if (super_check_offset.as_constant() == sc_offset) { 1163 // Need a slow path; fast failure is impossible. 1164 if (L_slow_path == &L_fallthrough) { 1165 br(Assembler::EQ, *L_success); 1166 } else { 1167 br(Assembler::NE, *L_slow_path); 1168 final_jmp(*L_success); 1169 } 1170 } else { 1171 // No slow path; it's a fast decision. 1172 if (L_failure == &L_fallthrough) { 1173 br(Assembler::EQ, *L_success); 1174 } else { 1175 br(Assembler::NE, *L_failure); 1176 final_jmp(*L_success); 1177 } 1178 } 1179 1180 bind(L_fallthrough); 1181 1182 #undef final_jmp 1183 } 1184 1185 // These two are taken from x86, but they look generally useful 1186 1187 // scans count pointer sized words at [addr] for occurence of value, 1188 // generic 1189 void MacroAssembler::repne_scan(Register addr, Register value, Register count, 1190 Register scratch) { 1191 Label Lloop, Lexit; 1192 cbz(count, Lexit); 1193 bind(Lloop); 1194 ldr(scratch, post(addr, wordSize)); 1195 cmp(value, scratch); 1196 br(EQ, Lexit); 1197 sub(count, count, 1); 1198 cbnz(count, Lloop); 1199 bind(Lexit); 1200 } 1201 1202 // scans count 4 byte words at [addr] for occurence of value, 1203 // generic 1204 void MacroAssembler::repne_scanw(Register addr, Register value, Register count, 1205 Register scratch) { 1206 Label Lloop, Lexit; 1207 cbz(count, Lexit); 1208 bind(Lloop); 1209 ldrw(scratch, post(addr, wordSize)); 1210 cmpw(value, scratch); 1211 br(EQ, Lexit); 1212 sub(count, count, 1); 1213 cbnz(count, Lloop); 1214 bind(Lexit); 1215 } 1216 1217 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass, 1218 Register super_klass, 1219 Register temp_reg, 1220 Register temp2_reg, 1221 Label* L_success, 1222 Label* L_failure, 1223 bool set_cond_codes) { 1224 assert_different_registers(sub_klass, super_klass, temp_reg); 1225 if (temp2_reg != noreg) 1226 assert_different_registers(sub_klass, super_klass, temp_reg, temp2_reg, rscratch1); 1227 #define IS_A_TEMP(reg) ((reg) == temp_reg || (reg) == temp2_reg) 1228 1229 Label L_fallthrough; 1230 int label_nulls = 0; 1231 if (L_success == NULL) { L_success = &L_fallthrough; label_nulls++; } 1232 if (L_failure == NULL) { L_failure = &L_fallthrough; label_nulls++; } 1233 assert(label_nulls <= 1, "at most one NULL in the batch"); 1234 1235 // a couple of useful fields in sub_klass: 1236 int ss_offset = in_bytes(Klass::secondary_supers_offset()); 1237 int sc_offset = in_bytes(Klass::secondary_super_cache_offset()); 1238 Address secondary_supers_addr(sub_klass, ss_offset); 1239 Address super_cache_addr( sub_klass, sc_offset); 1240 1241 BLOCK_COMMENT("check_klass_subtype_slow_path"); 1242 1243 // Do a linear scan of the secondary super-klass chain. 1244 // This code is rarely used, so simplicity is a virtue here. 1245 // The repne_scan instruction uses fixed registers, which we must spill. 1246 // Don't worry too much about pre-existing connections with the input regs. 1247 1248 assert(sub_klass != r0, "killed reg"); // killed by mov(r0, super) 1249 assert(sub_klass != r2, "killed reg"); // killed by lea(r2, &pst_counter) 1250 1251 RegSet pushed_registers; 1252 if (!IS_A_TEMP(r2)) pushed_registers += r2; 1253 if (!IS_A_TEMP(r5)) pushed_registers += r5; 1254 1255 if (super_klass != r0 || UseCompressedOops) { 1256 if (!IS_A_TEMP(r0)) pushed_registers += r0; 1257 } 1258 1259 push(pushed_registers, sp); 1260 1261 // Get super_klass value into r0 (even if it was in r5 or r2). 1262 if (super_klass != r0) { 1263 mov(r0, super_klass); 1264 } 1265 1266 #ifndef PRODUCT 1267 mov(rscratch2, (address)&SharedRuntime::_partial_subtype_ctr); 1268 Address pst_counter_addr(rscratch2); 1269 ldr(rscratch1, pst_counter_addr); 1270 add(rscratch1, rscratch1, 1); 1271 str(rscratch1, pst_counter_addr); 1272 #endif //PRODUCT 1273 1274 // We will consult the secondary-super array. 1275 ldr(r5, secondary_supers_addr); 1276 // Load the array length. 1277 ldrw(r2, Address(r5, Array<Klass*>::length_offset_in_bytes())); 1278 // Skip to start of data. 1279 add(r5, r5, Array<Klass*>::base_offset_in_bytes()); 1280 1281 cmp(sp, zr); // Clear Z flag; SP is never zero 1282 // Scan R2 words at [R5] for an occurrence of R0. 1283 // Set NZ/Z based on last compare. 1284 repne_scan(r5, r0, r2, rscratch1); 1285 1286 // Unspill the temp. registers: 1287 pop(pushed_registers, sp); 1288 1289 br(Assembler::NE, *L_failure); 1290 1291 // Success. Cache the super we found and proceed in triumph. 1292 str(super_klass, super_cache_addr); 1293 1294 if (L_success != &L_fallthrough) { 1295 b(*L_success); 1296 } 1297 1298 #undef IS_A_TEMP 1299 1300 bind(L_fallthrough); 1301 } 1302 1303 1304 void MacroAssembler::verify_oop(Register reg, const char* s) { 1305 if (!VerifyOops) return; 1306 1307 // Pass register number to verify_oop_subroutine 1308 const char* b = NULL; 1309 { 1310 ResourceMark rm; 1311 stringStream ss; 1312 ss.print("verify_oop: %s: %s", reg->name(), s); 1313 b = code_string(ss.as_string()); 1314 } 1315 BLOCK_COMMENT("verify_oop {"); 1316 1317 stp(r0, rscratch1, Address(pre(sp, -2 * wordSize))); 1318 stp(rscratch2, lr, Address(pre(sp, -2 * wordSize))); 1319 1320 mov(r0, reg); 1321 mov(rscratch1, (address)b); 1322 1323 // call indirectly to solve generation ordering problem 1324 lea(rscratch2, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address())); 1325 ldr(rscratch2, Address(rscratch2)); 1326 blr(rscratch2); 1327 1328 ldp(rscratch2, lr, Address(post(sp, 2 * wordSize))); 1329 ldp(r0, rscratch1, Address(post(sp, 2 * wordSize))); 1330 1331 BLOCK_COMMENT("} verify_oop"); 1332 } 1333 1334 void MacroAssembler::verify_oop_addr(Address addr, const char* s) { 1335 if (!VerifyOops) return; 1336 1337 const char* b = NULL; 1338 { 1339 ResourceMark rm; 1340 stringStream ss; 1341 ss.print("verify_oop_addr: %s", s); 1342 b = code_string(ss.as_string()); 1343 } 1344 BLOCK_COMMENT("verify_oop_addr {"); 1345 1346 stp(r0, rscratch1, Address(pre(sp, -2 * wordSize))); 1347 stp(rscratch2, lr, Address(pre(sp, -2 * wordSize))); 1348 1349 // addr may contain sp so we will have to adjust it based on the 1350 // pushes that we just did. 1351 if (addr.uses(sp)) { 1352 lea(r0, addr); 1353 ldr(r0, Address(r0, 4 * wordSize)); 1354 } else { 1355 ldr(r0, addr); 1356 } 1357 mov(rscratch1, (address)b); 1358 1359 // call indirectly to solve generation ordering problem 1360 lea(rscratch2, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address())); 1361 ldr(rscratch2, Address(rscratch2)); 1362 blr(rscratch2); 1363 1364 ldp(rscratch2, lr, Address(post(sp, 2 * wordSize))); 1365 ldp(r0, rscratch1, Address(post(sp, 2 * wordSize))); 1366 1367 BLOCK_COMMENT("} verify_oop_addr"); 1368 } 1369 1370 Address MacroAssembler::argument_address(RegisterOrConstant arg_slot, 1371 int extra_slot_offset) { 1372 // cf. TemplateTable::prepare_invoke(), if (load_receiver). 1373 int stackElementSize = Interpreter::stackElementSize; 1374 int offset = Interpreter::expr_offset_in_bytes(extra_slot_offset+0); 1375 #ifdef ASSERT 1376 int offset1 = Interpreter::expr_offset_in_bytes(extra_slot_offset+1); 1377 assert(offset1 - offset == stackElementSize, "correct arithmetic"); 1378 #endif 1379 if (arg_slot.is_constant()) { 1380 return Address(esp, arg_slot.as_constant() * stackElementSize 1381 + offset); 1382 } else { 1383 add(rscratch1, esp, arg_slot.as_register(), 1384 ext::uxtx, exact_log2(stackElementSize)); 1385 return Address(rscratch1, offset); 1386 } 1387 } 1388 1389 void MacroAssembler::call_VM_leaf_base(address entry_point, 1390 int number_of_arguments, 1391 Label *retaddr) { 1392 call_VM_leaf_base1(entry_point, number_of_arguments, 0, ret_type_integral, retaddr); 1393 } 1394 1395 void MacroAssembler::call_VM_leaf_base1(address entry_point, 1396 int number_of_gp_arguments, 1397 int number_of_fp_arguments, 1398 ret_type type, 1399 Label *retaddr) { 1400 Label E, L; 1401 1402 stp(rscratch1, rmethod, Address(pre(sp, -2 * wordSize))); 1403 1404 // We add 1 to number_of_arguments because the thread in arg0 is 1405 // not counted 1406 mov(rscratch1, entry_point); 1407 blrt(rscratch1, number_of_gp_arguments + 1, number_of_fp_arguments, type); 1408 if (retaddr) 1409 bind(*retaddr); 1410 1411 ldp(rscratch1, rmethod, Address(post(sp, 2 * wordSize))); 1412 maybe_isb(); 1413 } 1414 1415 void MacroAssembler::call_VM_leaf(address entry_point, int number_of_arguments) { 1416 call_VM_leaf_base(entry_point, number_of_arguments); 1417 } 1418 1419 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0) { 1420 pass_arg0(this, arg_0); 1421 call_VM_leaf_base(entry_point, 1); 1422 } 1423 1424 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1) { 1425 pass_arg0(this, arg_0); 1426 pass_arg1(this, arg_1); 1427 call_VM_leaf_base(entry_point, 2); 1428 } 1429 1430 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, 1431 Register arg_1, Register arg_2) { 1432 pass_arg0(this, arg_0); 1433 pass_arg1(this, arg_1); 1434 pass_arg2(this, arg_2); 1435 call_VM_leaf_base(entry_point, 3); 1436 } 1437 1438 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0) { 1439 pass_arg0(this, arg_0); 1440 MacroAssembler::call_VM_leaf_base(entry_point, 1); 1441 } 1442 1443 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1) { 1444 1445 assert(arg_0 != c_rarg1, "smashed arg"); 1446 pass_arg1(this, arg_1); 1447 pass_arg0(this, arg_0); 1448 MacroAssembler::call_VM_leaf_base(entry_point, 2); 1449 } 1450 1451 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) { 1452 assert(arg_0 != c_rarg2, "smashed arg"); 1453 assert(arg_1 != c_rarg2, "smashed arg"); 1454 pass_arg2(this, arg_2); 1455 assert(arg_0 != c_rarg1, "smashed arg"); 1456 pass_arg1(this, arg_1); 1457 pass_arg0(this, arg_0); 1458 MacroAssembler::call_VM_leaf_base(entry_point, 3); 1459 } 1460 1461 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) { 1462 assert(arg_0 != c_rarg3, "smashed arg"); 1463 assert(arg_1 != c_rarg3, "smashed arg"); 1464 assert(arg_2 != c_rarg3, "smashed arg"); 1465 pass_arg3(this, arg_3); 1466 assert(arg_0 != c_rarg2, "smashed arg"); 1467 assert(arg_1 != c_rarg2, "smashed arg"); 1468 pass_arg2(this, arg_2); 1469 assert(arg_0 != c_rarg1, "smashed arg"); 1470 pass_arg1(this, arg_1); 1471 pass_arg0(this, arg_0); 1472 MacroAssembler::call_VM_leaf_base(entry_point, 4); 1473 } 1474 1475 void MacroAssembler::null_check(Register reg, int offset) { 1476 if (needs_explicit_null_check(offset)) { 1477 // provoke OS NULL exception if reg = NULL by 1478 // accessing M[reg] w/o changing any registers 1479 // NOTE: this is plenty to provoke a segv 1480 ldr(zr, Address(reg)); 1481 } else { 1482 // nothing to do, (later) access of M[reg + offset] 1483 // will provoke OS NULL exception if reg = NULL 1484 } 1485 } 1486 1487 // MacroAssembler protected routines needed to implement 1488 // public methods 1489 1490 void MacroAssembler::mov(Register r, Address dest) { 1491 code_section()->relocate(pc(), dest.rspec()); 1492 u_int64_t imm64 = (u_int64_t)dest.target(); 1493 movptr(r, imm64); 1494 } 1495 1496 // Move a constant pointer into r. In AArch64 mode the virtual 1497 // address space is 48 bits in size, so we only need three 1498 // instructions to create a patchable instruction sequence that can 1499 // reach anywhere. 1500 void MacroAssembler::movptr(Register r, uintptr_t imm64) { 1501 #ifndef PRODUCT 1502 { 1503 char buffer[64]; 1504 snprintf(buffer, sizeof(buffer), "0x%" PRIX64, imm64); 1505 block_comment(buffer); 1506 } 1507 #endif 1508 assert(imm64 < (1ul << 48), "48-bit overflow in address constant"); 1509 movz(r, imm64 & 0xffff); 1510 imm64 >>= 16; 1511 movk(r, imm64 & 0xffff, 16); 1512 imm64 >>= 16; 1513 movk(r, imm64 & 0xffff, 32); 1514 } 1515 1516 // Macro to mov replicated immediate to vector register. 1517 // Vd will get the following values for different arrangements in T 1518 // imm32 == hex 000000gh T8B: Vd = ghghghghghghghgh 1519 // imm32 == hex 000000gh T16B: Vd = ghghghghghghghghghghghghghghghgh 1520 // imm32 == hex 0000efgh T4H: Vd = efghefghefghefgh 1521 // imm32 == hex 0000efgh T8H: Vd = efghefghefghefghefghefghefghefgh 1522 // imm32 == hex abcdefgh T2S: Vd = abcdefghabcdefgh 1523 // imm32 == hex abcdefgh T4S: Vd = abcdefghabcdefghabcdefghabcdefgh 1524 // T1D/T2D: invalid 1525 void MacroAssembler::mov(FloatRegister Vd, SIMD_Arrangement T, u_int32_t imm32) { 1526 assert(T != T1D && T != T2D, "invalid arrangement"); 1527 if (T == T8B || T == T16B) { 1528 assert((imm32 & ~0xff) == 0, "extraneous bits in unsigned imm32 (T8B/T16B)"); 1529 movi(Vd, T, imm32 & 0xff, 0); 1530 return; 1531 } 1532 u_int32_t nimm32 = ~imm32; 1533 if (T == T4H || T == T8H) { 1534 assert((imm32 & ~0xffff) == 0, "extraneous bits in unsigned imm32 (T4H/T8H)"); 1535 imm32 &= 0xffff; 1536 nimm32 &= 0xffff; 1537 } 1538 u_int32_t x = imm32; 1539 int movi_cnt = 0; 1540 int movn_cnt = 0; 1541 while (x) { if (x & 0xff) movi_cnt++; x >>= 8; } 1542 x = nimm32; 1543 while (x) { if (x & 0xff) movn_cnt++; x >>= 8; } 1544 if (movn_cnt < movi_cnt) imm32 = nimm32; 1545 unsigned lsl = 0; 1546 while (imm32 && (imm32 & 0xff) == 0) { lsl += 8; imm32 >>= 8; } 1547 if (movn_cnt < movi_cnt) 1548 mvni(Vd, T, imm32 & 0xff, lsl); 1549 else 1550 movi(Vd, T, imm32 & 0xff, lsl); 1551 imm32 >>= 8; lsl += 8; 1552 while (imm32) { 1553 while ((imm32 & 0xff) == 0) { lsl += 8; imm32 >>= 8; } 1554 if (movn_cnt < movi_cnt) 1555 bici(Vd, T, imm32 & 0xff, lsl); 1556 else 1557 orri(Vd, T, imm32 & 0xff, lsl); 1558 lsl += 8; imm32 >>= 8; 1559 } 1560 } 1561 1562 void MacroAssembler::mov_immediate64(Register dst, u_int64_t imm64) 1563 { 1564 #ifndef PRODUCT 1565 { 1566 char buffer[64]; 1567 snprintf(buffer, sizeof(buffer), "0x%" PRIX64, imm64); 1568 block_comment(buffer); 1569 } 1570 #endif 1571 if (operand_valid_for_logical_immediate(false, imm64)) { 1572 orr(dst, zr, imm64); 1573 } else { 1574 // we can use a combination of MOVZ or MOVN with 1575 // MOVK to build up the constant 1576 u_int64_t imm_h[4]; 1577 int zero_count = 0; 1578 int neg_count = 0; 1579 int i; 1580 for (i = 0; i < 4; i++) { 1581 imm_h[i] = ((imm64 >> (i * 16)) & 0xffffL); 1582 if (imm_h[i] == 0) { 1583 zero_count++; 1584 } else if (imm_h[i] == 0xffffL) { 1585 neg_count++; 1586 } 1587 } 1588 if (zero_count == 4) { 1589 // one MOVZ will do 1590 movz(dst, 0); 1591 } else if (neg_count == 4) { 1592 // one MOVN will do 1593 movn(dst, 0); 1594 } else if (zero_count == 3) { 1595 for (i = 0; i < 4; i++) { 1596 if (imm_h[i] != 0L) { 1597 movz(dst, (u_int32_t)imm_h[i], (i << 4)); 1598 break; 1599 } 1600 } 1601 } else if (neg_count == 3) { 1602 // one MOVN will do 1603 for (int i = 0; i < 4; i++) { 1604 if (imm_h[i] != 0xffffL) { 1605 movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4)); 1606 break; 1607 } 1608 } 1609 } else if (zero_count == 2) { 1610 // one MOVZ and one MOVK will do 1611 for (i = 0; i < 3; i++) { 1612 if (imm_h[i] != 0L) { 1613 movz(dst, (u_int32_t)imm_h[i], (i << 4)); 1614 i++; 1615 break; 1616 } 1617 } 1618 for (;i < 4; i++) { 1619 if (imm_h[i] != 0L) { 1620 movk(dst, (u_int32_t)imm_h[i], (i << 4)); 1621 } 1622 } 1623 } else if (neg_count == 2) { 1624 // one MOVN and one MOVK will do 1625 for (i = 0; i < 4; i++) { 1626 if (imm_h[i] != 0xffffL) { 1627 movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4)); 1628 i++; 1629 break; 1630 } 1631 } 1632 for (;i < 4; i++) { 1633 if (imm_h[i] != 0xffffL) { 1634 movk(dst, (u_int32_t)imm_h[i], (i << 4)); 1635 } 1636 } 1637 } else if (zero_count == 1) { 1638 // one MOVZ and two MOVKs will do 1639 for (i = 0; i < 4; i++) { 1640 if (imm_h[i] != 0L) { 1641 movz(dst, (u_int32_t)imm_h[i], (i << 4)); 1642 i++; 1643 break; 1644 } 1645 } 1646 for (;i < 4; i++) { 1647 if (imm_h[i] != 0x0L) { 1648 movk(dst, (u_int32_t)imm_h[i], (i << 4)); 1649 } 1650 } 1651 } else if (neg_count == 1) { 1652 // one MOVN and two MOVKs will do 1653 for (i = 0; i < 4; i++) { 1654 if (imm_h[i] != 0xffffL) { 1655 movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4)); 1656 i++; 1657 break; 1658 } 1659 } 1660 for (;i < 4; i++) { 1661 if (imm_h[i] != 0xffffL) { 1662 movk(dst, (u_int32_t)imm_h[i], (i << 4)); 1663 } 1664 } 1665 } else { 1666 // use a MOVZ and 3 MOVKs (makes it easier to debug) 1667 movz(dst, (u_int32_t)imm_h[0], 0); 1668 for (i = 1; i < 4; i++) { 1669 movk(dst, (u_int32_t)imm_h[i], (i << 4)); 1670 } 1671 } 1672 } 1673 } 1674 1675 void MacroAssembler::mov_immediate32(Register dst, u_int32_t imm32) 1676 { 1677 #ifndef PRODUCT 1678 { 1679 char buffer[64]; 1680 snprintf(buffer, sizeof(buffer), "0x%" PRIX32, imm32); 1681 block_comment(buffer); 1682 } 1683 #endif 1684 if (operand_valid_for_logical_immediate(true, imm32)) { 1685 orrw(dst, zr, imm32); 1686 } else { 1687 // we can use MOVZ, MOVN or two calls to MOVK to build up the 1688 // constant 1689 u_int32_t imm_h[2]; 1690 imm_h[0] = imm32 & 0xffff; 1691 imm_h[1] = ((imm32 >> 16) & 0xffff); 1692 if (imm_h[0] == 0) { 1693 movzw(dst, imm_h[1], 16); 1694 } else if (imm_h[0] == 0xffff) { 1695 movnw(dst, imm_h[1] ^ 0xffff, 16); 1696 } else if (imm_h[1] == 0) { 1697 movzw(dst, imm_h[0], 0); 1698 } else if (imm_h[1] == 0xffff) { 1699 movnw(dst, imm_h[0] ^ 0xffff, 0); 1700 } else { 1701 // use a MOVZ and MOVK (makes it easier to debug) 1702 movzw(dst, imm_h[0], 0); 1703 movkw(dst, imm_h[1], 16); 1704 } 1705 } 1706 } 1707 1708 // Form an address from base + offset in Rd. Rd may or may 1709 // not actually be used: you must use the Address that is returned. 1710 // It is up to you to ensure that the shift provided matches the size 1711 // of your data. 1712 Address MacroAssembler::form_address(Register Rd, Register base, long byte_offset, int shift) { 1713 if (Address::offset_ok_for_immed(byte_offset, shift)) 1714 // It fits; no need for any heroics 1715 return Address(base, byte_offset); 1716 1717 // Don't do anything clever with negative or misaligned offsets 1718 unsigned mask = (1 << shift) - 1; 1719 if (byte_offset < 0 || byte_offset & mask) { 1720 mov(Rd, byte_offset); 1721 add(Rd, base, Rd); 1722 return Address(Rd); 1723 } 1724 1725 // See if we can do this with two 12-bit offsets 1726 { 1727 unsigned long word_offset = byte_offset >> shift; 1728 unsigned long masked_offset = word_offset & 0xfff000; 1729 if (Address::offset_ok_for_immed(word_offset - masked_offset) 1730 && Assembler::operand_valid_for_add_sub_immediate(masked_offset << shift)) { 1731 add(Rd, base, masked_offset << shift); 1732 word_offset -= masked_offset; 1733 return Address(Rd, word_offset << shift); 1734 } 1735 } 1736 1737 // Do it the hard way 1738 mov(Rd, byte_offset); 1739 add(Rd, base, Rd); 1740 return Address(Rd); 1741 } 1742 1743 void MacroAssembler::atomic_incw(Register counter_addr, Register tmp, Register tmp2) { 1744 if (UseLSE) { 1745 mov(tmp, 1); 1746 ldadd(Assembler::word, tmp, zr, counter_addr); 1747 return; 1748 } 1749 Label retry_load; 1750 if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH)) 1751 prfm(Address(counter_addr), PSTL1STRM); 1752 bind(retry_load); 1753 // flush and load exclusive from the memory location 1754 ldxrw(tmp, counter_addr); 1755 addw(tmp, tmp, 1); 1756 // if we store+flush with no intervening write tmp wil be zero 1757 stxrw(tmp2, tmp, counter_addr); 1758 cbnzw(tmp2, retry_load); 1759 } 1760 1761 1762 int MacroAssembler::corrected_idivl(Register result, Register ra, Register rb, 1763 bool want_remainder, Register scratch) 1764 { 1765 // Full implementation of Java idiv and irem. The function 1766 // returns the (pc) offset of the div instruction - may be needed 1767 // for implicit exceptions. 1768 // 1769 // constraint : ra/rb =/= scratch 1770 // normal case 1771 // 1772 // input : ra: dividend 1773 // rb: divisor 1774 // 1775 // result: either 1776 // quotient (= ra idiv rb) 1777 // remainder (= ra irem rb) 1778 1779 assert(ra != scratch && rb != scratch, "reg cannot be scratch"); 1780 1781 int idivl_offset = offset(); 1782 if (! want_remainder) { 1783 sdivw(result, ra, rb); 1784 } else { 1785 sdivw(scratch, ra, rb); 1786 Assembler::msubw(result, scratch, rb, ra); 1787 } 1788 1789 return idivl_offset; 1790 } 1791 1792 int MacroAssembler::corrected_idivq(Register result, Register ra, Register rb, 1793 bool want_remainder, Register scratch) 1794 { 1795 // Full implementation of Java ldiv and lrem. The function 1796 // returns the (pc) offset of the div instruction - may be needed 1797 // for implicit exceptions. 1798 // 1799 // constraint : ra/rb =/= scratch 1800 // normal case 1801 // 1802 // input : ra: dividend 1803 // rb: divisor 1804 // 1805 // result: either 1806 // quotient (= ra idiv rb) 1807 // remainder (= ra irem rb) 1808 1809 assert(ra != scratch && rb != scratch, "reg cannot be scratch"); 1810 1811 int idivq_offset = offset(); 1812 if (! want_remainder) { 1813 sdiv(result, ra, rb); 1814 } else { 1815 sdiv(scratch, ra, rb); 1816 Assembler::msub(result, scratch, rb, ra); 1817 } 1818 1819 return idivq_offset; 1820 } 1821 1822 void MacroAssembler::membar(Membar_mask_bits order_constraint) { 1823 address prev = pc() - NativeMembar::instruction_size; 1824 address last = code()->last_insn(); 1825 if (last != NULL && nativeInstruction_at(last)->is_Membar() && prev == last) { 1826 NativeMembar *bar = NativeMembar_at(prev); 1827 // We are merging two memory barrier instructions. On AArch64 we 1828 // can do this simply by ORing them together. 1829 bar->set_kind(bar->get_kind() | order_constraint); 1830 BLOCK_COMMENT("merged membar"); 1831 } else { 1832 code()->set_last_insn(pc()); 1833 dmb(Assembler::barrier(order_constraint)); 1834 } 1835 } 1836 1837 bool MacroAssembler::try_merge_ldst(Register rt, const Address &adr, size_t size_in_bytes, bool is_store) { 1838 if (ldst_can_merge(rt, adr, size_in_bytes, is_store)) { 1839 merge_ldst(rt, adr, size_in_bytes, is_store); 1840 code()->clear_last_insn(); 1841 return true; 1842 } else { 1843 assert(size_in_bytes == 8 || size_in_bytes == 4, "only 8 bytes or 4 bytes load/store is supported."); 1844 const unsigned mask = size_in_bytes - 1; 1845 if (adr.getMode() == Address::base_plus_offset && 1846 (adr.offset() & mask) == 0) { // only supports base_plus_offset. 1847 code()->set_last_insn(pc()); 1848 } 1849 return false; 1850 } 1851 } 1852 1853 void MacroAssembler::ldr(Register Rx, const Address &adr) { 1854 // We always try to merge two adjacent loads into one ldp. 1855 if (!try_merge_ldst(Rx, adr, 8, false)) { 1856 Assembler::ldr(Rx, adr); 1857 } 1858 } 1859 1860 void MacroAssembler::ldrw(Register Rw, const Address &adr) { 1861 // We always try to merge two adjacent loads into one ldp. 1862 if (!try_merge_ldst(Rw, adr, 4, false)) { 1863 Assembler::ldrw(Rw, adr); 1864 } 1865 } 1866 1867 void MacroAssembler::str(Register Rx, const Address &adr) { 1868 // We always try to merge two adjacent stores into one stp. 1869 if (!try_merge_ldst(Rx, adr, 8, true)) { 1870 Assembler::str(Rx, adr); 1871 } 1872 } 1873 1874 void MacroAssembler::strw(Register Rw, const Address &adr) { 1875 // We always try to merge two adjacent stores into one stp. 1876 if (!try_merge_ldst(Rw, adr, 4, true)) { 1877 Assembler::strw(Rw, adr); 1878 } 1879 } 1880 1881 // MacroAssembler routines found actually to be needed 1882 1883 void MacroAssembler::push(Register src) 1884 { 1885 str(src, Address(pre(esp, -1 * wordSize))); 1886 } 1887 1888 void MacroAssembler::pop(Register dst) 1889 { 1890 ldr(dst, Address(post(esp, 1 * wordSize))); 1891 } 1892 1893 // Note: load_unsigned_short used to be called load_unsigned_word. 1894 int MacroAssembler::load_unsigned_short(Register dst, Address src) { 1895 int off = offset(); 1896 ldrh(dst, src); 1897 return off; 1898 } 1899 1900 int MacroAssembler::load_unsigned_byte(Register dst, Address src) { 1901 int off = offset(); 1902 ldrb(dst, src); 1903 return off; 1904 } 1905 1906 int MacroAssembler::load_signed_short(Register dst, Address src) { 1907 int off = offset(); 1908 ldrsh(dst, src); 1909 return off; 1910 } 1911 1912 int MacroAssembler::load_signed_byte(Register dst, Address src) { 1913 int off = offset(); 1914 ldrsb(dst, src); 1915 return off; 1916 } 1917 1918 int MacroAssembler::load_signed_short32(Register dst, Address src) { 1919 int off = offset(); 1920 ldrshw(dst, src); 1921 return off; 1922 } 1923 1924 int MacroAssembler::load_signed_byte32(Register dst, Address src) { 1925 int off = offset(); 1926 ldrsbw(dst, src); 1927 return off; 1928 } 1929 1930 void MacroAssembler::load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed, Register dst2) { 1931 switch (size_in_bytes) { 1932 case 8: ldr(dst, src); break; 1933 case 4: ldrw(dst, src); break; 1934 case 2: is_signed ? load_signed_short(dst, src) : load_unsigned_short(dst, src); break; 1935 case 1: is_signed ? load_signed_byte( dst, src) : load_unsigned_byte( dst, src); break; 1936 default: ShouldNotReachHere(); 1937 } 1938 } 1939 1940 void MacroAssembler::store_sized_value(Address dst, Register src, size_t size_in_bytes, Register src2) { 1941 switch (size_in_bytes) { 1942 case 8: str(src, dst); break; 1943 case 4: strw(src, dst); break; 1944 case 2: strh(src, dst); break; 1945 case 1: strb(src, dst); break; 1946 default: ShouldNotReachHere(); 1947 } 1948 } 1949 1950 void MacroAssembler::decrementw(Register reg, int value) 1951 { 1952 if (value < 0) { incrementw(reg, -value); return; } 1953 if (value == 0) { return; } 1954 if (value < (1 << 12)) { subw(reg, reg, value); return; } 1955 /* else */ { 1956 guarantee(reg != rscratch2, "invalid dst for register decrement"); 1957 movw(rscratch2, (unsigned)value); 1958 subw(reg, reg, rscratch2); 1959 } 1960 } 1961 1962 void MacroAssembler::decrement(Register reg, int value) 1963 { 1964 if (value < 0) { increment(reg, -value); return; } 1965 if (value == 0) { return; } 1966 if (value < (1 << 12)) { sub(reg, reg, value); return; } 1967 /* else */ { 1968 assert(reg != rscratch2, "invalid dst for register decrement"); 1969 mov(rscratch2, (unsigned long)value); 1970 sub(reg, reg, rscratch2); 1971 } 1972 } 1973 1974 void MacroAssembler::decrementw(Address dst, int value) 1975 { 1976 assert(!dst.uses(rscratch1), "invalid dst for address decrement"); 1977 if (dst.getMode() == Address::literal) { 1978 assert(abs(value) < (1 << 12), "invalid value and address mode combination"); 1979 lea(rscratch2, dst); 1980 dst = Address(rscratch2); 1981 } 1982 ldrw(rscratch1, dst); 1983 decrementw(rscratch1, value); 1984 strw(rscratch1, dst); 1985 } 1986 1987 void MacroAssembler::decrement(Address dst, int value) 1988 { 1989 assert(!dst.uses(rscratch1), "invalid address for decrement"); 1990 if (dst.getMode() == Address::literal) { 1991 assert(abs(value) < (1 << 12), "invalid value and address mode combination"); 1992 lea(rscratch2, dst); 1993 dst = Address(rscratch2); 1994 } 1995 ldr(rscratch1, dst); 1996 decrement(rscratch1, value); 1997 str(rscratch1, dst); 1998 } 1999 2000 void MacroAssembler::incrementw(Register reg, int value) 2001 { 2002 if (value < 0) { decrementw(reg, -value); return; } 2003 if (value == 0) { return; } 2004 if (value < (1 << 12)) { addw(reg, reg, value); return; } 2005 /* else */ { 2006 assert(reg != rscratch2, "invalid dst for register increment"); 2007 movw(rscratch2, (unsigned)value); 2008 addw(reg, reg, rscratch2); 2009 } 2010 } 2011 2012 void MacroAssembler::increment(Register reg, int value) 2013 { 2014 if (value < 0) { decrement(reg, -value); return; } 2015 if (value == 0) { return; } 2016 if (value < (1 << 12)) { add(reg, reg, value); return; } 2017 /* else */ { 2018 assert(reg != rscratch2, "invalid dst for register increment"); 2019 movw(rscratch2, (unsigned)value); 2020 add(reg, reg, rscratch2); 2021 } 2022 } 2023 2024 void MacroAssembler::incrementw(Address dst, int value) 2025 { 2026 assert(!dst.uses(rscratch1), "invalid dst for address increment"); 2027 if (dst.getMode() == Address::literal) { 2028 assert(abs(value) < (1 << 12), "invalid value and address mode combination"); 2029 lea(rscratch2, dst); 2030 dst = Address(rscratch2); 2031 } 2032 ldrw(rscratch1, dst); 2033 incrementw(rscratch1, value); 2034 strw(rscratch1, dst); 2035 } 2036 2037 void MacroAssembler::increment(Address dst, int value) 2038 { 2039 assert(!dst.uses(rscratch1), "invalid dst for address increment"); 2040 if (dst.getMode() == Address::literal) { 2041 assert(abs(value) < (1 << 12), "invalid value and address mode combination"); 2042 lea(rscratch2, dst); 2043 dst = Address(rscratch2); 2044 } 2045 ldr(rscratch1, dst); 2046 increment(rscratch1, value); 2047 str(rscratch1, dst); 2048 } 2049 2050 2051 void MacroAssembler::pusha() { 2052 push(0x7fffffff, sp); 2053 } 2054 2055 void MacroAssembler::popa() { 2056 pop(0x7fffffff, sp); 2057 } 2058 2059 // Push lots of registers in the bit set supplied. Don't push sp. 2060 // Return the number of words pushed 2061 int MacroAssembler::push(unsigned int bitset, Register stack) { 2062 int words_pushed = 0; 2063 2064 // Scan bitset to accumulate register pairs 2065 unsigned char regs[32]; 2066 int count = 0; 2067 for (int reg = 0; reg <= 30; reg++) { 2068 if (1 & bitset) 2069 regs[count++] = reg; 2070 bitset >>= 1; 2071 } 2072 regs[count++] = zr->encoding_nocheck(); 2073 count &= ~1; // Only push an even nuber of regs 2074 2075 if (count) { 2076 stp(as_Register(regs[0]), as_Register(regs[1]), 2077 Address(pre(stack, -count * wordSize))); 2078 words_pushed += 2; 2079 } 2080 for (int i = 2; i < count; i += 2) { 2081 stp(as_Register(regs[i]), as_Register(regs[i+1]), 2082 Address(stack, i * wordSize)); 2083 words_pushed += 2; 2084 } 2085 2086 assert(words_pushed == count, "oops, pushed != count"); 2087 2088 return count; 2089 } 2090 2091 int MacroAssembler::pop(unsigned int bitset, Register stack) { 2092 int words_pushed = 0; 2093 2094 // Scan bitset to accumulate register pairs 2095 unsigned char regs[32]; 2096 int count = 0; 2097 for (int reg = 0; reg <= 30; reg++) { 2098 if (1 & bitset) 2099 regs[count++] = reg; 2100 bitset >>= 1; 2101 } 2102 regs[count++] = zr->encoding_nocheck(); 2103 count &= ~1; 2104 2105 for (int i = 2; i < count; i += 2) { 2106 ldp(as_Register(regs[i]), as_Register(regs[i+1]), 2107 Address(stack, i * wordSize)); 2108 words_pushed += 2; 2109 } 2110 if (count) { 2111 ldp(as_Register(regs[0]), as_Register(regs[1]), 2112 Address(post(stack, count * wordSize))); 2113 words_pushed += 2; 2114 } 2115 2116 assert(words_pushed == count, "oops, pushed != count"); 2117 2118 return count; 2119 } 2120 #ifdef ASSERT 2121 void MacroAssembler::verify_heapbase(const char* msg) { 2122 #if 0 2123 assert (UseCompressedOops || UseCompressedClassPointers, "should be compressed"); 2124 assert (Universe::heap() != NULL, "java heap should be initialized"); 2125 if (CheckCompressedOops) { 2126 Label ok; 2127 push(1 << rscratch1->encoding(), sp); // cmpptr trashes rscratch1 2128 cmpptr(rheapbase, ExternalAddress((address)Universe::narrow_ptrs_base_addr())); 2129 br(Assembler::EQ, ok); 2130 stop(msg); 2131 bind(ok); 2132 pop(1 << rscratch1->encoding(), sp); 2133 } 2134 #endif 2135 } 2136 #endif 2137 2138 void MacroAssembler::resolve_jobject(Register value, Register thread, Register tmp) { 2139 Label done, not_weak; 2140 cbz(value, done); // Use NULL as-is. 2141 2142 STATIC_ASSERT(JNIHandles::weak_tag_mask == 1u); 2143 tbz(r0, 0, not_weak); // Test for jweak tag. 2144 2145 // Resolve jweak. 2146 access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF, value, 2147 Address(value, -JNIHandles::weak_tag_value), tmp, thread); 2148 verify_oop(value); 2149 b(done); 2150 2151 bind(not_weak); 2152 // Resolve (untagged) jobject. 2153 access_load_at(T_OBJECT, IN_NATIVE, value, Address(value, 0), tmp, thread); 2154 verify_oop(value); 2155 bind(done); 2156 } 2157 2158 void MacroAssembler::stop(const char* msg) { 2159 address ip = pc(); 2160 pusha(); 2161 mov(c_rarg0, (address)msg); 2162 mov(c_rarg1, (address)ip); 2163 mov(c_rarg2, sp); 2164 mov(c_rarg3, CAST_FROM_FN_PTR(address, MacroAssembler::debug64)); 2165 // call(c_rarg3); 2166 blrt(c_rarg3, 3, 0, 1); 2167 hlt(0); 2168 } 2169 2170 void MacroAssembler::unimplemented(const char* what) { 2171 const char* buf = NULL; 2172 { 2173 ResourceMark rm; 2174 stringStream ss; 2175 ss.print("unimplemented: %s", what); 2176 buf = code_string(ss.as_string()); 2177 } 2178 stop(buf); 2179 } 2180 2181 // If a constant does not fit in an immediate field, generate some 2182 // number of MOV instructions and then perform the operation. 2183 void MacroAssembler::wrap_add_sub_imm_insn(Register Rd, Register Rn, unsigned imm, 2184 add_sub_imm_insn insn1, 2185 add_sub_reg_insn insn2) { 2186 assert(Rd != zr, "Rd = zr and not setting flags?"); 2187 if (operand_valid_for_add_sub_immediate((int)imm)) { 2188 (this->*insn1)(Rd, Rn, imm); 2189 } else { 2190 if (uabs(imm) < (1 << 24)) { 2191 (this->*insn1)(Rd, Rn, imm & -(1 << 12)); 2192 (this->*insn1)(Rd, Rd, imm & ((1 << 12)-1)); 2193 } else { 2194 assert_different_registers(Rd, Rn); 2195 mov(Rd, (uint64_t)imm); 2196 (this->*insn2)(Rd, Rn, Rd, LSL, 0); 2197 } 2198 } 2199 } 2200 2201 // Seperate vsn which sets the flags. Optimisations are more restricted 2202 // because we must set the flags correctly. 2203 void MacroAssembler::wrap_adds_subs_imm_insn(Register Rd, Register Rn, unsigned imm, 2204 add_sub_imm_insn insn1, 2205 add_sub_reg_insn insn2) { 2206 if (operand_valid_for_add_sub_immediate((int)imm)) { 2207 (this->*insn1)(Rd, Rn, imm); 2208 } else { 2209 assert_different_registers(Rd, Rn); 2210 assert(Rd != zr, "overflow in immediate operand"); 2211 mov(Rd, (uint64_t)imm); 2212 (this->*insn2)(Rd, Rn, Rd, LSL, 0); 2213 } 2214 } 2215 2216 2217 void MacroAssembler::add(Register Rd, Register Rn, RegisterOrConstant increment) { 2218 if (increment.is_register()) { 2219 add(Rd, Rn, increment.as_register()); 2220 } else { 2221 add(Rd, Rn, increment.as_constant()); 2222 } 2223 } 2224 2225 void MacroAssembler::addw(Register Rd, Register Rn, RegisterOrConstant increment) { 2226 if (increment.is_register()) { 2227 addw(Rd, Rn, increment.as_register()); 2228 } else { 2229 addw(Rd, Rn, increment.as_constant()); 2230 } 2231 } 2232 2233 void MacroAssembler::sub(Register Rd, Register Rn, RegisterOrConstant decrement) { 2234 if (decrement.is_register()) { 2235 sub(Rd, Rn, decrement.as_register()); 2236 } else { 2237 sub(Rd, Rn, decrement.as_constant()); 2238 } 2239 } 2240 2241 void MacroAssembler::subw(Register Rd, Register Rn, RegisterOrConstant decrement) { 2242 if (decrement.is_register()) { 2243 subw(Rd, Rn, decrement.as_register()); 2244 } else { 2245 subw(Rd, Rn, decrement.as_constant()); 2246 } 2247 } 2248 2249 void MacroAssembler::reinit_heapbase() 2250 { 2251 if (UseCompressedOops) { 2252 if (Universe::is_fully_initialized()) { 2253 mov(rheapbase, Universe::narrow_ptrs_base()); 2254 } else { 2255 lea(rheapbase, ExternalAddress((address)Universe::narrow_ptrs_base_addr())); 2256 ldr(rheapbase, Address(rheapbase)); 2257 } 2258 } 2259 } 2260 2261 // this simulates the behaviour of the x86 cmpxchg instruction using a 2262 // load linked/store conditional pair. we use the acquire/release 2263 // versions of these instructions so that we flush pending writes as 2264 // per Java semantics. 2265 2266 // n.b the x86 version assumes the old value to be compared against is 2267 // in rax and updates rax with the value located in memory if the 2268 // cmpxchg fails. we supply a register for the old value explicitly 2269 2270 // the aarch64 load linked/store conditional instructions do not 2271 // accept an offset. so, unlike x86, we must provide a plain register 2272 // to identify the memory word to be compared/exchanged rather than a 2273 // register+offset Address. 2274 2275 void MacroAssembler::cmpxchgptr(Register oldv, Register newv, Register addr, Register tmp, 2276 Label &succeed, Label *fail) { 2277 // oldv holds comparison value 2278 // newv holds value to write in exchange 2279 // addr identifies memory word to compare against/update 2280 if (UseLSE) { 2281 mov(tmp, oldv); 2282 casal(Assembler::xword, oldv, newv, addr); 2283 cmp(tmp, oldv); 2284 br(Assembler::EQ, succeed); 2285 membar(AnyAny); 2286 } else { 2287 Label retry_load, nope; 2288 if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH)) 2289 prfm(Address(addr), PSTL1STRM); 2290 bind(retry_load); 2291 // flush and load exclusive from the memory location 2292 // and fail if it is not what we expect 2293 ldaxr(tmp, addr); 2294 cmp(tmp, oldv); 2295 br(Assembler::NE, nope); 2296 // if we store+flush with no intervening write tmp wil be zero 2297 stlxr(tmp, newv, addr); 2298 cbzw(tmp, succeed); 2299 // retry so we only ever return after a load fails to compare 2300 // ensures we don't return a stale value after a failed write. 2301 b(retry_load); 2302 // if the memory word differs we return it in oldv and signal a fail 2303 bind(nope); 2304 membar(AnyAny); 2305 mov(oldv, tmp); 2306 } 2307 if (fail) 2308 b(*fail); 2309 } 2310 2311 void MacroAssembler::cmpxchg_obj_header(Register oldv, Register newv, Register obj, Register tmp, 2312 Label &succeed, Label *fail) { 2313 assert(oopDesc::mark_offset_in_bytes() == 0, "assumption"); 2314 cmpxchgptr(oldv, newv, obj, tmp, succeed, fail); 2315 } 2316 2317 void MacroAssembler::cmpxchgw(Register oldv, Register newv, Register addr, Register tmp, 2318 Label &succeed, Label *fail) { 2319 // oldv holds comparison value 2320 // newv holds value to write in exchange 2321 // addr identifies memory word to compare against/update 2322 // tmp returns 0/1 for success/failure 2323 if (UseLSE) { 2324 mov(tmp, oldv); 2325 casal(Assembler::word, oldv, newv, addr); 2326 cmp(tmp, oldv); 2327 br(Assembler::EQ, succeed); 2328 membar(AnyAny); 2329 } else { 2330 Label retry_load, nope; 2331 if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH)) 2332 prfm(Address(addr), PSTL1STRM); 2333 bind(retry_load); 2334 // flush and load exclusive from the memory location 2335 // and fail if it is not what we expect 2336 ldaxrw(tmp, addr); 2337 cmp(tmp, oldv); 2338 br(Assembler::NE, nope); 2339 // if we store+flush with no intervening write tmp wil be zero 2340 stlxrw(tmp, newv, addr); 2341 cbzw(tmp, succeed); 2342 // retry so we only ever return after a load fails to compare 2343 // ensures we don't return a stale value after a failed write. 2344 b(retry_load); 2345 // if the memory word differs we return it in oldv and signal a fail 2346 bind(nope); 2347 membar(AnyAny); 2348 mov(oldv, tmp); 2349 } 2350 if (fail) 2351 b(*fail); 2352 } 2353 2354 // A generic CAS; success or failure is in the EQ flag. A weak CAS 2355 // doesn't retry and may fail spuriously. If the oldval is wanted, 2356 // Pass a register for the result, otherwise pass noreg. 2357 2358 // Clobbers rscratch1 2359 void MacroAssembler::cmpxchg(Register addr, Register expected, 2360 Register new_val, 2361 enum operand_size size, 2362 bool acquire, bool release, 2363 bool weak, 2364 Register result) { 2365 if (result == noreg) result = rscratch1; 2366 BLOCK_COMMENT("cmpxchg {"); 2367 if (UseLSE) { 2368 mov(result, expected); 2369 lse_cas(result, new_val, addr, size, acquire, release, /*not_pair*/ true); 2370 compare_eq(result, expected, size); 2371 } else { 2372 Label retry_load, done; 2373 if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH)) 2374 prfm(Address(addr), PSTL1STRM); 2375 bind(retry_load); 2376 load_exclusive(result, addr, size, acquire); 2377 compare_eq(result, expected, size); 2378 br(Assembler::NE, done); 2379 store_exclusive(rscratch1, new_val, addr, size, release); 2380 if (weak) { 2381 cmpw(rscratch1, 0u); // If the store fails, return NE to our caller. 2382 } else { 2383 cbnzw(rscratch1, retry_load); 2384 } 2385 bind(done); 2386 } 2387 BLOCK_COMMENT("} cmpxchg"); 2388 } 2389 2390 // A generic comparison. Only compares for equality, clobbers rscratch1. 2391 void MacroAssembler::compare_eq(Register rm, Register rn, enum operand_size size) { 2392 if (size == xword) { 2393 cmp(rm, rn); 2394 } else if (size == word) { 2395 cmpw(rm, rn); 2396 } else if (size == halfword) { 2397 eorw(rscratch1, rm, rn); 2398 ands(zr, rscratch1, 0xffff); 2399 } else if (size == byte) { 2400 eorw(rscratch1, rm, rn); 2401 ands(zr, rscratch1, 0xff); 2402 } else { 2403 ShouldNotReachHere(); 2404 } 2405 } 2406 2407 2408 static bool different(Register a, RegisterOrConstant b, Register c) { 2409 if (b.is_constant()) 2410 return a != c; 2411 else 2412 return a != b.as_register() && a != c && b.as_register() != c; 2413 } 2414 2415 #define ATOMIC_OP(NAME, LDXR, OP, IOP, AOP, STXR, sz) \ 2416 void MacroAssembler::atomic_##NAME(Register prev, RegisterOrConstant incr, Register addr) { \ 2417 if (UseLSE) { \ 2418 prev = prev->is_valid() ? prev : zr; \ 2419 if (incr.is_register()) { \ 2420 AOP(sz, incr.as_register(), prev, addr); \ 2421 } else { \ 2422 mov(rscratch2, incr.as_constant()); \ 2423 AOP(sz, rscratch2, prev, addr); \ 2424 } \ 2425 return; \ 2426 } \ 2427 Register result = rscratch2; \ 2428 if (prev->is_valid()) \ 2429 result = different(prev, incr, addr) ? prev : rscratch2; \ 2430 \ 2431 Label retry_load; \ 2432 if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH)) \ 2433 prfm(Address(addr), PSTL1STRM); \ 2434 bind(retry_load); \ 2435 LDXR(result, addr); \ 2436 OP(rscratch1, result, incr); \ 2437 STXR(rscratch2, rscratch1, addr); \ 2438 cbnzw(rscratch2, retry_load); \ 2439 if (prev->is_valid() && prev != result) { \ 2440 IOP(prev, rscratch1, incr); \ 2441 } \ 2442 } 2443 2444 ATOMIC_OP(add, ldxr, add, sub, ldadd, stxr, Assembler::xword) 2445 ATOMIC_OP(addw, ldxrw, addw, subw, ldadd, stxrw, Assembler::word) 2446 ATOMIC_OP(addal, ldaxr, add, sub, ldaddal, stlxr, Assembler::xword) 2447 ATOMIC_OP(addalw, ldaxrw, addw, subw, ldaddal, stlxrw, Assembler::word) 2448 2449 #undef ATOMIC_OP 2450 2451 #define ATOMIC_XCHG(OP, AOP, LDXR, STXR, sz) \ 2452 void MacroAssembler::atomic_##OP(Register prev, Register newv, Register addr) { \ 2453 if (UseLSE) { \ 2454 prev = prev->is_valid() ? prev : zr; \ 2455 AOP(sz, newv, prev, addr); \ 2456 return; \ 2457 } \ 2458 Register result = rscratch2; \ 2459 if (prev->is_valid()) \ 2460 result = different(prev, newv, addr) ? prev : rscratch2; \ 2461 \ 2462 Label retry_load; \ 2463 if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH)) \ 2464 prfm(Address(addr), PSTL1STRM); \ 2465 bind(retry_load); \ 2466 LDXR(result, addr); \ 2467 STXR(rscratch1, newv, addr); \ 2468 cbnzw(rscratch1, retry_load); \ 2469 if (prev->is_valid() && prev != result) \ 2470 mov(prev, result); \ 2471 } 2472 2473 ATOMIC_XCHG(xchg, swp, ldxr, stxr, Assembler::xword) 2474 ATOMIC_XCHG(xchgw, swp, ldxrw, stxrw, Assembler::word) 2475 ATOMIC_XCHG(xchgal, swpal, ldaxr, stlxr, Assembler::xword) 2476 ATOMIC_XCHG(xchgalw, swpal, ldaxrw, stlxrw, Assembler::word) 2477 2478 #undef ATOMIC_XCHG 2479 2480 #ifndef PRODUCT 2481 extern "C" void findpc(intptr_t x); 2482 #endif 2483 2484 void MacroAssembler::debug64(char* msg, int64_t pc, int64_t regs[]) 2485 { 2486 // In order to get locks to work, we need to fake a in_VM state 2487 if (ShowMessageBoxOnError ) { 2488 JavaThread* thread = JavaThread::current(); 2489 JavaThreadState saved_state = thread->thread_state(); 2490 thread->set_thread_state(_thread_in_vm); 2491 #ifndef PRODUCT 2492 if (CountBytecodes || TraceBytecodes || StopInterpreterAt) { 2493 ttyLocker ttyl; 2494 BytecodeCounter::print(); 2495 } 2496 #endif 2497 if (os::message_box(msg, "Execution stopped, print registers?")) { 2498 ttyLocker ttyl; 2499 tty->print_cr(" pc = 0x%016lx", pc); 2500 #ifndef PRODUCT 2501 tty->cr(); 2502 findpc(pc); 2503 tty->cr(); 2504 #endif 2505 tty->print_cr(" r0 = 0x%016lx", regs[0]); 2506 tty->print_cr(" r1 = 0x%016lx", regs[1]); 2507 tty->print_cr(" r2 = 0x%016lx", regs[2]); 2508 tty->print_cr(" r3 = 0x%016lx", regs[3]); 2509 tty->print_cr(" r4 = 0x%016lx", regs[4]); 2510 tty->print_cr(" r5 = 0x%016lx", regs[5]); 2511 tty->print_cr(" r6 = 0x%016lx", regs[6]); 2512 tty->print_cr(" r7 = 0x%016lx", regs[7]); 2513 tty->print_cr(" r8 = 0x%016lx", regs[8]); 2514 tty->print_cr(" r9 = 0x%016lx", regs[9]); 2515 tty->print_cr("r10 = 0x%016lx", regs[10]); 2516 tty->print_cr("r11 = 0x%016lx", regs[11]); 2517 tty->print_cr("r12 = 0x%016lx", regs[12]); 2518 tty->print_cr("r13 = 0x%016lx", regs[13]); 2519 tty->print_cr("r14 = 0x%016lx", regs[14]); 2520 tty->print_cr("r15 = 0x%016lx", regs[15]); 2521 tty->print_cr("r16 = 0x%016lx", regs[16]); 2522 tty->print_cr("r17 = 0x%016lx", regs[17]); 2523 tty->print_cr("r18 = 0x%016lx", regs[18]); 2524 tty->print_cr("r19 = 0x%016lx", regs[19]); 2525 tty->print_cr("r20 = 0x%016lx", regs[20]); 2526 tty->print_cr("r21 = 0x%016lx", regs[21]); 2527 tty->print_cr("r22 = 0x%016lx", regs[22]); 2528 tty->print_cr("r23 = 0x%016lx", regs[23]); 2529 tty->print_cr("r24 = 0x%016lx", regs[24]); 2530 tty->print_cr("r25 = 0x%016lx", regs[25]); 2531 tty->print_cr("r26 = 0x%016lx", regs[26]); 2532 tty->print_cr("r27 = 0x%016lx", regs[27]); 2533 tty->print_cr("r28 = 0x%016lx", regs[28]); 2534 tty->print_cr("r30 = 0x%016lx", regs[30]); 2535 tty->print_cr("r31 = 0x%016lx", regs[31]); 2536 BREAKPOINT; 2537 } 2538 ThreadStateTransition::transition(thread, _thread_in_vm, saved_state); 2539 } else { 2540 ttyLocker ttyl; 2541 ::tty->print_cr("=============== DEBUG MESSAGE: %s ================\n", 2542 msg); 2543 assert(false, "DEBUG MESSAGE: %s", msg); 2544 } 2545 } 2546 2547 #ifdef BUILTIN_SIM 2548 // routine to generate an x86 prolog for a stub function which 2549 // bootstraps into the generated ARM code which directly follows the 2550 // stub 2551 // 2552 // the argument encodes the number of general and fp registers 2553 // passed by the caller and the callng convention (currently just 2554 // the number of general registers and assumes C argument passing) 2555 2556 extern "C" { 2557 int aarch64_stub_prolog_size(); 2558 void aarch64_stub_prolog(); 2559 void aarch64_prolog(); 2560 } 2561 2562 void MacroAssembler::c_stub_prolog(int gp_arg_count, int fp_arg_count, int ret_type, 2563 address *prolog_ptr) 2564 { 2565 int calltype = (((ret_type & 0x3) << 8) | 2566 ((fp_arg_count & 0xf) << 4) | 2567 (gp_arg_count & 0xf)); 2568 2569 // the addresses for the x86 to ARM entry code we need to use 2570 address start = pc(); 2571 // printf("start = %lx\n", start); 2572 int byteCount = aarch64_stub_prolog_size(); 2573 // printf("byteCount = %x\n", byteCount); 2574 int instructionCount = (byteCount + 3)/ 4; 2575 // printf("instructionCount = %x\n", instructionCount); 2576 for (int i = 0; i < instructionCount; i++) { 2577 nop(); 2578 } 2579 2580 memcpy(start, (void*)aarch64_stub_prolog, byteCount); 2581 2582 // write the address of the setup routine and the call format at the 2583 // end of into the copied code 2584 u_int64_t *patch_end = (u_int64_t *)(start + byteCount); 2585 if (prolog_ptr) 2586 patch_end[-2] = (u_int64_t)prolog_ptr; 2587 patch_end[-1] = calltype; 2588 } 2589 #endif 2590 2591 void MacroAssembler::push_call_clobbered_fp_registers() { 2592 int step = 4 * wordSize; 2593 sub(sp, sp, step); 2594 mov(rscratch1, -step); 2595 // Push v0-v7, v16-v31. 2596 for (int i = 31; i>= 4; i -= 4) { 2597 if (i <= v7->encoding() || i >= v16->encoding()) 2598 st1(as_FloatRegister(i-3), as_FloatRegister(i-2), as_FloatRegister(i-1), 2599 as_FloatRegister(i), T1D, Address(post(sp, rscratch1))); 2600 } 2601 st1(as_FloatRegister(0), as_FloatRegister(1), as_FloatRegister(2), 2602 as_FloatRegister(3), T1D, Address(sp)); 2603 } 2604 2605 void MacroAssembler::pop_call_clobbered_fp_registers() { 2606 for (int i = 0; i < 32; i += 4) { 2607 if (i <= v7->encoding() || i >= v16->encoding()) 2608 ld1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2), 2609 as_FloatRegister(i+3), T1D, Address(post(sp, 4 * wordSize))); 2610 } 2611 } 2612 2613 void MacroAssembler::push_call_clobbered_registers() { 2614 push(RegSet::range(r0, r18) - RegSet::of(rscratch1, rscratch2), sp); 2615 push_call_clobbered_fp_registers(); 2616 } 2617 2618 void MacroAssembler::pop_call_clobbered_registers() { 2619 pop_call_clobbered_fp_registers(); 2620 pop(RegSet::range(r0, r18) - RegSet::of(rscratch1, rscratch2), sp); 2621 } 2622 2623 void MacroAssembler::push_CPU_state(bool save_vectors) { 2624 int step = (save_vectors ? 8 : 4) * wordSize; 2625 push(0x3fffffff, sp); // integer registers except lr & sp 2626 mov(rscratch1, -step); 2627 sub(sp, sp, step); 2628 for (int i = 28; i >= 4; i -= 4) { 2629 st1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2), 2630 as_FloatRegister(i+3), save_vectors ? T2D : T1D, Address(post(sp, rscratch1))); 2631 } 2632 st1(v0, v1, v2, v3, save_vectors ? T2D : T1D, sp); 2633 } 2634 2635 void MacroAssembler::pop_CPU_state(bool restore_vectors) { 2636 int step = (restore_vectors ? 8 : 4) * wordSize; 2637 for (int i = 0; i <= 28; i += 4) 2638 ld1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2), 2639 as_FloatRegister(i+3), restore_vectors ? T2D : T1D, Address(post(sp, step))); 2640 pop(0x3fffffff, sp); // integer registers except lr & sp 2641 } 2642 2643 /** 2644 * Helpers for multiply_to_len(). 2645 */ 2646 void MacroAssembler::add2_with_carry(Register final_dest_hi, Register dest_hi, Register dest_lo, 2647 Register src1, Register src2) { 2648 adds(dest_lo, dest_lo, src1); 2649 adc(dest_hi, dest_hi, zr); 2650 adds(dest_lo, dest_lo, src2); 2651 adc(final_dest_hi, dest_hi, zr); 2652 } 2653 2654 // Generate an address from (r + r1 extend offset). "size" is the 2655 // size of the operand. The result may be in rscratch2. 2656 Address MacroAssembler::offsetted_address(Register r, Register r1, 2657 Address::extend ext, int offset, int size) { 2658 if (offset || (ext.shift() % size != 0)) { 2659 lea(rscratch2, Address(r, r1, ext)); 2660 return Address(rscratch2, offset); 2661 } else { 2662 return Address(r, r1, ext); 2663 } 2664 } 2665 2666 Address MacroAssembler::spill_address(int size, int offset, Register tmp) 2667 { 2668 assert(offset >= 0, "spill to negative address?"); 2669 // Offset reachable ? 2670 // Not aligned - 9 bits signed offset 2671 // Aligned - 12 bits unsigned offset shifted 2672 Register base = sp; 2673 if ((offset & (size-1)) && offset >= (1<<8)) { 2674 add(tmp, base, offset & ((1<<12)-1)); 2675 base = tmp; 2676 offset &= -1<<12; 2677 } 2678 2679 if (offset >= (1<<12) * size) { 2680 add(tmp, base, offset & (((1<<12)-1)<<12)); 2681 base = tmp; 2682 offset &= ~(((1<<12)-1)<<12); 2683 } 2684 2685 return Address(base, offset); 2686 } 2687 2688 // Checks whether offset is aligned. 2689 // Returns true if it is, else false. 2690 bool MacroAssembler::merge_alignment_check(Register base, 2691 size_t size, 2692 long cur_offset, 2693 long prev_offset) const { 2694 if (AvoidUnalignedAccesses) { 2695 if (base == sp) { 2696 // Checks whether low offset if aligned to pair of registers. 2697 long pair_mask = size * 2 - 1; 2698 long offset = prev_offset > cur_offset ? cur_offset : prev_offset; 2699 return (offset & pair_mask) == 0; 2700 } else { // If base is not sp, we can't guarantee the access is aligned. 2701 return false; 2702 } 2703 } else { 2704 long mask = size - 1; 2705 // Load/store pair instruction only supports element size aligned offset. 2706 return (cur_offset & mask) == 0 && (prev_offset & mask) == 0; 2707 } 2708 } 2709 2710 // Checks whether current and previous loads/stores can be merged. 2711 // Returns true if it can be merged, else false. 2712 bool MacroAssembler::ldst_can_merge(Register rt, 2713 const Address &adr, 2714 size_t cur_size_in_bytes, 2715 bool is_store) const { 2716 address prev = pc() - NativeInstruction::instruction_size; 2717 address last = code()->last_insn(); 2718 2719 if (last == NULL || !nativeInstruction_at(last)->is_Imm_LdSt()) { 2720 return false; 2721 } 2722 2723 if (adr.getMode() != Address::base_plus_offset || prev != last) { 2724 return false; 2725 } 2726 2727 NativeLdSt* prev_ldst = NativeLdSt_at(prev); 2728 size_t prev_size_in_bytes = prev_ldst->size_in_bytes(); 2729 2730 assert(prev_size_in_bytes == 4 || prev_size_in_bytes == 8, "only supports 64/32bit merging."); 2731 assert(cur_size_in_bytes == 4 || cur_size_in_bytes == 8, "only supports 64/32bit merging."); 2732 2733 if (cur_size_in_bytes != prev_size_in_bytes || is_store != prev_ldst->is_store()) { 2734 return false; 2735 } 2736 2737 long max_offset = 63 * prev_size_in_bytes; 2738 long min_offset = -64 * prev_size_in_bytes; 2739 2740 assert(prev_ldst->is_not_pre_post_index(), "pre-index or post-index is not supported to be merged."); 2741 2742 // Only same base can be merged. 2743 if (adr.base() != prev_ldst->base()) { 2744 return false; 2745 } 2746 2747 long cur_offset = adr.offset(); 2748 long prev_offset = prev_ldst->offset(); 2749 size_t diff = abs(cur_offset - prev_offset); 2750 if (diff != prev_size_in_bytes) { 2751 return false; 2752 } 2753 2754 // Following cases can not be merged: 2755 // ldr x2, [x2, #8] 2756 // ldr x3, [x2, #16] 2757 // or: 2758 // ldr x2, [x3, #8] 2759 // ldr x2, [x3, #16] 2760 // If t1 and t2 is the same in "ldp t1, t2, [xn, #imm]", we'll get SIGILL. 2761 if (!is_store && (adr.base() == prev_ldst->target() || rt == prev_ldst->target())) { 2762 return false; 2763 } 2764 2765 long low_offset = prev_offset > cur_offset ? cur_offset : prev_offset; 2766 // Offset range must be in ldp/stp instruction's range. 2767 if (low_offset > max_offset || low_offset < min_offset) { 2768 return false; 2769 } 2770 2771 if (merge_alignment_check(adr.base(), prev_size_in_bytes, cur_offset, prev_offset)) { 2772 return true; 2773 } 2774 2775 return false; 2776 } 2777 2778 // Merge current load/store with previous load/store into ldp/stp. 2779 void MacroAssembler::merge_ldst(Register rt, 2780 const Address &adr, 2781 size_t cur_size_in_bytes, 2782 bool is_store) { 2783 2784 assert(ldst_can_merge(rt, adr, cur_size_in_bytes, is_store) == true, "cur and prev must be able to be merged."); 2785 2786 Register rt_low, rt_high; 2787 address prev = pc() - NativeInstruction::instruction_size; 2788 NativeLdSt* prev_ldst = NativeLdSt_at(prev); 2789 2790 long offset; 2791 2792 if (adr.offset() < prev_ldst->offset()) { 2793 offset = adr.offset(); 2794 rt_low = rt; 2795 rt_high = prev_ldst->target(); 2796 } else { 2797 offset = prev_ldst->offset(); 2798 rt_low = prev_ldst->target(); 2799 rt_high = rt; 2800 } 2801 2802 Address adr_p = Address(prev_ldst->base(), offset); 2803 // Overwrite previous generated binary. 2804 code_section()->set_end(prev); 2805 2806 const int sz = prev_ldst->size_in_bytes(); 2807 assert(sz == 8 || sz == 4, "only supports 64/32bit merging."); 2808 if (!is_store) { 2809 BLOCK_COMMENT("merged ldr pair"); 2810 if (sz == 8) { 2811 ldp(rt_low, rt_high, adr_p); 2812 } else { 2813 ldpw(rt_low, rt_high, adr_p); 2814 } 2815 } else { 2816 BLOCK_COMMENT("merged str pair"); 2817 if (sz == 8) { 2818 stp(rt_low, rt_high, adr_p); 2819 } else { 2820 stpw(rt_low, rt_high, adr_p); 2821 } 2822 } 2823 } 2824 2825 /** 2826 * Multiply 64 bit by 64 bit first loop. 2827 */ 2828 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart, 2829 Register y, Register y_idx, Register z, 2830 Register carry, Register product, 2831 Register idx, Register kdx) { 2832 // 2833 // jlong carry, x[], y[], z[]; 2834 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) { 2835 // huge_128 product = y[idx] * x[xstart] + carry; 2836 // z[kdx] = (jlong)product; 2837 // carry = (jlong)(product >>> 64); 2838 // } 2839 // z[xstart] = carry; 2840 // 2841 2842 Label L_first_loop, L_first_loop_exit; 2843 Label L_one_x, L_one_y, L_multiply; 2844 2845 subsw(xstart, xstart, 1); 2846 br(Assembler::MI, L_one_x); 2847 2848 lea(rscratch1, Address(x, xstart, Address::lsl(LogBytesPerInt))); 2849 ldr(x_xstart, Address(rscratch1)); 2850 ror(x_xstart, x_xstart, 32); // convert big-endian to little-endian 2851 2852 bind(L_first_loop); 2853 subsw(idx, idx, 1); 2854 br(Assembler::MI, L_first_loop_exit); 2855 subsw(idx, idx, 1); 2856 br(Assembler::MI, L_one_y); 2857 lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt))); 2858 ldr(y_idx, Address(rscratch1)); 2859 ror(y_idx, y_idx, 32); // convert big-endian to little-endian 2860 bind(L_multiply); 2861 2862 // AArch64 has a multiply-accumulate instruction that we can't use 2863 // here because it has no way to process carries, so we have to use 2864 // separate add and adc instructions. Bah. 2865 umulh(rscratch1, x_xstart, y_idx); // x_xstart * y_idx -> rscratch1:product 2866 mul(product, x_xstart, y_idx); 2867 adds(product, product, carry); 2868 adc(carry, rscratch1, zr); // x_xstart * y_idx + carry -> carry:product 2869 2870 subw(kdx, kdx, 2); 2871 ror(product, product, 32); // back to big-endian 2872 str(product, offsetted_address(z, kdx, Address::uxtw(LogBytesPerInt), 0, BytesPerLong)); 2873 2874 b(L_first_loop); 2875 2876 bind(L_one_y); 2877 ldrw(y_idx, Address(y, 0)); 2878 b(L_multiply); 2879 2880 bind(L_one_x); 2881 ldrw(x_xstart, Address(x, 0)); 2882 b(L_first_loop); 2883 2884 bind(L_first_loop_exit); 2885 } 2886 2887 /** 2888 * Multiply 128 bit by 128. Unrolled inner loop. 2889 * 2890 */ 2891 void MacroAssembler::multiply_128_x_128_loop(Register y, Register z, 2892 Register carry, Register carry2, 2893 Register idx, Register jdx, 2894 Register yz_idx1, Register yz_idx2, 2895 Register tmp, Register tmp3, Register tmp4, 2896 Register tmp6, Register product_hi) { 2897 2898 // jlong carry, x[], y[], z[]; 2899 // int kdx = ystart+1; 2900 // for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop 2901 // huge_128 tmp3 = (y[idx+1] * product_hi) + z[kdx+idx+1] + carry; 2902 // jlong carry2 = (jlong)(tmp3 >>> 64); 2903 // huge_128 tmp4 = (y[idx] * product_hi) + z[kdx+idx] + carry2; 2904 // carry = (jlong)(tmp4 >>> 64); 2905 // z[kdx+idx+1] = (jlong)tmp3; 2906 // z[kdx+idx] = (jlong)tmp4; 2907 // } 2908 // idx += 2; 2909 // if (idx > 0) { 2910 // yz_idx1 = (y[idx] * product_hi) + z[kdx+idx] + carry; 2911 // z[kdx+idx] = (jlong)yz_idx1; 2912 // carry = (jlong)(yz_idx1 >>> 64); 2913 // } 2914 // 2915 2916 Label L_third_loop, L_third_loop_exit, L_post_third_loop_done; 2917 2918 lsrw(jdx, idx, 2); 2919 2920 bind(L_third_loop); 2921 2922 subsw(jdx, jdx, 1); 2923 br(Assembler::MI, L_third_loop_exit); 2924 subw(idx, idx, 4); 2925 2926 lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt))); 2927 2928 ldp(yz_idx2, yz_idx1, Address(rscratch1, 0)); 2929 2930 lea(tmp6, Address(z, idx, Address::uxtw(LogBytesPerInt))); 2931 2932 ror(yz_idx1, yz_idx1, 32); // convert big-endian to little-endian 2933 ror(yz_idx2, yz_idx2, 32); 2934 2935 ldp(rscratch2, rscratch1, Address(tmp6, 0)); 2936 2937 mul(tmp3, product_hi, yz_idx1); // yz_idx1 * product_hi -> tmp4:tmp3 2938 umulh(tmp4, product_hi, yz_idx1); 2939 2940 ror(rscratch1, rscratch1, 32); // convert big-endian to little-endian 2941 ror(rscratch2, rscratch2, 32); 2942 2943 mul(tmp, product_hi, yz_idx2); // yz_idx2 * product_hi -> carry2:tmp 2944 umulh(carry2, product_hi, yz_idx2); 2945 2946 // propagate sum of both multiplications into carry:tmp4:tmp3 2947 adds(tmp3, tmp3, carry); 2948 adc(tmp4, tmp4, zr); 2949 adds(tmp3, tmp3, rscratch1); 2950 adcs(tmp4, tmp4, tmp); 2951 adc(carry, carry2, zr); 2952 adds(tmp4, tmp4, rscratch2); 2953 adc(carry, carry, zr); 2954 2955 ror(tmp3, tmp3, 32); // convert little-endian to big-endian 2956 ror(tmp4, tmp4, 32); 2957 stp(tmp4, tmp3, Address(tmp6, 0)); 2958 2959 b(L_third_loop); 2960 bind (L_third_loop_exit); 2961 2962 andw (idx, idx, 0x3); 2963 cbz(idx, L_post_third_loop_done); 2964 2965 Label L_check_1; 2966 subsw(idx, idx, 2); 2967 br(Assembler::MI, L_check_1); 2968 2969 lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt))); 2970 ldr(yz_idx1, Address(rscratch1, 0)); 2971 ror(yz_idx1, yz_idx1, 32); 2972 mul(tmp3, product_hi, yz_idx1); // yz_idx1 * product_hi -> tmp4:tmp3 2973 umulh(tmp4, product_hi, yz_idx1); 2974 lea(rscratch1, Address(z, idx, Address::uxtw(LogBytesPerInt))); 2975 ldr(yz_idx2, Address(rscratch1, 0)); 2976 ror(yz_idx2, yz_idx2, 32); 2977 2978 add2_with_carry(carry, tmp4, tmp3, carry, yz_idx2); 2979 2980 ror(tmp3, tmp3, 32); 2981 str(tmp3, Address(rscratch1, 0)); 2982 2983 bind (L_check_1); 2984 2985 andw (idx, idx, 0x1); 2986 subsw(idx, idx, 1); 2987 br(Assembler::MI, L_post_third_loop_done); 2988 ldrw(tmp4, Address(y, idx, Address::uxtw(LogBytesPerInt))); 2989 mul(tmp3, tmp4, product_hi); // tmp4 * product_hi -> carry2:tmp3 2990 umulh(carry2, tmp4, product_hi); 2991 ldrw(tmp4, Address(z, idx, Address::uxtw(LogBytesPerInt))); 2992 2993 add2_with_carry(carry2, tmp3, tmp4, carry); 2994 2995 strw(tmp3, Address(z, idx, Address::uxtw(LogBytesPerInt))); 2996 extr(carry, carry2, tmp3, 32); 2997 2998 bind(L_post_third_loop_done); 2999 } 3000 3001 /** 3002 * Code for BigInteger::multiplyToLen() instrinsic. 3003 * 3004 * r0: x 3005 * r1: xlen 3006 * r2: y 3007 * r3: ylen 3008 * r4: z 3009 * r5: zlen 3010 * r10: tmp1 3011 * r11: tmp2 3012 * r12: tmp3 3013 * r13: tmp4 3014 * r14: tmp5 3015 * r15: tmp6 3016 * r16: tmp7 3017 * 3018 */ 3019 void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Register ylen, 3020 Register z, Register zlen, 3021 Register tmp1, Register tmp2, Register tmp3, Register tmp4, 3022 Register tmp5, Register tmp6, Register product_hi) { 3023 3024 assert_different_registers(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6); 3025 3026 const Register idx = tmp1; 3027 const Register kdx = tmp2; 3028 const Register xstart = tmp3; 3029 3030 const Register y_idx = tmp4; 3031 const Register carry = tmp5; 3032 const Register product = xlen; 3033 const Register x_xstart = zlen; // reuse register 3034 3035 // First Loop. 3036 // 3037 // final static long LONG_MASK = 0xffffffffL; 3038 // int xstart = xlen - 1; 3039 // int ystart = ylen - 1; 3040 // long carry = 0; 3041 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) { 3042 // long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry; 3043 // z[kdx] = (int)product; 3044 // carry = product >>> 32; 3045 // } 3046 // z[xstart] = (int)carry; 3047 // 3048 3049 movw(idx, ylen); // idx = ylen; 3050 movw(kdx, zlen); // kdx = xlen+ylen; 3051 mov(carry, zr); // carry = 0; 3052 3053 Label L_done; 3054 3055 movw(xstart, xlen); 3056 subsw(xstart, xstart, 1); 3057 br(Assembler::MI, L_done); 3058 3059 multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, carry, product, idx, kdx); 3060 3061 Label L_second_loop; 3062 cbzw(kdx, L_second_loop); 3063 3064 Label L_carry; 3065 subw(kdx, kdx, 1); 3066 cbzw(kdx, L_carry); 3067 3068 strw(carry, Address(z, kdx, Address::uxtw(LogBytesPerInt))); 3069 lsr(carry, carry, 32); 3070 subw(kdx, kdx, 1); 3071 3072 bind(L_carry); 3073 strw(carry, Address(z, kdx, Address::uxtw(LogBytesPerInt))); 3074 3075 // Second and third (nested) loops. 3076 // 3077 // for (int i = xstart-1; i >= 0; i--) { // Second loop 3078 // carry = 0; 3079 // for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop 3080 // long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) + 3081 // (z[k] & LONG_MASK) + carry; 3082 // z[k] = (int)product; 3083 // carry = product >>> 32; 3084 // } 3085 // z[i] = (int)carry; 3086 // } 3087 // 3088 // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = product_hi 3089 3090 const Register jdx = tmp1; 3091 3092 bind(L_second_loop); 3093 mov(carry, zr); // carry = 0; 3094 movw(jdx, ylen); // j = ystart+1 3095 3096 subsw(xstart, xstart, 1); // i = xstart-1; 3097 br(Assembler::MI, L_done); 3098 3099 str(z, Address(pre(sp, -4 * wordSize))); 3100 3101 Label L_last_x; 3102 lea(z, offsetted_address(z, xstart, Address::uxtw(LogBytesPerInt), 4, BytesPerInt)); // z = z + k - j 3103 subsw(xstart, xstart, 1); // i = xstart-1; 3104 br(Assembler::MI, L_last_x); 3105 3106 lea(rscratch1, Address(x, xstart, Address::uxtw(LogBytesPerInt))); 3107 ldr(product_hi, Address(rscratch1)); 3108 ror(product_hi, product_hi, 32); // convert big-endian to little-endian 3109 3110 Label L_third_loop_prologue; 3111 bind(L_third_loop_prologue); 3112 3113 str(ylen, Address(sp, wordSize)); 3114 stp(x, xstart, Address(sp, 2 * wordSize)); 3115 multiply_128_x_128_loop(y, z, carry, x, jdx, ylen, product, 3116 tmp2, x_xstart, tmp3, tmp4, tmp6, product_hi); 3117 ldp(z, ylen, Address(post(sp, 2 * wordSize))); 3118 ldp(x, xlen, Address(post(sp, 2 * wordSize))); // copy old xstart -> xlen 3119 3120 addw(tmp3, xlen, 1); 3121 strw(carry, Address(z, tmp3, Address::uxtw(LogBytesPerInt))); 3122 subsw(tmp3, tmp3, 1); 3123 br(Assembler::MI, L_done); 3124 3125 lsr(carry, carry, 32); 3126 strw(carry, Address(z, tmp3, Address::uxtw(LogBytesPerInt))); 3127 b(L_second_loop); 3128 3129 // Next infrequent code is moved outside loops. 3130 bind(L_last_x); 3131 ldrw(product_hi, Address(x, 0)); 3132 b(L_third_loop_prologue); 3133 3134 bind(L_done); 3135 } 3136 3137 // Code for BigInteger::mulAdd instrinsic 3138 // out = r0 3139 // in = r1 3140 // offset = r2 (already out.length-offset) 3141 // len = r3 3142 // k = r4 3143 // 3144 // pseudo code from java implementation: 3145 // carry = 0; 3146 // offset = out.length-offset - 1; 3147 // for (int j=len-1; j >= 0; j--) { 3148 // product = (in[j] & LONG_MASK) * kLong + (out[offset] & LONG_MASK) + carry; 3149 // out[offset--] = (int)product; 3150 // carry = product >>> 32; 3151 // } 3152 // return (int)carry; 3153 void MacroAssembler::mul_add(Register out, Register in, Register offset, 3154 Register len, Register k) { 3155 Label LOOP, END; 3156 // pre-loop 3157 cmp(len, zr); // cmp, not cbz/cbnz: to use condition twice => less branches 3158 csel(out, zr, out, Assembler::EQ); 3159 br(Assembler::EQ, END); 3160 add(in, in, len, LSL, 2); // in[j+1] address 3161 add(offset, out, offset, LSL, 2); // out[offset + 1] address 3162 mov(out, zr); // used to keep carry now 3163 BIND(LOOP); 3164 ldrw(rscratch1, Address(pre(in, -4))); 3165 madd(rscratch1, rscratch1, k, out); 3166 ldrw(rscratch2, Address(pre(offset, -4))); 3167 add(rscratch1, rscratch1, rscratch2); 3168 strw(rscratch1, Address(offset)); 3169 lsr(out, rscratch1, 32); 3170 subs(len, len, 1); 3171 br(Assembler::NE, LOOP); 3172 BIND(END); 3173 } 3174 3175 /** 3176 * Emits code to update CRC-32 with a byte value according to constants in table 3177 * 3178 * @param [in,out]crc Register containing the crc. 3179 * @param [in]val Register containing the byte to fold into the CRC. 3180 * @param [in]table Register containing the table of crc constants. 3181 * 3182 * uint32_t crc; 3183 * val = crc_table[(val ^ crc) & 0xFF]; 3184 * crc = val ^ (crc >> 8); 3185 * 3186 */ 3187 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) { 3188 eor(val, val, crc); 3189 andr(val, val, 0xff); 3190 ldrw(val, Address(table, val, Address::lsl(2))); 3191 eor(crc, val, crc, Assembler::LSR, 8); 3192 } 3193 3194 /** 3195 * Emits code to update CRC-32 with a 32-bit value according to tables 0 to 3 3196 * 3197 * @param [in,out]crc Register containing the crc. 3198 * @param [in]v Register containing the 32-bit to fold into the CRC. 3199 * @param [in]table0 Register containing table 0 of crc constants. 3200 * @param [in]table1 Register containing table 1 of crc constants. 3201 * @param [in]table2 Register containing table 2 of crc constants. 3202 * @param [in]table3 Register containing table 3 of crc constants. 3203 * 3204 * uint32_t crc; 3205 * v = crc ^ v 3206 * crc = table3[v&0xff]^table2[(v>>8)&0xff]^table1[(v>>16)&0xff]^table0[v>>24] 3207 * 3208 */ 3209 void MacroAssembler::update_word_crc32(Register crc, Register v, Register tmp, 3210 Register table0, Register table1, Register table2, Register table3, 3211 bool upper) { 3212 eor(v, crc, v, upper ? LSR:LSL, upper ? 32:0); 3213 uxtb(tmp, v); 3214 ldrw(crc, Address(table3, tmp, Address::lsl(2))); 3215 ubfx(tmp, v, 8, 8); 3216 ldrw(tmp, Address(table2, tmp, Address::lsl(2))); 3217 eor(crc, crc, tmp); 3218 ubfx(tmp, v, 16, 8); 3219 ldrw(tmp, Address(table1, tmp, Address::lsl(2))); 3220 eor(crc, crc, tmp); 3221 ubfx(tmp, v, 24, 8); 3222 ldrw(tmp, Address(table0, tmp, Address::lsl(2))); 3223 eor(crc, crc, tmp); 3224 } 3225 3226 void MacroAssembler::kernel_crc32_using_crc32(Register crc, Register buf, 3227 Register len, Register tmp0, Register tmp1, Register tmp2, 3228 Register tmp3) { 3229 Label CRC_by64_loop, CRC_by4_loop, CRC_by1_loop, CRC_less64, CRC_by64_pre, CRC_by32_loop, CRC_less32, L_exit; 3230 assert_different_registers(crc, buf, len, tmp0, tmp1, tmp2, tmp3); 3231 3232 mvnw(crc, crc); 3233 3234 subs(len, len, 128); 3235 br(Assembler::GE, CRC_by64_pre); 3236 BIND(CRC_less64); 3237 adds(len, len, 128-32); 3238 br(Assembler::GE, CRC_by32_loop); 3239 BIND(CRC_less32); 3240 adds(len, len, 32-4); 3241 br(Assembler::GE, CRC_by4_loop); 3242 adds(len, len, 4); 3243 br(Assembler::GT, CRC_by1_loop); 3244 b(L_exit); 3245 3246 BIND(CRC_by32_loop); 3247 ldp(tmp0, tmp1, Address(post(buf, 16))); 3248 subs(len, len, 32); 3249 crc32x(crc, crc, tmp0); 3250 ldr(tmp2, Address(post(buf, 8))); 3251 crc32x(crc, crc, tmp1); 3252 ldr(tmp3, Address(post(buf, 8))); 3253 crc32x(crc, crc, tmp2); 3254 crc32x(crc, crc, tmp3); 3255 br(Assembler::GE, CRC_by32_loop); 3256 cmn(len, 32); 3257 br(Assembler::NE, CRC_less32); 3258 b(L_exit); 3259 3260 BIND(CRC_by4_loop); 3261 ldrw(tmp0, Address(post(buf, 4))); 3262 subs(len, len, 4); 3263 crc32w(crc, crc, tmp0); 3264 br(Assembler::GE, CRC_by4_loop); 3265 adds(len, len, 4); 3266 br(Assembler::LE, L_exit); 3267 BIND(CRC_by1_loop); 3268 ldrb(tmp0, Address(post(buf, 1))); 3269 subs(len, len, 1); 3270 crc32b(crc, crc, tmp0); 3271 br(Assembler::GT, CRC_by1_loop); 3272 b(L_exit); 3273 3274 BIND(CRC_by64_pre); 3275 sub(buf, buf, 8); 3276 ldp(tmp0, tmp1, Address(buf, 8)); 3277 crc32x(crc, crc, tmp0); 3278 ldr(tmp2, Address(buf, 24)); 3279 crc32x(crc, crc, tmp1); 3280 ldr(tmp3, Address(buf, 32)); 3281 crc32x(crc, crc, tmp2); 3282 ldr(tmp0, Address(buf, 40)); 3283 crc32x(crc, crc, tmp3); 3284 ldr(tmp1, Address(buf, 48)); 3285 crc32x(crc, crc, tmp0); 3286 ldr(tmp2, Address(buf, 56)); 3287 crc32x(crc, crc, tmp1); 3288 ldr(tmp3, Address(pre(buf, 64))); 3289 3290 b(CRC_by64_loop); 3291 3292 align(CodeEntryAlignment); 3293 BIND(CRC_by64_loop); 3294 subs(len, len, 64); 3295 crc32x(crc, crc, tmp2); 3296 ldr(tmp0, Address(buf, 8)); 3297 crc32x(crc, crc, tmp3); 3298 ldr(tmp1, Address(buf, 16)); 3299 crc32x(crc, crc, tmp0); 3300 ldr(tmp2, Address(buf, 24)); 3301 crc32x(crc, crc, tmp1); 3302 ldr(tmp3, Address(buf, 32)); 3303 crc32x(crc, crc, tmp2); 3304 ldr(tmp0, Address(buf, 40)); 3305 crc32x(crc, crc, tmp3); 3306 ldr(tmp1, Address(buf, 48)); 3307 crc32x(crc, crc, tmp0); 3308 ldr(tmp2, Address(buf, 56)); 3309 crc32x(crc, crc, tmp1); 3310 ldr(tmp3, Address(pre(buf, 64))); 3311 br(Assembler::GE, CRC_by64_loop); 3312 3313 // post-loop 3314 crc32x(crc, crc, tmp2); 3315 crc32x(crc, crc, tmp3); 3316 3317 sub(len, len, 64); 3318 add(buf, buf, 8); 3319 cmn(len, 128); 3320 br(Assembler::NE, CRC_less64); 3321 BIND(L_exit); 3322 mvnw(crc, crc); 3323 } 3324 3325 /** 3326 * @param crc register containing existing CRC (32-bit) 3327 * @param buf register pointing to input byte buffer (byte*) 3328 * @param len register containing number of bytes 3329 * @param table register that will contain address of CRC table 3330 * @param tmp scratch register 3331 */ 3332 void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len, 3333 Register table0, Register table1, Register table2, Register table3, 3334 Register tmp, Register tmp2, Register tmp3) { 3335 Label L_by16, L_by16_loop, L_by4, L_by4_loop, L_by1, L_by1_loop, L_exit; 3336 unsigned long offset; 3337 3338 if (UseCRC32) { 3339 kernel_crc32_using_crc32(crc, buf, len, table0, table1, table2, table3); 3340 return; 3341 } 3342 3343 mvnw(crc, crc); 3344 3345 adrp(table0, ExternalAddress(StubRoutines::crc_table_addr()), offset); 3346 if (offset) add(table0, table0, offset); 3347 add(table1, table0, 1*256*sizeof(juint)); 3348 add(table2, table0, 2*256*sizeof(juint)); 3349 add(table3, table0, 3*256*sizeof(juint)); 3350 3351 if (UseNeon) { 3352 cmp(len, (u1)64); 3353 br(Assembler::LT, L_by16); 3354 eor(v16, T16B, v16, v16); 3355 3356 Label L_fold; 3357 3358 add(tmp, table0, 4*256*sizeof(juint)); // Point at the Neon constants 3359 3360 ld1(v0, v1, T2D, post(buf, 32)); 3361 ld1r(v4, T2D, post(tmp, 8)); 3362 ld1r(v5, T2D, post(tmp, 8)); 3363 ld1r(v6, T2D, post(tmp, 8)); 3364 ld1r(v7, T2D, post(tmp, 8)); 3365 mov(v16, T4S, 0, crc); 3366 3367 eor(v0, T16B, v0, v16); 3368 sub(len, len, 64); 3369 3370 BIND(L_fold); 3371 pmull(v22, T8H, v0, v5, T8B); 3372 pmull(v20, T8H, v0, v7, T8B); 3373 pmull(v23, T8H, v0, v4, T8B); 3374 pmull(v21, T8H, v0, v6, T8B); 3375 3376 pmull2(v18, T8H, v0, v5, T16B); 3377 pmull2(v16, T8H, v0, v7, T16B); 3378 pmull2(v19, T8H, v0, v4, T16B); 3379 pmull2(v17, T8H, v0, v6, T16B); 3380 3381 uzp1(v24, T8H, v20, v22); 3382 uzp2(v25, T8H, v20, v22); 3383 eor(v20, T16B, v24, v25); 3384 3385 uzp1(v26, T8H, v16, v18); 3386 uzp2(v27, T8H, v16, v18); 3387 eor(v16, T16B, v26, v27); 3388 3389 ushll2(v22, T4S, v20, T8H, 8); 3390 ushll(v20, T4S, v20, T4H, 8); 3391 3392 ushll2(v18, T4S, v16, T8H, 8); 3393 ushll(v16, T4S, v16, T4H, 8); 3394 3395 eor(v22, T16B, v23, v22); 3396 eor(v18, T16B, v19, v18); 3397 eor(v20, T16B, v21, v20); 3398 eor(v16, T16B, v17, v16); 3399 3400 uzp1(v17, T2D, v16, v20); 3401 uzp2(v21, T2D, v16, v20); 3402 eor(v17, T16B, v17, v21); 3403 3404 ushll2(v20, T2D, v17, T4S, 16); 3405 ushll(v16, T2D, v17, T2S, 16); 3406 3407 eor(v20, T16B, v20, v22); 3408 eor(v16, T16B, v16, v18); 3409 3410 uzp1(v17, T2D, v20, v16); 3411 uzp2(v21, T2D, v20, v16); 3412 eor(v28, T16B, v17, v21); 3413 3414 pmull(v22, T8H, v1, v5, T8B); 3415 pmull(v20, T8H, v1, v7, T8B); 3416 pmull(v23, T8H, v1, v4, T8B); 3417 pmull(v21, T8H, v1, v6, T8B); 3418 3419 pmull2(v18, T8H, v1, v5, T16B); 3420 pmull2(v16, T8H, v1, v7, T16B); 3421 pmull2(v19, T8H, v1, v4, T16B); 3422 pmull2(v17, T8H, v1, v6, T16B); 3423 3424 ld1(v0, v1, T2D, post(buf, 32)); 3425 3426 uzp1(v24, T8H, v20, v22); 3427 uzp2(v25, T8H, v20, v22); 3428 eor(v20, T16B, v24, v25); 3429 3430 uzp1(v26, T8H, v16, v18); 3431 uzp2(v27, T8H, v16, v18); 3432 eor(v16, T16B, v26, v27); 3433 3434 ushll2(v22, T4S, v20, T8H, 8); 3435 ushll(v20, T4S, v20, T4H, 8); 3436 3437 ushll2(v18, T4S, v16, T8H, 8); 3438 ushll(v16, T4S, v16, T4H, 8); 3439 3440 eor(v22, T16B, v23, v22); 3441 eor(v18, T16B, v19, v18); 3442 eor(v20, T16B, v21, v20); 3443 eor(v16, T16B, v17, v16); 3444 3445 uzp1(v17, T2D, v16, v20); 3446 uzp2(v21, T2D, v16, v20); 3447 eor(v16, T16B, v17, v21); 3448 3449 ushll2(v20, T2D, v16, T4S, 16); 3450 ushll(v16, T2D, v16, T2S, 16); 3451 3452 eor(v20, T16B, v22, v20); 3453 eor(v16, T16B, v16, v18); 3454 3455 uzp1(v17, T2D, v20, v16); 3456 uzp2(v21, T2D, v20, v16); 3457 eor(v20, T16B, v17, v21); 3458 3459 shl(v16, T2D, v28, 1); 3460 shl(v17, T2D, v20, 1); 3461 3462 eor(v0, T16B, v0, v16); 3463 eor(v1, T16B, v1, v17); 3464 3465 subs(len, len, 32); 3466 br(Assembler::GE, L_fold); 3467 3468 mov(crc, 0); 3469 mov(tmp, v0, T1D, 0); 3470 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false); 3471 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true); 3472 mov(tmp, v0, T1D, 1); 3473 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false); 3474 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true); 3475 mov(tmp, v1, T1D, 0); 3476 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false); 3477 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true); 3478 mov(tmp, v1, T1D, 1); 3479 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false); 3480 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true); 3481 3482 add(len, len, 32); 3483 } 3484 3485 BIND(L_by16); 3486 subs(len, len, 16); 3487 br(Assembler::GE, L_by16_loop); 3488 adds(len, len, 16-4); 3489 br(Assembler::GE, L_by4_loop); 3490 adds(len, len, 4); 3491 br(Assembler::GT, L_by1_loop); 3492 b(L_exit); 3493 3494 BIND(L_by4_loop); 3495 ldrw(tmp, Address(post(buf, 4))); 3496 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3); 3497 subs(len, len, 4); 3498 br(Assembler::GE, L_by4_loop); 3499 adds(len, len, 4); 3500 br(Assembler::LE, L_exit); 3501 BIND(L_by1_loop); 3502 subs(len, len, 1); 3503 ldrb(tmp, Address(post(buf, 1))); 3504 update_byte_crc32(crc, tmp, table0); 3505 br(Assembler::GT, L_by1_loop); 3506 b(L_exit); 3507 3508 align(CodeEntryAlignment); 3509 BIND(L_by16_loop); 3510 subs(len, len, 16); 3511 ldp(tmp, tmp3, Address(post(buf, 16))); 3512 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false); 3513 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true); 3514 update_word_crc32(crc, tmp3, tmp2, table0, table1, table2, table3, false); 3515 update_word_crc32(crc, tmp3, tmp2, table0, table1, table2, table3, true); 3516 br(Assembler::GE, L_by16_loop); 3517 adds(len, len, 16-4); 3518 br(Assembler::GE, L_by4_loop); 3519 adds(len, len, 4); 3520 br(Assembler::GT, L_by1_loop); 3521 BIND(L_exit); 3522 mvnw(crc, crc); 3523 } 3524 3525 void MacroAssembler::kernel_crc32c_using_crc32c(Register crc, Register buf, 3526 Register len, Register tmp0, Register tmp1, Register tmp2, 3527 Register tmp3) { 3528 Label CRC_by64_loop, CRC_by4_loop, CRC_by1_loop, CRC_less64, CRC_by64_pre, CRC_by32_loop, CRC_less32, L_exit; 3529 assert_different_registers(crc, buf, len, tmp0, tmp1, tmp2, tmp3); 3530 3531 subs(len, len, 128); 3532 br(Assembler::GE, CRC_by64_pre); 3533 BIND(CRC_less64); 3534 adds(len, len, 128-32); 3535 br(Assembler::GE, CRC_by32_loop); 3536 BIND(CRC_less32); 3537 adds(len, len, 32-4); 3538 br(Assembler::GE, CRC_by4_loop); 3539 adds(len, len, 4); 3540 br(Assembler::GT, CRC_by1_loop); 3541 b(L_exit); 3542 3543 BIND(CRC_by32_loop); 3544 ldp(tmp0, tmp1, Address(post(buf, 16))); 3545 subs(len, len, 32); 3546 crc32cx(crc, crc, tmp0); 3547 ldr(tmp2, Address(post(buf, 8))); 3548 crc32cx(crc, crc, tmp1); 3549 ldr(tmp3, Address(post(buf, 8))); 3550 crc32cx(crc, crc, tmp2); 3551 crc32cx(crc, crc, tmp3); 3552 br(Assembler::GE, CRC_by32_loop); 3553 cmn(len, 32); 3554 br(Assembler::NE, CRC_less32); 3555 b(L_exit); 3556 3557 BIND(CRC_by4_loop); 3558 ldrw(tmp0, Address(post(buf, 4))); 3559 subs(len, len, 4); 3560 crc32cw(crc, crc, tmp0); 3561 br(Assembler::GE, CRC_by4_loop); 3562 adds(len, len, 4); 3563 br(Assembler::LE, L_exit); 3564 BIND(CRC_by1_loop); 3565 ldrb(tmp0, Address(post(buf, 1))); 3566 subs(len, len, 1); 3567 crc32cb(crc, crc, tmp0); 3568 br(Assembler::GT, CRC_by1_loop); 3569 b(L_exit); 3570 3571 BIND(CRC_by64_pre); 3572 sub(buf, buf, 8); 3573 ldp(tmp0, tmp1, Address(buf, 8)); 3574 crc32cx(crc, crc, tmp0); 3575 ldr(tmp2, Address(buf, 24)); 3576 crc32cx(crc, crc, tmp1); 3577 ldr(tmp3, Address(buf, 32)); 3578 crc32cx(crc, crc, tmp2); 3579 ldr(tmp0, Address(buf, 40)); 3580 crc32cx(crc, crc, tmp3); 3581 ldr(tmp1, Address(buf, 48)); 3582 crc32cx(crc, crc, tmp0); 3583 ldr(tmp2, Address(buf, 56)); 3584 crc32cx(crc, crc, tmp1); 3585 ldr(tmp3, Address(pre(buf, 64))); 3586 3587 b(CRC_by64_loop); 3588 3589 align(CodeEntryAlignment); 3590 BIND(CRC_by64_loop); 3591 subs(len, len, 64); 3592 crc32cx(crc, crc, tmp2); 3593 ldr(tmp0, Address(buf, 8)); 3594 crc32cx(crc, crc, tmp3); 3595 ldr(tmp1, Address(buf, 16)); 3596 crc32cx(crc, crc, tmp0); 3597 ldr(tmp2, Address(buf, 24)); 3598 crc32cx(crc, crc, tmp1); 3599 ldr(tmp3, Address(buf, 32)); 3600 crc32cx(crc, crc, tmp2); 3601 ldr(tmp0, Address(buf, 40)); 3602 crc32cx(crc, crc, tmp3); 3603 ldr(tmp1, Address(buf, 48)); 3604 crc32cx(crc, crc, tmp0); 3605 ldr(tmp2, Address(buf, 56)); 3606 crc32cx(crc, crc, tmp1); 3607 ldr(tmp3, Address(pre(buf, 64))); 3608 br(Assembler::GE, CRC_by64_loop); 3609 3610 // post-loop 3611 crc32cx(crc, crc, tmp2); 3612 crc32cx(crc, crc, tmp3); 3613 3614 sub(len, len, 64); 3615 add(buf, buf, 8); 3616 cmn(len, 128); 3617 br(Assembler::NE, CRC_less64); 3618 BIND(L_exit); 3619 } 3620 3621 /** 3622 * @param crc register containing existing CRC (32-bit) 3623 * @param buf register pointing to input byte buffer (byte*) 3624 * @param len register containing number of bytes 3625 * @param table register that will contain address of CRC table 3626 * @param tmp scratch register 3627 */ 3628 void MacroAssembler::kernel_crc32c(Register crc, Register buf, Register len, 3629 Register table0, Register table1, Register table2, Register table3, 3630 Register tmp, Register tmp2, Register tmp3) { 3631 kernel_crc32c_using_crc32c(crc, buf, len, table0, table1, table2, table3); 3632 } 3633 3634 3635 SkipIfEqual::SkipIfEqual( 3636 MacroAssembler* masm, const bool* flag_addr, bool value) { 3637 _masm = masm; 3638 unsigned long offset; 3639 _masm->adrp(rscratch1, ExternalAddress((address)flag_addr), offset); 3640 _masm->ldrb(rscratch1, Address(rscratch1, offset)); 3641 _masm->cbzw(rscratch1, _label); 3642 } 3643 3644 SkipIfEqual::~SkipIfEqual() { 3645 _masm->bind(_label); 3646 } 3647 3648 void MacroAssembler::addptr(const Address &dst, int32_t src) { 3649 Address adr; 3650 switch(dst.getMode()) { 3651 case Address::base_plus_offset: 3652 // This is the expected mode, although we allow all the other 3653 // forms below. 3654 adr = form_address(rscratch2, dst.base(), dst.offset(), LogBytesPerWord); 3655 break; 3656 default: 3657 lea(rscratch2, dst); 3658 adr = Address(rscratch2); 3659 break; 3660 } 3661 ldr(rscratch1, adr); 3662 add(rscratch1, rscratch1, src); 3663 str(rscratch1, adr); 3664 } 3665 3666 void MacroAssembler::cmpptr(Register src1, Address src2) { 3667 unsigned long offset; 3668 adrp(rscratch1, src2, offset); 3669 ldr(rscratch1, Address(rscratch1, offset)); 3670 cmp(src1, rscratch1); 3671 } 3672 3673 void MacroAssembler::cmpoop(Register obj1, Register obj2) { 3674 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 3675 bs->obj_equals(this, obj1, obj2); 3676 } 3677 3678 void MacroAssembler::load_klass(Register dst, Register src) { 3679 if (UseCompressedClassPointers) { 3680 ldrw(dst, Address(src, oopDesc::klass_offset_in_bytes())); 3681 decode_klass_not_null(dst); 3682 } else { 3683 ldr(dst, Address(src, oopDesc::klass_offset_in_bytes())); 3684 } 3685 } 3686 3687 // ((OopHandle)result).resolve(); 3688 void MacroAssembler::resolve_oop_handle(Register result, Register tmp) { 3689 // OopHandle::resolve is an indirection. 3690 access_load_at(T_OBJECT, IN_NATIVE, result, Address(result, 0), tmp, noreg); 3691 } 3692 3693 void MacroAssembler::load_mirror(Register dst, Register method, Register tmp) { 3694 const int mirror_offset = in_bytes(Klass::java_mirror_offset()); 3695 ldr(dst, Address(rmethod, Method::const_offset())); 3696 ldr(dst, Address(dst, ConstMethod::constants_offset())); 3697 ldr(dst, Address(dst, ConstantPool::pool_holder_offset_in_bytes())); 3698 ldr(dst, Address(dst, mirror_offset)); 3699 resolve_oop_handle(dst, tmp); 3700 } 3701 3702 void MacroAssembler::cmp_klass(Register oop, Register trial_klass, Register tmp) { 3703 if (UseCompressedClassPointers) { 3704 ldrw(tmp, Address(oop, oopDesc::klass_offset_in_bytes())); 3705 if (Universe::narrow_klass_base() == NULL) { 3706 cmp(trial_klass, tmp, LSL, Universe::narrow_klass_shift()); 3707 return; 3708 } else if (((uint64_t)Universe::narrow_klass_base() & 0xffffffff) == 0 3709 && Universe::narrow_klass_shift() == 0) { 3710 // Only the bottom 32 bits matter 3711 cmpw(trial_klass, tmp); 3712 return; 3713 } 3714 decode_klass_not_null(tmp); 3715 } else { 3716 ldr(tmp, Address(oop, oopDesc::klass_offset_in_bytes())); 3717 } 3718 cmp(trial_klass, tmp); 3719 } 3720 3721 void MacroAssembler::load_prototype_header(Register dst, Register src) { 3722 load_klass(dst, src); 3723 ldr(dst, Address(dst, Klass::prototype_header_offset())); 3724 } 3725 3726 void MacroAssembler::store_klass(Register dst, Register src) { 3727 // FIXME: Should this be a store release? concurrent gcs assumes 3728 // klass length is valid if klass field is not null. 3729 if (UseCompressedClassPointers) { 3730 encode_klass_not_null(src); 3731 strw(src, Address(dst, oopDesc::klass_offset_in_bytes())); 3732 } else { 3733 str(src, Address(dst, oopDesc::klass_offset_in_bytes())); 3734 } 3735 } 3736 3737 void MacroAssembler::store_klass_gap(Register dst, Register src) { 3738 if (UseCompressedClassPointers) { 3739 // Store to klass gap in destination 3740 strw(src, Address(dst, oopDesc::klass_gap_offset_in_bytes())); 3741 } 3742 } 3743 3744 // Algorithm must match CompressedOops::encode. 3745 void MacroAssembler::encode_heap_oop(Register d, Register s) { 3746 #ifdef ASSERT 3747 verify_heapbase("MacroAssembler::encode_heap_oop: heap base corrupted?"); 3748 #endif 3749 verify_oop(s, "broken oop in encode_heap_oop"); 3750 if (Universe::narrow_oop_base() == NULL) { 3751 if (Universe::narrow_oop_shift() != 0) { 3752 assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong"); 3753 lsr(d, s, LogMinObjAlignmentInBytes); 3754 } else { 3755 mov(d, s); 3756 } 3757 } else { 3758 subs(d, s, rheapbase); 3759 csel(d, d, zr, Assembler::HS); 3760 lsr(d, d, LogMinObjAlignmentInBytes); 3761 3762 /* Old algorithm: is this any worse? 3763 Label nonnull; 3764 cbnz(r, nonnull); 3765 sub(r, r, rheapbase); 3766 bind(nonnull); 3767 lsr(r, r, LogMinObjAlignmentInBytes); 3768 */ 3769 } 3770 } 3771 3772 void MacroAssembler::encode_heap_oop_not_null(Register r) { 3773 #ifdef ASSERT 3774 verify_heapbase("MacroAssembler::encode_heap_oop_not_null: heap base corrupted?"); 3775 if (CheckCompressedOops) { 3776 Label ok; 3777 cbnz(r, ok); 3778 stop("null oop passed to encode_heap_oop_not_null"); 3779 bind(ok); 3780 } 3781 #endif 3782 verify_oop(r, "broken oop in encode_heap_oop_not_null"); 3783 if (Universe::narrow_oop_base() != NULL) { 3784 sub(r, r, rheapbase); 3785 } 3786 if (Universe::narrow_oop_shift() != 0) { 3787 assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong"); 3788 lsr(r, r, LogMinObjAlignmentInBytes); 3789 } 3790 } 3791 3792 void MacroAssembler::encode_heap_oop_not_null(Register dst, Register src) { 3793 #ifdef ASSERT 3794 verify_heapbase("MacroAssembler::encode_heap_oop_not_null2: heap base corrupted?"); 3795 if (CheckCompressedOops) { 3796 Label ok; 3797 cbnz(src, ok); 3798 stop("null oop passed to encode_heap_oop_not_null2"); 3799 bind(ok); 3800 } 3801 #endif 3802 verify_oop(src, "broken oop in encode_heap_oop_not_null2"); 3803 3804 Register data = src; 3805 if (Universe::narrow_oop_base() != NULL) { 3806 sub(dst, src, rheapbase); 3807 data = dst; 3808 } 3809 if (Universe::narrow_oop_shift() != 0) { 3810 assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong"); 3811 lsr(dst, data, LogMinObjAlignmentInBytes); 3812 data = dst; 3813 } 3814 if (data == src) 3815 mov(dst, src); 3816 } 3817 3818 void MacroAssembler::decode_heap_oop(Register d, Register s) { 3819 #ifdef ASSERT 3820 verify_heapbase("MacroAssembler::decode_heap_oop: heap base corrupted?"); 3821 #endif 3822 if (Universe::narrow_oop_base() == NULL) { 3823 if (Universe::narrow_oop_shift() != 0 || d != s) { 3824 lsl(d, s, Universe::narrow_oop_shift()); 3825 } 3826 } else { 3827 Label done; 3828 if (d != s) 3829 mov(d, s); 3830 cbz(s, done); 3831 add(d, rheapbase, s, Assembler::LSL, LogMinObjAlignmentInBytes); 3832 bind(done); 3833 } 3834 verify_oop(d, "broken oop in decode_heap_oop"); 3835 } 3836 3837 void MacroAssembler::decode_heap_oop_not_null(Register r) { 3838 assert (UseCompressedOops, "should only be used for compressed headers"); 3839 assert (Universe::heap() != NULL, "java heap should be initialized"); 3840 // Cannot assert, unverified entry point counts instructions (see .ad file) 3841 // vtableStubs also counts instructions in pd_code_size_limit. 3842 // Also do not verify_oop as this is called by verify_oop. 3843 if (Universe::narrow_oop_shift() != 0) { 3844 assert(LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong"); 3845 if (Universe::narrow_oop_base() != NULL) { 3846 add(r, rheapbase, r, Assembler::LSL, LogMinObjAlignmentInBytes); 3847 } else { 3848 add(r, zr, r, Assembler::LSL, LogMinObjAlignmentInBytes); 3849 } 3850 } else { 3851 assert (Universe::narrow_oop_base() == NULL, "sanity"); 3852 } 3853 } 3854 3855 void MacroAssembler::decode_heap_oop_not_null(Register dst, Register src) { 3856 assert (UseCompressedOops, "should only be used for compressed headers"); 3857 assert (Universe::heap() != NULL, "java heap should be initialized"); 3858 // Cannot assert, unverified entry point counts instructions (see .ad file) 3859 // vtableStubs also counts instructions in pd_code_size_limit. 3860 // Also do not verify_oop as this is called by verify_oop. 3861 if (Universe::narrow_oop_shift() != 0) { 3862 assert(LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong"); 3863 if (Universe::narrow_oop_base() != NULL) { 3864 add(dst, rheapbase, src, Assembler::LSL, LogMinObjAlignmentInBytes); 3865 } else { 3866 add(dst, zr, src, Assembler::LSL, LogMinObjAlignmentInBytes); 3867 } 3868 } else { 3869 assert (Universe::narrow_oop_base() == NULL, "sanity"); 3870 if (dst != src) { 3871 mov(dst, src); 3872 } 3873 } 3874 } 3875 3876 void MacroAssembler::encode_klass_not_null(Register dst, Register src) { 3877 if (Universe::narrow_klass_base() == NULL) { 3878 if (Universe::narrow_klass_shift() != 0) { 3879 assert (LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong"); 3880 lsr(dst, src, LogKlassAlignmentInBytes); 3881 } else { 3882 if (dst != src) mov(dst, src); 3883 } 3884 return; 3885 } 3886 3887 if (use_XOR_for_compressed_class_base) { 3888 if (Universe::narrow_klass_shift() != 0) { 3889 eor(dst, src, (uint64_t)Universe::narrow_klass_base()); 3890 lsr(dst, dst, LogKlassAlignmentInBytes); 3891 } else { 3892 eor(dst, src, (uint64_t)Universe::narrow_klass_base()); 3893 } 3894 return; 3895 } 3896 3897 if (((uint64_t)Universe::narrow_klass_base() & 0xffffffff) == 0 3898 && Universe::narrow_klass_shift() == 0) { 3899 movw(dst, src); 3900 return; 3901 } 3902 3903 #ifdef ASSERT 3904 verify_heapbase("MacroAssembler::encode_klass_not_null2: heap base corrupted?"); 3905 #endif 3906 3907 Register rbase = dst; 3908 if (dst == src) rbase = rheapbase; 3909 mov(rbase, (uint64_t)Universe::narrow_klass_base()); 3910 sub(dst, src, rbase); 3911 if (Universe::narrow_klass_shift() != 0) { 3912 assert (LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong"); 3913 lsr(dst, dst, LogKlassAlignmentInBytes); 3914 } 3915 if (dst == src) reinit_heapbase(); 3916 } 3917 3918 void MacroAssembler::encode_klass_not_null(Register r) { 3919 encode_klass_not_null(r, r); 3920 } 3921 3922 void MacroAssembler::decode_klass_not_null(Register dst, Register src) { 3923 Register rbase = dst; 3924 assert (UseCompressedClassPointers, "should only be used for compressed headers"); 3925 3926 if (Universe::narrow_klass_base() == NULL) { 3927 if (Universe::narrow_klass_shift() != 0) { 3928 assert(LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong"); 3929 lsl(dst, src, LogKlassAlignmentInBytes); 3930 } else { 3931 if (dst != src) mov(dst, src); 3932 } 3933 return; 3934 } 3935 3936 if (use_XOR_for_compressed_class_base) { 3937 if (Universe::narrow_klass_shift() != 0) { 3938 lsl(dst, src, LogKlassAlignmentInBytes); 3939 eor(dst, dst, (uint64_t)Universe::narrow_klass_base()); 3940 } else { 3941 eor(dst, src, (uint64_t)Universe::narrow_klass_base()); 3942 } 3943 return; 3944 } 3945 3946 if (((uint64_t)Universe::narrow_klass_base() & 0xffffffff) == 0 3947 && Universe::narrow_klass_shift() == 0) { 3948 if (dst != src) 3949 movw(dst, src); 3950 movk(dst, (uint64_t)Universe::narrow_klass_base() >> 32, 32); 3951 return; 3952 } 3953 3954 // Cannot assert, unverified entry point counts instructions (see .ad file) 3955 // vtableStubs also counts instructions in pd_code_size_limit. 3956 // Also do not verify_oop as this is called by verify_oop. 3957 if (dst == src) rbase = rheapbase; 3958 mov(rbase, (uint64_t)Universe::narrow_klass_base()); 3959 if (Universe::narrow_klass_shift() != 0) { 3960 assert(LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong"); 3961 add(dst, rbase, src, Assembler::LSL, LogKlassAlignmentInBytes); 3962 } else { 3963 add(dst, rbase, src); 3964 } 3965 if (dst == src) reinit_heapbase(); 3966 } 3967 3968 void MacroAssembler::decode_klass_not_null(Register r) { 3969 decode_klass_not_null(r, r); 3970 } 3971 3972 void MacroAssembler::set_narrow_oop(Register dst, jobject obj) { 3973 #ifdef ASSERT 3974 { 3975 ThreadInVMfromUnknown tiv; 3976 assert (UseCompressedOops, "should only be used for compressed oops"); 3977 assert (Universe::heap() != NULL, "java heap should be initialized"); 3978 assert (oop_recorder() != NULL, "this assembler needs an OopRecorder"); 3979 assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "should be real oop"); 3980 } 3981 #endif 3982 int oop_index = oop_recorder()->find_index(obj); 3983 InstructionMark im(this); 3984 RelocationHolder rspec = oop_Relocation::spec(oop_index); 3985 code_section()->relocate(inst_mark(), rspec); 3986 movz(dst, 0xDEAD, 16); 3987 movk(dst, 0xBEEF); 3988 } 3989 3990 void MacroAssembler::set_narrow_klass(Register dst, Klass* k) { 3991 assert (UseCompressedClassPointers, "should only be used for compressed headers"); 3992 assert (oop_recorder() != NULL, "this assembler needs an OopRecorder"); 3993 int index = oop_recorder()->find_index(k); 3994 assert(! Universe::heap()->is_in_reserved(k), "should not be an oop"); 3995 3996 InstructionMark im(this); 3997 RelocationHolder rspec = metadata_Relocation::spec(index); 3998 code_section()->relocate(inst_mark(), rspec); 3999 narrowKlass nk = Klass::encode_klass(k); 4000 movz(dst, (nk >> 16), 16); 4001 movk(dst, nk & 0xffff); 4002 } 4003 4004 void MacroAssembler::access_load_at(BasicType type, DecoratorSet decorators, 4005 Register dst, Address src, 4006 Register tmp1, Register thread_tmp) { 4007 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 4008 decorators = AccessInternal::decorator_fixup(decorators); 4009 bool as_raw = (decorators & AS_RAW) != 0; 4010 if (as_raw) { 4011 bs->BarrierSetAssembler::load_at(this, decorators, type, dst, src, tmp1, thread_tmp); 4012 } else { 4013 bs->load_at(this, decorators, type, dst, src, tmp1, thread_tmp); 4014 } 4015 } 4016 4017 void MacroAssembler::access_store_at(BasicType type, DecoratorSet decorators, 4018 Address dst, Register src, 4019 Register tmp1, Register thread_tmp) { 4020 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 4021 decorators = AccessInternal::decorator_fixup(decorators); 4022 bool as_raw = (decorators & AS_RAW) != 0; 4023 if (as_raw) { 4024 bs->BarrierSetAssembler::store_at(this, decorators, type, dst, src, tmp1, thread_tmp); 4025 } else { 4026 bs->store_at(this, decorators, type, dst, src, tmp1, thread_tmp); 4027 } 4028 } 4029 4030 void MacroAssembler::resolve(DecoratorSet decorators, Register obj) { 4031 // Use stronger ACCESS_WRITE|ACCESS_READ by default. 4032 if ((decorators & (ACCESS_READ | ACCESS_WRITE)) == 0) { 4033 decorators |= ACCESS_READ | ACCESS_WRITE; 4034 } 4035 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 4036 return bs->resolve(this, decorators, obj); 4037 } 4038 4039 void MacroAssembler::load_heap_oop(Register dst, Address src, Register tmp1, 4040 Register thread_tmp, DecoratorSet decorators) { 4041 access_load_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, thread_tmp); 4042 } 4043 4044 void MacroAssembler::load_heap_oop_not_null(Register dst, Address src, Register tmp1, 4045 Register thread_tmp, DecoratorSet decorators) { 4046 access_load_at(T_OBJECT, IN_HEAP | IS_NOT_NULL | decorators, dst, src, tmp1, thread_tmp); 4047 } 4048 4049 void MacroAssembler::store_heap_oop(Address dst, Register src, Register tmp1, 4050 Register thread_tmp, DecoratorSet decorators) { 4051 access_store_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, thread_tmp); 4052 } 4053 4054 // Used for storing NULLs. 4055 void MacroAssembler::store_heap_oop_null(Address dst) { 4056 access_store_at(T_OBJECT, IN_HEAP, dst, noreg, noreg, noreg); 4057 } 4058 4059 Address MacroAssembler::allocate_metadata_address(Metadata* obj) { 4060 assert(oop_recorder() != NULL, "this assembler needs a Recorder"); 4061 int index = oop_recorder()->allocate_metadata_index(obj); 4062 RelocationHolder rspec = metadata_Relocation::spec(index); 4063 return Address((address)obj, rspec); 4064 } 4065 4066 // Move an oop into a register. immediate is true if we want 4067 // immediate instrcutions, i.e. we are not going to patch this 4068 // instruction while the code is being executed by another thread. In 4069 // that case we can use move immediates rather than the constant pool. 4070 void MacroAssembler::movoop(Register dst, jobject obj, bool immediate) { 4071 int oop_index; 4072 if (obj == NULL) { 4073 oop_index = oop_recorder()->allocate_oop_index(obj); 4074 } else { 4075 #ifdef ASSERT 4076 { 4077 ThreadInVMfromUnknown tiv; 4078 assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "should be real oop"); 4079 } 4080 #endif 4081 oop_index = oop_recorder()->find_index(obj); 4082 } 4083 RelocationHolder rspec = oop_Relocation::spec(oop_index); 4084 if (! immediate) { 4085 address dummy = address(uintptr_t(pc()) & -wordSize); // A nearby aligned address 4086 ldr_constant(dst, Address(dummy, rspec)); 4087 } else 4088 mov(dst, Address((address)obj, rspec)); 4089 } 4090 4091 // Move a metadata address into a register. 4092 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) { 4093 int oop_index; 4094 if (obj == NULL) { 4095 oop_index = oop_recorder()->allocate_metadata_index(obj); 4096 } else { 4097 oop_index = oop_recorder()->find_index(obj); 4098 } 4099 RelocationHolder rspec = metadata_Relocation::spec(oop_index); 4100 mov(dst, Address((address)obj, rspec)); 4101 } 4102 4103 Address MacroAssembler::constant_oop_address(jobject obj) { 4104 #ifdef ASSERT 4105 { 4106 ThreadInVMfromUnknown tiv; 4107 assert(oop_recorder() != NULL, "this assembler needs an OopRecorder"); 4108 assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "not an oop"); 4109 } 4110 #endif 4111 int oop_index = oop_recorder()->find_index(obj); 4112 return Address((address)obj, oop_Relocation::spec(oop_index)); 4113 } 4114 4115 // Defines obj, preserves var_size_in_bytes, okay for t2 == var_size_in_bytes. 4116 void MacroAssembler::tlab_allocate(Register obj, 4117 Register var_size_in_bytes, 4118 int con_size_in_bytes, 4119 Register t1, 4120 Register t2, 4121 Label& slow_case) { 4122 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 4123 bs->tlab_allocate(this, obj, var_size_in_bytes, con_size_in_bytes, t1, t2, slow_case); 4124 } 4125 4126 // Defines obj, preserves var_size_in_bytes 4127 void MacroAssembler::eden_allocate(Register obj, 4128 Register var_size_in_bytes, 4129 int con_size_in_bytes, 4130 Register t1, 4131 Label& slow_case) { 4132 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 4133 bs->eden_allocate(this, obj, var_size_in_bytes, con_size_in_bytes, t1, slow_case); 4134 } 4135 4136 // Zero words; len is in bytes 4137 // Destroys all registers except addr 4138 // len must be a nonzero multiple of wordSize 4139 void MacroAssembler::zero_memory(Register addr, Register len, Register t1) { 4140 assert_different_registers(addr, len, t1, rscratch1, rscratch2); 4141 4142 #ifdef ASSERT 4143 { Label L; 4144 tst(len, BytesPerWord - 1); 4145 br(Assembler::EQ, L); 4146 stop("len is not a multiple of BytesPerWord"); 4147 bind(L); 4148 } 4149 #endif 4150 4151 #ifndef PRODUCT 4152 block_comment("zero memory"); 4153 #endif 4154 4155 Label loop; 4156 Label entry; 4157 4158 // Algorithm: 4159 // 4160 // scratch1 = cnt & 7; 4161 // cnt -= scratch1; 4162 // p += scratch1; 4163 // switch (scratch1) { 4164 // do { 4165 // cnt -= 8; 4166 // p[-8] = 0; 4167 // case 7: 4168 // p[-7] = 0; 4169 // case 6: 4170 // p[-6] = 0; 4171 // // ... 4172 // case 1: 4173 // p[-1] = 0; 4174 // case 0: 4175 // p += 8; 4176 // } while (cnt); 4177 // } 4178 4179 const int unroll = 8; // Number of str(zr) instructions we'll unroll 4180 4181 lsr(len, len, LogBytesPerWord); 4182 andr(rscratch1, len, unroll - 1); // tmp1 = cnt % unroll 4183 sub(len, len, rscratch1); // cnt -= unroll 4184 // t1 always points to the end of the region we're about to zero 4185 add(t1, addr, rscratch1, Assembler::LSL, LogBytesPerWord); 4186 adr(rscratch2, entry); 4187 sub(rscratch2, rscratch2, rscratch1, Assembler::LSL, 2); 4188 br(rscratch2); 4189 bind(loop); 4190 sub(len, len, unroll); 4191 for (int i = -unroll; i < 0; i++) 4192 Assembler::str(zr, Address(t1, i * wordSize)); 4193 bind(entry); 4194 add(t1, t1, unroll * wordSize); 4195 cbnz(len, loop); 4196 } 4197 4198 void MacroAssembler::verify_tlab() { 4199 #ifdef ASSERT 4200 if (UseTLAB && VerifyOops) { 4201 Label next, ok; 4202 4203 stp(rscratch2, rscratch1, Address(pre(sp, -16))); 4204 4205 ldr(rscratch2, Address(rthread, in_bytes(JavaThread::tlab_top_offset()))); 4206 ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_start_offset()))); 4207 cmp(rscratch2, rscratch1); 4208 br(Assembler::HS, next); 4209 STOP("assert(top >= start)"); 4210 should_not_reach_here(); 4211 4212 bind(next); 4213 ldr(rscratch2, Address(rthread, in_bytes(JavaThread::tlab_end_offset()))); 4214 ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_top_offset()))); 4215 cmp(rscratch2, rscratch1); 4216 br(Assembler::HS, ok); 4217 STOP("assert(top <= end)"); 4218 should_not_reach_here(); 4219 4220 bind(ok); 4221 ldp(rscratch2, rscratch1, Address(post(sp, 16))); 4222 } 4223 #endif 4224 } 4225 4226 // Writes to stack successive pages until offset reached to check for 4227 // stack overflow + shadow pages. This clobbers tmp. 4228 void MacroAssembler::bang_stack_size(Register size, Register tmp) { 4229 assert_different_registers(tmp, size, rscratch1); 4230 mov(tmp, sp); 4231 // Bang stack for total size given plus shadow page size. 4232 // Bang one page at a time because large size can bang beyond yellow and 4233 // red zones. 4234 Label loop; 4235 mov(rscratch1, os::vm_page_size()); 4236 bind(loop); 4237 lea(tmp, Address(tmp, -os::vm_page_size())); 4238 subsw(size, size, rscratch1); 4239 str(size, Address(tmp)); 4240 br(Assembler::GT, loop); 4241 4242 // Bang down shadow pages too. 4243 // At this point, (tmp-0) is the last address touched, so don't 4244 // touch it again. (It was touched as (tmp-pagesize) but then tmp 4245 // was post-decremented.) Skip this address by starting at i=1, and 4246 // touch a few more pages below. N.B. It is important to touch all 4247 // the way down to and including i=StackShadowPages. 4248 for (int i = 0; i < (int)(JavaThread::stack_shadow_zone_size() / os::vm_page_size()) - 1; i++) { 4249 // this could be any sized move but this is can be a debugging crumb 4250 // so the bigger the better. 4251 lea(tmp, Address(tmp, -os::vm_page_size())); 4252 str(size, Address(tmp)); 4253 } 4254 } 4255 4256 4257 // Move the address of the polling page into dest. 4258 void MacroAssembler::get_polling_page(Register dest, address page, relocInfo::relocType rtype) { 4259 if (SafepointMechanism::uses_thread_local_poll()) { 4260 ldr(dest, Address(rthread, Thread::polling_page_offset())); 4261 } else { 4262 unsigned long off; 4263 adrp(dest, Address(page, rtype), off); 4264 assert(off == 0, "polling page must be page aligned"); 4265 } 4266 } 4267 4268 // Move the address of the polling page into r, then read the polling 4269 // page. 4270 address MacroAssembler::read_polling_page(Register r, address page, relocInfo::relocType rtype) { 4271 get_polling_page(r, page, rtype); 4272 return read_polling_page(r, rtype); 4273 } 4274 4275 // Read the polling page. The address of the polling page must 4276 // already be in r. 4277 address MacroAssembler::read_polling_page(Register r, relocInfo::relocType rtype) { 4278 InstructionMark im(this); 4279 code_section()->relocate(inst_mark(), rtype); 4280 ldrw(zr, Address(r, 0)); 4281 return inst_mark(); 4282 } 4283 4284 void MacroAssembler::adrp(Register reg1, const Address &dest, unsigned long &byte_offset) { 4285 relocInfo::relocType rtype = dest.rspec().reloc()->type(); 4286 unsigned long low_page = (unsigned long)CodeCache::low_bound() >> 12; 4287 unsigned long high_page = (unsigned long)(CodeCache::high_bound()-1) >> 12; 4288 unsigned long dest_page = (unsigned long)dest.target() >> 12; 4289 long offset_low = dest_page - low_page; 4290 long offset_high = dest_page - high_page; 4291 4292 assert(is_valid_AArch64_address(dest.target()), "bad address"); 4293 assert(dest.getMode() == Address::literal, "ADRP must be applied to a literal address"); 4294 4295 InstructionMark im(this); 4296 code_section()->relocate(inst_mark(), dest.rspec()); 4297 // 8143067: Ensure that the adrp can reach the dest from anywhere within 4298 // the code cache so that if it is relocated we know it will still reach 4299 if (offset_high >= -(1<<20) && offset_low < (1<<20)) { 4300 _adrp(reg1, dest.target()); 4301 } else { 4302 unsigned long target = (unsigned long)dest.target(); 4303 unsigned long adrp_target 4304 = (target & 0xffffffffUL) | ((unsigned long)pc() & 0xffff00000000UL); 4305 4306 _adrp(reg1, (address)adrp_target); 4307 movk(reg1, target >> 32, 32); 4308 } 4309 byte_offset = (unsigned long)dest.target() & 0xfff; 4310 } 4311 4312 void MacroAssembler::load_byte_map_base(Register reg) { 4313 jbyte *byte_map_base = 4314 ((CardTableBarrierSet*)(BarrierSet::barrier_set()))->card_table()->byte_map_base(); 4315 4316 if (is_valid_AArch64_address((address)byte_map_base)) { 4317 // Strictly speaking the byte_map_base isn't an address at all, 4318 // and it might even be negative. 4319 unsigned long offset; 4320 adrp(reg, ExternalAddress((address)byte_map_base), offset); 4321 // We expect offset to be zero with most collectors. 4322 if (offset != 0) { 4323 add(reg, reg, offset); 4324 } 4325 } else { 4326 mov(reg, (uint64_t)byte_map_base); 4327 } 4328 } 4329 4330 void MacroAssembler::build_frame(int framesize) { 4331 assert(framesize > 0, "framesize must be > 0"); 4332 if (framesize < ((1 << 9) + 2 * wordSize)) { 4333 sub(sp, sp, framesize); 4334 stp(rfp, lr, Address(sp, framesize - 2 * wordSize)); 4335 if (PreserveFramePointer) add(rfp, sp, framesize - 2 * wordSize); 4336 } else { 4337 stp(rfp, lr, Address(pre(sp, -2 * wordSize))); 4338 if (PreserveFramePointer) mov(rfp, sp); 4339 if (framesize < ((1 << 12) + 2 * wordSize)) 4340 sub(sp, sp, framesize - 2 * wordSize); 4341 else { 4342 mov(rscratch1, framesize - 2 * wordSize); 4343 sub(sp, sp, rscratch1); 4344 } 4345 } 4346 } 4347 4348 void MacroAssembler::remove_frame(int framesize) { 4349 assert(framesize > 0, "framesize must be > 0"); 4350 if (framesize < ((1 << 9) + 2 * wordSize)) { 4351 ldp(rfp, lr, Address(sp, framesize - 2 * wordSize)); 4352 add(sp, sp, framesize); 4353 } else { 4354 if (framesize < ((1 << 12) + 2 * wordSize)) 4355 add(sp, sp, framesize - 2 * wordSize); 4356 else { 4357 mov(rscratch1, framesize - 2 * wordSize); 4358 add(sp, sp, rscratch1); 4359 } 4360 ldp(rfp, lr, Address(post(sp, 2 * wordSize))); 4361 } 4362 } 4363 4364 #ifdef COMPILER2 4365 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr); 4366 4367 // Search for str1 in str2 and return index or -1 4368 void MacroAssembler::string_indexof(Register str2, Register str1, 4369 Register cnt2, Register cnt1, 4370 Register tmp1, Register tmp2, 4371 Register tmp3, Register tmp4, 4372 Register tmp5, Register tmp6, 4373 int icnt1, Register result, int ae) { 4374 // NOTE: tmp5, tmp6 can be zr depending on specific method version 4375 Label LINEARSEARCH, LINEARSTUB, LINEAR_MEDIUM, DONE, NOMATCH, MATCH; 4376 4377 Register ch1 = rscratch1; 4378 Register ch2 = rscratch2; 4379 Register cnt1tmp = tmp1; 4380 Register cnt2tmp = tmp2; 4381 Register cnt1_neg = cnt1; 4382 Register cnt2_neg = cnt2; 4383 Register result_tmp = tmp4; 4384 4385 bool isL = ae == StrIntrinsicNode::LL; 4386 4387 bool str1_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL; 4388 bool str2_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU; 4389 int str1_chr_shift = str1_isL ? 0:1; 4390 int str2_chr_shift = str2_isL ? 0:1; 4391 int str1_chr_size = str1_isL ? 1:2; 4392 int str2_chr_size = str2_isL ? 1:2; 4393 chr_insn str1_load_1chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb : 4394 (chr_insn)&MacroAssembler::ldrh; 4395 chr_insn str2_load_1chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb : 4396 (chr_insn)&MacroAssembler::ldrh; 4397 chr_insn load_2chr = isL ? (chr_insn)&MacroAssembler::ldrh : (chr_insn)&MacroAssembler::ldrw; 4398 chr_insn load_4chr = isL ? (chr_insn)&MacroAssembler::ldrw : (chr_insn)&MacroAssembler::ldr; 4399 4400 // Note, inline_string_indexOf() generates checks: 4401 // if (substr.count > string.count) return -1; 4402 // if (substr.count == 0) return 0; 4403 4404 // We have two strings, a source string in str2, cnt2 and a pattern string 4405 // in str1, cnt1. Find the 1st occurence of pattern in source or return -1. 4406 4407 // For larger pattern and source we use a simplified Boyer Moore algorithm. 4408 // With a small pattern and source we use linear scan. 4409 4410 if (icnt1 == -1) { 4411 sub(result_tmp, cnt2, cnt1); 4412 cmp(cnt1, (u1)8); // Use Linear Scan if cnt1 < 8 || cnt1 >= 256 4413 br(LT, LINEARSEARCH); 4414 dup(v0, T16B, cnt1); // done in separate FPU pipeline. Almost no penalty 4415 subs(zr, cnt1, 256); 4416 lsr(tmp1, cnt2, 2); 4417 ccmp(cnt1, tmp1, 0b0000, LT); // Source must be 4 * pattern for BM 4418 br(GE, LINEARSTUB); 4419 } 4420 4421 // The Boyer Moore alogorithm is based on the description here:- 4422 // 4423 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm 4424 // 4425 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule 4426 // and the 'Good Suffix' rule. 4427 // 4428 // These rules are essentially heuristics for how far we can shift the 4429 // pattern along the search string. 4430 // 4431 // The implementation here uses the 'Bad Character' rule only because of the 4432 // complexity of initialisation for the 'Good Suffix' rule. 4433 // 4434 // This is also known as the Boyer-Moore-Horspool algorithm:- 4435 // 4436 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm 4437 // 4438 // This particular implementation has few java-specific optimizations. 4439 // 4440 // #define ASIZE 256 4441 // 4442 // int bm(unsigned char *x, int m, unsigned char *y, int n) { 4443 // int i, j; 4444 // unsigned c; 4445 // unsigned char bc[ASIZE]; 4446 // 4447 // /* Preprocessing */ 4448 // for (i = 0; i < ASIZE; ++i) 4449 // bc[i] = m; 4450 // for (i = 0; i < m - 1; ) { 4451 // c = x[i]; 4452 // ++i; 4453 // // c < 256 for Latin1 string, so, no need for branch 4454 // #ifdef PATTERN_STRING_IS_LATIN1 4455 // bc[c] = m - i; 4456 // #else 4457 // if (c < ASIZE) bc[c] = m - i; 4458 // #endif 4459 // } 4460 // 4461 // /* Searching */ 4462 // j = 0; 4463 // while (j <= n - m) { 4464 // c = y[i+j]; 4465 // if (x[m-1] == c) 4466 // for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i); 4467 // if (i < 0) return j; 4468 // // c < 256 for Latin1 string, so, no need for branch 4469 // #ifdef SOURCE_STRING_IS_LATIN1 4470 // // LL case: (c< 256) always true. Remove branch 4471 // j += bc[y[j+m-1]]; 4472 // #endif 4473 // #ifndef PATTERN_STRING_IS_UTF 4474 // // UU case: need if (c<ASIZE) check. Skip 1 character if not. 4475 // if (c < ASIZE) 4476 // j += bc[y[j+m-1]]; 4477 // else 4478 // j += 1 4479 // #endif 4480 // #ifdef PATTERN_IS_LATIN1_AND_SOURCE_IS_UTF 4481 // // UL case: need if (c<ASIZE) check. Skip <pattern length> if not. 4482 // if (c < ASIZE) 4483 // j += bc[y[j+m-1]]; 4484 // else 4485 // j += m 4486 // #endif 4487 // } 4488 // } 4489 4490 if (icnt1 == -1) { 4491 Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH, 4492 BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP; 4493 Register cnt1end = tmp2; 4494 Register str2end = cnt2; 4495 Register skipch = tmp2; 4496 4497 // str1 length is >=8, so, we can read at least 1 register for cases when 4498 // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for 4499 // UL case. We'll re-read last character in inner pre-loop code to have 4500 // single outer pre-loop load 4501 const int firstStep = isL ? 7 : 3; 4502 4503 const int ASIZE = 256; 4504 const int STORED_BYTES = 32; // amount of bytes stored per instruction 4505 sub(sp, sp, ASIZE); 4506 mov(tmp5, ASIZE/STORED_BYTES); // loop iterations 4507 mov(ch1, sp); 4508 BIND(BM_INIT_LOOP); 4509 stpq(v0, v0, Address(post(ch1, STORED_BYTES))); 4510 subs(tmp5, tmp5, 1); 4511 br(GT, BM_INIT_LOOP); 4512 4513 sub(cnt1tmp, cnt1, 1); 4514 mov(tmp5, str2); 4515 add(str2end, str2, result_tmp, LSL, str2_chr_shift); 4516 sub(ch2, cnt1, 1); 4517 mov(tmp3, str1); 4518 BIND(BCLOOP); 4519 (this->*str1_load_1chr)(ch1, Address(post(tmp3, str1_chr_size))); 4520 if (!str1_isL) { 4521 subs(zr, ch1, ASIZE); 4522 br(HS, BCSKIP); 4523 } 4524 strb(ch2, Address(sp, ch1)); 4525 BIND(BCSKIP); 4526 subs(ch2, ch2, 1); 4527 br(GT, BCLOOP); 4528 4529 add(tmp6, str1, cnt1, LSL, str1_chr_shift); // address after str1 4530 if (str1_isL == str2_isL) { 4531 // load last 8 bytes (8LL/4UU symbols) 4532 ldr(tmp6, Address(tmp6, -wordSize)); 4533 } else { 4534 ldrw(tmp6, Address(tmp6, -wordSize/2)); // load last 4 bytes(4 symbols) 4535 // convert Latin1 to UTF. We'll have to wait until load completed, but 4536 // it's still faster than per-character loads+checks 4537 lsr(tmp3, tmp6, BitsPerByte * (wordSize/2 - str1_chr_size)); // str1[N-1] 4538 ubfx(ch1, tmp6, 8, 8); // str1[N-2] 4539 ubfx(ch2, tmp6, 16, 8); // str1[N-3] 4540 andr(tmp6, tmp6, 0xFF); // str1[N-4] 4541 orr(ch2, ch1, ch2, LSL, 16); 4542 orr(tmp6, tmp6, tmp3, LSL, 48); 4543 orr(tmp6, tmp6, ch2, LSL, 16); 4544 } 4545 BIND(BMLOOPSTR2); 4546 (this->*str2_load_1chr)(skipch, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift))); 4547 sub(cnt1tmp, cnt1tmp, firstStep); // cnt1tmp is positive here, because cnt1 >= 8 4548 if (str1_isL == str2_isL) { 4549 // re-init tmp3. It's for free because it's executed in parallel with 4550 // load above. Alternative is to initialize it before loop, but it'll 4551 // affect performance on in-order systems with 2 or more ld/st pipelines 4552 lsr(tmp3, tmp6, BitsPerByte * (wordSize - str1_chr_size)); 4553 } 4554 if (!isL) { // UU/UL case 4555 lsl(ch2, cnt1tmp, 1); // offset in bytes 4556 } 4557 cmp(tmp3, skipch); 4558 br(NE, BMSKIP); 4559 ldr(ch2, Address(str2, isL ? cnt1tmp : ch2)); 4560 mov(ch1, tmp6); 4561 if (isL) { 4562 b(BMLOOPSTR1_AFTER_LOAD); 4563 } else { 4564 sub(cnt1tmp, cnt1tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8 4565 b(BMLOOPSTR1_CMP); 4566 } 4567 BIND(BMLOOPSTR1); 4568 (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift))); 4569 (this->*str2_load_1chr)(ch2, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift))); 4570 BIND(BMLOOPSTR1_AFTER_LOAD); 4571 subs(cnt1tmp, cnt1tmp, 1); 4572 br(LT, BMLOOPSTR1_LASTCMP); 4573 BIND(BMLOOPSTR1_CMP); 4574 cmp(ch1, ch2); 4575 br(EQ, BMLOOPSTR1); 4576 BIND(BMSKIP); 4577 if (!isL) { 4578 // if we've met UTF symbol while searching Latin1 pattern, then we can 4579 // skip cnt1 symbols 4580 if (str1_isL != str2_isL) { 4581 mov(result_tmp, cnt1); 4582 } else { 4583 mov(result_tmp, 1); 4584 } 4585 subs(zr, skipch, ASIZE); 4586 br(HS, BMADV); 4587 } 4588 ldrb(result_tmp, Address(sp, skipch)); // load skip distance 4589 BIND(BMADV); 4590 sub(cnt1tmp, cnt1, 1); 4591 add(str2, str2, result_tmp, LSL, str2_chr_shift); 4592 cmp(str2, str2end); 4593 br(LE, BMLOOPSTR2); 4594 add(sp, sp, ASIZE); 4595 b(NOMATCH); 4596 BIND(BMLOOPSTR1_LASTCMP); 4597 cmp(ch1, ch2); 4598 br(NE, BMSKIP); 4599 BIND(BMMATCH); 4600 sub(result, str2, tmp5); 4601 if (!str2_isL) lsr(result, result, 1); 4602 add(sp, sp, ASIZE); 4603 b(DONE); 4604 4605 BIND(LINEARSTUB); 4606 cmp(cnt1, (u1)16); // small patterns still should be handled by simple algorithm 4607 br(LT, LINEAR_MEDIUM); 4608 mov(result, zr); 4609 RuntimeAddress stub = NULL; 4610 if (isL) { 4611 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ll()); 4612 assert(stub.target() != NULL, "string_indexof_linear_ll stub has not been generated"); 4613 } else if (str1_isL) { 4614 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ul()); 4615 assert(stub.target() != NULL, "string_indexof_linear_ul stub has not been generated"); 4616 } else { 4617 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_uu()); 4618 assert(stub.target() != NULL, "string_indexof_linear_uu stub has not been generated"); 4619 } 4620 trampoline_call(stub); 4621 b(DONE); 4622 } 4623 4624 BIND(LINEARSEARCH); 4625 { 4626 Label DO1, DO2, DO3; 4627 4628 Register str2tmp = tmp2; 4629 Register first = tmp3; 4630 4631 if (icnt1 == -1) 4632 { 4633 Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT; 4634 4635 cmp(cnt1, u1(str1_isL == str2_isL ? 4 : 2)); 4636 br(LT, DOSHORT); 4637 BIND(LINEAR_MEDIUM); 4638 (this->*str1_load_1chr)(first, Address(str1)); 4639 lea(str1, Address(str1, cnt1, Address::lsl(str1_chr_shift))); 4640 sub(cnt1_neg, zr, cnt1, LSL, str1_chr_shift); 4641 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 4642 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 4643 4644 BIND(FIRST_LOOP); 4645 (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg)); 4646 cmp(first, ch2); 4647 br(EQ, STR1_LOOP); 4648 BIND(STR2_NEXT); 4649 adds(cnt2_neg, cnt2_neg, str2_chr_size); 4650 br(LE, FIRST_LOOP); 4651 b(NOMATCH); 4652 4653 BIND(STR1_LOOP); 4654 adds(cnt1tmp, cnt1_neg, str1_chr_size); 4655 add(cnt2tmp, cnt2_neg, str2_chr_size); 4656 br(GE, MATCH); 4657 4658 BIND(STR1_NEXT); 4659 (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp)); 4660 (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp)); 4661 cmp(ch1, ch2); 4662 br(NE, STR2_NEXT); 4663 adds(cnt1tmp, cnt1tmp, str1_chr_size); 4664 add(cnt2tmp, cnt2tmp, str2_chr_size); 4665 br(LT, STR1_NEXT); 4666 b(MATCH); 4667 4668 BIND(DOSHORT); 4669 if (str1_isL == str2_isL) { 4670 cmp(cnt1, (u1)2); 4671 br(LT, DO1); 4672 br(GT, DO3); 4673 } 4674 } 4675 4676 if (icnt1 == 4) { 4677 Label CH1_LOOP; 4678 4679 (this->*load_4chr)(ch1, str1); 4680 sub(result_tmp, cnt2, 4); 4681 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 4682 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 4683 4684 BIND(CH1_LOOP); 4685 (this->*load_4chr)(ch2, Address(str2, cnt2_neg)); 4686 cmp(ch1, ch2); 4687 br(EQ, MATCH); 4688 adds(cnt2_neg, cnt2_neg, str2_chr_size); 4689 br(LE, CH1_LOOP); 4690 b(NOMATCH); 4691 } 4692 4693 if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 2) { 4694 Label CH1_LOOP; 4695 4696 BIND(DO2); 4697 (this->*load_2chr)(ch1, str1); 4698 if (icnt1 == 2) { 4699 sub(result_tmp, cnt2, 2); 4700 } 4701 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 4702 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 4703 BIND(CH1_LOOP); 4704 (this->*load_2chr)(ch2, Address(str2, cnt2_neg)); 4705 cmp(ch1, ch2); 4706 br(EQ, MATCH); 4707 adds(cnt2_neg, cnt2_neg, str2_chr_size); 4708 br(LE, CH1_LOOP); 4709 b(NOMATCH); 4710 } 4711 4712 if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 3) { 4713 Label FIRST_LOOP, STR2_NEXT, STR1_LOOP; 4714 4715 BIND(DO3); 4716 (this->*load_2chr)(first, str1); 4717 (this->*str1_load_1chr)(ch1, Address(str1, 2*str1_chr_size)); 4718 if (icnt1 == 3) { 4719 sub(result_tmp, cnt2, 3); 4720 } 4721 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 4722 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 4723 BIND(FIRST_LOOP); 4724 (this->*load_2chr)(ch2, Address(str2, cnt2_neg)); 4725 cmpw(first, ch2); 4726 br(EQ, STR1_LOOP); 4727 BIND(STR2_NEXT); 4728 adds(cnt2_neg, cnt2_neg, str2_chr_size); 4729 br(LE, FIRST_LOOP); 4730 b(NOMATCH); 4731 4732 BIND(STR1_LOOP); 4733 add(cnt2tmp, cnt2_neg, 2*str2_chr_size); 4734 (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp)); 4735 cmp(ch1, ch2); 4736 br(NE, STR2_NEXT); 4737 b(MATCH); 4738 } 4739 4740 if (icnt1 == -1 || icnt1 == 1) { 4741 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP; 4742 4743 BIND(DO1); 4744 (this->*str1_load_1chr)(ch1, str1); 4745 cmp(cnt2, (u1)8); 4746 br(LT, DO1_SHORT); 4747 4748 sub(result_tmp, cnt2, 8/str2_chr_size); 4749 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 4750 mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001); 4751 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 4752 4753 if (str2_isL) { 4754 orr(ch1, ch1, ch1, LSL, 8); 4755 } 4756 orr(ch1, ch1, ch1, LSL, 16); 4757 orr(ch1, ch1, ch1, LSL, 32); 4758 BIND(CH1_LOOP); 4759 ldr(ch2, Address(str2, cnt2_neg)); 4760 eor(ch2, ch1, ch2); 4761 sub(tmp1, ch2, tmp3); 4762 orr(tmp2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 4763 bics(tmp1, tmp1, tmp2); 4764 br(NE, HAS_ZERO); 4765 adds(cnt2_neg, cnt2_neg, 8); 4766 br(LT, CH1_LOOP); 4767 4768 cmp(cnt2_neg, (u1)8); 4769 mov(cnt2_neg, 0); 4770 br(LT, CH1_LOOP); 4771 b(NOMATCH); 4772 4773 BIND(HAS_ZERO); 4774 rev(tmp1, tmp1); 4775 clz(tmp1, tmp1); 4776 add(cnt2_neg, cnt2_neg, tmp1, LSR, 3); 4777 b(MATCH); 4778 4779 BIND(DO1_SHORT); 4780 mov(result_tmp, cnt2); 4781 lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift))); 4782 sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift); 4783 BIND(DO1_LOOP); 4784 (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg)); 4785 cmpw(ch1, ch2); 4786 br(EQ, MATCH); 4787 adds(cnt2_neg, cnt2_neg, str2_chr_size); 4788 br(LT, DO1_LOOP); 4789 } 4790 } 4791 BIND(NOMATCH); 4792 mov(result, -1); 4793 b(DONE); 4794 BIND(MATCH); 4795 add(result, result_tmp, cnt2_neg, ASR, str2_chr_shift); 4796 BIND(DONE); 4797 } 4798 4799 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr); 4800 typedef void (MacroAssembler::* uxt_insn)(Register Rd, Register Rn); 4801 4802 void MacroAssembler::string_indexof_char(Register str1, Register cnt1, 4803 Register ch, Register result, 4804 Register tmp1, Register tmp2, Register tmp3) 4805 { 4806 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE; 4807 Register cnt1_neg = cnt1; 4808 Register ch1 = rscratch1; 4809 Register result_tmp = rscratch2; 4810 4811 cmp(cnt1, (u1)4); 4812 br(LT, DO1_SHORT); 4813 4814 orr(ch, ch, ch, LSL, 16); 4815 orr(ch, ch, ch, LSL, 32); 4816 4817 sub(cnt1, cnt1, 4); 4818 mov(result_tmp, cnt1); 4819 lea(str1, Address(str1, cnt1, Address::uxtw(1))); 4820 sub(cnt1_neg, zr, cnt1, LSL, 1); 4821 4822 mov(tmp3, 0x0001000100010001); 4823 4824 BIND(CH1_LOOP); 4825 ldr(ch1, Address(str1, cnt1_neg)); 4826 eor(ch1, ch, ch1); 4827 sub(tmp1, ch1, tmp3); 4828 orr(tmp2, ch1, 0x7fff7fff7fff7fff); 4829 bics(tmp1, tmp1, tmp2); 4830 br(NE, HAS_ZERO); 4831 adds(cnt1_neg, cnt1_neg, 8); 4832 br(LT, CH1_LOOP); 4833 4834 cmp(cnt1_neg, (u1)8); 4835 mov(cnt1_neg, 0); 4836 br(LT, CH1_LOOP); 4837 b(NOMATCH); 4838 4839 BIND(HAS_ZERO); 4840 rev(tmp1, tmp1); 4841 clz(tmp1, tmp1); 4842 add(cnt1_neg, cnt1_neg, tmp1, LSR, 3); 4843 b(MATCH); 4844 4845 BIND(DO1_SHORT); 4846 mov(result_tmp, cnt1); 4847 lea(str1, Address(str1, cnt1, Address::uxtw(1))); 4848 sub(cnt1_neg, zr, cnt1, LSL, 1); 4849 BIND(DO1_LOOP); 4850 ldrh(ch1, Address(str1, cnt1_neg)); 4851 cmpw(ch, ch1); 4852 br(EQ, MATCH); 4853 adds(cnt1_neg, cnt1_neg, 2); 4854 br(LT, DO1_LOOP); 4855 BIND(NOMATCH); 4856 mov(result, -1); 4857 b(DONE); 4858 BIND(MATCH); 4859 add(result, result_tmp, cnt1_neg, ASR, 1); 4860 BIND(DONE); 4861 } 4862 4863 // Compare strings. 4864 void MacroAssembler::string_compare(Register str1, Register str2, 4865 Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2, 4866 FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3, int ae) { 4867 Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB, 4868 DIFFERENCE, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, 4869 SHORT_LOOP_START, TAIL_CHECK; 4870 4871 const u1 STUB_THRESHOLD = 64 + 8; 4872 bool isLL = ae == StrIntrinsicNode::LL; 4873 bool isLU = ae == StrIntrinsicNode::LU; 4874 bool isUL = ae == StrIntrinsicNode::UL; 4875 4876 bool str1_isL = isLL || isLU; 4877 bool str2_isL = isLL || isUL; 4878 4879 int str1_chr_shift = str1_isL ? 0 : 1; 4880 int str2_chr_shift = str2_isL ? 0 : 1; 4881 int str1_chr_size = str1_isL ? 1 : 2; 4882 int str2_chr_size = str2_isL ? 1 : 2; 4883 int minCharsInWord = isLL ? wordSize : wordSize/2; 4884 4885 FloatRegister vtmpZ = vtmp1, vtmp = vtmp2; 4886 chr_insn str1_load_chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb : 4887 (chr_insn)&MacroAssembler::ldrh; 4888 chr_insn str2_load_chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb : 4889 (chr_insn)&MacroAssembler::ldrh; 4890 uxt_insn ext_chr = isLL ? (uxt_insn)&MacroAssembler::uxtbw : 4891 (uxt_insn)&MacroAssembler::uxthw; 4892 4893 BLOCK_COMMENT("string_compare {"); 4894 4895 // Bizzarely, the counts are passed in bytes, regardless of whether they 4896 // are L or U strings, however the result is always in characters. 4897 if (!str1_isL) asrw(cnt1, cnt1, 1); 4898 if (!str2_isL) asrw(cnt2, cnt2, 1); 4899 4900 // Compute the minimum of the string lengths and save the difference. 4901 subsw(result, cnt1, cnt2); 4902 cselw(cnt2, cnt1, cnt2, Assembler::LE); // min 4903 4904 // A very short string 4905 cmpw(cnt2, minCharsInWord); 4906 br(Assembler::LT, SHORT_STRING); 4907 4908 // Compare longwords 4909 // load first parts of strings and finish initialization while loading 4910 { 4911 if (str1_isL == str2_isL) { // LL or UU 4912 ldr(tmp1, Address(str1)); 4913 cmp(str1, str2); 4914 br(Assembler::EQ, DONE); 4915 ldr(tmp2, Address(str2)); 4916 cmp(cnt2, STUB_THRESHOLD); 4917 br(GE, STUB); 4918 subsw(cnt2, cnt2, minCharsInWord); 4919 br(EQ, TAIL_CHECK); 4920 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift))); 4921 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift))); 4922 sub(cnt2, zr, cnt2, LSL, str2_chr_shift); 4923 } else if (isLU) { 4924 ldrs(vtmp, Address(str1)); 4925 cmp(str1, str2); 4926 br(Assembler::EQ, DONE); 4927 ldr(tmp2, Address(str2)); 4928 cmp(cnt2, STUB_THRESHOLD); 4929 br(GE, STUB); 4930 subsw(cnt2, cnt2, 4); 4931 br(EQ, TAIL_CHECK); 4932 eor(vtmpZ, T16B, vtmpZ, vtmpZ); 4933 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift))); 4934 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift))); 4935 zip1(vtmp, T8B, vtmp, vtmpZ); 4936 sub(cnt1, zr, cnt2, LSL, str1_chr_shift); 4937 sub(cnt2, zr, cnt2, LSL, str2_chr_shift); 4938 add(cnt1, cnt1, 4); 4939 fmovd(tmp1, vtmp); 4940 } else { // UL case 4941 ldr(tmp1, Address(str1)); 4942 cmp(str1, str2); 4943 br(Assembler::EQ, DONE); 4944 ldrs(vtmp, Address(str2)); 4945 cmp(cnt2, STUB_THRESHOLD); 4946 br(GE, STUB); 4947 subsw(cnt2, cnt2, 4); 4948 br(EQ, TAIL_CHECK); 4949 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift))); 4950 eor(vtmpZ, T16B, vtmpZ, vtmpZ); 4951 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift))); 4952 sub(cnt1, zr, cnt2, LSL, str1_chr_shift); 4953 zip1(vtmp, T8B, vtmp, vtmpZ); 4954 sub(cnt2, zr, cnt2, LSL, str2_chr_shift); 4955 add(cnt1, cnt1, 8); 4956 fmovd(tmp2, vtmp); 4957 } 4958 adds(cnt2, cnt2, isUL ? 4 : 8); 4959 br(GE, TAIL); 4960 eor(rscratch2, tmp1, tmp2); 4961 cbnz(rscratch2, DIFFERENCE); 4962 // main loop 4963 bind(NEXT_WORD); 4964 if (str1_isL == str2_isL) { 4965 ldr(tmp1, Address(str1, cnt2)); 4966 ldr(tmp2, Address(str2, cnt2)); 4967 adds(cnt2, cnt2, 8); 4968 } else if (isLU) { 4969 ldrs(vtmp, Address(str1, cnt1)); 4970 ldr(tmp2, Address(str2, cnt2)); 4971 add(cnt1, cnt1, 4); 4972 zip1(vtmp, T8B, vtmp, vtmpZ); 4973 fmovd(tmp1, vtmp); 4974 adds(cnt2, cnt2, 8); 4975 } else { // UL 4976 ldrs(vtmp, Address(str2, cnt2)); 4977 ldr(tmp1, Address(str1, cnt1)); 4978 zip1(vtmp, T8B, vtmp, vtmpZ); 4979 add(cnt1, cnt1, 8); 4980 fmovd(tmp2, vtmp); 4981 adds(cnt2, cnt2, 4); 4982 } 4983 br(GE, TAIL); 4984 4985 eor(rscratch2, tmp1, tmp2); 4986 cbz(rscratch2, NEXT_WORD); 4987 b(DIFFERENCE); 4988 bind(TAIL); 4989 eor(rscratch2, tmp1, tmp2); 4990 cbnz(rscratch2, DIFFERENCE); 4991 // Last longword. In the case where length == 4 we compare the 4992 // same longword twice, but that's still faster than another 4993 // conditional branch. 4994 if (str1_isL == str2_isL) { 4995 ldr(tmp1, Address(str1)); 4996 ldr(tmp2, Address(str2)); 4997 } else if (isLU) { 4998 ldrs(vtmp, Address(str1)); 4999 ldr(tmp2, Address(str2)); 5000 zip1(vtmp, T8B, vtmp, vtmpZ); 5001 fmovd(tmp1, vtmp); 5002 } else { // UL 5003 ldrs(vtmp, Address(str2)); 5004 ldr(tmp1, Address(str1)); 5005 zip1(vtmp, T8B, vtmp, vtmpZ); 5006 fmovd(tmp2, vtmp); 5007 } 5008 bind(TAIL_CHECK); 5009 eor(rscratch2, tmp1, tmp2); 5010 cbz(rscratch2, DONE); 5011 5012 // Find the first different characters in the longwords and 5013 // compute their difference. 5014 bind(DIFFERENCE); 5015 rev(rscratch2, rscratch2); 5016 clz(rscratch2, rscratch2); 5017 andr(rscratch2, rscratch2, isLL ? -8 : -16); 5018 lsrv(tmp1, tmp1, rscratch2); 5019 (this->*ext_chr)(tmp1, tmp1); 5020 lsrv(tmp2, tmp2, rscratch2); 5021 (this->*ext_chr)(tmp2, tmp2); 5022 subw(result, tmp1, tmp2); 5023 b(DONE); 5024 } 5025 5026 bind(STUB); 5027 RuntimeAddress stub = NULL; 5028 switch(ae) { 5029 case StrIntrinsicNode::LL: 5030 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LL()); 5031 break; 5032 case StrIntrinsicNode::UU: 5033 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UU()); 5034 break; 5035 case StrIntrinsicNode::LU: 5036 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LU()); 5037 break; 5038 case StrIntrinsicNode::UL: 5039 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UL()); 5040 break; 5041 default: 5042 ShouldNotReachHere(); 5043 } 5044 assert(stub.target() != NULL, "compare_long_string stub has not been generated"); 5045 trampoline_call(stub); 5046 b(DONE); 5047 5048 bind(SHORT_STRING); 5049 // Is the minimum length zero? 5050 cbz(cnt2, DONE); 5051 // arrange code to do most branches while loading and loading next characters 5052 // while comparing previous 5053 (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size))); 5054 subs(cnt2, cnt2, 1); 5055 br(EQ, SHORT_LAST_INIT); 5056 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size))); 5057 b(SHORT_LOOP_START); 5058 bind(SHORT_LOOP); 5059 subs(cnt2, cnt2, 1); 5060 br(EQ, SHORT_LAST); 5061 bind(SHORT_LOOP_START); 5062 (this->*str1_load_chr)(tmp2, Address(post(str1, str1_chr_size))); 5063 (this->*str2_load_chr)(rscratch1, Address(post(str2, str2_chr_size))); 5064 cmp(tmp1, cnt1); 5065 br(NE, SHORT_LOOP_TAIL); 5066 subs(cnt2, cnt2, 1); 5067 br(EQ, SHORT_LAST2); 5068 (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size))); 5069 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size))); 5070 cmp(tmp2, rscratch1); 5071 br(EQ, SHORT_LOOP); 5072 sub(result, tmp2, rscratch1); 5073 b(DONE); 5074 bind(SHORT_LOOP_TAIL); 5075 sub(result, tmp1, cnt1); 5076 b(DONE); 5077 bind(SHORT_LAST2); 5078 cmp(tmp2, rscratch1); 5079 br(EQ, DONE); 5080 sub(result, tmp2, rscratch1); 5081 5082 b(DONE); 5083 bind(SHORT_LAST_INIT); 5084 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size))); 5085 bind(SHORT_LAST); 5086 cmp(tmp1, cnt1); 5087 br(EQ, DONE); 5088 sub(result, tmp1, cnt1); 5089 5090 bind(DONE); 5091 5092 BLOCK_COMMENT("} string_compare"); 5093 } 5094 #endif // COMPILER2 5095 5096 // This method checks if provided byte array contains byte with highest bit set. 5097 void MacroAssembler::has_negatives(Register ary1, Register len, Register result) { 5098 // Simple and most common case of aligned small array which is not at the 5099 // end of memory page is placed here. All other cases are in stub. 5100 Label LOOP, END, STUB, STUB_LONG, SET_RESULT, DONE; 5101 const uint64_t UPPER_BIT_MASK=0x8080808080808080; 5102 assert_different_registers(ary1, len, result); 5103 5104 cmpw(len, 0); 5105 br(LE, SET_RESULT); 5106 cmpw(len, 4 * wordSize); 5107 br(GE, STUB_LONG); // size > 32 then go to stub 5108 5109 int shift = 64 - exact_log2(os::vm_page_size()); 5110 lsl(rscratch1, ary1, shift); 5111 mov(rscratch2, (size_t)(4 * wordSize) << shift); 5112 adds(rscratch2, rscratch1, rscratch2); // At end of page? 5113 br(CS, STUB); // at the end of page then go to stub 5114 subs(len, len, wordSize); 5115 br(LT, END); 5116 5117 BIND(LOOP); 5118 ldr(rscratch1, Address(post(ary1, wordSize))); 5119 tst(rscratch1, UPPER_BIT_MASK); 5120 br(NE, SET_RESULT); 5121 subs(len, len, wordSize); 5122 br(GE, LOOP); 5123 cmpw(len, -wordSize); 5124 br(EQ, SET_RESULT); 5125 5126 BIND(END); 5127 ldr(result, Address(ary1)); 5128 sub(len, zr, len, LSL, 3); // LSL 3 is to get bits from bytes 5129 lslv(result, result, len); 5130 tst(result, UPPER_BIT_MASK); 5131 b(SET_RESULT); 5132 5133 BIND(STUB); 5134 RuntimeAddress has_neg = RuntimeAddress(StubRoutines::aarch64::has_negatives()); 5135 assert(has_neg.target() != NULL, "has_negatives stub has not been generated"); 5136 trampoline_call(has_neg); 5137 b(DONE); 5138 5139 BIND(STUB_LONG); 5140 RuntimeAddress has_neg_long = RuntimeAddress( 5141 StubRoutines::aarch64::has_negatives_long()); 5142 assert(has_neg_long.target() != NULL, "has_negatives stub has not been generated"); 5143 trampoline_call(has_neg_long); 5144 b(DONE); 5145 5146 BIND(SET_RESULT); 5147 cset(result, NE); // set true or false 5148 5149 BIND(DONE); 5150 } 5151 5152 void MacroAssembler::arrays_equals(Register a1, Register a2, Register tmp3, 5153 Register tmp4, Register tmp5, Register result, 5154 Register cnt1, int elem_size) { 5155 Label DONE, SAME; 5156 Register tmp1 = rscratch1; 5157 Register tmp2 = rscratch2; 5158 Register cnt2 = tmp2; // cnt2 only used in array length compare 5159 int elem_per_word = wordSize/elem_size; 5160 int log_elem_size = exact_log2(elem_size); 5161 int length_offset = arrayOopDesc::length_offset_in_bytes(); 5162 int base_offset 5163 = arrayOopDesc::base_offset_in_bytes(elem_size == 2 ? T_CHAR : T_BYTE); 5164 int stubBytesThreshold = 3 * 64 + (UseSIMDForArrayEquals ? 0 : 16); 5165 5166 assert(elem_size == 1 || elem_size == 2, "must be char or byte"); 5167 assert_different_registers(a1, a2, result, cnt1, rscratch1, rscratch2); 5168 5169 #ifndef PRODUCT 5170 { 5171 const char kind = (elem_size == 2) ? 'U' : 'L'; 5172 char comment[64]; 5173 snprintf(comment, sizeof comment, "array_equals%c{", kind); 5174 BLOCK_COMMENT(comment); 5175 } 5176 #endif 5177 5178 // if (a1 == a2) 5179 // return true; 5180 cmpoop(a1, a2); // May have read barriers for a1 and a2. 5181 br(EQ, SAME); 5182 5183 if (UseSimpleArrayEquals) { 5184 Label NEXT_WORD, SHORT, TAIL03, TAIL01, A_MIGHT_BE_NULL, A_IS_NOT_NULL; 5185 // if (a1 == null || a2 == null) 5186 // return false; 5187 // a1 & a2 == 0 means (some-pointer is null) or 5188 // (very-rare-or-even-probably-impossible-pointer-values) 5189 // so, we can save one branch in most cases 5190 tst(a1, a2); 5191 mov(result, false); 5192 br(EQ, A_MIGHT_BE_NULL); 5193 // if (a1.length != a2.length) 5194 // return false; 5195 bind(A_IS_NOT_NULL); 5196 ldrw(cnt1, Address(a1, length_offset)); 5197 ldrw(cnt2, Address(a2, length_offset)); 5198 eorw(tmp5, cnt1, cnt2); 5199 cbnzw(tmp5, DONE); 5200 lea(a1, Address(a1, base_offset)); 5201 lea(a2, Address(a2, base_offset)); 5202 // Check for short strings, i.e. smaller than wordSize. 5203 subs(cnt1, cnt1, elem_per_word); 5204 br(Assembler::LT, SHORT); 5205 // Main 8 byte comparison loop. 5206 bind(NEXT_WORD); { 5207 ldr(tmp1, Address(post(a1, wordSize))); 5208 ldr(tmp2, Address(post(a2, wordSize))); 5209 subs(cnt1, cnt1, elem_per_word); 5210 eor(tmp5, tmp1, tmp2); 5211 cbnz(tmp5, DONE); 5212 } br(GT, NEXT_WORD); 5213 // Last longword. In the case where length == 4 we compare the 5214 // same longword twice, but that's still faster than another 5215 // conditional branch. 5216 // cnt1 could be 0, -1, -2, -3, -4 for chars; -4 only happens when 5217 // length == 4. 5218 if (log_elem_size > 0) 5219 lsl(cnt1, cnt1, log_elem_size); 5220 ldr(tmp3, Address(a1, cnt1)); 5221 ldr(tmp4, Address(a2, cnt1)); 5222 eor(tmp5, tmp3, tmp4); 5223 cbnz(tmp5, DONE); 5224 b(SAME); 5225 bind(A_MIGHT_BE_NULL); 5226 // in case both a1 and a2 are not-null, proceed with loads 5227 cbz(a1, DONE); 5228 cbz(a2, DONE); 5229 b(A_IS_NOT_NULL); 5230 bind(SHORT); 5231 5232 tbz(cnt1, 2 - log_elem_size, TAIL03); // 0-7 bytes left. 5233 { 5234 ldrw(tmp1, Address(post(a1, 4))); 5235 ldrw(tmp2, Address(post(a2, 4))); 5236 eorw(tmp5, tmp1, tmp2); 5237 cbnzw(tmp5, DONE); 5238 } 5239 bind(TAIL03); 5240 tbz(cnt1, 1 - log_elem_size, TAIL01); // 0-3 bytes left. 5241 { 5242 ldrh(tmp3, Address(post(a1, 2))); 5243 ldrh(tmp4, Address(post(a2, 2))); 5244 eorw(tmp5, tmp3, tmp4); 5245 cbnzw(tmp5, DONE); 5246 } 5247 bind(TAIL01); 5248 if (elem_size == 1) { // Only needed when comparing byte arrays. 5249 tbz(cnt1, 0, SAME); // 0-1 bytes left. 5250 { 5251 ldrb(tmp1, a1); 5252 ldrb(tmp2, a2); 5253 eorw(tmp5, tmp1, tmp2); 5254 cbnzw(tmp5, DONE); 5255 } 5256 } 5257 } else { 5258 Label NEXT_DWORD, SHORT, TAIL, TAIL2, STUB, EARLY_OUT, 5259 CSET_EQ, LAST_CHECK; 5260 mov(result, false); 5261 cbz(a1, DONE); 5262 ldrw(cnt1, Address(a1, length_offset)); 5263 cbz(a2, DONE); 5264 ldrw(cnt2, Address(a2, length_offset)); 5265 // on most CPUs a2 is still "locked"(surprisingly) in ldrw and it's 5266 // faster to perform another branch before comparing a1 and a2 5267 cmp(cnt1, (u1)elem_per_word); 5268 br(LE, SHORT); // short or same 5269 ldr(tmp3, Address(pre(a1, base_offset))); 5270 subs(zr, cnt1, stubBytesThreshold); 5271 br(GE, STUB); 5272 ldr(tmp4, Address(pre(a2, base_offset))); 5273 sub(tmp5, zr, cnt1, LSL, 3 + log_elem_size); 5274 cmp(cnt2, cnt1); 5275 br(NE, DONE); 5276 5277 // Main 16 byte comparison loop with 2 exits 5278 bind(NEXT_DWORD); { 5279 ldr(tmp1, Address(pre(a1, wordSize))); 5280 ldr(tmp2, Address(pre(a2, wordSize))); 5281 subs(cnt1, cnt1, 2 * elem_per_word); 5282 br(LE, TAIL); 5283 eor(tmp4, tmp3, tmp4); 5284 cbnz(tmp4, DONE); 5285 ldr(tmp3, Address(pre(a1, wordSize))); 5286 ldr(tmp4, Address(pre(a2, wordSize))); 5287 cmp(cnt1, (u1)elem_per_word); 5288 br(LE, TAIL2); 5289 cmp(tmp1, tmp2); 5290 } br(EQ, NEXT_DWORD); 5291 b(DONE); 5292 5293 bind(TAIL); 5294 eor(tmp4, tmp3, tmp4); 5295 eor(tmp2, tmp1, tmp2); 5296 lslv(tmp2, tmp2, tmp5); 5297 orr(tmp5, tmp4, tmp2); 5298 cmp(tmp5, zr); 5299 b(CSET_EQ); 5300 5301 bind(TAIL2); 5302 eor(tmp2, tmp1, tmp2); 5303 cbnz(tmp2, DONE); 5304 b(LAST_CHECK); 5305 5306 bind(STUB); 5307 ldr(tmp4, Address(pre(a2, base_offset))); 5308 cmp(cnt2, cnt1); 5309 br(NE, DONE); 5310 if (elem_size == 2) { // convert to byte counter 5311 lsl(cnt1, cnt1, 1); 5312 } 5313 eor(tmp5, tmp3, tmp4); 5314 cbnz(tmp5, DONE); 5315 RuntimeAddress stub = RuntimeAddress(StubRoutines::aarch64::large_array_equals()); 5316 assert(stub.target() != NULL, "array_equals_long stub has not been generated"); 5317 trampoline_call(stub); 5318 b(DONE); 5319 5320 bind(EARLY_OUT); 5321 // (a1 != null && a2 == null) || (a1 != null && a2 != null && a1 == a2) 5322 // so, if a2 == null => return false(0), else return true, so we can return a2 5323 mov(result, a2); 5324 b(DONE); 5325 bind(SHORT); 5326 cmp(cnt2, cnt1); 5327 br(NE, DONE); 5328 cbz(cnt1, SAME); 5329 sub(tmp5, zr, cnt1, LSL, 3 + log_elem_size); 5330 ldr(tmp3, Address(a1, base_offset)); 5331 ldr(tmp4, Address(a2, base_offset)); 5332 bind(LAST_CHECK); 5333 eor(tmp4, tmp3, tmp4); 5334 lslv(tmp5, tmp4, tmp5); 5335 cmp(tmp5, zr); 5336 bind(CSET_EQ); 5337 cset(result, EQ); 5338 b(DONE); 5339 } 5340 5341 bind(SAME); 5342 mov(result, true); 5343 // That's it. 5344 bind(DONE); 5345 5346 BLOCK_COMMENT("} array_equals"); 5347 } 5348 5349 // Compare Strings 5350 5351 // For Strings we're passed the address of the first characters in a1 5352 // and a2 and the length in cnt1. 5353 // elem_size is the element size in bytes: either 1 or 2. 5354 // There are two implementations. For arrays >= 8 bytes, all 5355 // comparisons (including the final one, which may overlap) are 5356 // performed 8 bytes at a time. For strings < 8 bytes, we compare a 5357 // halfword, then a short, and then a byte. 5358 5359 void MacroAssembler::string_equals(Register a1, Register a2, 5360 Register result, Register cnt1, int elem_size) 5361 { 5362 Label SAME, DONE, SHORT, NEXT_WORD; 5363 Register tmp1 = rscratch1; 5364 Register tmp2 = rscratch2; 5365 Register cnt2 = tmp2; // cnt2 only used in array length compare 5366 5367 assert(elem_size == 1 || elem_size == 2, "must be 2 or 1 byte"); 5368 assert_different_registers(a1, a2, result, cnt1, rscratch1, rscratch2); 5369 5370 #ifndef PRODUCT 5371 { 5372 const char kind = (elem_size == 2) ? 'U' : 'L'; 5373 char comment[64]; 5374 snprintf(comment, sizeof comment, "{string_equals%c", kind); 5375 BLOCK_COMMENT(comment); 5376 } 5377 #endif 5378 5379 mov(result, false); 5380 5381 // Check for short strings, i.e. smaller than wordSize. 5382 subs(cnt1, cnt1, wordSize); 5383 br(Assembler::LT, SHORT); 5384 // Main 8 byte comparison loop. 5385 bind(NEXT_WORD); { 5386 ldr(tmp1, Address(post(a1, wordSize))); 5387 ldr(tmp2, Address(post(a2, wordSize))); 5388 subs(cnt1, cnt1, wordSize); 5389 eor(tmp1, tmp1, tmp2); 5390 cbnz(tmp1, DONE); 5391 } br(GT, NEXT_WORD); 5392 // Last longword. In the case where length == 4 we compare the 5393 // same longword twice, but that's still faster than another 5394 // conditional branch. 5395 // cnt1 could be 0, -1, -2, -3, -4 for chars; -4 only happens when 5396 // length == 4. 5397 ldr(tmp1, Address(a1, cnt1)); 5398 ldr(tmp2, Address(a2, cnt1)); 5399 eor(tmp2, tmp1, tmp2); 5400 cbnz(tmp2, DONE); 5401 b(SAME); 5402 5403 bind(SHORT); 5404 Label TAIL03, TAIL01; 5405 5406 tbz(cnt1, 2, TAIL03); // 0-7 bytes left. 5407 { 5408 ldrw(tmp1, Address(post(a1, 4))); 5409 ldrw(tmp2, Address(post(a2, 4))); 5410 eorw(tmp1, tmp1, tmp2); 5411 cbnzw(tmp1, DONE); 5412 } 5413 bind(TAIL03); 5414 tbz(cnt1, 1, TAIL01); // 0-3 bytes left. 5415 { 5416 ldrh(tmp1, Address(post(a1, 2))); 5417 ldrh(tmp2, Address(post(a2, 2))); 5418 eorw(tmp1, tmp1, tmp2); 5419 cbnzw(tmp1, DONE); 5420 } 5421 bind(TAIL01); 5422 if (elem_size == 1) { // Only needed when comparing 1-byte elements 5423 tbz(cnt1, 0, SAME); // 0-1 bytes left. 5424 { 5425 ldrb(tmp1, a1); 5426 ldrb(tmp2, a2); 5427 eorw(tmp1, tmp1, tmp2); 5428 cbnzw(tmp1, DONE); 5429 } 5430 } 5431 // Arrays are equal. 5432 bind(SAME); 5433 mov(result, true); 5434 5435 // That's it. 5436 bind(DONE); 5437 BLOCK_COMMENT("} string_equals"); 5438 } 5439 5440 5441 // The size of the blocks erased by the zero_blocks stub. We must 5442 // handle anything smaller than this ourselves in zero_words(). 5443 const int MacroAssembler::zero_words_block_size = 8; 5444 5445 // zero_words() is used by C2 ClearArray patterns. It is as small as 5446 // possible, handling small word counts locally and delegating 5447 // anything larger to the zero_blocks stub. It is expanded many times 5448 // in compiled code, so it is important to keep it short. 5449 5450 // ptr: Address of a buffer to be zeroed. 5451 // cnt: Count in HeapWords. 5452 // 5453 // ptr, cnt, rscratch1, and rscratch2 are clobbered. 5454 void MacroAssembler::zero_words(Register ptr, Register cnt) 5455 { 5456 assert(is_power_of_2(zero_words_block_size), "adjust this"); 5457 assert(ptr == r10 && cnt == r11, "mismatch in register usage"); 5458 5459 BLOCK_COMMENT("zero_words {"); 5460 cmp(cnt, (u1)zero_words_block_size); 5461 Label around; 5462 br(LO, around); 5463 { 5464 RuntimeAddress zero_blocks = RuntimeAddress(StubRoutines::aarch64::zero_blocks()); 5465 assert(zero_blocks.target() != NULL, "zero_blocks stub has not been generated"); 5466 if (StubRoutines::aarch64::complete()) { 5467 trampoline_call(zero_blocks); 5468 } else { 5469 bl(zero_blocks); 5470 } 5471 } 5472 bind(around); 5473 for (int i = zero_words_block_size >> 1; i > 1; i >>= 1) { 5474 Label l; 5475 tbz(cnt, exact_log2(i), l); 5476 for (int j = 0; j < i; j += 2) { 5477 stp(zr, zr, post(ptr, 16)); 5478 } 5479 bind(l); 5480 } 5481 { 5482 Label l; 5483 tbz(cnt, 0, l); 5484 str(zr, Address(ptr)); 5485 bind(l); 5486 } 5487 BLOCK_COMMENT("} zero_words"); 5488 } 5489 5490 // base: Address of a buffer to be zeroed, 8 bytes aligned. 5491 // cnt: Immediate count in HeapWords. 5492 #define SmallArraySize (18 * BytesPerLong) 5493 void MacroAssembler::zero_words(Register base, u_int64_t cnt) 5494 { 5495 BLOCK_COMMENT("zero_words {"); 5496 int i = cnt & 1; // store any odd word to start 5497 if (i) str(zr, Address(base)); 5498 5499 if (cnt <= SmallArraySize / BytesPerLong) { 5500 for (; i < (int)cnt; i += 2) 5501 stp(zr, zr, Address(base, i * wordSize)); 5502 } else { 5503 const int unroll = 4; // Number of stp(zr, zr) instructions we'll unroll 5504 int remainder = cnt % (2 * unroll); 5505 for (; i < remainder; i += 2) 5506 stp(zr, zr, Address(base, i * wordSize)); 5507 5508 Label loop; 5509 Register cnt_reg = rscratch1; 5510 Register loop_base = rscratch2; 5511 cnt = cnt - remainder; 5512 mov(cnt_reg, cnt); 5513 // adjust base and prebias by -2 * wordSize so we can pre-increment 5514 add(loop_base, base, (remainder - 2) * wordSize); 5515 bind(loop); 5516 sub(cnt_reg, cnt_reg, 2 * unroll); 5517 for (i = 1; i < unroll; i++) 5518 stp(zr, zr, Address(loop_base, 2 * i * wordSize)); 5519 stp(zr, zr, Address(pre(loop_base, 2 * unroll * wordSize))); 5520 cbnz(cnt_reg, loop); 5521 } 5522 BLOCK_COMMENT("} zero_words"); 5523 } 5524 5525 // Zero blocks of memory by using DC ZVA. 5526 // 5527 // Aligns the base address first sufficently for DC ZVA, then uses 5528 // DC ZVA repeatedly for every full block. cnt is the size to be 5529 // zeroed in HeapWords. Returns the count of words left to be zeroed 5530 // in cnt. 5531 // 5532 // NOTE: This is intended to be used in the zero_blocks() stub. If 5533 // you want to use it elsewhere, note that cnt must be >= 2*zva_length. 5534 void MacroAssembler::zero_dcache_blocks(Register base, Register cnt) { 5535 Register tmp = rscratch1; 5536 Register tmp2 = rscratch2; 5537 int zva_length = VM_Version::zva_length(); 5538 Label initial_table_end, loop_zva; 5539 Label fini; 5540 5541 // Base must be 16 byte aligned. If not just return and let caller handle it 5542 tst(base, 0x0f); 5543 br(Assembler::NE, fini); 5544 // Align base with ZVA length. 5545 neg(tmp, base); 5546 andr(tmp, tmp, zva_length - 1); 5547 5548 // tmp: the number of bytes to be filled to align the base with ZVA length. 5549 add(base, base, tmp); 5550 sub(cnt, cnt, tmp, Assembler::ASR, 3); 5551 adr(tmp2, initial_table_end); 5552 sub(tmp2, tmp2, tmp, Assembler::LSR, 2); 5553 br(tmp2); 5554 5555 for (int i = -zva_length + 16; i < 0; i += 16) 5556 stp(zr, zr, Address(base, i)); 5557 bind(initial_table_end); 5558 5559 sub(cnt, cnt, zva_length >> 3); 5560 bind(loop_zva); 5561 dc(Assembler::ZVA, base); 5562 subs(cnt, cnt, zva_length >> 3); 5563 add(base, base, zva_length); 5564 br(Assembler::GE, loop_zva); 5565 add(cnt, cnt, zva_length >> 3); // count not zeroed by DC ZVA 5566 bind(fini); 5567 } 5568 5569 // base: Address of a buffer to be filled, 8 bytes aligned. 5570 // cnt: Count in 8-byte unit. 5571 // value: Value to be filled with. 5572 // base will point to the end of the buffer after filling. 5573 void MacroAssembler::fill_words(Register base, Register cnt, Register value) 5574 { 5575 // Algorithm: 5576 // 5577 // scratch1 = cnt & 7; 5578 // cnt -= scratch1; 5579 // p += scratch1; 5580 // switch (scratch1) { 5581 // do { 5582 // cnt -= 8; 5583 // p[-8] = v; 5584 // case 7: 5585 // p[-7] = v; 5586 // case 6: 5587 // p[-6] = v; 5588 // // ... 5589 // case 1: 5590 // p[-1] = v; 5591 // case 0: 5592 // p += 8; 5593 // } while (cnt); 5594 // } 5595 5596 assert_different_registers(base, cnt, value, rscratch1, rscratch2); 5597 5598 Label fini, skip, entry, loop; 5599 const int unroll = 8; // Number of stp instructions we'll unroll 5600 5601 cbz(cnt, fini); 5602 tbz(base, 3, skip); 5603 str(value, Address(post(base, 8))); 5604 sub(cnt, cnt, 1); 5605 bind(skip); 5606 5607 andr(rscratch1, cnt, (unroll-1) * 2); 5608 sub(cnt, cnt, rscratch1); 5609 add(base, base, rscratch1, Assembler::LSL, 3); 5610 adr(rscratch2, entry); 5611 sub(rscratch2, rscratch2, rscratch1, Assembler::LSL, 1); 5612 br(rscratch2); 5613 5614 bind(loop); 5615 add(base, base, unroll * 16); 5616 for (int i = -unroll; i < 0; i++) 5617 stp(value, value, Address(base, i * 16)); 5618 bind(entry); 5619 subs(cnt, cnt, unroll * 2); 5620 br(Assembler::GE, loop); 5621 5622 tbz(cnt, 0, fini); 5623 str(value, Address(post(base, 8))); 5624 bind(fini); 5625 } 5626 5627 // Intrinsic for sun/nio/cs/ISO_8859_1$Encoder.implEncodeISOArray and 5628 // java/lang/StringUTF16.compress. 5629 void MacroAssembler::encode_iso_array(Register src, Register dst, 5630 Register len, Register result, 5631 FloatRegister Vtmp1, FloatRegister Vtmp2, 5632 FloatRegister Vtmp3, FloatRegister Vtmp4) 5633 { 5634 Label DONE, SET_RESULT, NEXT_32, NEXT_32_PRFM, LOOP_8, NEXT_8, LOOP_1, NEXT_1, 5635 NEXT_32_START, NEXT_32_PRFM_START; 5636 Register tmp1 = rscratch1, tmp2 = rscratch2; 5637 5638 mov(result, len); // Save initial len 5639 5640 #ifndef BUILTIN_SIM 5641 cmp(len, (u1)8); // handle shortest strings first 5642 br(LT, LOOP_1); 5643 cmp(len, (u1)32); 5644 br(LT, NEXT_8); 5645 // The following code uses the SIMD 'uzp1' and 'uzp2' instructions 5646 // to convert chars to bytes 5647 if (SoftwarePrefetchHintDistance >= 0) { 5648 ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src); 5649 subs(tmp2, len, SoftwarePrefetchHintDistance/2 + 16); 5650 br(LE, NEXT_32_START); 5651 b(NEXT_32_PRFM_START); 5652 BIND(NEXT_32_PRFM); 5653 ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src); 5654 BIND(NEXT_32_PRFM_START); 5655 prfm(Address(src, SoftwarePrefetchHintDistance)); 5656 orr(v4, T16B, Vtmp1, Vtmp2); 5657 orr(v5, T16B, Vtmp3, Vtmp4); 5658 uzp1(Vtmp1, T16B, Vtmp1, Vtmp2); 5659 uzp1(Vtmp3, T16B, Vtmp3, Vtmp4); 5660 stpq(Vtmp1, Vtmp3, dst); 5661 uzp2(v5, T16B, v4, v5); // high bytes 5662 umov(tmp2, v5, D, 1); 5663 fmovd(tmp1, v5); 5664 orr(tmp1, tmp1, tmp2); 5665 cbnz(tmp1, LOOP_8); 5666 sub(len, len, 32); 5667 add(dst, dst, 32); 5668 add(src, src, 64); 5669 subs(tmp2, len, SoftwarePrefetchHintDistance/2 + 16); 5670 br(GE, NEXT_32_PRFM); 5671 cmp(len, (u1)32); 5672 br(LT, LOOP_8); 5673 BIND(NEXT_32); 5674 ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src); 5675 BIND(NEXT_32_START); 5676 } else { 5677 BIND(NEXT_32); 5678 ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src); 5679 } 5680 prfm(Address(src, SoftwarePrefetchHintDistance)); 5681 uzp1(v4, T16B, Vtmp1, Vtmp2); 5682 uzp1(v5, T16B, Vtmp3, Vtmp4); 5683 stpq(v4, v5, dst); 5684 orr(Vtmp1, T16B, Vtmp1, Vtmp2); 5685 orr(Vtmp3, T16B, Vtmp3, Vtmp4); 5686 uzp2(Vtmp1, T16B, Vtmp1, Vtmp3); // high bytes 5687 umov(tmp2, Vtmp1, D, 1); 5688 fmovd(tmp1, Vtmp1); 5689 orr(tmp1, tmp1, tmp2); 5690 cbnz(tmp1, LOOP_8); 5691 sub(len, len, 32); 5692 add(dst, dst, 32); 5693 add(src, src, 64); 5694 cmp(len, (u1)32); 5695 br(GE, NEXT_32); 5696 cbz(len, DONE); 5697 5698 BIND(LOOP_8); 5699 cmp(len, (u1)8); 5700 br(LT, LOOP_1); 5701 BIND(NEXT_8); 5702 ld1(Vtmp1, T8H, src); 5703 uzp1(Vtmp2, T16B, Vtmp1, Vtmp1); // low bytes 5704 uzp2(Vtmp3, T16B, Vtmp1, Vtmp1); // high bytes 5705 strd(Vtmp2, dst); 5706 fmovd(tmp1, Vtmp3); 5707 cbnz(tmp1, NEXT_1); 5708 5709 sub(len, len, 8); 5710 add(dst, dst, 8); 5711 add(src, src, 16); 5712 cmp(len, (u1)8); 5713 br(GE, NEXT_8); 5714 5715 BIND(LOOP_1); 5716 #endif 5717 cbz(len, DONE); 5718 BIND(NEXT_1); 5719 ldrh(tmp1, Address(post(src, 2))); 5720 strb(tmp1, Address(post(dst, 1))); 5721 tst(tmp1, 0xff00); 5722 br(NE, SET_RESULT); 5723 subs(len, len, 1); 5724 br(GT, NEXT_1); 5725 5726 BIND(SET_RESULT); 5727 sub(result, result, len); // Return index where we stopped 5728 // Return len == 0 if we processed all 5729 // characters 5730 BIND(DONE); 5731 } 5732 5733 5734 // Inflate byte[] array to char[]. 5735 void MacroAssembler::byte_array_inflate(Register src, Register dst, Register len, 5736 FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3, 5737 Register tmp4) { 5738 Label big, done, after_init, to_stub; 5739 5740 assert_different_registers(src, dst, len, tmp4, rscratch1); 5741 5742 fmovd(vtmp1, zr); 5743 lsrw(tmp4, len, 3); 5744 bind(after_init); 5745 cbnzw(tmp4, big); 5746 // Short string: less than 8 bytes. 5747 { 5748 Label loop, tiny; 5749 5750 cmpw(len, 4); 5751 br(LT, tiny); 5752 // Use SIMD to do 4 bytes. 5753 ldrs(vtmp2, post(src, 4)); 5754 zip1(vtmp3, T8B, vtmp2, vtmp1); 5755 subw(len, len, 4); 5756 strd(vtmp3, post(dst, 8)); 5757 5758 cbzw(len, done); 5759 5760 // Do the remaining bytes by steam. 5761 bind(loop); 5762 ldrb(tmp4, post(src, 1)); 5763 strh(tmp4, post(dst, 2)); 5764 subw(len, len, 1); 5765 5766 bind(tiny); 5767 cbnz(len, loop); 5768 5769 b(done); 5770 } 5771 5772 if (SoftwarePrefetchHintDistance >= 0) { 5773 bind(to_stub); 5774 RuntimeAddress stub = RuntimeAddress(StubRoutines::aarch64::large_byte_array_inflate()); 5775 assert(stub.target() != NULL, "large_byte_array_inflate stub has not been generated"); 5776 trampoline_call(stub); 5777 b(after_init); 5778 } 5779 5780 // Unpack the bytes 8 at a time. 5781 bind(big); 5782 { 5783 Label loop, around, loop_last, loop_start; 5784 5785 if (SoftwarePrefetchHintDistance >= 0) { 5786 const int large_loop_threshold = (64 + 16)/8; 5787 ldrd(vtmp2, post(src, 8)); 5788 andw(len, len, 7); 5789 cmp(tmp4, (u1)large_loop_threshold); 5790 br(GE, to_stub); 5791 b(loop_start); 5792 5793 bind(loop); 5794 ldrd(vtmp2, post(src, 8)); 5795 bind(loop_start); 5796 subs(tmp4, tmp4, 1); 5797 br(EQ, loop_last); 5798 zip1(vtmp2, T16B, vtmp2, vtmp1); 5799 ldrd(vtmp3, post(src, 8)); 5800 st1(vtmp2, T8H, post(dst, 16)); 5801 subs(tmp4, tmp4, 1); 5802 zip1(vtmp3, T16B, vtmp3, vtmp1); 5803 st1(vtmp3, T8H, post(dst, 16)); 5804 br(NE, loop); 5805 b(around); 5806 bind(loop_last); 5807 zip1(vtmp2, T16B, vtmp2, vtmp1); 5808 st1(vtmp2, T8H, post(dst, 16)); 5809 bind(around); 5810 cbz(len, done); 5811 } else { 5812 andw(len, len, 7); 5813 bind(loop); 5814 ldrd(vtmp2, post(src, 8)); 5815 sub(tmp4, tmp4, 1); 5816 zip1(vtmp3, T16B, vtmp2, vtmp1); 5817 st1(vtmp3, T8H, post(dst, 16)); 5818 cbnz(tmp4, loop); 5819 } 5820 } 5821 5822 // Do the tail of up to 8 bytes. 5823 add(src, src, len); 5824 ldrd(vtmp3, Address(src, -8)); 5825 add(dst, dst, len, ext::uxtw, 1); 5826 zip1(vtmp3, T16B, vtmp3, vtmp1); 5827 strq(vtmp3, Address(dst, -16)); 5828 5829 bind(done); 5830 } 5831 5832 // Compress char[] array to byte[]. 5833 void MacroAssembler::char_array_compress(Register src, Register dst, Register len, 5834 FloatRegister tmp1Reg, FloatRegister tmp2Reg, 5835 FloatRegister tmp3Reg, FloatRegister tmp4Reg, 5836 Register result) { 5837 encode_iso_array(src, dst, len, result, 5838 tmp1Reg, tmp2Reg, tmp3Reg, tmp4Reg); 5839 cmp(len, zr); 5840 csel(result, result, zr, EQ); 5841 } 5842 5843 // get_thread() can be called anywhere inside generated code so we 5844 // need to save whatever non-callee save context might get clobbered 5845 // by the call to JavaThread::aarch64_get_thread_helper() or, indeed, 5846 // the call setup code. 5847 // 5848 // aarch64_get_thread_helper() clobbers only r0, r1, and flags. 5849 // 5850 void MacroAssembler::get_thread(Register dst) { 5851 RegSet saved_regs = RegSet::range(r0, r1) + lr - dst; 5852 push(saved_regs, sp); 5853 5854 mov(lr, CAST_FROM_FN_PTR(address, JavaThread::aarch64_get_thread_helper)); 5855 blrt(lr, 1, 0, 1); 5856 if (dst != c_rarg0) { 5857 mov(dst, c_rarg0); 5858 } 5859 5860 pop(saved_regs, sp); 5861 }