1 /* 2 * Copyright (c) 1997, 2018, Oracle and/or its affiliates. All rights reserved. 3 * Copyright (c) 2014, 2015, Red Hat Inc. All rights reserved. 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. 9 * 10 * This code is distributed in the hope that it will be useful, but WITHOUT 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 * version 2 for more details (a copy is included in the LICENSE file that 14 * accompanied this code). 15 * 16 * You should have received a copy of the GNU General Public License version 17 * 2 along with this work; if not, write to the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 19 * 20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 21 * or visit www.oracle.com if you need additional information or have any 22 * questions. 23 * 24 */ 25 26 #include <sys/types.h> 27 28 #include "precompiled.hpp" 29 #include "jvm.h" 30 #include "asm/assembler.hpp" 31 #include "asm/assembler.inline.hpp" 32 #include "gc/shared/barrierSet.hpp" 33 #include "gc/shared/cardTable.hpp" 34 #include "gc/shared/barrierSetAssembler.hpp" 35 #include "gc/shared/cardTableBarrierSet.hpp" 36 #include "interpreter/interpreter.hpp" 37 #include "compiler/disassembler.hpp" 38 #include "memory/resourceArea.hpp" 39 #include "nativeInst_aarch64.hpp" 40 #include "oops/accessDecorators.hpp" 41 #include "oops/compressedOops.inline.hpp" 42 #include "oops/klass.inline.hpp" 43 #include "runtime/biasedLocking.hpp" 44 #include "runtime/icache.hpp" 45 #include "runtime/interfaceSupport.inline.hpp" 46 #include "runtime/jniHandles.inline.hpp" 47 #include "runtime/sharedRuntime.hpp" 48 #include "runtime/thread.hpp" 49 #ifdef COMPILER1 50 #include "c1/c1_LIRAssembler.hpp" 51 #endif 52 #ifdef COMPILER2 53 #include "oops/oop.hpp" 54 #include "opto/compile.hpp" 55 #include "opto/intrinsicnode.hpp" 56 #include "opto/node.hpp" 57 #endif 58 59 #ifdef PRODUCT 60 #define BLOCK_COMMENT(str) /* nothing */ 61 #define STOP(error) stop(error) 62 #else 63 #define BLOCK_COMMENT(str) block_comment(str) 64 #define STOP(error) block_comment(error); stop(error) 65 #endif 66 67 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 68 69 // Patch any kind of instruction; there may be several instructions. 70 // Return the total length (in bytes) of the instructions. 71 int MacroAssembler::pd_patch_instruction_size(address branch, address target) { 72 int instructions = 1; 73 assert((uint64_t)target < (1ul << 48), "48-bit overflow in address constant"); 74 long offset = (target - branch) >> 2; 75 unsigned insn = *(unsigned*)branch; 76 if ((Instruction_aarch64::extract(insn, 29, 24) & 0b111011) == 0b011000) { 77 // Load register (literal) 78 Instruction_aarch64::spatch(branch, 23, 5, offset); 79 } else if (Instruction_aarch64::extract(insn, 30, 26) == 0b00101) { 80 // Unconditional branch (immediate) 81 Instruction_aarch64::spatch(branch, 25, 0, offset); 82 } else if (Instruction_aarch64::extract(insn, 31, 25) == 0b0101010) { 83 // Conditional branch (immediate) 84 Instruction_aarch64::spatch(branch, 23, 5, offset); 85 } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011010) { 86 // Compare & branch (immediate) 87 Instruction_aarch64::spatch(branch, 23, 5, offset); 88 } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011011) { 89 // Test & branch (immediate) 90 Instruction_aarch64::spatch(branch, 18, 5, offset); 91 } else if (Instruction_aarch64::extract(insn, 28, 24) == 0b10000) { 92 // PC-rel. addressing 93 offset = target-branch; 94 int shift = Instruction_aarch64::extract(insn, 31, 31); 95 if (shift) { 96 u_int64_t dest = (u_int64_t)target; 97 uint64_t pc_page = (uint64_t)branch >> 12; 98 uint64_t adr_page = (uint64_t)target >> 12; 99 unsigned offset_lo = dest & 0xfff; 100 offset = adr_page - pc_page; 101 102 // We handle 4 types of PC relative addressing 103 // 1 - adrp Rx, target_page 104 // ldr/str Ry, [Rx, #offset_in_page] 105 // 2 - adrp Rx, target_page 106 // add Ry, Rx, #offset_in_page 107 // 3 - adrp Rx, target_page (page aligned reloc, offset == 0) 108 // movk Rx, #imm16<<32 109 // 4 - adrp Rx, target_page (page aligned reloc, offset == 0) 110 // In the first 3 cases we must check that Rx is the same in the adrp and the 111 // subsequent ldr/str, add or movk instruction. Otherwise we could accidentally end 112 // up treating a type 4 relocation as a type 1, 2 or 3 just because it happened 113 // to be followed by a random unrelated ldr/str, add or movk instruction. 114 // 115 unsigned insn2 = ((unsigned*)branch)[1]; 116 if (Instruction_aarch64::extract(insn2, 29, 24) == 0b111001 && 117 Instruction_aarch64::extract(insn, 4, 0) == 118 Instruction_aarch64::extract(insn2, 9, 5)) { 119 // Load/store register (unsigned immediate) 120 unsigned size = Instruction_aarch64::extract(insn2, 31, 30); 121 Instruction_aarch64::patch(branch + sizeof (unsigned), 122 21, 10, offset_lo >> size); 123 guarantee(((dest >> size) << size) == dest, "misaligned target"); 124 instructions = 2; 125 } else if (Instruction_aarch64::extract(insn2, 31, 22) == 0b1001000100 && 126 Instruction_aarch64::extract(insn, 4, 0) == 127 Instruction_aarch64::extract(insn2, 4, 0)) { 128 // add (immediate) 129 Instruction_aarch64::patch(branch + sizeof (unsigned), 130 21, 10, offset_lo); 131 instructions = 2; 132 } else if (Instruction_aarch64::extract(insn2, 31, 21) == 0b11110010110 && 133 Instruction_aarch64::extract(insn, 4, 0) == 134 Instruction_aarch64::extract(insn2, 4, 0)) { 135 // movk #imm16<<32 136 Instruction_aarch64::patch(branch + 4, 20, 5, (uint64_t)target >> 32); 137 long dest = ((long)target & 0xffffffffL) | ((long)branch & 0xffff00000000L); 138 long pc_page = (long)branch >> 12; 139 long adr_page = (long)dest >> 12; 140 offset = adr_page - pc_page; 141 instructions = 2; 142 } 143 } 144 int offset_lo = offset & 3; 145 offset >>= 2; 146 Instruction_aarch64::spatch(branch, 23, 5, offset); 147 Instruction_aarch64::patch(branch, 30, 29, offset_lo); 148 } else if (Instruction_aarch64::extract(insn, 31, 21) == 0b11010010100) { 149 u_int64_t dest = (u_int64_t)target; 150 // Move wide constant 151 assert(nativeInstruction_at(branch+4)->is_movk(), "wrong insns in patch"); 152 assert(nativeInstruction_at(branch+8)->is_movk(), "wrong insns in patch"); 153 Instruction_aarch64::patch(branch, 20, 5, dest & 0xffff); 154 Instruction_aarch64::patch(branch+4, 20, 5, (dest >>= 16) & 0xffff); 155 Instruction_aarch64::patch(branch+8, 20, 5, (dest >>= 16) & 0xffff); 156 assert(target_addr_for_insn(branch) == target, "should be"); 157 instructions = 3; 158 } else if (Instruction_aarch64::extract(insn, 31, 22) == 0b1011100101 && 159 Instruction_aarch64::extract(insn, 4, 0) == 0b11111) { 160 // nothing to do 161 assert(target == 0, "did not expect to relocate target for polling page load"); 162 } else { 163 ShouldNotReachHere(); 164 } 165 return instructions * NativeInstruction::instruction_size; 166 } 167 168 int MacroAssembler::patch_oop(address insn_addr, address o) { 169 int instructions; 170 unsigned insn = *(unsigned*)insn_addr; 171 assert(nativeInstruction_at(insn_addr+4)->is_movk(), "wrong insns in patch"); 172 173 // OOPs are either narrow (32 bits) or wide (48 bits). We encode 174 // narrow OOPs by setting the upper 16 bits in the first 175 // instruction. 176 if (Instruction_aarch64::extract(insn, 31, 21) == 0b11010010101) { 177 // Move narrow OOP 178 narrowOop n = CompressedOops::encode((oop)o); 179 Instruction_aarch64::patch(insn_addr, 20, 5, n >> 16); 180 Instruction_aarch64::patch(insn_addr+4, 20, 5, n & 0xffff); 181 instructions = 2; 182 } else { 183 // Move wide OOP 184 assert(nativeInstruction_at(insn_addr+8)->is_movk(), "wrong insns in patch"); 185 uintptr_t dest = (uintptr_t)o; 186 Instruction_aarch64::patch(insn_addr, 20, 5, dest & 0xffff); 187 Instruction_aarch64::patch(insn_addr+4, 20, 5, (dest >>= 16) & 0xffff); 188 Instruction_aarch64::patch(insn_addr+8, 20, 5, (dest >>= 16) & 0xffff); 189 instructions = 3; 190 } 191 return instructions * NativeInstruction::instruction_size; 192 } 193 194 int MacroAssembler::patch_narrow_klass(address insn_addr, narrowKlass n) { 195 // Metatdata pointers are either narrow (32 bits) or wide (48 bits). 196 // We encode narrow ones by setting the upper 16 bits in the first 197 // instruction. 198 NativeInstruction *insn = nativeInstruction_at(insn_addr); 199 assert(Instruction_aarch64::extract(insn->encoding(), 31, 21) == 0b11010010101 && 200 nativeInstruction_at(insn_addr+4)->is_movk(), "wrong insns in patch"); 201 202 Instruction_aarch64::patch(insn_addr, 20, 5, n >> 16); 203 Instruction_aarch64::patch(insn_addr+4, 20, 5, n & 0xffff); 204 return 2 * NativeInstruction::instruction_size; 205 } 206 207 address MacroAssembler::target_addr_for_insn(address insn_addr, unsigned insn) { 208 long offset = 0; 209 if ((Instruction_aarch64::extract(insn, 29, 24) & 0b011011) == 0b00011000) { 210 // Load register (literal) 211 offset = Instruction_aarch64::sextract(insn, 23, 5); 212 return address(((uint64_t)insn_addr + (offset << 2))); 213 } else if (Instruction_aarch64::extract(insn, 30, 26) == 0b00101) { 214 // Unconditional branch (immediate) 215 offset = Instruction_aarch64::sextract(insn, 25, 0); 216 } else if (Instruction_aarch64::extract(insn, 31, 25) == 0b0101010) { 217 // Conditional branch (immediate) 218 offset = Instruction_aarch64::sextract(insn, 23, 5); 219 } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011010) { 220 // Compare & branch (immediate) 221 offset = Instruction_aarch64::sextract(insn, 23, 5); 222 } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011011) { 223 // Test & branch (immediate) 224 offset = Instruction_aarch64::sextract(insn, 18, 5); 225 } else if (Instruction_aarch64::extract(insn, 28, 24) == 0b10000) { 226 // PC-rel. addressing 227 offset = Instruction_aarch64::extract(insn, 30, 29); 228 offset |= Instruction_aarch64::sextract(insn, 23, 5) << 2; 229 int shift = Instruction_aarch64::extract(insn, 31, 31) ? 12 : 0; 230 if (shift) { 231 offset <<= shift; 232 uint64_t target_page = ((uint64_t)insn_addr) + offset; 233 target_page &= ((uint64_t)-1) << shift; 234 // Return the target address for the following sequences 235 // 1 - adrp Rx, target_page 236 // ldr/str Ry, [Rx, #offset_in_page] 237 // 2 - adrp Rx, target_page 238 // add Ry, Rx, #offset_in_page 239 // 3 - adrp Rx, target_page (page aligned reloc, offset == 0) 240 // movk Rx, #imm12<<32 241 // 4 - adrp Rx, target_page (page aligned reloc, offset == 0) 242 // 243 // In the first two cases we check that the register is the same and 244 // return the target_page + the offset within the page. 245 // Otherwise we assume it is a page aligned relocation and return 246 // the target page only. 247 // 248 unsigned insn2 = ((unsigned*)insn_addr)[1]; 249 if (Instruction_aarch64::extract(insn2, 29, 24) == 0b111001 && 250 Instruction_aarch64::extract(insn, 4, 0) == 251 Instruction_aarch64::extract(insn2, 9, 5)) { 252 // Load/store register (unsigned immediate) 253 unsigned int byte_offset = Instruction_aarch64::extract(insn2, 21, 10); 254 unsigned int size = Instruction_aarch64::extract(insn2, 31, 30); 255 return address(target_page + (byte_offset << size)); 256 } else if (Instruction_aarch64::extract(insn2, 31, 22) == 0b1001000100 && 257 Instruction_aarch64::extract(insn, 4, 0) == 258 Instruction_aarch64::extract(insn2, 4, 0)) { 259 // add (immediate) 260 unsigned int byte_offset = Instruction_aarch64::extract(insn2, 21, 10); 261 return address(target_page + byte_offset); 262 } else { 263 if (Instruction_aarch64::extract(insn2, 31, 21) == 0b11110010110 && 264 Instruction_aarch64::extract(insn, 4, 0) == 265 Instruction_aarch64::extract(insn2, 4, 0)) { 266 target_page = (target_page & 0xffffffff) | 267 ((uint64_t)Instruction_aarch64::extract(insn2, 20, 5) << 32); 268 } 269 return (address)target_page; 270 } 271 } else { 272 ShouldNotReachHere(); 273 } 274 } else if (Instruction_aarch64::extract(insn, 31, 23) == 0b110100101) { 275 u_int32_t *insns = (u_int32_t *)insn_addr; 276 // Move wide constant: movz, movk, movk. See movptr(). 277 assert(nativeInstruction_at(insns+1)->is_movk(), "wrong insns in patch"); 278 assert(nativeInstruction_at(insns+2)->is_movk(), "wrong insns in patch"); 279 return address(u_int64_t(Instruction_aarch64::extract(insns[0], 20, 5)) 280 + (u_int64_t(Instruction_aarch64::extract(insns[1], 20, 5)) << 16) 281 + (u_int64_t(Instruction_aarch64::extract(insns[2], 20, 5)) << 32)); 282 } else if (Instruction_aarch64::extract(insn, 31, 22) == 0b1011100101 && 283 Instruction_aarch64::extract(insn, 4, 0) == 0b11111) { 284 return 0; 285 } else { 286 ShouldNotReachHere(); 287 } 288 return address(((uint64_t)insn_addr + (offset << 2))); 289 } 290 291 void MacroAssembler::safepoint_poll(Label& slow_path) { 292 if (SafepointMechanism::uses_thread_local_poll()) { 293 ldr(rscratch1, Address(rthread, Thread::polling_page_offset())); 294 tbnz(rscratch1, exact_log2(SafepointMechanism::poll_bit()), slow_path); 295 } else { 296 unsigned long offset; 297 adrp(rscratch1, ExternalAddress(SafepointSynchronize::address_of_state()), offset); 298 ldrw(rscratch1, Address(rscratch1, offset)); 299 assert(SafepointSynchronize::_not_synchronized == 0, "rewrite this code"); 300 cbnz(rscratch1, slow_path); 301 } 302 } 303 304 // Just like safepoint_poll, but use an acquiring load for thread- 305 // local polling. 306 // 307 // We need an acquire here to ensure that any subsequent load of the 308 // global SafepointSynchronize::_state flag is ordered after this load 309 // of the local Thread::_polling page. We don't want this poll to 310 // return false (i.e. not safepointing) and a later poll of the global 311 // SafepointSynchronize::_state spuriously to return true. 312 // 313 // This is to avoid a race when we're in a native->Java transition 314 // racing the code which wakes up from a safepoint. 315 // 316 void MacroAssembler::safepoint_poll_acquire(Label& slow_path) { 317 if (SafepointMechanism::uses_thread_local_poll()) { 318 lea(rscratch1, Address(rthread, Thread::polling_page_offset())); 319 ldar(rscratch1, rscratch1); 320 tbnz(rscratch1, exact_log2(SafepointMechanism::poll_bit()), slow_path); 321 } else { 322 safepoint_poll(slow_path); 323 } 324 } 325 326 void MacroAssembler::reset_last_Java_frame(bool clear_fp) { 327 // we must set sp to zero to clear frame 328 str(zr, Address(rthread, JavaThread::last_Java_sp_offset())); 329 330 // must clear fp, so that compiled frames are not confused; it is 331 // possible that we need it only for debugging 332 if (clear_fp) { 333 str(zr, Address(rthread, JavaThread::last_Java_fp_offset())); 334 } 335 336 // Always clear the pc because it could have been set by make_walkable() 337 str(zr, Address(rthread, JavaThread::last_Java_pc_offset())); 338 } 339 340 // Calls to C land 341 // 342 // When entering C land, the rfp, & resp of the last Java frame have to be recorded 343 // in the (thread-local) JavaThread object. When leaving C land, the last Java fp 344 // has to be reset to 0. This is required to allow proper stack traversal. 345 void MacroAssembler::set_last_Java_frame(Register last_java_sp, 346 Register last_java_fp, 347 Register last_java_pc, 348 Register scratch) { 349 350 if (last_java_pc->is_valid()) { 351 str(last_java_pc, Address(rthread, 352 JavaThread::frame_anchor_offset() 353 + JavaFrameAnchor::last_Java_pc_offset())); 354 } 355 356 // determine last_java_sp register 357 if (last_java_sp == sp) { 358 mov(scratch, sp); 359 last_java_sp = scratch; 360 } else if (!last_java_sp->is_valid()) { 361 last_java_sp = esp; 362 } 363 364 str(last_java_sp, Address(rthread, JavaThread::last_Java_sp_offset())); 365 366 // last_java_fp is optional 367 if (last_java_fp->is_valid()) { 368 str(last_java_fp, Address(rthread, JavaThread::last_Java_fp_offset())); 369 } 370 } 371 372 void MacroAssembler::set_last_Java_frame(Register last_java_sp, 373 Register last_java_fp, 374 address last_java_pc, 375 Register scratch) { 376 if (last_java_pc != NULL) { 377 adr(scratch, last_java_pc); 378 } else { 379 // FIXME: This is almost never correct. We should delete all 380 // cases of set_last_Java_frame with last_java_pc=NULL and use the 381 // correct return address instead. 382 adr(scratch, pc()); 383 } 384 385 str(scratch, Address(rthread, 386 JavaThread::frame_anchor_offset() 387 + JavaFrameAnchor::last_Java_pc_offset())); 388 389 set_last_Java_frame(last_java_sp, last_java_fp, noreg, scratch); 390 } 391 392 void MacroAssembler::set_last_Java_frame(Register last_java_sp, 393 Register last_java_fp, 394 Label &L, 395 Register scratch) { 396 if (L.is_bound()) { 397 set_last_Java_frame(last_java_sp, last_java_fp, target(L), scratch); 398 } else { 399 InstructionMark im(this); 400 L.add_patch_at(code(), locator()); 401 set_last_Java_frame(last_java_sp, last_java_fp, (address)NULL, scratch); 402 } 403 } 404 405 void MacroAssembler::far_call(Address entry, CodeBuffer *cbuf, Register tmp) { 406 assert(ReservedCodeCacheSize < 4*G, "branch out of range"); 407 assert(CodeCache::find_blob(entry.target()) != NULL, 408 "destination of far call not found in code cache"); 409 if (far_branches()) { 410 unsigned long offset; 411 // We can use ADRP here because we know that the total size of 412 // the code cache cannot exceed 2Gb. 413 adrp(tmp, entry, offset); 414 add(tmp, tmp, offset); 415 if (cbuf) cbuf->set_insts_mark(); 416 blr(tmp); 417 } else { 418 if (cbuf) cbuf->set_insts_mark(); 419 bl(entry); 420 } 421 } 422 423 void MacroAssembler::far_jump(Address entry, CodeBuffer *cbuf, Register tmp) { 424 assert(ReservedCodeCacheSize < 4*G, "branch out of range"); 425 assert(CodeCache::find_blob(entry.target()) != NULL, 426 "destination of far call not found in code cache"); 427 if (far_branches()) { 428 unsigned long offset; 429 // We can use ADRP here because we know that the total size of 430 // the code cache cannot exceed 2Gb. 431 adrp(tmp, entry, offset); 432 add(tmp, tmp, offset); 433 if (cbuf) cbuf->set_insts_mark(); 434 br(tmp); 435 } else { 436 if (cbuf) cbuf->set_insts_mark(); 437 b(entry); 438 } 439 } 440 441 void MacroAssembler::reserved_stack_check() { 442 // testing if reserved zone needs to be enabled 443 Label no_reserved_zone_enabling; 444 445 ldr(rscratch1, Address(rthread, JavaThread::reserved_stack_activation_offset())); 446 cmp(sp, rscratch1); 447 br(Assembler::LO, no_reserved_zone_enabling); 448 449 enter(); // LR and FP are live. 450 lea(rscratch1, CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone)); 451 mov(c_rarg0, rthread); 452 blr(rscratch1); 453 leave(); 454 455 // We have already removed our own frame. 456 // throw_delayed_StackOverflowError will think that it's been 457 // called by our caller. 458 lea(rscratch1, RuntimeAddress(StubRoutines::throw_delayed_StackOverflowError_entry())); 459 br(rscratch1); 460 should_not_reach_here(); 461 462 bind(no_reserved_zone_enabling); 463 } 464 465 int MacroAssembler::biased_locking_enter(Register lock_reg, 466 Register obj_reg, 467 Register swap_reg, 468 Register tmp_reg, 469 bool swap_reg_contains_mark, 470 Label& done, 471 Label* slow_case, 472 BiasedLockingCounters* counters) { 473 assert(UseBiasedLocking, "why call this otherwise?"); 474 assert_different_registers(lock_reg, obj_reg, swap_reg); 475 476 if (PrintBiasedLockingStatistics && counters == NULL) 477 counters = BiasedLocking::counters(); 478 479 assert_different_registers(lock_reg, obj_reg, swap_reg, tmp_reg, rscratch1, rscratch2, noreg); 480 assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits, "biased locking makes assumptions about bit layout"); 481 Address mark_addr (obj_reg, oopDesc::mark_offset_in_bytes()); 482 Address klass_addr (obj_reg, oopDesc::klass_offset_in_bytes()); 483 Address saved_mark_addr(lock_reg, 0); 484 485 // Biased locking 486 // See whether the lock is currently biased toward our thread and 487 // whether the epoch is still valid 488 // Note that the runtime guarantees sufficient alignment of JavaThread 489 // pointers to allow age to be placed into low bits 490 // First check to see whether biasing is even enabled for this object 491 Label cas_label; 492 int null_check_offset = -1; 493 if (!swap_reg_contains_mark) { 494 null_check_offset = offset(); 495 ldr(swap_reg, mark_addr); 496 } 497 andr(tmp_reg, swap_reg, markOopDesc::biased_lock_mask_in_place); 498 cmp(tmp_reg, (u1)markOopDesc::biased_lock_pattern); 499 br(Assembler::NE, cas_label); 500 // The bias pattern is present in the object's header. Need to check 501 // whether the bias owner and the epoch are both still current. 502 load_prototype_header(tmp_reg, obj_reg); 503 orr(tmp_reg, tmp_reg, rthread); 504 eor(tmp_reg, swap_reg, tmp_reg); 505 andr(tmp_reg, tmp_reg, ~((int) markOopDesc::age_mask_in_place)); 506 if (counters != NULL) { 507 Label around; 508 cbnz(tmp_reg, around); 509 atomic_incw(Address((address)counters->biased_lock_entry_count_addr()), tmp_reg, rscratch1, rscratch2); 510 b(done); 511 bind(around); 512 } else { 513 cbz(tmp_reg, done); 514 } 515 516 Label try_revoke_bias; 517 Label try_rebias; 518 519 // At this point we know that the header has the bias pattern and 520 // that we are not the bias owner in the current epoch. We need to 521 // figure out more details about the state of the header in order to 522 // know what operations can be legally performed on the object's 523 // header. 524 525 // If the low three bits in the xor result aren't clear, that means 526 // the prototype header is no longer biased and we have to revoke 527 // the bias on this object. 528 andr(rscratch1, tmp_reg, markOopDesc::biased_lock_mask_in_place); 529 cbnz(rscratch1, try_revoke_bias); 530 531 // Biasing is still enabled for this data type. See whether the 532 // epoch of the current bias is still valid, meaning that the epoch 533 // bits of the mark word are equal to the epoch bits of the 534 // prototype header. (Note that the prototype header's epoch bits 535 // only change at a safepoint.) If not, attempt to rebias the object 536 // toward the current thread. Note that we must be absolutely sure 537 // that the current epoch is invalid in order to do this because 538 // otherwise the manipulations it performs on the mark word are 539 // illegal. 540 andr(rscratch1, tmp_reg, markOopDesc::epoch_mask_in_place); 541 cbnz(rscratch1, try_rebias); 542 543 // The epoch of the current bias is still valid but we know nothing 544 // about the owner; it might be set or it might be clear. Try to 545 // acquire the bias of the object using an atomic operation. If this 546 // fails we will go in to the runtime to revoke the object's bias. 547 // Note that we first construct the presumed unbiased header so we 548 // don't accidentally blow away another thread's valid bias. 549 { 550 Label here; 551 mov(rscratch1, markOopDesc::biased_lock_mask_in_place | markOopDesc::age_mask_in_place | markOopDesc::epoch_mask_in_place); 552 andr(swap_reg, swap_reg, rscratch1); 553 orr(tmp_reg, swap_reg, rthread); 554 cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, rscratch1, here, slow_case); 555 // If the biasing toward our thread failed, this means that 556 // another thread succeeded in biasing it toward itself and we 557 // need to revoke that bias. The revocation will occur in the 558 // interpreter runtime in the slow case. 559 bind(here); 560 if (counters != NULL) { 561 atomic_incw(Address((address)counters->anonymously_biased_lock_entry_count_addr()), 562 tmp_reg, rscratch1, rscratch2); 563 } 564 } 565 b(done); 566 567 bind(try_rebias); 568 // At this point we know the epoch has expired, meaning that the 569 // current "bias owner", if any, is actually invalid. Under these 570 // circumstances _only_, we are allowed to use the current header's 571 // value as the comparison value when doing the cas to acquire the 572 // bias in the current epoch. In other words, we allow transfer of 573 // the bias from one thread to another directly in this situation. 574 // 575 // FIXME: due to a lack of registers we currently blow away the age 576 // bits in this situation. Should attempt to preserve them. 577 { 578 Label here; 579 load_prototype_header(tmp_reg, obj_reg); 580 orr(tmp_reg, rthread, tmp_reg); 581 cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, rscratch1, here, slow_case); 582 // If the biasing toward our thread failed, then another thread 583 // succeeded in biasing it toward itself and we need to revoke that 584 // bias. The revocation will occur in the runtime in the slow case. 585 bind(here); 586 if (counters != NULL) { 587 atomic_incw(Address((address)counters->rebiased_lock_entry_count_addr()), 588 tmp_reg, rscratch1, rscratch2); 589 } 590 } 591 b(done); 592 593 bind(try_revoke_bias); 594 // The prototype mark in the klass doesn't have the bias bit set any 595 // more, indicating that objects of this data type are not supposed 596 // to be biased any more. We are going to try to reset the mark of 597 // this object to the prototype value and fall through to the 598 // CAS-based locking scheme. Note that if our CAS fails, it means 599 // that another thread raced us for the privilege of revoking the 600 // bias of this particular object, so it's okay to continue in the 601 // normal locking code. 602 // 603 // FIXME: due to a lack of registers we currently blow away the age 604 // bits in this situation. Should attempt to preserve them. 605 { 606 Label here, nope; 607 load_prototype_header(tmp_reg, obj_reg); 608 cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, rscratch1, here, &nope); 609 bind(here); 610 611 // Fall through to the normal CAS-based lock, because no matter what 612 // the result of the above CAS, some thread must have succeeded in 613 // removing the bias bit from the object's header. 614 if (counters != NULL) { 615 atomic_incw(Address((address)counters->revoked_lock_entry_count_addr()), tmp_reg, 616 rscratch1, rscratch2); 617 } 618 bind(nope); 619 } 620 621 bind(cas_label); 622 623 return null_check_offset; 624 } 625 626 void MacroAssembler::biased_locking_exit(Register obj_reg, Register temp_reg, Label& done) { 627 assert(UseBiasedLocking, "why call this otherwise?"); 628 629 // Check for biased locking unlock case, which is a no-op 630 // Note: we do not have to check the thread ID for two reasons. 631 // First, the interpreter checks for IllegalMonitorStateException at 632 // a higher level. Second, if the bias was revoked while we held the 633 // lock, the object could not be rebiased toward another thread, so 634 // the bias bit would be clear. 635 ldr(temp_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes())); 636 andr(temp_reg, temp_reg, markOopDesc::biased_lock_mask_in_place); 637 cmp(temp_reg, (u1)markOopDesc::biased_lock_pattern); 638 br(Assembler::EQ, done); 639 } 640 641 static void pass_arg0(MacroAssembler* masm, Register arg) { 642 if (c_rarg0 != arg ) { 643 masm->mov(c_rarg0, arg); 644 } 645 } 646 647 static void pass_arg1(MacroAssembler* masm, Register arg) { 648 if (c_rarg1 != arg ) { 649 masm->mov(c_rarg1, arg); 650 } 651 } 652 653 static void pass_arg2(MacroAssembler* masm, Register arg) { 654 if (c_rarg2 != arg ) { 655 masm->mov(c_rarg2, arg); 656 } 657 } 658 659 static void pass_arg3(MacroAssembler* masm, Register arg) { 660 if (c_rarg3 != arg ) { 661 masm->mov(c_rarg3, arg); 662 } 663 } 664 665 void MacroAssembler::call_VM_base(Register oop_result, 666 Register java_thread, 667 Register last_java_sp, 668 address entry_point, 669 int number_of_arguments, 670 bool check_exceptions) { 671 // determine java_thread register 672 if (!java_thread->is_valid()) { 673 java_thread = rthread; 674 } 675 676 // determine last_java_sp register 677 if (!last_java_sp->is_valid()) { 678 last_java_sp = esp; 679 } 680 681 // debugging support 682 assert(number_of_arguments >= 0 , "cannot have negative number of arguments"); 683 assert(java_thread == rthread, "unexpected register"); 684 #ifdef ASSERT 685 // TraceBytecodes does not use r12 but saves it over the call, so don't verify 686 // if ((UseCompressedOops || UseCompressedClassPointers) && !TraceBytecodes) verify_heapbase("call_VM_base: heap base corrupted?"); 687 #endif // ASSERT 688 689 assert(java_thread != oop_result , "cannot use the same register for java_thread & oop_result"); 690 assert(java_thread != last_java_sp, "cannot use the same register for java_thread & last_java_sp"); 691 692 // push java thread (becomes first argument of C function) 693 694 mov(c_rarg0, java_thread); 695 696 // set last Java frame before call 697 assert(last_java_sp != rfp, "can't use rfp"); 698 699 Label l; 700 set_last_Java_frame(last_java_sp, rfp, l, rscratch1); 701 702 // do the call, remove parameters 703 MacroAssembler::call_VM_leaf_base(entry_point, number_of_arguments, &l); 704 705 // reset last Java frame 706 // Only interpreter should have to clear fp 707 reset_last_Java_frame(true); 708 709 // C++ interp handles this in the interpreter 710 check_and_handle_popframe(java_thread); 711 check_and_handle_earlyret(java_thread); 712 713 if (check_exceptions) { 714 // check for pending exceptions (java_thread is set upon return) 715 ldr(rscratch1, Address(java_thread, in_bytes(Thread::pending_exception_offset()))); 716 Label ok; 717 cbz(rscratch1, ok); 718 lea(rscratch1, RuntimeAddress(StubRoutines::forward_exception_entry())); 719 br(rscratch1); 720 bind(ok); 721 } 722 723 // get oop result if there is one and reset the value in the thread 724 if (oop_result->is_valid()) { 725 get_vm_result(oop_result, java_thread); 726 } 727 } 728 729 void MacroAssembler::call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions) { 730 call_VM_base(oop_result, noreg, noreg, entry_point, number_of_arguments, check_exceptions); 731 } 732 733 // Maybe emit a call via a trampoline. If the code cache is small 734 // trampolines won't be emitted. 735 736 address MacroAssembler::trampoline_call(Address entry, CodeBuffer *cbuf) { 737 assert(JavaThread::current()->is_Compiler_thread(), "just checking"); 738 assert(entry.rspec().type() == relocInfo::runtime_call_type 739 || entry.rspec().type() == relocInfo::opt_virtual_call_type 740 || entry.rspec().type() == relocInfo::static_call_type 741 || entry.rspec().type() == relocInfo::virtual_call_type, "wrong reloc type"); 742 743 // We need a trampoline if branches are far. 744 if (far_branches()) { 745 bool in_scratch_emit_size = false; 746 #ifdef COMPILER2 747 // We don't want to emit a trampoline if C2 is generating dummy 748 // code during its branch shortening phase. 749 CompileTask* task = ciEnv::current()->task(); 750 in_scratch_emit_size = 751 (task != NULL && is_c2_compile(task->comp_level()) && 752 Compile::current()->in_scratch_emit_size()); 753 #endif 754 if (!in_scratch_emit_size) { 755 address stub = emit_trampoline_stub(offset(), entry.target()); 756 if (stub == NULL) { 757 return NULL; // CodeCache is full 758 } 759 } 760 } 761 762 if (cbuf) cbuf->set_insts_mark(); 763 relocate(entry.rspec()); 764 if (!far_branches()) { 765 bl(entry.target()); 766 } else { 767 bl(pc()); 768 } 769 // just need to return a non-null address 770 return pc(); 771 } 772 773 774 // Emit a trampoline stub for a call to a target which is too far away. 775 // 776 // code sequences: 777 // 778 // call-site: 779 // branch-and-link to <destination> or <trampoline stub> 780 // 781 // Related trampoline stub for this call site in the stub section: 782 // load the call target from the constant pool 783 // branch (LR still points to the call site above) 784 785 address MacroAssembler::emit_trampoline_stub(int insts_call_instruction_offset, 786 address dest) { 787 // Max stub size: alignment nop, TrampolineStub. 788 address stub = start_a_stub(NativeInstruction::instruction_size 789 + NativeCallTrampolineStub::instruction_size); 790 if (stub == NULL) { 791 return NULL; // CodeBuffer::expand failed 792 } 793 794 // Create a trampoline stub relocation which relates this trampoline stub 795 // with the call instruction at insts_call_instruction_offset in the 796 // instructions code-section. 797 align(wordSize); 798 relocate(trampoline_stub_Relocation::spec(code()->insts()->start() 799 + insts_call_instruction_offset)); 800 const int stub_start_offset = offset(); 801 802 // Now, create the trampoline stub's code: 803 // - load the call 804 // - call 805 Label target; 806 ldr(rscratch1, target); 807 br(rscratch1); 808 bind(target); 809 assert(offset() - stub_start_offset == NativeCallTrampolineStub::data_offset, 810 "should be"); 811 emit_int64((int64_t)dest); 812 813 const address stub_start_addr = addr_at(stub_start_offset); 814 815 assert(is_NativeCallTrampolineStub_at(stub_start_addr), "doesn't look like a trampoline"); 816 817 end_a_stub(); 818 return stub_start_addr; 819 } 820 821 void MacroAssembler::c2bool(Register x) { 822 // implements x == 0 ? 0 : 1 823 // note: must only look at least-significant byte of x 824 // since C-style booleans are stored in one byte 825 // only! (was bug) 826 tst(x, 0xff); 827 cset(x, Assembler::NE); 828 } 829 830 address MacroAssembler::ic_call(address entry, jint method_index) { 831 RelocationHolder rh = virtual_call_Relocation::spec(pc(), method_index); 832 // address const_ptr = long_constant((jlong)Universe::non_oop_word()); 833 // unsigned long offset; 834 // ldr_constant(rscratch2, const_ptr); 835 movptr(rscratch2, (uintptr_t)Universe::non_oop_word()); 836 return trampoline_call(Address(entry, rh)); 837 } 838 839 // Implementation of call_VM versions 840 841 void MacroAssembler::call_VM(Register oop_result, 842 address entry_point, 843 bool check_exceptions) { 844 call_VM_helper(oop_result, entry_point, 0, check_exceptions); 845 } 846 847 void MacroAssembler::call_VM(Register oop_result, 848 address entry_point, 849 Register arg_1, 850 bool check_exceptions) { 851 pass_arg1(this, arg_1); 852 call_VM_helper(oop_result, entry_point, 1, check_exceptions); 853 } 854 855 void MacroAssembler::call_VM(Register oop_result, 856 address entry_point, 857 Register arg_1, 858 Register arg_2, 859 bool check_exceptions) { 860 assert(arg_1 != c_rarg2, "smashed arg"); 861 pass_arg2(this, arg_2); 862 pass_arg1(this, arg_1); 863 call_VM_helper(oop_result, entry_point, 2, check_exceptions); 864 } 865 866 void MacroAssembler::call_VM(Register oop_result, 867 address entry_point, 868 Register arg_1, 869 Register arg_2, 870 Register arg_3, 871 bool check_exceptions) { 872 assert(arg_1 != c_rarg3, "smashed arg"); 873 assert(arg_2 != c_rarg3, "smashed arg"); 874 pass_arg3(this, arg_3); 875 876 assert(arg_1 != c_rarg2, "smashed arg"); 877 pass_arg2(this, arg_2); 878 879 pass_arg1(this, arg_1); 880 call_VM_helper(oop_result, entry_point, 3, check_exceptions); 881 } 882 883 void MacroAssembler::call_VM(Register oop_result, 884 Register last_java_sp, 885 address entry_point, 886 int number_of_arguments, 887 bool check_exceptions) { 888 call_VM_base(oop_result, rthread, last_java_sp, entry_point, number_of_arguments, check_exceptions); 889 } 890 891 void MacroAssembler::call_VM(Register oop_result, 892 Register last_java_sp, 893 address entry_point, 894 Register arg_1, 895 bool check_exceptions) { 896 pass_arg1(this, arg_1); 897 call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions); 898 } 899 900 void MacroAssembler::call_VM(Register oop_result, 901 Register last_java_sp, 902 address entry_point, 903 Register arg_1, 904 Register arg_2, 905 bool check_exceptions) { 906 907 assert(arg_1 != c_rarg2, "smashed arg"); 908 pass_arg2(this, arg_2); 909 pass_arg1(this, arg_1); 910 call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions); 911 } 912 913 void MacroAssembler::call_VM(Register oop_result, 914 Register last_java_sp, 915 address entry_point, 916 Register arg_1, 917 Register arg_2, 918 Register arg_3, 919 bool check_exceptions) { 920 assert(arg_1 != c_rarg3, "smashed arg"); 921 assert(arg_2 != c_rarg3, "smashed arg"); 922 pass_arg3(this, arg_3); 923 assert(arg_1 != c_rarg2, "smashed arg"); 924 pass_arg2(this, arg_2); 925 pass_arg1(this, arg_1); 926 call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions); 927 } 928 929 930 void MacroAssembler::get_vm_result(Register oop_result, Register java_thread) { 931 ldr(oop_result, Address(java_thread, JavaThread::vm_result_offset())); 932 str(zr, Address(java_thread, JavaThread::vm_result_offset())); 933 verify_oop(oop_result, "broken oop in call_VM_base"); 934 } 935 936 void MacroAssembler::get_vm_result_2(Register metadata_result, Register java_thread) { 937 ldr(metadata_result, Address(java_thread, JavaThread::vm_result_2_offset())); 938 str(zr, Address(java_thread, JavaThread::vm_result_2_offset())); 939 } 940 941 void MacroAssembler::align(int modulus) { 942 while (offset() % modulus != 0) nop(); 943 } 944 945 // these are no-ops overridden by InterpreterMacroAssembler 946 947 void MacroAssembler::check_and_handle_earlyret(Register java_thread) { } 948 949 void MacroAssembler::check_and_handle_popframe(Register java_thread) { } 950 951 952 RegisterOrConstant MacroAssembler::delayed_value_impl(intptr_t* delayed_value_addr, 953 Register tmp, 954 int offset) { 955 intptr_t value = *delayed_value_addr; 956 if (value != 0) 957 return RegisterOrConstant(value + offset); 958 959 // load indirectly to solve generation ordering problem 960 ldr(tmp, ExternalAddress((address) delayed_value_addr)); 961 962 if (offset != 0) 963 add(tmp, tmp, offset); 964 965 return RegisterOrConstant(tmp); 966 } 967 968 969 void MacroAssembler:: notify(int type) { 970 if (type == bytecode_start) { 971 // set_last_Java_frame(esp, rfp, (address)NULL); 972 Assembler:: notify(type); 973 // reset_last_Java_frame(true); 974 } 975 else 976 Assembler:: notify(type); 977 } 978 979 // Look up the method for a megamorphic invokeinterface call. 980 // The target method is determined by <intf_klass, itable_index>. 981 // The receiver klass is in recv_klass. 982 // On success, the result will be in method_result, and execution falls through. 983 // On failure, execution transfers to the given label. 984 void MacroAssembler::lookup_interface_method(Register recv_klass, 985 Register intf_klass, 986 RegisterOrConstant itable_index, 987 Register method_result, 988 Register scan_temp, 989 Label& L_no_such_interface, 990 bool return_method) { 991 assert_different_registers(recv_klass, intf_klass, scan_temp); 992 assert_different_registers(method_result, intf_klass, scan_temp); 993 assert(recv_klass != method_result || !return_method, 994 "recv_klass can be destroyed when method isn't needed"); 995 assert(itable_index.is_constant() || itable_index.as_register() == method_result, 996 "caller must use same register for non-constant itable index as for method"); 997 998 // Compute start of first itableOffsetEntry (which is at the end of the vtable) 999 int vtable_base = in_bytes(Klass::vtable_start_offset()); 1000 int itentry_off = itableMethodEntry::method_offset_in_bytes(); 1001 int scan_step = itableOffsetEntry::size() * wordSize; 1002 int vte_size = vtableEntry::size_in_bytes(); 1003 assert(vte_size == wordSize, "else adjust times_vte_scale"); 1004 1005 ldrw(scan_temp, Address(recv_klass, Klass::vtable_length_offset())); 1006 1007 // %%% Could store the aligned, prescaled offset in the klassoop. 1008 // lea(scan_temp, Address(recv_klass, scan_temp, times_vte_scale, vtable_base)); 1009 lea(scan_temp, Address(recv_klass, scan_temp, Address::lsl(3))); 1010 add(scan_temp, scan_temp, vtable_base); 1011 1012 if (return_method) { 1013 // Adjust recv_klass by scaled itable_index, so we can free itable_index. 1014 assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below"); 1015 // lea(recv_klass, Address(recv_klass, itable_index, Address::times_ptr, itentry_off)); 1016 lea(recv_klass, Address(recv_klass, itable_index, Address::lsl(3))); 1017 if (itentry_off) 1018 add(recv_klass, recv_klass, itentry_off); 1019 } 1020 1021 // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) { 1022 // if (scan->interface() == intf) { 1023 // result = (klass + scan->offset() + itable_index); 1024 // } 1025 // } 1026 Label search, found_method; 1027 1028 for (int peel = 1; peel >= 0; peel--) { 1029 ldr(method_result, Address(scan_temp, itableOffsetEntry::interface_offset_in_bytes())); 1030 cmp(intf_klass, method_result); 1031 1032 if (peel) { 1033 br(Assembler::EQ, found_method); 1034 } else { 1035 br(Assembler::NE, search); 1036 // (invert the test to fall through to found_method...) 1037 } 1038 1039 if (!peel) break; 1040 1041 bind(search); 1042 1043 // Check that the previous entry is non-null. A null entry means that 1044 // the receiver class doesn't implement the interface, and wasn't the 1045 // same as when the caller was compiled. 1046 cbz(method_result, L_no_such_interface); 1047 add(scan_temp, scan_temp, scan_step); 1048 } 1049 1050 bind(found_method); 1051 1052 // Got a hit. 1053 if (return_method) { 1054 ldrw(scan_temp, Address(scan_temp, itableOffsetEntry::offset_offset_in_bytes())); 1055 ldr(method_result, Address(recv_klass, scan_temp, Address::uxtw(0))); 1056 } 1057 } 1058 1059 // virtual method calling 1060 void MacroAssembler::lookup_virtual_method(Register recv_klass, 1061 RegisterOrConstant vtable_index, 1062 Register method_result) { 1063 const int base = in_bytes(Klass::vtable_start_offset()); 1064 assert(vtableEntry::size() * wordSize == 8, 1065 "adjust the scaling in the code below"); 1066 int vtable_offset_in_bytes = base + vtableEntry::method_offset_in_bytes(); 1067 1068 if (vtable_index.is_register()) { 1069 lea(method_result, Address(recv_klass, 1070 vtable_index.as_register(), 1071 Address::lsl(LogBytesPerWord))); 1072 ldr(method_result, Address(method_result, vtable_offset_in_bytes)); 1073 } else { 1074 vtable_offset_in_bytes += vtable_index.as_constant() * wordSize; 1075 ldr(method_result, 1076 form_address(rscratch1, recv_klass, vtable_offset_in_bytes, 0)); 1077 } 1078 } 1079 1080 void MacroAssembler::check_klass_subtype(Register sub_klass, 1081 Register super_klass, 1082 Register temp_reg, 1083 Label& L_success) { 1084 Label L_failure; 1085 check_klass_subtype_fast_path(sub_klass, super_klass, temp_reg, &L_success, &L_failure, NULL); 1086 check_klass_subtype_slow_path(sub_klass, super_klass, temp_reg, noreg, &L_success, NULL); 1087 bind(L_failure); 1088 } 1089 1090 1091 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass, 1092 Register super_klass, 1093 Register temp_reg, 1094 Label* L_success, 1095 Label* L_failure, 1096 Label* L_slow_path, 1097 RegisterOrConstant super_check_offset) { 1098 assert_different_registers(sub_klass, super_klass, temp_reg); 1099 bool must_load_sco = (super_check_offset.constant_or_zero() == -1); 1100 if (super_check_offset.is_register()) { 1101 assert_different_registers(sub_klass, super_klass, 1102 super_check_offset.as_register()); 1103 } else if (must_load_sco) { 1104 assert(temp_reg != noreg, "supply either a temp or a register offset"); 1105 } 1106 1107 Label L_fallthrough; 1108 int label_nulls = 0; 1109 if (L_success == NULL) { L_success = &L_fallthrough; label_nulls++; } 1110 if (L_failure == NULL) { L_failure = &L_fallthrough; label_nulls++; } 1111 if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; } 1112 assert(label_nulls <= 1, "at most one NULL in the batch"); 1113 1114 int sc_offset = in_bytes(Klass::secondary_super_cache_offset()); 1115 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 1116 Address super_check_offset_addr(super_klass, sco_offset); 1117 1118 // Hacked jmp, which may only be used just before L_fallthrough. 1119 #define final_jmp(label) \ 1120 if (&(label) == &L_fallthrough) { /*do nothing*/ } \ 1121 else b(label) /*omit semi*/ 1122 1123 // If the pointers are equal, we are done (e.g., String[] elements). 1124 // This self-check enables sharing of secondary supertype arrays among 1125 // non-primary types such as array-of-interface. Otherwise, each such 1126 // type would need its own customized SSA. 1127 // We move this check to the front of the fast path because many 1128 // type checks are in fact trivially successful in this manner, 1129 // so we get a nicely predicted branch right at the start of the check. 1130 cmp(sub_klass, super_klass); 1131 br(Assembler::EQ, *L_success); 1132 1133 // Check the supertype display: 1134 if (must_load_sco) { 1135 ldrw(temp_reg, super_check_offset_addr); 1136 super_check_offset = RegisterOrConstant(temp_reg); 1137 } 1138 Address super_check_addr(sub_klass, super_check_offset); 1139 ldr(rscratch1, super_check_addr); 1140 cmp(super_klass, rscratch1); // load displayed supertype 1141 1142 // This check has worked decisively for primary supers. 1143 // Secondary supers are sought in the super_cache ('super_cache_addr'). 1144 // (Secondary supers are interfaces and very deeply nested subtypes.) 1145 // This works in the same check above because of a tricky aliasing 1146 // between the super_cache and the primary super display elements. 1147 // (The 'super_check_addr' can address either, as the case requires.) 1148 // Note that the cache is updated below if it does not help us find 1149 // what we need immediately. 1150 // So if it was a primary super, we can just fail immediately. 1151 // Otherwise, it's the slow path for us (no success at this point). 1152 1153 if (super_check_offset.is_register()) { 1154 br(Assembler::EQ, *L_success); 1155 subs(zr, super_check_offset.as_register(), sc_offset); 1156 if (L_failure == &L_fallthrough) { 1157 br(Assembler::EQ, *L_slow_path); 1158 } else { 1159 br(Assembler::NE, *L_failure); 1160 final_jmp(*L_slow_path); 1161 } 1162 } else if (super_check_offset.as_constant() == sc_offset) { 1163 // Need a slow path; fast failure is impossible. 1164 if (L_slow_path == &L_fallthrough) { 1165 br(Assembler::EQ, *L_success); 1166 } else { 1167 br(Assembler::NE, *L_slow_path); 1168 final_jmp(*L_success); 1169 } 1170 } else { 1171 // No slow path; it's a fast decision. 1172 if (L_failure == &L_fallthrough) { 1173 br(Assembler::EQ, *L_success); 1174 } else { 1175 br(Assembler::NE, *L_failure); 1176 final_jmp(*L_success); 1177 } 1178 } 1179 1180 bind(L_fallthrough); 1181 1182 #undef final_jmp 1183 } 1184 1185 // These two are taken from x86, but they look generally useful 1186 1187 // scans count pointer sized words at [addr] for occurence of value, 1188 // generic 1189 void MacroAssembler::repne_scan(Register addr, Register value, Register count, 1190 Register scratch) { 1191 Label Lloop, Lexit; 1192 cbz(count, Lexit); 1193 bind(Lloop); 1194 ldr(scratch, post(addr, wordSize)); 1195 cmp(value, scratch); 1196 br(EQ, Lexit); 1197 sub(count, count, 1); 1198 cbnz(count, Lloop); 1199 bind(Lexit); 1200 } 1201 1202 // scans count 4 byte words at [addr] for occurence of value, 1203 // generic 1204 void MacroAssembler::repne_scanw(Register addr, Register value, Register count, 1205 Register scratch) { 1206 Label Lloop, Lexit; 1207 cbz(count, Lexit); 1208 bind(Lloop); 1209 ldrw(scratch, post(addr, wordSize)); 1210 cmpw(value, scratch); 1211 br(EQ, Lexit); 1212 sub(count, count, 1); 1213 cbnz(count, Lloop); 1214 bind(Lexit); 1215 } 1216 1217 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass, 1218 Register super_klass, 1219 Register temp_reg, 1220 Register temp2_reg, 1221 Label* L_success, 1222 Label* L_failure, 1223 bool set_cond_codes) { 1224 assert_different_registers(sub_klass, super_klass, temp_reg); 1225 if (temp2_reg != noreg) 1226 assert_different_registers(sub_klass, super_klass, temp_reg, temp2_reg, rscratch1); 1227 #define IS_A_TEMP(reg) ((reg) == temp_reg || (reg) == temp2_reg) 1228 1229 Label L_fallthrough; 1230 int label_nulls = 0; 1231 if (L_success == NULL) { L_success = &L_fallthrough; label_nulls++; } 1232 if (L_failure == NULL) { L_failure = &L_fallthrough; label_nulls++; } 1233 assert(label_nulls <= 1, "at most one NULL in the batch"); 1234 1235 // a couple of useful fields in sub_klass: 1236 int ss_offset = in_bytes(Klass::secondary_supers_offset()); 1237 int sc_offset = in_bytes(Klass::secondary_super_cache_offset()); 1238 Address secondary_supers_addr(sub_klass, ss_offset); 1239 Address super_cache_addr( sub_klass, sc_offset); 1240 1241 BLOCK_COMMENT("check_klass_subtype_slow_path"); 1242 1243 // Do a linear scan of the secondary super-klass chain. 1244 // This code is rarely used, so simplicity is a virtue here. 1245 // The repne_scan instruction uses fixed registers, which we must spill. 1246 // Don't worry too much about pre-existing connections with the input regs. 1247 1248 assert(sub_klass != r0, "killed reg"); // killed by mov(r0, super) 1249 assert(sub_klass != r2, "killed reg"); // killed by lea(r2, &pst_counter) 1250 1251 RegSet pushed_registers; 1252 if (!IS_A_TEMP(r2)) pushed_registers += r2; 1253 if (!IS_A_TEMP(r5)) pushed_registers += r5; 1254 1255 if (super_klass != r0 || UseCompressedOops) { 1256 if (!IS_A_TEMP(r0)) pushed_registers += r0; 1257 } 1258 1259 push(pushed_registers, sp); 1260 1261 // Get super_klass value into r0 (even if it was in r5 or r2). 1262 if (super_klass != r0) { 1263 mov(r0, super_klass); 1264 } 1265 1266 #ifndef PRODUCT 1267 mov(rscratch2, (address)&SharedRuntime::_partial_subtype_ctr); 1268 Address pst_counter_addr(rscratch2); 1269 ldr(rscratch1, pst_counter_addr); 1270 add(rscratch1, rscratch1, 1); 1271 str(rscratch1, pst_counter_addr); 1272 #endif //PRODUCT 1273 1274 // We will consult the secondary-super array. 1275 ldr(r5, secondary_supers_addr); 1276 // Load the array length. 1277 ldrw(r2, Address(r5, Array<Klass*>::length_offset_in_bytes())); 1278 // Skip to start of data. 1279 add(r5, r5, Array<Klass*>::base_offset_in_bytes()); 1280 1281 cmp(sp, zr); // Clear Z flag; SP is never zero 1282 // Scan R2 words at [R5] for an occurrence of R0. 1283 // Set NZ/Z based on last compare. 1284 repne_scan(r5, r0, r2, rscratch1); 1285 1286 // Unspill the temp. registers: 1287 pop(pushed_registers, sp); 1288 1289 br(Assembler::NE, *L_failure); 1290 1291 // Success. Cache the super we found and proceed in triumph. 1292 str(super_klass, super_cache_addr); 1293 1294 if (L_success != &L_fallthrough) { 1295 b(*L_success); 1296 } 1297 1298 #undef IS_A_TEMP 1299 1300 bind(L_fallthrough); 1301 } 1302 1303 1304 void MacroAssembler::verify_oop(Register reg, const char* s) { 1305 if (!VerifyOops) return; 1306 1307 // Pass register number to verify_oop_subroutine 1308 const char* b = NULL; 1309 { 1310 ResourceMark rm; 1311 stringStream ss; 1312 ss.print("verify_oop: %s: %s", reg->name(), s); 1313 b = code_string(ss.as_string()); 1314 } 1315 BLOCK_COMMENT("verify_oop {"); 1316 1317 stp(r0, rscratch1, Address(pre(sp, -2 * wordSize))); 1318 stp(rscratch2, lr, Address(pre(sp, -2 * wordSize))); 1319 1320 mov(r0, reg); 1321 mov(rscratch1, (address)b); 1322 1323 // call indirectly to solve generation ordering problem 1324 lea(rscratch2, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address())); 1325 ldr(rscratch2, Address(rscratch2)); 1326 blr(rscratch2); 1327 1328 ldp(rscratch2, lr, Address(post(sp, 2 * wordSize))); 1329 ldp(r0, rscratch1, Address(post(sp, 2 * wordSize))); 1330 1331 BLOCK_COMMENT("} verify_oop"); 1332 } 1333 1334 void MacroAssembler::verify_oop_addr(Address addr, const char* s) { 1335 if (!VerifyOops) return; 1336 1337 const char* b = NULL; 1338 { 1339 ResourceMark rm; 1340 stringStream ss; 1341 ss.print("verify_oop_addr: %s", s); 1342 b = code_string(ss.as_string()); 1343 } 1344 BLOCK_COMMENT("verify_oop_addr {"); 1345 1346 stp(r0, rscratch1, Address(pre(sp, -2 * wordSize))); 1347 stp(rscratch2, lr, Address(pre(sp, -2 * wordSize))); 1348 1349 // addr may contain sp so we will have to adjust it based on the 1350 // pushes that we just did. 1351 if (addr.uses(sp)) { 1352 lea(r0, addr); 1353 ldr(r0, Address(r0, 4 * wordSize)); 1354 } else { 1355 ldr(r0, addr); 1356 } 1357 mov(rscratch1, (address)b); 1358 1359 // call indirectly to solve generation ordering problem 1360 lea(rscratch2, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address())); 1361 ldr(rscratch2, Address(rscratch2)); 1362 blr(rscratch2); 1363 1364 ldp(rscratch2, lr, Address(post(sp, 2 * wordSize))); 1365 ldp(r0, rscratch1, Address(post(sp, 2 * wordSize))); 1366 1367 BLOCK_COMMENT("} verify_oop_addr"); 1368 } 1369 1370 Address MacroAssembler::argument_address(RegisterOrConstant arg_slot, 1371 int extra_slot_offset) { 1372 // cf. TemplateTable::prepare_invoke(), if (load_receiver). 1373 int stackElementSize = Interpreter::stackElementSize; 1374 int offset = Interpreter::expr_offset_in_bytes(extra_slot_offset+0); 1375 #ifdef ASSERT 1376 int offset1 = Interpreter::expr_offset_in_bytes(extra_slot_offset+1); 1377 assert(offset1 - offset == stackElementSize, "correct arithmetic"); 1378 #endif 1379 if (arg_slot.is_constant()) { 1380 return Address(esp, arg_slot.as_constant() * stackElementSize 1381 + offset); 1382 } else { 1383 add(rscratch1, esp, arg_slot.as_register(), 1384 ext::uxtx, exact_log2(stackElementSize)); 1385 return Address(rscratch1, offset); 1386 } 1387 } 1388 1389 void MacroAssembler::call_VM_leaf_base(address entry_point, 1390 int number_of_arguments, 1391 Label *retaddr) { 1392 call_VM_leaf_base1(entry_point, number_of_arguments, 0, ret_type_integral, retaddr); 1393 } 1394 1395 void MacroAssembler::call_VM_leaf_base1(address entry_point, 1396 int number_of_gp_arguments, 1397 int number_of_fp_arguments, 1398 ret_type type, 1399 Label *retaddr) { 1400 Label E, L; 1401 1402 stp(rscratch1, rmethod, Address(pre(sp, -2 * wordSize))); 1403 1404 // We add 1 to number_of_arguments because the thread in arg0 is 1405 // not counted 1406 mov(rscratch1, entry_point); 1407 blrt(rscratch1, number_of_gp_arguments + 1, number_of_fp_arguments, type); 1408 if (retaddr) 1409 bind(*retaddr); 1410 1411 ldp(rscratch1, rmethod, Address(post(sp, 2 * wordSize))); 1412 maybe_isb(); 1413 } 1414 1415 void MacroAssembler::call_VM_leaf(address entry_point, int number_of_arguments) { 1416 call_VM_leaf_base(entry_point, number_of_arguments); 1417 } 1418 1419 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0) { 1420 pass_arg0(this, arg_0); 1421 call_VM_leaf_base(entry_point, 1); 1422 } 1423 1424 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1) { 1425 pass_arg0(this, arg_0); 1426 pass_arg1(this, arg_1); 1427 call_VM_leaf_base(entry_point, 2); 1428 } 1429 1430 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, 1431 Register arg_1, Register arg_2) { 1432 pass_arg0(this, arg_0); 1433 pass_arg1(this, arg_1); 1434 pass_arg2(this, arg_2); 1435 call_VM_leaf_base(entry_point, 3); 1436 } 1437 1438 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0) { 1439 pass_arg0(this, arg_0); 1440 MacroAssembler::call_VM_leaf_base(entry_point, 1); 1441 } 1442 1443 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1) { 1444 1445 assert(arg_0 != c_rarg1, "smashed arg"); 1446 pass_arg1(this, arg_1); 1447 pass_arg0(this, arg_0); 1448 MacroAssembler::call_VM_leaf_base(entry_point, 2); 1449 } 1450 1451 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) { 1452 assert(arg_0 != c_rarg2, "smashed arg"); 1453 assert(arg_1 != c_rarg2, "smashed arg"); 1454 pass_arg2(this, arg_2); 1455 assert(arg_0 != c_rarg1, "smashed arg"); 1456 pass_arg1(this, arg_1); 1457 pass_arg0(this, arg_0); 1458 MacroAssembler::call_VM_leaf_base(entry_point, 3); 1459 } 1460 1461 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) { 1462 assert(arg_0 != c_rarg3, "smashed arg"); 1463 assert(arg_1 != c_rarg3, "smashed arg"); 1464 assert(arg_2 != c_rarg3, "smashed arg"); 1465 pass_arg3(this, arg_3); 1466 assert(arg_0 != c_rarg2, "smashed arg"); 1467 assert(arg_1 != c_rarg2, "smashed arg"); 1468 pass_arg2(this, arg_2); 1469 assert(arg_0 != c_rarg1, "smashed arg"); 1470 pass_arg1(this, arg_1); 1471 pass_arg0(this, arg_0); 1472 MacroAssembler::call_VM_leaf_base(entry_point, 4); 1473 } 1474 1475 void MacroAssembler::null_check(Register reg, int offset) { 1476 if (needs_explicit_null_check(offset)) { 1477 // provoke OS NULL exception if reg = NULL by 1478 // accessing M[reg] w/o changing any registers 1479 // NOTE: this is plenty to provoke a segv 1480 ldr(zr, Address(reg)); 1481 } else { 1482 // nothing to do, (later) access of M[reg + offset] 1483 // will provoke OS NULL exception if reg = NULL 1484 } 1485 } 1486 1487 // MacroAssembler protected routines needed to implement 1488 // public methods 1489 1490 void MacroAssembler::mov(Register r, Address dest) { 1491 code_section()->relocate(pc(), dest.rspec()); 1492 u_int64_t imm64 = (u_int64_t)dest.target(); 1493 movptr(r, imm64); 1494 } 1495 1496 // Move a constant pointer into r. In AArch64 mode the virtual 1497 // address space is 48 bits in size, so we only need three 1498 // instructions to create a patchable instruction sequence that can 1499 // reach anywhere. 1500 void MacroAssembler::movptr(Register r, uintptr_t imm64) { 1501 #ifndef PRODUCT 1502 { 1503 char buffer[64]; 1504 snprintf(buffer, sizeof(buffer), "0x%" PRIX64, imm64); 1505 block_comment(buffer); 1506 } 1507 #endif 1508 assert(imm64 < (1ul << 48), "48-bit overflow in address constant"); 1509 movz(r, imm64 & 0xffff); 1510 imm64 >>= 16; 1511 movk(r, imm64 & 0xffff, 16); 1512 imm64 >>= 16; 1513 movk(r, imm64 & 0xffff, 32); 1514 } 1515 1516 // Macro to mov replicated immediate to vector register. 1517 // Vd will get the following values for different arrangements in T 1518 // imm32 == hex 000000gh T8B: Vd = ghghghghghghghgh 1519 // imm32 == hex 000000gh T16B: Vd = ghghghghghghghghghghghghghghghgh 1520 // imm32 == hex 0000efgh T4H: Vd = efghefghefghefgh 1521 // imm32 == hex 0000efgh T8H: Vd = efghefghefghefghefghefghefghefgh 1522 // imm32 == hex abcdefgh T2S: Vd = abcdefghabcdefgh 1523 // imm32 == hex abcdefgh T4S: Vd = abcdefghabcdefghabcdefghabcdefgh 1524 // T1D/T2D: invalid 1525 void MacroAssembler::mov(FloatRegister Vd, SIMD_Arrangement T, u_int32_t imm32) { 1526 assert(T != T1D && T != T2D, "invalid arrangement"); 1527 if (T == T8B || T == T16B) { 1528 assert((imm32 & ~0xff) == 0, "extraneous bits in unsigned imm32 (T8B/T16B)"); 1529 movi(Vd, T, imm32 & 0xff, 0); 1530 return; 1531 } 1532 u_int32_t nimm32 = ~imm32; 1533 if (T == T4H || T == T8H) { 1534 assert((imm32 & ~0xffff) == 0, "extraneous bits in unsigned imm32 (T4H/T8H)"); 1535 imm32 &= 0xffff; 1536 nimm32 &= 0xffff; 1537 } 1538 u_int32_t x = imm32; 1539 int movi_cnt = 0; 1540 int movn_cnt = 0; 1541 while (x) { if (x & 0xff) movi_cnt++; x >>= 8; } 1542 x = nimm32; 1543 while (x) { if (x & 0xff) movn_cnt++; x >>= 8; } 1544 if (movn_cnt < movi_cnt) imm32 = nimm32; 1545 unsigned lsl = 0; 1546 while (imm32 && (imm32 & 0xff) == 0) { lsl += 8; imm32 >>= 8; } 1547 if (movn_cnt < movi_cnt) 1548 mvni(Vd, T, imm32 & 0xff, lsl); 1549 else 1550 movi(Vd, T, imm32 & 0xff, lsl); 1551 imm32 >>= 8; lsl += 8; 1552 while (imm32) { 1553 while ((imm32 & 0xff) == 0) { lsl += 8; imm32 >>= 8; } 1554 if (movn_cnt < movi_cnt) 1555 bici(Vd, T, imm32 & 0xff, lsl); 1556 else 1557 orri(Vd, T, imm32 & 0xff, lsl); 1558 lsl += 8; imm32 >>= 8; 1559 } 1560 } 1561 1562 void MacroAssembler::mov_immediate64(Register dst, u_int64_t imm64) 1563 { 1564 #ifndef PRODUCT 1565 { 1566 char buffer[64]; 1567 snprintf(buffer, sizeof(buffer), "0x%" PRIX64, imm64); 1568 block_comment(buffer); 1569 } 1570 #endif 1571 if (operand_valid_for_logical_immediate(false, imm64)) { 1572 orr(dst, zr, imm64); 1573 } else { 1574 // we can use a combination of MOVZ or MOVN with 1575 // MOVK to build up the constant 1576 u_int64_t imm_h[4]; 1577 int zero_count = 0; 1578 int neg_count = 0; 1579 int i; 1580 for (i = 0; i < 4; i++) { 1581 imm_h[i] = ((imm64 >> (i * 16)) & 0xffffL); 1582 if (imm_h[i] == 0) { 1583 zero_count++; 1584 } else if (imm_h[i] == 0xffffL) { 1585 neg_count++; 1586 } 1587 } 1588 if (zero_count == 4) { 1589 // one MOVZ will do 1590 movz(dst, 0); 1591 } else if (neg_count == 4) { 1592 // one MOVN will do 1593 movn(dst, 0); 1594 } else if (zero_count == 3) { 1595 for (i = 0; i < 4; i++) { 1596 if (imm_h[i] != 0L) { 1597 movz(dst, (u_int32_t)imm_h[i], (i << 4)); 1598 break; 1599 } 1600 } 1601 } else if (neg_count == 3) { 1602 // one MOVN will do 1603 for (int i = 0; i < 4; i++) { 1604 if (imm_h[i] != 0xffffL) { 1605 movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4)); 1606 break; 1607 } 1608 } 1609 } else if (zero_count == 2) { 1610 // one MOVZ and one MOVK will do 1611 for (i = 0; i < 3; i++) { 1612 if (imm_h[i] != 0L) { 1613 movz(dst, (u_int32_t)imm_h[i], (i << 4)); 1614 i++; 1615 break; 1616 } 1617 } 1618 for (;i < 4; i++) { 1619 if (imm_h[i] != 0L) { 1620 movk(dst, (u_int32_t)imm_h[i], (i << 4)); 1621 } 1622 } 1623 } else if (neg_count == 2) { 1624 // one MOVN and one MOVK will do 1625 for (i = 0; i < 4; i++) { 1626 if (imm_h[i] != 0xffffL) { 1627 movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4)); 1628 i++; 1629 break; 1630 } 1631 } 1632 for (;i < 4; i++) { 1633 if (imm_h[i] != 0xffffL) { 1634 movk(dst, (u_int32_t)imm_h[i], (i << 4)); 1635 } 1636 } 1637 } else if (zero_count == 1) { 1638 // one MOVZ and two MOVKs will do 1639 for (i = 0; i < 4; i++) { 1640 if (imm_h[i] != 0L) { 1641 movz(dst, (u_int32_t)imm_h[i], (i << 4)); 1642 i++; 1643 break; 1644 } 1645 } 1646 for (;i < 4; i++) { 1647 if (imm_h[i] != 0x0L) { 1648 movk(dst, (u_int32_t)imm_h[i], (i << 4)); 1649 } 1650 } 1651 } else if (neg_count == 1) { 1652 // one MOVN and two MOVKs will do 1653 for (i = 0; i < 4; i++) { 1654 if (imm_h[i] != 0xffffL) { 1655 movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4)); 1656 i++; 1657 break; 1658 } 1659 } 1660 for (;i < 4; i++) { 1661 if (imm_h[i] != 0xffffL) { 1662 movk(dst, (u_int32_t)imm_h[i], (i << 4)); 1663 } 1664 } 1665 } else { 1666 // use a MOVZ and 3 MOVKs (makes it easier to debug) 1667 movz(dst, (u_int32_t)imm_h[0], 0); 1668 for (i = 1; i < 4; i++) { 1669 movk(dst, (u_int32_t)imm_h[i], (i << 4)); 1670 } 1671 } 1672 } 1673 } 1674 1675 void MacroAssembler::mov_immediate32(Register dst, u_int32_t imm32) 1676 { 1677 #ifndef PRODUCT 1678 { 1679 char buffer[64]; 1680 snprintf(buffer, sizeof(buffer), "0x%" PRIX32, imm32); 1681 block_comment(buffer); 1682 } 1683 #endif 1684 if (operand_valid_for_logical_immediate(true, imm32)) { 1685 orrw(dst, zr, imm32); 1686 } else { 1687 // we can use MOVZ, MOVN or two calls to MOVK to build up the 1688 // constant 1689 u_int32_t imm_h[2]; 1690 imm_h[0] = imm32 & 0xffff; 1691 imm_h[1] = ((imm32 >> 16) & 0xffff); 1692 if (imm_h[0] == 0) { 1693 movzw(dst, imm_h[1], 16); 1694 } else if (imm_h[0] == 0xffff) { 1695 movnw(dst, imm_h[1] ^ 0xffff, 16); 1696 } else if (imm_h[1] == 0) { 1697 movzw(dst, imm_h[0], 0); 1698 } else if (imm_h[1] == 0xffff) { 1699 movnw(dst, imm_h[0] ^ 0xffff, 0); 1700 } else { 1701 // use a MOVZ and MOVK (makes it easier to debug) 1702 movzw(dst, imm_h[0], 0); 1703 movkw(dst, imm_h[1], 16); 1704 } 1705 } 1706 } 1707 1708 // Form an address from base + offset in Rd. Rd may or may 1709 // not actually be used: you must use the Address that is returned. 1710 // It is up to you to ensure that the shift provided matches the size 1711 // of your data. 1712 Address MacroAssembler::form_address(Register Rd, Register base, long byte_offset, int shift) { 1713 if (Address::offset_ok_for_immed(byte_offset, shift)) 1714 // It fits; no need for any heroics 1715 return Address(base, byte_offset); 1716 1717 // Don't do anything clever with negative or misaligned offsets 1718 unsigned mask = (1 << shift) - 1; 1719 if (byte_offset < 0 || byte_offset & mask) { 1720 mov(Rd, byte_offset); 1721 add(Rd, base, Rd); 1722 return Address(Rd); 1723 } 1724 1725 // See if we can do this with two 12-bit offsets 1726 { 1727 unsigned long word_offset = byte_offset >> shift; 1728 unsigned long masked_offset = word_offset & 0xfff000; 1729 if (Address::offset_ok_for_immed(word_offset - masked_offset) 1730 && Assembler::operand_valid_for_add_sub_immediate(masked_offset << shift)) { 1731 add(Rd, base, masked_offset << shift); 1732 word_offset -= masked_offset; 1733 return Address(Rd, word_offset << shift); 1734 } 1735 } 1736 1737 // Do it the hard way 1738 mov(Rd, byte_offset); 1739 add(Rd, base, Rd); 1740 return Address(Rd); 1741 } 1742 1743 void MacroAssembler::atomic_incw(Register counter_addr, Register tmp, Register tmp2) { 1744 if (UseLSE) { 1745 mov(tmp, 1); 1746 ldadd(Assembler::word, tmp, zr, counter_addr); 1747 return; 1748 } 1749 Label retry_load; 1750 if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH)) 1751 prfm(Address(counter_addr), PSTL1STRM); 1752 bind(retry_load); 1753 // flush and load exclusive from the memory location 1754 ldxrw(tmp, counter_addr); 1755 addw(tmp, tmp, 1); 1756 // if we store+flush with no intervening write tmp wil be zero 1757 stxrw(tmp2, tmp, counter_addr); 1758 cbnzw(tmp2, retry_load); 1759 } 1760 1761 1762 int MacroAssembler::corrected_idivl(Register result, Register ra, Register rb, 1763 bool want_remainder, Register scratch) 1764 { 1765 // Full implementation of Java idiv and irem. The function 1766 // returns the (pc) offset of the div instruction - may be needed 1767 // for implicit exceptions. 1768 // 1769 // constraint : ra/rb =/= scratch 1770 // normal case 1771 // 1772 // input : ra: dividend 1773 // rb: divisor 1774 // 1775 // result: either 1776 // quotient (= ra idiv rb) 1777 // remainder (= ra irem rb) 1778 1779 assert(ra != scratch && rb != scratch, "reg cannot be scratch"); 1780 1781 int idivl_offset = offset(); 1782 if (! want_remainder) { 1783 sdivw(result, ra, rb); 1784 } else { 1785 sdivw(scratch, ra, rb); 1786 Assembler::msubw(result, scratch, rb, ra); 1787 } 1788 1789 return idivl_offset; 1790 } 1791 1792 int MacroAssembler::corrected_idivq(Register result, Register ra, Register rb, 1793 bool want_remainder, Register scratch) 1794 { 1795 // Full implementation of Java ldiv and lrem. The function 1796 // returns the (pc) offset of the div instruction - may be needed 1797 // for implicit exceptions. 1798 // 1799 // constraint : ra/rb =/= scratch 1800 // normal case 1801 // 1802 // input : ra: dividend 1803 // rb: divisor 1804 // 1805 // result: either 1806 // quotient (= ra idiv rb) 1807 // remainder (= ra irem rb) 1808 1809 assert(ra != scratch && rb != scratch, "reg cannot be scratch"); 1810 1811 int idivq_offset = offset(); 1812 if (! want_remainder) { 1813 sdiv(result, ra, rb); 1814 } else { 1815 sdiv(scratch, ra, rb); 1816 Assembler::msub(result, scratch, rb, ra); 1817 } 1818 1819 return idivq_offset; 1820 } 1821 1822 void MacroAssembler::membar(Membar_mask_bits order_constraint) { 1823 address prev = pc() - NativeMembar::instruction_size; 1824 address last = code()->last_insn(); 1825 if (last != NULL && nativeInstruction_at(last)->is_Membar() && prev == last) { 1826 NativeMembar *bar = NativeMembar_at(prev); 1827 // We are merging two memory barrier instructions. On AArch64 we 1828 // can do this simply by ORing them together. 1829 bar->set_kind(bar->get_kind() | order_constraint); 1830 BLOCK_COMMENT("merged membar"); 1831 } else { 1832 code()->set_last_insn(pc()); 1833 dmb(Assembler::barrier(order_constraint)); 1834 } 1835 } 1836 1837 bool MacroAssembler::try_merge_ldst(Register rt, const Address &adr, size_t size_in_bytes, bool is_store) { 1838 if (ldst_can_merge(rt, adr, size_in_bytes, is_store)) { 1839 merge_ldst(rt, adr, size_in_bytes, is_store); 1840 code()->clear_last_insn(); 1841 return true; 1842 } else { 1843 assert(size_in_bytes == 8 || size_in_bytes == 4, "only 8 bytes or 4 bytes load/store is supported."); 1844 const unsigned mask = size_in_bytes - 1; 1845 if (adr.getMode() == Address::base_plus_offset && 1846 (adr.offset() & mask) == 0) { // only supports base_plus_offset. 1847 code()->set_last_insn(pc()); 1848 } 1849 return false; 1850 } 1851 } 1852 1853 void MacroAssembler::ldr(Register Rx, const Address &adr) { 1854 // We always try to merge two adjacent loads into one ldp. 1855 if (!try_merge_ldst(Rx, adr, 8, false)) { 1856 Assembler::ldr(Rx, adr); 1857 } 1858 } 1859 1860 void MacroAssembler::ldrw(Register Rw, const Address &adr) { 1861 // We always try to merge two adjacent loads into one ldp. 1862 if (!try_merge_ldst(Rw, adr, 4, false)) { 1863 Assembler::ldrw(Rw, adr); 1864 } 1865 } 1866 1867 void MacroAssembler::str(Register Rx, const Address &adr) { 1868 // We always try to merge two adjacent stores into one stp. 1869 if (!try_merge_ldst(Rx, adr, 8, true)) { 1870 Assembler::str(Rx, adr); 1871 } 1872 } 1873 1874 void MacroAssembler::strw(Register Rw, const Address &adr) { 1875 // We always try to merge two adjacent stores into one stp. 1876 if (!try_merge_ldst(Rw, adr, 4, true)) { 1877 Assembler::strw(Rw, adr); 1878 } 1879 } 1880 1881 // MacroAssembler routines found actually to be needed 1882 1883 void MacroAssembler::push(Register src) 1884 { 1885 str(src, Address(pre(esp, -1 * wordSize))); 1886 } 1887 1888 void MacroAssembler::pop(Register dst) 1889 { 1890 ldr(dst, Address(post(esp, 1 * wordSize))); 1891 } 1892 1893 // Note: load_unsigned_short used to be called load_unsigned_word. 1894 int MacroAssembler::load_unsigned_short(Register dst, Address src) { 1895 int off = offset(); 1896 ldrh(dst, src); 1897 return off; 1898 } 1899 1900 int MacroAssembler::load_unsigned_byte(Register dst, Address src) { 1901 int off = offset(); 1902 ldrb(dst, src); 1903 return off; 1904 } 1905 1906 int MacroAssembler::load_signed_short(Register dst, Address src) { 1907 int off = offset(); 1908 ldrsh(dst, src); 1909 return off; 1910 } 1911 1912 int MacroAssembler::load_signed_byte(Register dst, Address src) { 1913 int off = offset(); 1914 ldrsb(dst, src); 1915 return off; 1916 } 1917 1918 int MacroAssembler::load_signed_short32(Register dst, Address src) { 1919 int off = offset(); 1920 ldrshw(dst, src); 1921 return off; 1922 } 1923 1924 int MacroAssembler::load_signed_byte32(Register dst, Address src) { 1925 int off = offset(); 1926 ldrsbw(dst, src); 1927 return off; 1928 } 1929 1930 void MacroAssembler::load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed, Register dst2) { 1931 switch (size_in_bytes) { 1932 case 8: ldr(dst, src); break; 1933 case 4: ldrw(dst, src); break; 1934 case 2: is_signed ? load_signed_short(dst, src) : load_unsigned_short(dst, src); break; 1935 case 1: is_signed ? load_signed_byte( dst, src) : load_unsigned_byte( dst, src); break; 1936 default: ShouldNotReachHere(); 1937 } 1938 } 1939 1940 void MacroAssembler::store_sized_value(Address dst, Register src, size_t size_in_bytes, Register src2) { 1941 switch (size_in_bytes) { 1942 case 8: str(src, dst); break; 1943 case 4: strw(src, dst); break; 1944 case 2: strh(src, dst); break; 1945 case 1: strb(src, dst); break; 1946 default: ShouldNotReachHere(); 1947 } 1948 } 1949 1950 void MacroAssembler::decrementw(Register reg, int value) 1951 { 1952 if (value < 0) { incrementw(reg, -value); return; } 1953 if (value == 0) { return; } 1954 if (value < (1 << 12)) { subw(reg, reg, value); return; } 1955 /* else */ { 1956 guarantee(reg != rscratch2, "invalid dst for register decrement"); 1957 movw(rscratch2, (unsigned)value); 1958 subw(reg, reg, rscratch2); 1959 } 1960 } 1961 1962 void MacroAssembler::decrement(Register reg, int value) 1963 { 1964 if (value < 0) { increment(reg, -value); return; } 1965 if (value == 0) { return; } 1966 if (value < (1 << 12)) { sub(reg, reg, value); return; } 1967 /* else */ { 1968 assert(reg != rscratch2, "invalid dst for register decrement"); 1969 mov(rscratch2, (unsigned long)value); 1970 sub(reg, reg, rscratch2); 1971 } 1972 } 1973 1974 void MacroAssembler::decrementw(Address dst, int value) 1975 { 1976 assert(!dst.uses(rscratch1), "invalid dst for address decrement"); 1977 if (dst.getMode() == Address::literal) { 1978 assert(abs(value) < (1 << 12), "invalid value and address mode combination"); 1979 lea(rscratch2, dst); 1980 dst = Address(rscratch2); 1981 } 1982 ldrw(rscratch1, dst); 1983 decrementw(rscratch1, value); 1984 strw(rscratch1, dst); 1985 } 1986 1987 void MacroAssembler::decrement(Address dst, int value) 1988 { 1989 assert(!dst.uses(rscratch1), "invalid address for decrement"); 1990 if (dst.getMode() == Address::literal) { 1991 assert(abs(value) < (1 << 12), "invalid value and address mode combination"); 1992 lea(rscratch2, dst); 1993 dst = Address(rscratch2); 1994 } 1995 ldr(rscratch1, dst); 1996 decrement(rscratch1, value); 1997 str(rscratch1, dst); 1998 } 1999 2000 void MacroAssembler::incrementw(Register reg, int value) 2001 { 2002 if (value < 0) { decrementw(reg, -value); return; } 2003 if (value == 0) { return; } 2004 if (value < (1 << 12)) { addw(reg, reg, value); return; } 2005 /* else */ { 2006 assert(reg != rscratch2, "invalid dst for register increment"); 2007 movw(rscratch2, (unsigned)value); 2008 addw(reg, reg, rscratch2); 2009 } 2010 } 2011 2012 void MacroAssembler::increment(Register reg, int value) 2013 { 2014 if (value < 0) { decrement(reg, -value); return; } 2015 if (value == 0) { return; } 2016 if (value < (1 << 12)) { add(reg, reg, value); return; } 2017 /* else */ { 2018 assert(reg != rscratch2, "invalid dst for register increment"); 2019 movw(rscratch2, (unsigned)value); 2020 add(reg, reg, rscratch2); 2021 } 2022 } 2023 2024 void MacroAssembler::incrementw(Address dst, int value) 2025 { 2026 assert(!dst.uses(rscratch1), "invalid dst for address increment"); 2027 if (dst.getMode() == Address::literal) { 2028 assert(abs(value) < (1 << 12), "invalid value and address mode combination"); 2029 lea(rscratch2, dst); 2030 dst = Address(rscratch2); 2031 } 2032 ldrw(rscratch1, dst); 2033 incrementw(rscratch1, value); 2034 strw(rscratch1, dst); 2035 } 2036 2037 void MacroAssembler::increment(Address dst, int value) 2038 { 2039 assert(!dst.uses(rscratch1), "invalid dst for address increment"); 2040 if (dst.getMode() == Address::literal) { 2041 assert(abs(value) < (1 << 12), "invalid value and address mode combination"); 2042 lea(rscratch2, dst); 2043 dst = Address(rscratch2); 2044 } 2045 ldr(rscratch1, dst); 2046 increment(rscratch1, value); 2047 str(rscratch1, dst); 2048 } 2049 2050 2051 void MacroAssembler::pusha() { 2052 push(0x7fffffff, sp); 2053 } 2054 2055 void MacroAssembler::popa() { 2056 pop(0x7fffffff, sp); 2057 } 2058 2059 // Push lots of registers in the bit set supplied. Don't push sp. 2060 // Return the number of words pushed 2061 int MacroAssembler::push(unsigned int bitset, Register stack) { 2062 int words_pushed = 0; 2063 2064 // Scan bitset to accumulate register pairs 2065 unsigned char regs[32]; 2066 int count = 0; 2067 for (int reg = 0; reg <= 30; reg++) { 2068 if (1 & bitset) 2069 regs[count++] = reg; 2070 bitset >>= 1; 2071 } 2072 regs[count++] = zr->encoding_nocheck(); 2073 count &= ~1; // Only push an even nuber of regs 2074 2075 if (count) { 2076 stp(as_Register(regs[0]), as_Register(regs[1]), 2077 Address(pre(stack, -count * wordSize))); 2078 words_pushed += 2; 2079 } 2080 for (int i = 2; i < count; i += 2) { 2081 stp(as_Register(regs[i]), as_Register(regs[i+1]), 2082 Address(stack, i * wordSize)); 2083 words_pushed += 2; 2084 } 2085 2086 assert(words_pushed == count, "oops, pushed != count"); 2087 2088 return count; 2089 } 2090 2091 int MacroAssembler::pop(unsigned int bitset, Register stack) { 2092 int words_pushed = 0; 2093 2094 // Scan bitset to accumulate register pairs 2095 unsigned char regs[32]; 2096 int count = 0; 2097 for (int reg = 0; reg <= 30; reg++) { 2098 if (1 & bitset) 2099 regs[count++] = reg; 2100 bitset >>= 1; 2101 } 2102 regs[count++] = zr->encoding_nocheck(); 2103 count &= ~1; 2104 2105 for (int i = 2; i < count; i += 2) { 2106 ldp(as_Register(regs[i]), as_Register(regs[i+1]), 2107 Address(stack, i * wordSize)); 2108 words_pushed += 2; 2109 } 2110 if (count) { 2111 ldp(as_Register(regs[0]), as_Register(regs[1]), 2112 Address(post(stack, count * wordSize))); 2113 words_pushed += 2; 2114 } 2115 2116 assert(words_pushed == count, "oops, pushed != count"); 2117 2118 return count; 2119 } 2120 #ifdef ASSERT 2121 void MacroAssembler::verify_heapbase(const char* msg) { 2122 #if 0 2123 assert (UseCompressedOops || UseCompressedClassPointers, "should be compressed"); 2124 assert (Universe::heap() != NULL, "java heap should be initialized"); 2125 if (CheckCompressedOops) { 2126 Label ok; 2127 push(1 << rscratch1->encoding(), sp); // cmpptr trashes rscratch1 2128 cmpptr(rheapbase, ExternalAddress((address)Universe::narrow_ptrs_base_addr())); 2129 br(Assembler::EQ, ok); 2130 stop(msg); 2131 bind(ok); 2132 pop(1 << rscratch1->encoding(), sp); 2133 } 2134 #endif 2135 } 2136 #endif 2137 2138 void MacroAssembler::resolve_jobject(Register value, Register thread, Register tmp) { 2139 Label done, not_weak; 2140 cbz(value, done); // Use NULL as-is. 2141 2142 STATIC_ASSERT(JNIHandles::weak_tag_mask == 1u); 2143 tbz(r0, 0, not_weak); // Test for jweak tag. 2144 2145 // Resolve jweak. 2146 access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF, value, 2147 Address(value, -JNIHandles::weak_tag_value), tmp, thread); 2148 verify_oop(value); 2149 b(done); 2150 2151 bind(not_weak); 2152 // Resolve (untagged) jobject. 2153 access_load_at(T_OBJECT, IN_NATIVE, value, Address(value, 0), tmp, thread); 2154 verify_oop(value); 2155 bind(done); 2156 } 2157 2158 void MacroAssembler::stop(const char* msg) { 2159 address ip = pc(); 2160 pusha(); 2161 mov(c_rarg0, (address)msg); 2162 mov(c_rarg1, (address)ip); 2163 mov(c_rarg2, sp); 2164 mov(c_rarg3, CAST_FROM_FN_PTR(address, MacroAssembler::debug64)); 2165 // call(c_rarg3); 2166 blrt(c_rarg3, 3, 0, 1); 2167 hlt(0); 2168 } 2169 2170 void MacroAssembler::unimplemented(const char* what) { 2171 const char* buf = NULL; 2172 { 2173 ResourceMark rm; 2174 stringStream ss; 2175 ss.print("unimplemented: %s", what); 2176 buf = code_string(ss.as_string()); 2177 } 2178 stop(buf); 2179 } 2180 2181 // If a constant does not fit in an immediate field, generate some 2182 // number of MOV instructions and then perform the operation. 2183 void MacroAssembler::wrap_add_sub_imm_insn(Register Rd, Register Rn, unsigned imm, 2184 add_sub_imm_insn insn1, 2185 add_sub_reg_insn insn2) { 2186 assert(Rd != zr, "Rd = zr and not setting flags?"); 2187 if (operand_valid_for_add_sub_immediate((int)imm)) { 2188 (this->*insn1)(Rd, Rn, imm); 2189 } else { 2190 if (uabs(imm) < (1 << 24)) { 2191 (this->*insn1)(Rd, Rn, imm & -(1 << 12)); 2192 (this->*insn1)(Rd, Rd, imm & ((1 << 12)-1)); 2193 } else { 2194 assert_different_registers(Rd, Rn); 2195 mov(Rd, (uint64_t)imm); 2196 (this->*insn2)(Rd, Rn, Rd, LSL, 0); 2197 } 2198 } 2199 } 2200 2201 // Seperate vsn which sets the flags. Optimisations are more restricted 2202 // because we must set the flags correctly. 2203 void MacroAssembler::wrap_adds_subs_imm_insn(Register Rd, Register Rn, unsigned imm, 2204 add_sub_imm_insn insn1, 2205 add_sub_reg_insn insn2) { 2206 if (operand_valid_for_add_sub_immediate((int)imm)) { 2207 (this->*insn1)(Rd, Rn, imm); 2208 } else { 2209 assert_different_registers(Rd, Rn); 2210 assert(Rd != zr, "overflow in immediate operand"); 2211 mov(Rd, (uint64_t)imm); 2212 (this->*insn2)(Rd, Rn, Rd, LSL, 0); 2213 } 2214 } 2215 2216 2217 void MacroAssembler::add(Register Rd, Register Rn, RegisterOrConstant increment) { 2218 if (increment.is_register()) { 2219 add(Rd, Rn, increment.as_register()); 2220 } else { 2221 add(Rd, Rn, increment.as_constant()); 2222 } 2223 } 2224 2225 void MacroAssembler::addw(Register Rd, Register Rn, RegisterOrConstant increment) { 2226 if (increment.is_register()) { 2227 addw(Rd, Rn, increment.as_register()); 2228 } else { 2229 addw(Rd, Rn, increment.as_constant()); 2230 } 2231 } 2232 2233 void MacroAssembler::sub(Register Rd, Register Rn, RegisterOrConstant decrement) { 2234 if (decrement.is_register()) { 2235 sub(Rd, Rn, decrement.as_register()); 2236 } else { 2237 sub(Rd, Rn, decrement.as_constant()); 2238 } 2239 } 2240 2241 void MacroAssembler::subw(Register Rd, Register Rn, RegisterOrConstant decrement) { 2242 if (decrement.is_register()) { 2243 subw(Rd, Rn, decrement.as_register()); 2244 } else { 2245 subw(Rd, Rn, decrement.as_constant()); 2246 } 2247 } 2248 2249 void MacroAssembler::reinit_heapbase() 2250 { 2251 if (UseCompressedOops) { 2252 if (Universe::is_fully_initialized()) { 2253 mov(rheapbase, Universe::narrow_ptrs_base()); 2254 } else { 2255 lea(rheapbase, ExternalAddress((address)Universe::narrow_ptrs_base_addr())); 2256 ldr(rheapbase, Address(rheapbase)); 2257 } 2258 } 2259 } 2260 2261 // this simulates the behaviour of the x86 cmpxchg instruction using a 2262 // load linked/store conditional pair. we use the acquire/release 2263 // versions of these instructions so that we flush pending writes as 2264 // per Java semantics. 2265 2266 // n.b the x86 version assumes the old value to be compared against is 2267 // in rax and updates rax with the value located in memory if the 2268 // cmpxchg fails. we supply a register for the old value explicitly 2269 2270 // the aarch64 load linked/store conditional instructions do not 2271 // accept an offset. so, unlike x86, we must provide a plain register 2272 // to identify the memory word to be compared/exchanged rather than a 2273 // register+offset Address. 2274 2275 void MacroAssembler::cmpxchgptr(Register oldv, Register newv, Register addr, Register tmp, 2276 Label &succeed, Label *fail) { 2277 // oldv holds comparison value 2278 // newv holds value to write in exchange 2279 // addr identifies memory word to compare against/update 2280 if (UseLSE) { 2281 mov(tmp, oldv); 2282 casal(Assembler::xword, oldv, newv, addr); 2283 cmp(tmp, oldv); 2284 br(Assembler::EQ, succeed); 2285 membar(AnyAny); 2286 } else { 2287 Label retry_load, nope; 2288 if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH)) 2289 prfm(Address(addr), PSTL1STRM); 2290 bind(retry_load); 2291 // flush and load exclusive from the memory location 2292 // and fail if it is not what we expect 2293 ldaxr(tmp, addr); 2294 cmp(tmp, oldv); 2295 br(Assembler::NE, nope); 2296 // if we store+flush with no intervening write tmp wil be zero 2297 stlxr(tmp, newv, addr); 2298 cbzw(tmp, succeed); 2299 // retry so we only ever return after a load fails to compare 2300 // ensures we don't return a stale value after a failed write. 2301 b(retry_load); 2302 // if the memory word differs we return it in oldv and signal a fail 2303 bind(nope); 2304 membar(AnyAny); 2305 mov(oldv, tmp); 2306 } 2307 if (fail) 2308 b(*fail); 2309 } 2310 2311 void MacroAssembler::cmpxchg_obj_header(Register oldv, Register newv, Register obj, Register tmp, 2312 Label &succeed, Label *fail) { 2313 assert(oopDesc::mark_offset_in_bytes() == 0, "assumption"); 2314 cmpxchgptr(oldv, newv, obj, tmp, succeed, fail); 2315 } 2316 2317 void MacroAssembler::cmpxchgw(Register oldv, Register newv, Register addr, Register tmp, 2318 Label &succeed, Label *fail) { 2319 // oldv holds comparison value 2320 // newv holds value to write in exchange 2321 // addr identifies memory word to compare against/update 2322 // tmp returns 0/1 for success/failure 2323 if (UseLSE) { 2324 mov(tmp, oldv); 2325 casal(Assembler::word, oldv, newv, addr); 2326 cmp(tmp, oldv); 2327 br(Assembler::EQ, succeed); 2328 membar(AnyAny); 2329 } else { 2330 Label retry_load, nope; 2331 if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH)) 2332 prfm(Address(addr), PSTL1STRM); 2333 bind(retry_load); 2334 // flush and load exclusive from the memory location 2335 // and fail if it is not what we expect 2336 ldaxrw(tmp, addr); 2337 cmp(tmp, oldv); 2338 br(Assembler::NE, nope); 2339 // if we store+flush with no intervening write tmp wil be zero 2340 stlxrw(tmp, newv, addr); 2341 cbzw(tmp, succeed); 2342 // retry so we only ever return after a load fails to compare 2343 // ensures we don't return a stale value after a failed write. 2344 b(retry_load); 2345 // if the memory word differs we return it in oldv and signal a fail 2346 bind(nope); 2347 membar(AnyAny); 2348 mov(oldv, tmp); 2349 } 2350 if (fail) 2351 b(*fail); 2352 } 2353 2354 // A generic CAS; success or failure is in the EQ flag. A weak CAS 2355 // doesn't retry and may fail spuriously. If the oldval is wanted, 2356 // Pass a register for the result, otherwise pass noreg. 2357 2358 // Clobbers rscratch1 2359 void MacroAssembler::cmpxchg(Register addr, Register expected, 2360 Register new_val, 2361 enum operand_size size, 2362 bool acquire, bool release, 2363 bool weak, 2364 Register result) { 2365 if (result == noreg) result = rscratch1; 2366 BLOCK_COMMENT("cmpxchg {"); 2367 if (UseLSE) { 2368 mov(result, expected); 2369 lse_cas(result, new_val, addr, size, acquire, release, /*not_pair*/ true); 2370 compare_eq(result, expected, size); 2371 } else { 2372 Label retry_load, done; 2373 if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH)) 2374 prfm(Address(addr), PSTL1STRM); 2375 bind(retry_load); 2376 load_exclusive(result, addr, size, acquire); 2377 compare_eq(result, expected, size); 2378 br(Assembler::NE, done); 2379 store_exclusive(rscratch1, new_val, addr, size, release); 2380 if (weak) { 2381 cmpw(rscratch1, 0u); // If the store fails, return NE to our caller. 2382 } else { 2383 cbnzw(rscratch1, retry_load); 2384 } 2385 bind(done); 2386 } 2387 BLOCK_COMMENT("} cmpxchg"); 2388 } 2389 2390 // A generic comparison. Only compares for equality, clobbers rscratch1. 2391 void MacroAssembler::compare_eq(Register rm, Register rn, enum operand_size size) { 2392 if (size == xword) { 2393 cmp(rm, rn); 2394 } else if (size == word) { 2395 cmpw(rm, rn); 2396 } else if (size == halfword) { 2397 eorw(rscratch1, rm, rn); 2398 ands(zr, rscratch1, 0xffff); 2399 } else if (size == byte) { 2400 eorw(rscratch1, rm, rn); 2401 ands(zr, rscratch1, 0xff); 2402 } else { 2403 ShouldNotReachHere(); 2404 } 2405 } 2406 2407 2408 static bool different(Register a, RegisterOrConstant b, Register c) { 2409 if (b.is_constant()) 2410 return a != c; 2411 else 2412 return a != b.as_register() && a != c && b.as_register() != c; 2413 } 2414 2415 #define ATOMIC_OP(NAME, LDXR, OP, IOP, AOP, STXR, sz) \ 2416 void MacroAssembler::atomic_##NAME(Register prev, RegisterOrConstant incr, Register addr) { \ 2417 if (UseLSE) { \ 2418 prev = prev->is_valid() ? prev : zr; \ 2419 if (incr.is_register()) { \ 2420 AOP(sz, incr.as_register(), prev, addr); \ 2421 } else { \ 2422 mov(rscratch2, incr.as_constant()); \ 2423 AOP(sz, rscratch2, prev, addr); \ 2424 } \ 2425 return; \ 2426 } \ 2427 Register result = rscratch2; \ 2428 if (prev->is_valid()) \ 2429 result = different(prev, incr, addr) ? prev : rscratch2; \ 2430 \ 2431 Label retry_load; \ 2432 if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH)) \ 2433 prfm(Address(addr), PSTL1STRM); \ 2434 bind(retry_load); \ 2435 LDXR(result, addr); \ 2436 OP(rscratch1, result, incr); \ 2437 STXR(rscratch2, rscratch1, addr); \ 2438 cbnzw(rscratch2, retry_load); \ 2439 if (prev->is_valid() && prev != result) { \ 2440 IOP(prev, rscratch1, incr); \ 2441 } \ 2442 } 2443 2444 ATOMIC_OP(add, ldxr, add, sub, ldadd, stxr, Assembler::xword) 2445 ATOMIC_OP(addw, ldxrw, addw, subw, ldadd, stxrw, Assembler::word) 2446 ATOMIC_OP(addal, ldaxr, add, sub, ldaddal, stlxr, Assembler::xword) 2447 ATOMIC_OP(addalw, ldaxrw, addw, subw, ldaddal, stlxrw, Assembler::word) 2448 2449 #undef ATOMIC_OP 2450 2451 #define ATOMIC_XCHG(OP, AOP, LDXR, STXR, sz) \ 2452 void MacroAssembler::atomic_##OP(Register prev, Register newv, Register addr) { \ 2453 if (UseLSE) { \ 2454 prev = prev->is_valid() ? prev : zr; \ 2455 AOP(sz, newv, prev, addr); \ 2456 return; \ 2457 } \ 2458 Register result = rscratch2; \ 2459 if (prev->is_valid()) \ 2460 result = different(prev, newv, addr) ? prev : rscratch2; \ 2461 \ 2462 Label retry_load; \ 2463 if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH)) \ 2464 prfm(Address(addr), PSTL1STRM); \ 2465 bind(retry_load); \ 2466 LDXR(result, addr); \ 2467 STXR(rscratch1, newv, addr); \ 2468 cbnzw(rscratch1, retry_load); \ 2469 if (prev->is_valid() && prev != result) \ 2470 mov(prev, result); \ 2471 } 2472 2473 ATOMIC_XCHG(xchg, swp, ldxr, stxr, Assembler::xword) 2474 ATOMIC_XCHG(xchgw, swp, ldxrw, stxrw, Assembler::word) 2475 ATOMIC_XCHG(xchgal, swpal, ldaxr, stlxr, Assembler::xword) 2476 ATOMIC_XCHG(xchgalw, swpal, ldaxrw, stlxrw, Assembler::word) 2477 2478 #undef ATOMIC_XCHG 2479 2480 #ifndef PRODUCT 2481 extern "C" void findpc(intptr_t x); 2482 #endif 2483 2484 void MacroAssembler::debug64(char* msg, int64_t pc, int64_t regs[]) 2485 { 2486 // In order to get locks to work, we need to fake a in_VM state 2487 if (ShowMessageBoxOnError ) { 2488 JavaThread* thread = JavaThread::current(); 2489 JavaThreadState saved_state = thread->thread_state(); 2490 thread->set_thread_state(_thread_in_vm); 2491 #ifndef PRODUCT 2492 if (CountBytecodes || TraceBytecodes || StopInterpreterAt) { 2493 ttyLocker ttyl; 2494 BytecodeCounter::print(); 2495 } 2496 #endif 2497 if (os::message_box(msg, "Execution stopped, print registers?")) { 2498 ttyLocker ttyl; 2499 tty->print_cr(" pc = 0x%016lx", pc); 2500 #ifndef PRODUCT 2501 tty->cr(); 2502 findpc(pc); 2503 tty->cr(); 2504 #endif 2505 tty->print_cr(" r0 = 0x%016lx", regs[0]); 2506 tty->print_cr(" r1 = 0x%016lx", regs[1]); 2507 tty->print_cr(" r2 = 0x%016lx", regs[2]); 2508 tty->print_cr(" r3 = 0x%016lx", regs[3]); 2509 tty->print_cr(" r4 = 0x%016lx", regs[4]); 2510 tty->print_cr(" r5 = 0x%016lx", regs[5]); 2511 tty->print_cr(" r6 = 0x%016lx", regs[6]); 2512 tty->print_cr(" r7 = 0x%016lx", regs[7]); 2513 tty->print_cr(" r8 = 0x%016lx", regs[8]); 2514 tty->print_cr(" r9 = 0x%016lx", regs[9]); 2515 tty->print_cr("r10 = 0x%016lx", regs[10]); 2516 tty->print_cr("r11 = 0x%016lx", regs[11]); 2517 tty->print_cr("r12 = 0x%016lx", regs[12]); 2518 tty->print_cr("r13 = 0x%016lx", regs[13]); 2519 tty->print_cr("r14 = 0x%016lx", regs[14]); 2520 tty->print_cr("r15 = 0x%016lx", regs[15]); 2521 tty->print_cr("r16 = 0x%016lx", regs[16]); 2522 tty->print_cr("r17 = 0x%016lx", regs[17]); 2523 tty->print_cr("r18 = 0x%016lx", regs[18]); 2524 tty->print_cr("r19 = 0x%016lx", regs[19]); 2525 tty->print_cr("r20 = 0x%016lx", regs[20]); 2526 tty->print_cr("r21 = 0x%016lx", regs[21]); 2527 tty->print_cr("r22 = 0x%016lx", regs[22]); 2528 tty->print_cr("r23 = 0x%016lx", regs[23]); 2529 tty->print_cr("r24 = 0x%016lx", regs[24]); 2530 tty->print_cr("r25 = 0x%016lx", regs[25]); 2531 tty->print_cr("r26 = 0x%016lx", regs[26]); 2532 tty->print_cr("r27 = 0x%016lx", regs[27]); 2533 tty->print_cr("r28 = 0x%016lx", regs[28]); 2534 tty->print_cr("r30 = 0x%016lx", regs[30]); 2535 tty->print_cr("r31 = 0x%016lx", regs[31]); 2536 BREAKPOINT; 2537 } 2538 ThreadStateTransition::transition(thread, _thread_in_vm, saved_state); 2539 } else { 2540 ttyLocker ttyl; 2541 ::tty->print_cr("=============== DEBUG MESSAGE: %s ================\n", 2542 msg); 2543 assert(false, "DEBUG MESSAGE: %s", msg); 2544 } 2545 } 2546 2547 #ifdef BUILTIN_SIM 2548 // routine to generate an x86 prolog for a stub function which 2549 // bootstraps into the generated ARM code which directly follows the 2550 // stub 2551 // 2552 // the argument encodes the number of general and fp registers 2553 // passed by the caller and the callng convention (currently just 2554 // the number of general registers and assumes C argument passing) 2555 2556 extern "C" { 2557 int aarch64_stub_prolog_size(); 2558 void aarch64_stub_prolog(); 2559 void aarch64_prolog(); 2560 } 2561 2562 void MacroAssembler::c_stub_prolog(int gp_arg_count, int fp_arg_count, int ret_type, 2563 address *prolog_ptr) 2564 { 2565 int calltype = (((ret_type & 0x3) << 8) | 2566 ((fp_arg_count & 0xf) << 4) | 2567 (gp_arg_count & 0xf)); 2568 2569 // the addresses for the x86 to ARM entry code we need to use 2570 address start = pc(); 2571 // printf("start = %lx\n", start); 2572 int byteCount = aarch64_stub_prolog_size(); 2573 // printf("byteCount = %x\n", byteCount); 2574 int instructionCount = (byteCount + 3)/ 4; 2575 // printf("instructionCount = %x\n", instructionCount); 2576 for (int i = 0; i < instructionCount; i++) { 2577 nop(); 2578 } 2579 2580 memcpy(start, (void*)aarch64_stub_prolog, byteCount); 2581 2582 // write the address of the setup routine and the call format at the 2583 // end of into the copied code 2584 u_int64_t *patch_end = (u_int64_t *)(start + byteCount); 2585 if (prolog_ptr) 2586 patch_end[-2] = (u_int64_t)prolog_ptr; 2587 patch_end[-1] = calltype; 2588 } 2589 #endif 2590 2591 void MacroAssembler::push_call_clobbered_registers() { 2592 int step = 4 * wordSize; 2593 push(RegSet::range(r0, r18) - RegSet::of(rscratch1, rscratch2), sp); 2594 sub(sp, sp, step); 2595 mov(rscratch1, -step); 2596 // Push v0-v7, v16-v31. 2597 for (int i = 31; i>= 4; i -= 4) { 2598 if (i <= v7->encoding() || i >= v16->encoding()) 2599 st1(as_FloatRegister(i-3), as_FloatRegister(i-2), as_FloatRegister(i-1), 2600 as_FloatRegister(i), T1D, Address(post(sp, rscratch1))); 2601 } 2602 st1(as_FloatRegister(0), as_FloatRegister(1), as_FloatRegister(2), 2603 as_FloatRegister(3), T1D, Address(sp)); 2604 } 2605 2606 void MacroAssembler::pop_call_clobbered_registers() { 2607 for (int i = 0; i < 32; i += 4) { 2608 if (i <= v7->encoding() || i >= v16->encoding()) 2609 ld1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2), 2610 as_FloatRegister(i+3), T1D, Address(post(sp, 4 * wordSize))); 2611 } 2612 2613 pop(RegSet::range(r0, r18) - RegSet::of(rscratch1, rscratch2), sp); 2614 } 2615 2616 void MacroAssembler::push_CPU_state(bool save_vectors) { 2617 int step = (save_vectors ? 8 : 4) * wordSize; 2618 push(0x3fffffff, sp); // integer registers except lr & sp 2619 mov(rscratch1, -step); 2620 sub(sp, sp, step); 2621 for (int i = 28; i >= 4; i -= 4) { 2622 st1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2), 2623 as_FloatRegister(i+3), save_vectors ? T2D : T1D, Address(post(sp, rscratch1))); 2624 } 2625 st1(v0, v1, v2, v3, save_vectors ? T2D : T1D, sp); 2626 } 2627 2628 void MacroAssembler::pop_CPU_state(bool restore_vectors) { 2629 int step = (restore_vectors ? 8 : 4) * wordSize; 2630 for (int i = 0; i <= 28; i += 4) 2631 ld1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2), 2632 as_FloatRegister(i+3), restore_vectors ? T2D : T1D, Address(post(sp, step))); 2633 pop(0x3fffffff, sp); // integer registers except lr & sp 2634 } 2635 2636 /** 2637 * Helpers for multiply_to_len(). 2638 */ 2639 void MacroAssembler::add2_with_carry(Register final_dest_hi, Register dest_hi, Register dest_lo, 2640 Register src1, Register src2) { 2641 adds(dest_lo, dest_lo, src1); 2642 adc(dest_hi, dest_hi, zr); 2643 adds(dest_lo, dest_lo, src2); 2644 adc(final_dest_hi, dest_hi, zr); 2645 } 2646 2647 // Generate an address from (r + r1 extend offset). "size" is the 2648 // size of the operand. The result may be in rscratch2. 2649 Address MacroAssembler::offsetted_address(Register r, Register r1, 2650 Address::extend ext, int offset, int size) { 2651 if (offset || (ext.shift() % size != 0)) { 2652 lea(rscratch2, Address(r, r1, ext)); 2653 return Address(rscratch2, offset); 2654 } else { 2655 return Address(r, r1, ext); 2656 } 2657 } 2658 2659 Address MacroAssembler::spill_address(int size, int offset, Register tmp) 2660 { 2661 assert(offset >= 0, "spill to negative address?"); 2662 // Offset reachable ? 2663 // Not aligned - 9 bits signed offset 2664 // Aligned - 12 bits unsigned offset shifted 2665 Register base = sp; 2666 if ((offset & (size-1)) && offset >= (1<<8)) { 2667 add(tmp, base, offset & ((1<<12)-1)); 2668 base = tmp; 2669 offset &= -1<<12; 2670 } 2671 2672 if (offset >= (1<<12) * size) { 2673 add(tmp, base, offset & (((1<<12)-1)<<12)); 2674 base = tmp; 2675 offset &= ~(((1<<12)-1)<<12); 2676 } 2677 2678 return Address(base, offset); 2679 } 2680 2681 // Checks whether offset is aligned. 2682 // Returns true if it is, else false. 2683 bool MacroAssembler::merge_alignment_check(Register base, 2684 size_t size, 2685 long cur_offset, 2686 long prev_offset) const { 2687 if (AvoidUnalignedAccesses) { 2688 if (base == sp) { 2689 // Checks whether low offset if aligned to pair of registers. 2690 long pair_mask = size * 2 - 1; 2691 long offset = prev_offset > cur_offset ? cur_offset : prev_offset; 2692 return (offset & pair_mask) == 0; 2693 } else { // If base is not sp, we can't guarantee the access is aligned. 2694 return false; 2695 } 2696 } else { 2697 long mask = size - 1; 2698 // Load/store pair instruction only supports element size aligned offset. 2699 return (cur_offset & mask) == 0 && (prev_offset & mask) == 0; 2700 } 2701 } 2702 2703 // Checks whether current and previous loads/stores can be merged. 2704 // Returns true if it can be merged, else false. 2705 bool MacroAssembler::ldst_can_merge(Register rt, 2706 const Address &adr, 2707 size_t cur_size_in_bytes, 2708 bool is_store) const { 2709 address prev = pc() - NativeInstruction::instruction_size; 2710 address last = code()->last_insn(); 2711 2712 if (last == NULL || !nativeInstruction_at(last)->is_Imm_LdSt()) { 2713 return false; 2714 } 2715 2716 if (adr.getMode() != Address::base_plus_offset || prev != last) { 2717 return false; 2718 } 2719 2720 NativeLdSt* prev_ldst = NativeLdSt_at(prev); 2721 size_t prev_size_in_bytes = prev_ldst->size_in_bytes(); 2722 2723 assert(prev_size_in_bytes == 4 || prev_size_in_bytes == 8, "only supports 64/32bit merging."); 2724 assert(cur_size_in_bytes == 4 || cur_size_in_bytes == 8, "only supports 64/32bit merging."); 2725 2726 if (cur_size_in_bytes != prev_size_in_bytes || is_store != prev_ldst->is_store()) { 2727 return false; 2728 } 2729 2730 long max_offset = 63 * prev_size_in_bytes; 2731 long min_offset = -64 * prev_size_in_bytes; 2732 2733 assert(prev_ldst->is_not_pre_post_index(), "pre-index or post-index is not supported to be merged."); 2734 2735 // Only same base can be merged. 2736 if (adr.base() != prev_ldst->base()) { 2737 return false; 2738 } 2739 2740 long cur_offset = adr.offset(); 2741 long prev_offset = prev_ldst->offset(); 2742 size_t diff = abs(cur_offset - prev_offset); 2743 if (diff != prev_size_in_bytes) { 2744 return false; 2745 } 2746 2747 // Following cases can not be merged: 2748 // ldr x2, [x2, #8] 2749 // ldr x3, [x2, #16] 2750 // or: 2751 // ldr x2, [x3, #8] 2752 // ldr x2, [x3, #16] 2753 // If t1 and t2 is the same in "ldp t1, t2, [xn, #imm]", we'll get SIGILL. 2754 if (!is_store && (adr.base() == prev_ldst->target() || rt == prev_ldst->target())) { 2755 return false; 2756 } 2757 2758 long low_offset = prev_offset > cur_offset ? cur_offset : prev_offset; 2759 // Offset range must be in ldp/stp instruction's range. 2760 if (low_offset > max_offset || low_offset < min_offset) { 2761 return false; 2762 } 2763 2764 if (merge_alignment_check(adr.base(), prev_size_in_bytes, cur_offset, prev_offset)) { 2765 return true; 2766 } 2767 2768 return false; 2769 } 2770 2771 // Merge current load/store with previous load/store into ldp/stp. 2772 void MacroAssembler::merge_ldst(Register rt, 2773 const Address &adr, 2774 size_t cur_size_in_bytes, 2775 bool is_store) { 2776 2777 assert(ldst_can_merge(rt, adr, cur_size_in_bytes, is_store) == true, "cur and prev must be able to be merged."); 2778 2779 Register rt_low, rt_high; 2780 address prev = pc() - NativeInstruction::instruction_size; 2781 NativeLdSt* prev_ldst = NativeLdSt_at(prev); 2782 2783 long offset; 2784 2785 if (adr.offset() < prev_ldst->offset()) { 2786 offset = adr.offset(); 2787 rt_low = rt; 2788 rt_high = prev_ldst->target(); 2789 } else { 2790 offset = prev_ldst->offset(); 2791 rt_low = prev_ldst->target(); 2792 rt_high = rt; 2793 } 2794 2795 Address adr_p = Address(prev_ldst->base(), offset); 2796 // Overwrite previous generated binary. 2797 code_section()->set_end(prev); 2798 2799 const int sz = prev_ldst->size_in_bytes(); 2800 assert(sz == 8 || sz == 4, "only supports 64/32bit merging."); 2801 if (!is_store) { 2802 BLOCK_COMMENT("merged ldr pair"); 2803 if (sz == 8) { 2804 ldp(rt_low, rt_high, adr_p); 2805 } else { 2806 ldpw(rt_low, rt_high, adr_p); 2807 } 2808 } else { 2809 BLOCK_COMMENT("merged str pair"); 2810 if (sz == 8) { 2811 stp(rt_low, rt_high, adr_p); 2812 } else { 2813 stpw(rt_low, rt_high, adr_p); 2814 } 2815 } 2816 } 2817 2818 /** 2819 * Multiply 64 bit by 64 bit first loop. 2820 */ 2821 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart, 2822 Register y, Register y_idx, Register z, 2823 Register carry, Register product, 2824 Register idx, Register kdx) { 2825 // 2826 // jlong carry, x[], y[], z[]; 2827 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) { 2828 // huge_128 product = y[idx] * x[xstart] + carry; 2829 // z[kdx] = (jlong)product; 2830 // carry = (jlong)(product >>> 64); 2831 // } 2832 // z[xstart] = carry; 2833 // 2834 2835 Label L_first_loop, L_first_loop_exit; 2836 Label L_one_x, L_one_y, L_multiply; 2837 2838 subsw(xstart, xstart, 1); 2839 br(Assembler::MI, L_one_x); 2840 2841 lea(rscratch1, Address(x, xstart, Address::lsl(LogBytesPerInt))); 2842 ldr(x_xstart, Address(rscratch1)); 2843 ror(x_xstart, x_xstart, 32); // convert big-endian to little-endian 2844 2845 bind(L_first_loop); 2846 subsw(idx, idx, 1); 2847 br(Assembler::MI, L_first_loop_exit); 2848 subsw(idx, idx, 1); 2849 br(Assembler::MI, L_one_y); 2850 lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt))); 2851 ldr(y_idx, Address(rscratch1)); 2852 ror(y_idx, y_idx, 32); // convert big-endian to little-endian 2853 bind(L_multiply); 2854 2855 // AArch64 has a multiply-accumulate instruction that we can't use 2856 // here because it has no way to process carries, so we have to use 2857 // separate add and adc instructions. Bah. 2858 umulh(rscratch1, x_xstart, y_idx); // x_xstart * y_idx -> rscratch1:product 2859 mul(product, x_xstart, y_idx); 2860 adds(product, product, carry); 2861 adc(carry, rscratch1, zr); // x_xstart * y_idx + carry -> carry:product 2862 2863 subw(kdx, kdx, 2); 2864 ror(product, product, 32); // back to big-endian 2865 str(product, offsetted_address(z, kdx, Address::uxtw(LogBytesPerInt), 0, BytesPerLong)); 2866 2867 b(L_first_loop); 2868 2869 bind(L_one_y); 2870 ldrw(y_idx, Address(y, 0)); 2871 b(L_multiply); 2872 2873 bind(L_one_x); 2874 ldrw(x_xstart, Address(x, 0)); 2875 b(L_first_loop); 2876 2877 bind(L_first_loop_exit); 2878 } 2879 2880 /** 2881 * Multiply 128 bit by 128. Unrolled inner loop. 2882 * 2883 */ 2884 void MacroAssembler::multiply_128_x_128_loop(Register y, Register z, 2885 Register carry, Register carry2, 2886 Register idx, Register jdx, 2887 Register yz_idx1, Register yz_idx2, 2888 Register tmp, Register tmp3, Register tmp4, 2889 Register tmp6, Register product_hi) { 2890 2891 // jlong carry, x[], y[], z[]; 2892 // int kdx = ystart+1; 2893 // for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop 2894 // huge_128 tmp3 = (y[idx+1] * product_hi) + z[kdx+idx+1] + carry; 2895 // jlong carry2 = (jlong)(tmp3 >>> 64); 2896 // huge_128 tmp4 = (y[idx] * product_hi) + z[kdx+idx] + carry2; 2897 // carry = (jlong)(tmp4 >>> 64); 2898 // z[kdx+idx+1] = (jlong)tmp3; 2899 // z[kdx+idx] = (jlong)tmp4; 2900 // } 2901 // idx += 2; 2902 // if (idx > 0) { 2903 // yz_idx1 = (y[idx] * product_hi) + z[kdx+idx] + carry; 2904 // z[kdx+idx] = (jlong)yz_idx1; 2905 // carry = (jlong)(yz_idx1 >>> 64); 2906 // } 2907 // 2908 2909 Label L_third_loop, L_third_loop_exit, L_post_third_loop_done; 2910 2911 lsrw(jdx, idx, 2); 2912 2913 bind(L_third_loop); 2914 2915 subsw(jdx, jdx, 1); 2916 br(Assembler::MI, L_third_loop_exit); 2917 subw(idx, idx, 4); 2918 2919 lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt))); 2920 2921 ldp(yz_idx2, yz_idx1, Address(rscratch1, 0)); 2922 2923 lea(tmp6, Address(z, idx, Address::uxtw(LogBytesPerInt))); 2924 2925 ror(yz_idx1, yz_idx1, 32); // convert big-endian to little-endian 2926 ror(yz_idx2, yz_idx2, 32); 2927 2928 ldp(rscratch2, rscratch1, Address(tmp6, 0)); 2929 2930 mul(tmp3, product_hi, yz_idx1); // yz_idx1 * product_hi -> tmp4:tmp3 2931 umulh(tmp4, product_hi, yz_idx1); 2932 2933 ror(rscratch1, rscratch1, 32); // convert big-endian to little-endian 2934 ror(rscratch2, rscratch2, 32); 2935 2936 mul(tmp, product_hi, yz_idx2); // yz_idx2 * product_hi -> carry2:tmp 2937 umulh(carry2, product_hi, yz_idx2); 2938 2939 // propagate sum of both multiplications into carry:tmp4:tmp3 2940 adds(tmp3, tmp3, carry); 2941 adc(tmp4, tmp4, zr); 2942 adds(tmp3, tmp3, rscratch1); 2943 adcs(tmp4, tmp4, tmp); 2944 adc(carry, carry2, zr); 2945 adds(tmp4, tmp4, rscratch2); 2946 adc(carry, carry, zr); 2947 2948 ror(tmp3, tmp3, 32); // convert little-endian to big-endian 2949 ror(tmp4, tmp4, 32); 2950 stp(tmp4, tmp3, Address(tmp6, 0)); 2951 2952 b(L_third_loop); 2953 bind (L_third_loop_exit); 2954 2955 andw (idx, idx, 0x3); 2956 cbz(idx, L_post_third_loop_done); 2957 2958 Label L_check_1; 2959 subsw(idx, idx, 2); 2960 br(Assembler::MI, L_check_1); 2961 2962 lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt))); 2963 ldr(yz_idx1, Address(rscratch1, 0)); 2964 ror(yz_idx1, yz_idx1, 32); 2965 mul(tmp3, product_hi, yz_idx1); // yz_idx1 * product_hi -> tmp4:tmp3 2966 umulh(tmp4, product_hi, yz_idx1); 2967 lea(rscratch1, Address(z, idx, Address::uxtw(LogBytesPerInt))); 2968 ldr(yz_idx2, Address(rscratch1, 0)); 2969 ror(yz_idx2, yz_idx2, 32); 2970 2971 add2_with_carry(carry, tmp4, tmp3, carry, yz_idx2); 2972 2973 ror(tmp3, tmp3, 32); 2974 str(tmp3, Address(rscratch1, 0)); 2975 2976 bind (L_check_1); 2977 2978 andw (idx, idx, 0x1); 2979 subsw(idx, idx, 1); 2980 br(Assembler::MI, L_post_third_loop_done); 2981 ldrw(tmp4, Address(y, idx, Address::uxtw(LogBytesPerInt))); 2982 mul(tmp3, tmp4, product_hi); // tmp4 * product_hi -> carry2:tmp3 2983 umulh(carry2, tmp4, product_hi); 2984 ldrw(tmp4, Address(z, idx, Address::uxtw(LogBytesPerInt))); 2985 2986 add2_with_carry(carry2, tmp3, tmp4, carry); 2987 2988 strw(tmp3, Address(z, idx, Address::uxtw(LogBytesPerInt))); 2989 extr(carry, carry2, tmp3, 32); 2990 2991 bind(L_post_third_loop_done); 2992 } 2993 2994 /** 2995 * Code for BigInteger::multiplyToLen() instrinsic. 2996 * 2997 * r0: x 2998 * r1: xlen 2999 * r2: y 3000 * r3: ylen 3001 * r4: z 3002 * r5: zlen 3003 * r10: tmp1 3004 * r11: tmp2 3005 * r12: tmp3 3006 * r13: tmp4 3007 * r14: tmp5 3008 * r15: tmp6 3009 * r16: tmp7 3010 * 3011 */ 3012 void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Register ylen, 3013 Register z, Register zlen, 3014 Register tmp1, Register tmp2, Register tmp3, Register tmp4, 3015 Register tmp5, Register tmp6, Register product_hi) { 3016 3017 assert_different_registers(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6); 3018 3019 const Register idx = tmp1; 3020 const Register kdx = tmp2; 3021 const Register xstart = tmp3; 3022 3023 const Register y_idx = tmp4; 3024 const Register carry = tmp5; 3025 const Register product = xlen; 3026 const Register x_xstart = zlen; // reuse register 3027 3028 // First Loop. 3029 // 3030 // final static long LONG_MASK = 0xffffffffL; 3031 // int xstart = xlen - 1; 3032 // int ystart = ylen - 1; 3033 // long carry = 0; 3034 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) { 3035 // long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry; 3036 // z[kdx] = (int)product; 3037 // carry = product >>> 32; 3038 // } 3039 // z[xstart] = (int)carry; 3040 // 3041 3042 movw(idx, ylen); // idx = ylen; 3043 movw(kdx, zlen); // kdx = xlen+ylen; 3044 mov(carry, zr); // carry = 0; 3045 3046 Label L_done; 3047 3048 movw(xstart, xlen); 3049 subsw(xstart, xstart, 1); 3050 br(Assembler::MI, L_done); 3051 3052 multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, carry, product, idx, kdx); 3053 3054 Label L_second_loop; 3055 cbzw(kdx, L_second_loop); 3056 3057 Label L_carry; 3058 subw(kdx, kdx, 1); 3059 cbzw(kdx, L_carry); 3060 3061 strw(carry, Address(z, kdx, Address::uxtw(LogBytesPerInt))); 3062 lsr(carry, carry, 32); 3063 subw(kdx, kdx, 1); 3064 3065 bind(L_carry); 3066 strw(carry, Address(z, kdx, Address::uxtw(LogBytesPerInt))); 3067 3068 // Second and third (nested) loops. 3069 // 3070 // for (int i = xstart-1; i >= 0; i--) { // Second loop 3071 // carry = 0; 3072 // for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop 3073 // long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) + 3074 // (z[k] & LONG_MASK) + carry; 3075 // z[k] = (int)product; 3076 // carry = product >>> 32; 3077 // } 3078 // z[i] = (int)carry; 3079 // } 3080 // 3081 // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = product_hi 3082 3083 const Register jdx = tmp1; 3084 3085 bind(L_second_loop); 3086 mov(carry, zr); // carry = 0; 3087 movw(jdx, ylen); // j = ystart+1 3088 3089 subsw(xstart, xstart, 1); // i = xstart-1; 3090 br(Assembler::MI, L_done); 3091 3092 str(z, Address(pre(sp, -4 * wordSize))); 3093 3094 Label L_last_x; 3095 lea(z, offsetted_address(z, xstart, Address::uxtw(LogBytesPerInt), 4, BytesPerInt)); // z = z + k - j 3096 subsw(xstart, xstart, 1); // i = xstart-1; 3097 br(Assembler::MI, L_last_x); 3098 3099 lea(rscratch1, Address(x, xstart, Address::uxtw(LogBytesPerInt))); 3100 ldr(product_hi, Address(rscratch1)); 3101 ror(product_hi, product_hi, 32); // convert big-endian to little-endian 3102 3103 Label L_third_loop_prologue; 3104 bind(L_third_loop_prologue); 3105 3106 str(ylen, Address(sp, wordSize)); 3107 stp(x, xstart, Address(sp, 2 * wordSize)); 3108 multiply_128_x_128_loop(y, z, carry, x, jdx, ylen, product, 3109 tmp2, x_xstart, tmp3, tmp4, tmp6, product_hi); 3110 ldp(z, ylen, Address(post(sp, 2 * wordSize))); 3111 ldp(x, xlen, Address(post(sp, 2 * wordSize))); // copy old xstart -> xlen 3112 3113 addw(tmp3, xlen, 1); 3114 strw(carry, Address(z, tmp3, Address::uxtw(LogBytesPerInt))); 3115 subsw(tmp3, tmp3, 1); 3116 br(Assembler::MI, L_done); 3117 3118 lsr(carry, carry, 32); 3119 strw(carry, Address(z, tmp3, Address::uxtw(LogBytesPerInt))); 3120 b(L_second_loop); 3121 3122 // Next infrequent code is moved outside loops. 3123 bind(L_last_x); 3124 ldrw(product_hi, Address(x, 0)); 3125 b(L_third_loop_prologue); 3126 3127 bind(L_done); 3128 } 3129 3130 // Code for BigInteger::mulAdd instrinsic 3131 // out = r0 3132 // in = r1 3133 // offset = r2 (already out.length-offset) 3134 // len = r3 3135 // k = r4 3136 // 3137 // pseudo code from java implementation: 3138 // carry = 0; 3139 // offset = out.length-offset - 1; 3140 // for (int j=len-1; j >= 0; j--) { 3141 // product = (in[j] & LONG_MASK) * kLong + (out[offset] & LONG_MASK) + carry; 3142 // out[offset--] = (int)product; 3143 // carry = product >>> 32; 3144 // } 3145 // return (int)carry; 3146 void MacroAssembler::mul_add(Register out, Register in, Register offset, 3147 Register len, Register k) { 3148 Label LOOP, END; 3149 // pre-loop 3150 cmp(len, zr); // cmp, not cbz/cbnz: to use condition twice => less branches 3151 csel(out, zr, out, Assembler::EQ); 3152 br(Assembler::EQ, END); 3153 add(in, in, len, LSL, 2); // in[j+1] address 3154 add(offset, out, offset, LSL, 2); // out[offset + 1] address 3155 mov(out, zr); // used to keep carry now 3156 BIND(LOOP); 3157 ldrw(rscratch1, Address(pre(in, -4))); 3158 madd(rscratch1, rscratch1, k, out); 3159 ldrw(rscratch2, Address(pre(offset, -4))); 3160 add(rscratch1, rscratch1, rscratch2); 3161 strw(rscratch1, Address(offset)); 3162 lsr(out, rscratch1, 32); 3163 subs(len, len, 1); 3164 br(Assembler::NE, LOOP); 3165 BIND(END); 3166 } 3167 3168 /** 3169 * Emits code to update CRC-32 with a byte value according to constants in table 3170 * 3171 * @param [in,out]crc Register containing the crc. 3172 * @param [in]val Register containing the byte to fold into the CRC. 3173 * @param [in]table Register containing the table of crc constants. 3174 * 3175 * uint32_t crc; 3176 * val = crc_table[(val ^ crc) & 0xFF]; 3177 * crc = val ^ (crc >> 8); 3178 * 3179 */ 3180 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) { 3181 eor(val, val, crc); 3182 andr(val, val, 0xff); 3183 ldrw(val, Address(table, val, Address::lsl(2))); 3184 eor(crc, val, crc, Assembler::LSR, 8); 3185 } 3186 3187 /** 3188 * Emits code to update CRC-32 with a 32-bit value according to tables 0 to 3 3189 * 3190 * @param [in,out]crc Register containing the crc. 3191 * @param [in]v Register containing the 32-bit to fold into the CRC. 3192 * @param [in]table0 Register containing table 0 of crc constants. 3193 * @param [in]table1 Register containing table 1 of crc constants. 3194 * @param [in]table2 Register containing table 2 of crc constants. 3195 * @param [in]table3 Register containing table 3 of crc constants. 3196 * 3197 * uint32_t crc; 3198 * v = crc ^ v 3199 * crc = table3[v&0xff]^table2[(v>>8)&0xff]^table1[(v>>16)&0xff]^table0[v>>24] 3200 * 3201 */ 3202 void MacroAssembler::update_word_crc32(Register crc, Register v, Register tmp, 3203 Register table0, Register table1, Register table2, Register table3, 3204 bool upper) { 3205 eor(v, crc, v, upper ? LSR:LSL, upper ? 32:0); 3206 uxtb(tmp, v); 3207 ldrw(crc, Address(table3, tmp, Address::lsl(2))); 3208 ubfx(tmp, v, 8, 8); 3209 ldrw(tmp, Address(table2, tmp, Address::lsl(2))); 3210 eor(crc, crc, tmp); 3211 ubfx(tmp, v, 16, 8); 3212 ldrw(tmp, Address(table1, tmp, Address::lsl(2))); 3213 eor(crc, crc, tmp); 3214 ubfx(tmp, v, 24, 8); 3215 ldrw(tmp, Address(table0, tmp, Address::lsl(2))); 3216 eor(crc, crc, tmp); 3217 } 3218 3219 void MacroAssembler::kernel_crc32_using_crc32(Register crc, Register buf, 3220 Register len, Register tmp0, Register tmp1, Register tmp2, 3221 Register tmp3) { 3222 Label CRC_by64_loop, CRC_by4_loop, CRC_by1_loop, CRC_less64, CRC_by64_pre, CRC_by32_loop, CRC_less32, L_exit; 3223 assert_different_registers(crc, buf, len, tmp0, tmp1, tmp2, tmp3); 3224 3225 mvnw(crc, crc); 3226 3227 subs(len, len, 128); 3228 br(Assembler::GE, CRC_by64_pre); 3229 BIND(CRC_less64); 3230 adds(len, len, 128-32); 3231 br(Assembler::GE, CRC_by32_loop); 3232 BIND(CRC_less32); 3233 adds(len, len, 32-4); 3234 br(Assembler::GE, CRC_by4_loop); 3235 adds(len, len, 4); 3236 br(Assembler::GT, CRC_by1_loop); 3237 b(L_exit); 3238 3239 BIND(CRC_by32_loop); 3240 ldp(tmp0, tmp1, Address(post(buf, 16))); 3241 subs(len, len, 32); 3242 crc32x(crc, crc, tmp0); 3243 ldr(tmp2, Address(post(buf, 8))); 3244 crc32x(crc, crc, tmp1); 3245 ldr(tmp3, Address(post(buf, 8))); 3246 crc32x(crc, crc, tmp2); 3247 crc32x(crc, crc, tmp3); 3248 br(Assembler::GE, CRC_by32_loop); 3249 cmn(len, 32); 3250 br(Assembler::NE, CRC_less32); 3251 b(L_exit); 3252 3253 BIND(CRC_by4_loop); 3254 ldrw(tmp0, Address(post(buf, 4))); 3255 subs(len, len, 4); 3256 crc32w(crc, crc, tmp0); 3257 br(Assembler::GE, CRC_by4_loop); 3258 adds(len, len, 4); 3259 br(Assembler::LE, L_exit); 3260 BIND(CRC_by1_loop); 3261 ldrb(tmp0, Address(post(buf, 1))); 3262 subs(len, len, 1); 3263 crc32b(crc, crc, tmp0); 3264 br(Assembler::GT, CRC_by1_loop); 3265 b(L_exit); 3266 3267 BIND(CRC_by64_pre); 3268 sub(buf, buf, 8); 3269 ldp(tmp0, tmp1, Address(buf, 8)); 3270 crc32x(crc, crc, tmp0); 3271 ldr(tmp2, Address(buf, 24)); 3272 crc32x(crc, crc, tmp1); 3273 ldr(tmp3, Address(buf, 32)); 3274 crc32x(crc, crc, tmp2); 3275 ldr(tmp0, Address(buf, 40)); 3276 crc32x(crc, crc, tmp3); 3277 ldr(tmp1, Address(buf, 48)); 3278 crc32x(crc, crc, tmp0); 3279 ldr(tmp2, Address(buf, 56)); 3280 crc32x(crc, crc, tmp1); 3281 ldr(tmp3, Address(pre(buf, 64))); 3282 3283 b(CRC_by64_loop); 3284 3285 align(CodeEntryAlignment); 3286 BIND(CRC_by64_loop); 3287 subs(len, len, 64); 3288 crc32x(crc, crc, tmp2); 3289 ldr(tmp0, Address(buf, 8)); 3290 crc32x(crc, crc, tmp3); 3291 ldr(tmp1, Address(buf, 16)); 3292 crc32x(crc, crc, tmp0); 3293 ldr(tmp2, Address(buf, 24)); 3294 crc32x(crc, crc, tmp1); 3295 ldr(tmp3, Address(buf, 32)); 3296 crc32x(crc, crc, tmp2); 3297 ldr(tmp0, Address(buf, 40)); 3298 crc32x(crc, crc, tmp3); 3299 ldr(tmp1, Address(buf, 48)); 3300 crc32x(crc, crc, tmp0); 3301 ldr(tmp2, Address(buf, 56)); 3302 crc32x(crc, crc, tmp1); 3303 ldr(tmp3, Address(pre(buf, 64))); 3304 br(Assembler::GE, CRC_by64_loop); 3305 3306 // post-loop 3307 crc32x(crc, crc, tmp2); 3308 crc32x(crc, crc, tmp3); 3309 3310 sub(len, len, 64); 3311 add(buf, buf, 8); 3312 cmn(len, 128); 3313 br(Assembler::NE, CRC_less64); 3314 BIND(L_exit); 3315 mvnw(crc, crc); 3316 } 3317 3318 /** 3319 * @param crc register containing existing CRC (32-bit) 3320 * @param buf register pointing to input byte buffer (byte*) 3321 * @param len register containing number of bytes 3322 * @param table register that will contain address of CRC table 3323 * @param tmp scratch register 3324 */ 3325 void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len, 3326 Register table0, Register table1, Register table2, Register table3, 3327 Register tmp, Register tmp2, Register tmp3) { 3328 Label L_by16, L_by16_loop, L_by4, L_by4_loop, L_by1, L_by1_loop, L_exit; 3329 unsigned long offset; 3330 3331 if (UseCRC32) { 3332 kernel_crc32_using_crc32(crc, buf, len, table0, table1, table2, table3); 3333 return; 3334 } 3335 3336 mvnw(crc, crc); 3337 3338 adrp(table0, ExternalAddress(StubRoutines::crc_table_addr()), offset); 3339 if (offset) add(table0, table0, offset); 3340 add(table1, table0, 1*256*sizeof(juint)); 3341 add(table2, table0, 2*256*sizeof(juint)); 3342 add(table3, table0, 3*256*sizeof(juint)); 3343 3344 if (UseNeon) { 3345 cmp(len, (u1)64); 3346 br(Assembler::LT, L_by16); 3347 eor(v16, T16B, v16, v16); 3348 3349 Label L_fold; 3350 3351 add(tmp, table0, 4*256*sizeof(juint)); // Point at the Neon constants 3352 3353 ld1(v0, v1, T2D, post(buf, 32)); 3354 ld1r(v4, T2D, post(tmp, 8)); 3355 ld1r(v5, T2D, post(tmp, 8)); 3356 ld1r(v6, T2D, post(tmp, 8)); 3357 ld1r(v7, T2D, post(tmp, 8)); 3358 mov(v16, T4S, 0, crc); 3359 3360 eor(v0, T16B, v0, v16); 3361 sub(len, len, 64); 3362 3363 BIND(L_fold); 3364 pmull(v22, T8H, v0, v5, T8B); 3365 pmull(v20, T8H, v0, v7, T8B); 3366 pmull(v23, T8H, v0, v4, T8B); 3367 pmull(v21, T8H, v0, v6, T8B); 3368 3369 pmull2(v18, T8H, v0, v5, T16B); 3370 pmull2(v16, T8H, v0, v7, T16B); 3371 pmull2(v19, T8H, v0, v4, T16B); 3372 pmull2(v17, T8H, v0, v6, T16B); 3373 3374 uzp1(v24, T8H, v20, v22); 3375 uzp2(v25, T8H, v20, v22); 3376 eor(v20, T16B, v24, v25); 3377 3378 uzp1(v26, T8H, v16, v18); 3379 uzp2(v27, T8H, v16, v18); 3380 eor(v16, T16B, v26, v27); 3381 3382 ushll2(v22, T4S, v20, T8H, 8); 3383 ushll(v20, T4S, v20, T4H, 8); 3384 3385 ushll2(v18, T4S, v16, T8H, 8); 3386 ushll(v16, T4S, v16, T4H, 8); 3387 3388 eor(v22, T16B, v23, v22); 3389 eor(v18, T16B, v19, v18); 3390 eor(v20, T16B, v21, v20); 3391 eor(v16, T16B, v17, v16); 3392 3393 uzp1(v17, T2D, v16, v20); 3394 uzp2(v21, T2D, v16, v20); 3395 eor(v17, T16B, v17, v21); 3396 3397 ushll2(v20, T2D, v17, T4S, 16); 3398 ushll(v16, T2D, v17, T2S, 16); 3399 3400 eor(v20, T16B, v20, v22); 3401 eor(v16, T16B, v16, v18); 3402 3403 uzp1(v17, T2D, v20, v16); 3404 uzp2(v21, T2D, v20, v16); 3405 eor(v28, T16B, v17, v21); 3406 3407 pmull(v22, T8H, v1, v5, T8B); 3408 pmull(v20, T8H, v1, v7, T8B); 3409 pmull(v23, T8H, v1, v4, T8B); 3410 pmull(v21, T8H, v1, v6, T8B); 3411 3412 pmull2(v18, T8H, v1, v5, T16B); 3413 pmull2(v16, T8H, v1, v7, T16B); 3414 pmull2(v19, T8H, v1, v4, T16B); 3415 pmull2(v17, T8H, v1, v6, T16B); 3416 3417 ld1(v0, v1, T2D, post(buf, 32)); 3418 3419 uzp1(v24, T8H, v20, v22); 3420 uzp2(v25, T8H, v20, v22); 3421 eor(v20, T16B, v24, v25); 3422 3423 uzp1(v26, T8H, v16, v18); 3424 uzp2(v27, T8H, v16, v18); 3425 eor(v16, T16B, v26, v27); 3426 3427 ushll2(v22, T4S, v20, T8H, 8); 3428 ushll(v20, T4S, v20, T4H, 8); 3429 3430 ushll2(v18, T4S, v16, T8H, 8); 3431 ushll(v16, T4S, v16, T4H, 8); 3432 3433 eor(v22, T16B, v23, v22); 3434 eor(v18, T16B, v19, v18); 3435 eor(v20, T16B, v21, v20); 3436 eor(v16, T16B, v17, v16); 3437 3438 uzp1(v17, T2D, v16, v20); 3439 uzp2(v21, T2D, v16, v20); 3440 eor(v16, T16B, v17, v21); 3441 3442 ushll2(v20, T2D, v16, T4S, 16); 3443 ushll(v16, T2D, v16, T2S, 16); 3444 3445 eor(v20, T16B, v22, v20); 3446 eor(v16, T16B, v16, v18); 3447 3448 uzp1(v17, T2D, v20, v16); 3449 uzp2(v21, T2D, v20, v16); 3450 eor(v20, T16B, v17, v21); 3451 3452 shl(v16, T2D, v28, 1); 3453 shl(v17, T2D, v20, 1); 3454 3455 eor(v0, T16B, v0, v16); 3456 eor(v1, T16B, v1, v17); 3457 3458 subs(len, len, 32); 3459 br(Assembler::GE, L_fold); 3460 3461 mov(crc, 0); 3462 mov(tmp, v0, T1D, 0); 3463 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false); 3464 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true); 3465 mov(tmp, v0, T1D, 1); 3466 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false); 3467 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true); 3468 mov(tmp, v1, T1D, 0); 3469 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false); 3470 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true); 3471 mov(tmp, v1, T1D, 1); 3472 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false); 3473 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true); 3474 3475 add(len, len, 32); 3476 } 3477 3478 BIND(L_by16); 3479 subs(len, len, 16); 3480 br(Assembler::GE, L_by16_loop); 3481 adds(len, len, 16-4); 3482 br(Assembler::GE, L_by4_loop); 3483 adds(len, len, 4); 3484 br(Assembler::GT, L_by1_loop); 3485 b(L_exit); 3486 3487 BIND(L_by4_loop); 3488 ldrw(tmp, Address(post(buf, 4))); 3489 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3); 3490 subs(len, len, 4); 3491 br(Assembler::GE, L_by4_loop); 3492 adds(len, len, 4); 3493 br(Assembler::LE, L_exit); 3494 BIND(L_by1_loop); 3495 subs(len, len, 1); 3496 ldrb(tmp, Address(post(buf, 1))); 3497 update_byte_crc32(crc, tmp, table0); 3498 br(Assembler::GT, L_by1_loop); 3499 b(L_exit); 3500 3501 align(CodeEntryAlignment); 3502 BIND(L_by16_loop); 3503 subs(len, len, 16); 3504 ldp(tmp, tmp3, Address(post(buf, 16))); 3505 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false); 3506 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true); 3507 update_word_crc32(crc, tmp3, tmp2, table0, table1, table2, table3, false); 3508 update_word_crc32(crc, tmp3, tmp2, table0, table1, table2, table3, true); 3509 br(Assembler::GE, L_by16_loop); 3510 adds(len, len, 16-4); 3511 br(Assembler::GE, L_by4_loop); 3512 adds(len, len, 4); 3513 br(Assembler::GT, L_by1_loop); 3514 BIND(L_exit); 3515 mvnw(crc, crc); 3516 } 3517 3518 void MacroAssembler::kernel_crc32c_using_crc32c(Register crc, Register buf, 3519 Register len, Register tmp0, Register tmp1, Register tmp2, 3520 Register tmp3) { 3521 Label CRC_by64_loop, CRC_by4_loop, CRC_by1_loop, CRC_less64, CRC_by64_pre, CRC_by32_loop, CRC_less32, L_exit; 3522 assert_different_registers(crc, buf, len, tmp0, tmp1, tmp2, tmp3); 3523 3524 subs(len, len, 128); 3525 br(Assembler::GE, CRC_by64_pre); 3526 BIND(CRC_less64); 3527 adds(len, len, 128-32); 3528 br(Assembler::GE, CRC_by32_loop); 3529 BIND(CRC_less32); 3530 adds(len, len, 32-4); 3531 br(Assembler::GE, CRC_by4_loop); 3532 adds(len, len, 4); 3533 br(Assembler::GT, CRC_by1_loop); 3534 b(L_exit); 3535 3536 BIND(CRC_by32_loop); 3537 ldp(tmp0, tmp1, Address(post(buf, 16))); 3538 subs(len, len, 32); 3539 crc32cx(crc, crc, tmp0); 3540 ldr(tmp2, Address(post(buf, 8))); 3541 crc32cx(crc, crc, tmp1); 3542 ldr(tmp3, Address(post(buf, 8))); 3543 crc32cx(crc, crc, tmp2); 3544 crc32cx(crc, crc, tmp3); 3545 br(Assembler::GE, CRC_by32_loop); 3546 cmn(len, 32); 3547 br(Assembler::NE, CRC_less32); 3548 b(L_exit); 3549 3550 BIND(CRC_by4_loop); 3551 ldrw(tmp0, Address(post(buf, 4))); 3552 subs(len, len, 4); 3553 crc32cw(crc, crc, tmp0); 3554 br(Assembler::GE, CRC_by4_loop); 3555 adds(len, len, 4); 3556 br(Assembler::LE, L_exit); 3557 BIND(CRC_by1_loop); 3558 ldrb(tmp0, Address(post(buf, 1))); 3559 subs(len, len, 1); 3560 crc32cb(crc, crc, tmp0); 3561 br(Assembler::GT, CRC_by1_loop); 3562 b(L_exit); 3563 3564 BIND(CRC_by64_pre); 3565 sub(buf, buf, 8); 3566 ldp(tmp0, tmp1, Address(buf, 8)); 3567 crc32cx(crc, crc, tmp0); 3568 ldr(tmp2, Address(buf, 24)); 3569 crc32cx(crc, crc, tmp1); 3570 ldr(tmp3, Address(buf, 32)); 3571 crc32cx(crc, crc, tmp2); 3572 ldr(tmp0, Address(buf, 40)); 3573 crc32cx(crc, crc, tmp3); 3574 ldr(tmp1, Address(buf, 48)); 3575 crc32cx(crc, crc, tmp0); 3576 ldr(tmp2, Address(buf, 56)); 3577 crc32cx(crc, crc, tmp1); 3578 ldr(tmp3, Address(pre(buf, 64))); 3579 3580 b(CRC_by64_loop); 3581 3582 align(CodeEntryAlignment); 3583 BIND(CRC_by64_loop); 3584 subs(len, len, 64); 3585 crc32cx(crc, crc, tmp2); 3586 ldr(tmp0, Address(buf, 8)); 3587 crc32cx(crc, crc, tmp3); 3588 ldr(tmp1, Address(buf, 16)); 3589 crc32cx(crc, crc, tmp0); 3590 ldr(tmp2, Address(buf, 24)); 3591 crc32cx(crc, crc, tmp1); 3592 ldr(tmp3, Address(buf, 32)); 3593 crc32cx(crc, crc, tmp2); 3594 ldr(tmp0, Address(buf, 40)); 3595 crc32cx(crc, crc, tmp3); 3596 ldr(tmp1, Address(buf, 48)); 3597 crc32cx(crc, crc, tmp0); 3598 ldr(tmp2, Address(buf, 56)); 3599 crc32cx(crc, crc, tmp1); 3600 ldr(tmp3, Address(pre(buf, 64))); 3601 br(Assembler::GE, CRC_by64_loop); 3602 3603 // post-loop 3604 crc32cx(crc, crc, tmp2); 3605 crc32cx(crc, crc, tmp3); 3606 3607 sub(len, len, 64); 3608 add(buf, buf, 8); 3609 cmn(len, 128); 3610 br(Assembler::NE, CRC_less64); 3611 BIND(L_exit); 3612 } 3613 3614 /** 3615 * @param crc register containing existing CRC (32-bit) 3616 * @param buf register pointing to input byte buffer (byte*) 3617 * @param len register containing number of bytes 3618 * @param table register that will contain address of CRC table 3619 * @param tmp scratch register 3620 */ 3621 void MacroAssembler::kernel_crc32c(Register crc, Register buf, Register len, 3622 Register table0, Register table1, Register table2, Register table3, 3623 Register tmp, Register tmp2, Register tmp3) { 3624 kernel_crc32c_using_crc32c(crc, buf, len, table0, table1, table2, table3); 3625 } 3626 3627 3628 SkipIfEqual::SkipIfEqual( 3629 MacroAssembler* masm, const bool* flag_addr, bool value) { 3630 _masm = masm; 3631 unsigned long offset; 3632 _masm->adrp(rscratch1, ExternalAddress((address)flag_addr), offset); 3633 _masm->ldrb(rscratch1, Address(rscratch1, offset)); 3634 _masm->cbzw(rscratch1, _label); 3635 } 3636 3637 SkipIfEqual::~SkipIfEqual() { 3638 _masm->bind(_label); 3639 } 3640 3641 void MacroAssembler::addptr(const Address &dst, int32_t src) { 3642 Address adr; 3643 switch(dst.getMode()) { 3644 case Address::base_plus_offset: 3645 // This is the expected mode, although we allow all the other 3646 // forms below. 3647 adr = form_address(rscratch2, dst.base(), dst.offset(), LogBytesPerWord); 3648 break; 3649 default: 3650 lea(rscratch2, dst); 3651 adr = Address(rscratch2); 3652 break; 3653 } 3654 ldr(rscratch1, adr); 3655 add(rscratch1, rscratch1, src); 3656 str(rscratch1, adr); 3657 } 3658 3659 void MacroAssembler::cmpptr(Register src1, Address src2) { 3660 unsigned long offset; 3661 adrp(rscratch1, src2, offset); 3662 ldr(rscratch1, Address(rscratch1, offset)); 3663 cmp(src1, rscratch1); 3664 } 3665 3666 void MacroAssembler::cmpoop(Register obj1, Register obj2) { 3667 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 3668 bs->obj_equals(this, obj1, obj2); 3669 } 3670 3671 void MacroAssembler::load_klass(Register dst, Register src) { 3672 if (UseCompressedClassPointers) { 3673 ldrw(dst, Address(src, oopDesc::klass_offset_in_bytes())); 3674 decode_klass_not_null(dst); 3675 } else { 3676 ldr(dst, Address(src, oopDesc::klass_offset_in_bytes())); 3677 } 3678 } 3679 3680 // ((OopHandle)result).resolve(); 3681 void MacroAssembler::resolve_oop_handle(Register result, Register tmp) { 3682 // OopHandle::resolve is an indirection. 3683 access_load_at(T_OBJECT, IN_NATIVE, result, Address(result, 0), tmp, noreg); 3684 } 3685 3686 void MacroAssembler::load_mirror(Register dst, Register method, Register tmp) { 3687 const int mirror_offset = in_bytes(Klass::java_mirror_offset()); 3688 ldr(dst, Address(rmethod, Method::const_offset())); 3689 ldr(dst, Address(dst, ConstMethod::constants_offset())); 3690 ldr(dst, Address(dst, ConstantPool::pool_holder_offset_in_bytes())); 3691 ldr(dst, Address(dst, mirror_offset)); 3692 resolve_oop_handle(dst, tmp); 3693 } 3694 3695 void MacroAssembler::cmp_klass(Register oop, Register trial_klass, Register tmp) { 3696 if (UseCompressedClassPointers) { 3697 ldrw(tmp, Address(oop, oopDesc::klass_offset_in_bytes())); 3698 if (Universe::narrow_klass_base() == NULL) { 3699 cmp(trial_klass, tmp, LSL, Universe::narrow_klass_shift()); 3700 return; 3701 } else if (((uint64_t)Universe::narrow_klass_base() & 0xffffffff) == 0 3702 && Universe::narrow_klass_shift() == 0) { 3703 // Only the bottom 32 bits matter 3704 cmpw(trial_klass, tmp); 3705 return; 3706 } 3707 decode_klass_not_null(tmp); 3708 } else { 3709 ldr(tmp, Address(oop, oopDesc::klass_offset_in_bytes())); 3710 } 3711 cmp(trial_klass, tmp); 3712 } 3713 3714 void MacroAssembler::load_prototype_header(Register dst, Register src) { 3715 load_klass(dst, src); 3716 ldr(dst, Address(dst, Klass::prototype_header_offset())); 3717 } 3718 3719 void MacroAssembler::store_klass(Register dst, Register src) { 3720 // FIXME: Should this be a store release? concurrent gcs assumes 3721 // klass length is valid if klass field is not null. 3722 if (UseCompressedClassPointers) { 3723 encode_klass_not_null(src); 3724 strw(src, Address(dst, oopDesc::klass_offset_in_bytes())); 3725 } else { 3726 str(src, Address(dst, oopDesc::klass_offset_in_bytes())); 3727 } 3728 } 3729 3730 void MacroAssembler::store_klass_gap(Register dst, Register src) { 3731 if (UseCompressedClassPointers) { 3732 // Store to klass gap in destination 3733 strw(src, Address(dst, oopDesc::klass_gap_offset_in_bytes())); 3734 } 3735 } 3736 3737 // Algorithm must match CompressedOops::encode. 3738 void MacroAssembler::encode_heap_oop(Register d, Register s) { 3739 #ifdef ASSERT 3740 verify_heapbase("MacroAssembler::encode_heap_oop: heap base corrupted?"); 3741 #endif 3742 verify_oop(s, "broken oop in encode_heap_oop"); 3743 if (Universe::narrow_oop_base() == NULL) { 3744 if (Universe::narrow_oop_shift() != 0) { 3745 assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong"); 3746 lsr(d, s, LogMinObjAlignmentInBytes); 3747 } else { 3748 mov(d, s); 3749 } 3750 } else { 3751 subs(d, s, rheapbase); 3752 csel(d, d, zr, Assembler::HS); 3753 lsr(d, d, LogMinObjAlignmentInBytes); 3754 3755 /* Old algorithm: is this any worse? 3756 Label nonnull; 3757 cbnz(r, nonnull); 3758 sub(r, r, rheapbase); 3759 bind(nonnull); 3760 lsr(r, r, LogMinObjAlignmentInBytes); 3761 */ 3762 } 3763 } 3764 3765 void MacroAssembler::encode_heap_oop_not_null(Register r) { 3766 #ifdef ASSERT 3767 verify_heapbase("MacroAssembler::encode_heap_oop_not_null: heap base corrupted?"); 3768 if (CheckCompressedOops) { 3769 Label ok; 3770 cbnz(r, ok); 3771 stop("null oop passed to encode_heap_oop_not_null"); 3772 bind(ok); 3773 } 3774 #endif 3775 verify_oop(r, "broken oop in encode_heap_oop_not_null"); 3776 if (Universe::narrow_oop_base() != NULL) { 3777 sub(r, r, rheapbase); 3778 } 3779 if (Universe::narrow_oop_shift() != 0) { 3780 assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong"); 3781 lsr(r, r, LogMinObjAlignmentInBytes); 3782 } 3783 } 3784 3785 void MacroAssembler::encode_heap_oop_not_null(Register dst, Register src) { 3786 #ifdef ASSERT 3787 verify_heapbase("MacroAssembler::encode_heap_oop_not_null2: heap base corrupted?"); 3788 if (CheckCompressedOops) { 3789 Label ok; 3790 cbnz(src, ok); 3791 stop("null oop passed to encode_heap_oop_not_null2"); 3792 bind(ok); 3793 } 3794 #endif 3795 verify_oop(src, "broken oop in encode_heap_oop_not_null2"); 3796 3797 Register data = src; 3798 if (Universe::narrow_oop_base() != NULL) { 3799 sub(dst, src, rheapbase); 3800 data = dst; 3801 } 3802 if (Universe::narrow_oop_shift() != 0) { 3803 assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong"); 3804 lsr(dst, data, LogMinObjAlignmentInBytes); 3805 data = dst; 3806 } 3807 if (data == src) 3808 mov(dst, src); 3809 } 3810 3811 void MacroAssembler::decode_heap_oop(Register d, Register s) { 3812 #ifdef ASSERT 3813 verify_heapbase("MacroAssembler::decode_heap_oop: heap base corrupted?"); 3814 #endif 3815 if (Universe::narrow_oop_base() == NULL) { 3816 if (Universe::narrow_oop_shift() != 0 || d != s) { 3817 lsl(d, s, Universe::narrow_oop_shift()); 3818 } 3819 } else { 3820 Label done; 3821 if (d != s) 3822 mov(d, s); 3823 cbz(s, done); 3824 add(d, rheapbase, s, Assembler::LSL, LogMinObjAlignmentInBytes); 3825 bind(done); 3826 } 3827 verify_oop(d, "broken oop in decode_heap_oop"); 3828 } 3829 3830 void MacroAssembler::decode_heap_oop_not_null(Register r) { 3831 assert (UseCompressedOops, "should only be used for compressed headers"); 3832 assert (Universe::heap() != NULL, "java heap should be initialized"); 3833 // Cannot assert, unverified entry point counts instructions (see .ad file) 3834 // vtableStubs also counts instructions in pd_code_size_limit. 3835 // Also do not verify_oop as this is called by verify_oop. 3836 if (Universe::narrow_oop_shift() != 0) { 3837 assert(LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong"); 3838 if (Universe::narrow_oop_base() != NULL) { 3839 add(r, rheapbase, r, Assembler::LSL, LogMinObjAlignmentInBytes); 3840 } else { 3841 add(r, zr, r, Assembler::LSL, LogMinObjAlignmentInBytes); 3842 } 3843 } else { 3844 assert (Universe::narrow_oop_base() == NULL, "sanity"); 3845 } 3846 } 3847 3848 void MacroAssembler::decode_heap_oop_not_null(Register dst, Register src) { 3849 assert (UseCompressedOops, "should only be used for compressed headers"); 3850 assert (Universe::heap() != NULL, "java heap should be initialized"); 3851 // Cannot assert, unverified entry point counts instructions (see .ad file) 3852 // vtableStubs also counts instructions in pd_code_size_limit. 3853 // Also do not verify_oop as this is called by verify_oop. 3854 if (Universe::narrow_oop_shift() != 0) { 3855 assert(LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong"); 3856 if (Universe::narrow_oop_base() != NULL) { 3857 add(dst, rheapbase, src, Assembler::LSL, LogMinObjAlignmentInBytes); 3858 } else { 3859 add(dst, zr, src, Assembler::LSL, LogMinObjAlignmentInBytes); 3860 } 3861 } else { 3862 assert (Universe::narrow_oop_base() == NULL, "sanity"); 3863 if (dst != src) { 3864 mov(dst, src); 3865 } 3866 } 3867 } 3868 3869 void MacroAssembler::encode_klass_not_null(Register dst, Register src) { 3870 if (Universe::narrow_klass_base() == NULL) { 3871 if (Universe::narrow_klass_shift() != 0) { 3872 assert (LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong"); 3873 lsr(dst, src, LogKlassAlignmentInBytes); 3874 } else { 3875 if (dst != src) mov(dst, src); 3876 } 3877 return; 3878 } 3879 3880 if (use_XOR_for_compressed_class_base) { 3881 if (Universe::narrow_klass_shift() != 0) { 3882 eor(dst, src, (uint64_t)Universe::narrow_klass_base()); 3883 lsr(dst, dst, LogKlassAlignmentInBytes); 3884 } else { 3885 eor(dst, src, (uint64_t)Universe::narrow_klass_base()); 3886 } 3887 return; 3888 } 3889 3890 if (((uint64_t)Universe::narrow_klass_base() & 0xffffffff) == 0 3891 && Universe::narrow_klass_shift() == 0) { 3892 movw(dst, src); 3893 return; 3894 } 3895 3896 #ifdef ASSERT 3897 verify_heapbase("MacroAssembler::encode_klass_not_null2: heap base corrupted?"); 3898 #endif 3899 3900 Register rbase = dst; 3901 if (dst == src) rbase = rheapbase; 3902 mov(rbase, (uint64_t)Universe::narrow_klass_base()); 3903 sub(dst, src, rbase); 3904 if (Universe::narrow_klass_shift() != 0) { 3905 assert (LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong"); 3906 lsr(dst, dst, LogKlassAlignmentInBytes); 3907 } 3908 if (dst == src) reinit_heapbase(); 3909 } 3910 3911 void MacroAssembler::encode_klass_not_null(Register r) { 3912 encode_klass_not_null(r, r); 3913 } 3914 3915 void MacroAssembler::decode_klass_not_null(Register dst, Register src) { 3916 Register rbase = dst; 3917 assert (UseCompressedClassPointers, "should only be used for compressed headers"); 3918 3919 if (Universe::narrow_klass_base() == NULL) { 3920 if (Universe::narrow_klass_shift() != 0) { 3921 assert(LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong"); 3922 lsl(dst, src, LogKlassAlignmentInBytes); 3923 } else { 3924 if (dst != src) mov(dst, src); 3925 } 3926 return; 3927 } 3928 3929 if (use_XOR_for_compressed_class_base) { 3930 if (Universe::narrow_klass_shift() != 0) { 3931 lsl(dst, src, LogKlassAlignmentInBytes); 3932 eor(dst, dst, (uint64_t)Universe::narrow_klass_base()); 3933 } else { 3934 eor(dst, src, (uint64_t)Universe::narrow_klass_base()); 3935 } 3936 return; 3937 } 3938 3939 if (((uint64_t)Universe::narrow_klass_base() & 0xffffffff) == 0 3940 && Universe::narrow_klass_shift() == 0) { 3941 if (dst != src) 3942 movw(dst, src); 3943 movk(dst, (uint64_t)Universe::narrow_klass_base() >> 32, 32); 3944 return; 3945 } 3946 3947 // Cannot assert, unverified entry point counts instructions (see .ad file) 3948 // vtableStubs also counts instructions in pd_code_size_limit. 3949 // Also do not verify_oop as this is called by verify_oop. 3950 if (dst == src) rbase = rheapbase; 3951 mov(rbase, (uint64_t)Universe::narrow_klass_base()); 3952 if (Universe::narrow_klass_shift() != 0) { 3953 assert(LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong"); 3954 add(dst, rbase, src, Assembler::LSL, LogKlassAlignmentInBytes); 3955 } else { 3956 add(dst, rbase, src); 3957 } 3958 if (dst == src) reinit_heapbase(); 3959 } 3960 3961 void MacroAssembler::decode_klass_not_null(Register r) { 3962 decode_klass_not_null(r, r); 3963 } 3964 3965 void MacroAssembler::set_narrow_oop(Register dst, jobject obj) { 3966 #ifdef ASSERT 3967 { 3968 ThreadInVMfromUnknown tiv; 3969 assert (UseCompressedOops, "should only be used for compressed oops"); 3970 assert (Universe::heap() != NULL, "java heap should be initialized"); 3971 assert (oop_recorder() != NULL, "this assembler needs an OopRecorder"); 3972 assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "should be real oop"); 3973 } 3974 #endif 3975 int oop_index = oop_recorder()->find_index(obj); 3976 InstructionMark im(this); 3977 RelocationHolder rspec = oop_Relocation::spec(oop_index); 3978 code_section()->relocate(inst_mark(), rspec); 3979 movz(dst, 0xDEAD, 16); 3980 movk(dst, 0xBEEF); 3981 } 3982 3983 void MacroAssembler::set_narrow_klass(Register dst, Klass* k) { 3984 assert (UseCompressedClassPointers, "should only be used for compressed headers"); 3985 assert (oop_recorder() != NULL, "this assembler needs an OopRecorder"); 3986 int index = oop_recorder()->find_index(k); 3987 assert(! Universe::heap()->is_in_reserved(k), "should not be an oop"); 3988 3989 InstructionMark im(this); 3990 RelocationHolder rspec = metadata_Relocation::spec(index); 3991 code_section()->relocate(inst_mark(), rspec); 3992 narrowKlass nk = Klass::encode_klass(k); 3993 movz(dst, (nk >> 16), 16); 3994 movk(dst, nk & 0xffff); 3995 } 3996 3997 void MacroAssembler::access_load_at(BasicType type, DecoratorSet decorators, 3998 Register dst, Address src, 3999 Register tmp1, Register thread_tmp) { 4000 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 4001 decorators = AccessInternal::decorator_fixup(decorators); 4002 bool as_raw = (decorators & AS_RAW) != 0; 4003 if (as_raw) { 4004 bs->BarrierSetAssembler::load_at(this, decorators, type, dst, src, tmp1, thread_tmp); 4005 } else { 4006 bs->load_at(this, decorators, type, dst, src, tmp1, thread_tmp); 4007 } 4008 } 4009 4010 void MacroAssembler::access_store_at(BasicType type, DecoratorSet decorators, 4011 Address dst, Register src, 4012 Register tmp1, Register thread_tmp) { 4013 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 4014 decorators = AccessInternal::decorator_fixup(decorators); 4015 bool as_raw = (decorators & AS_RAW) != 0; 4016 if (as_raw) { 4017 bs->BarrierSetAssembler::store_at(this, decorators, type, dst, src, tmp1, thread_tmp); 4018 } else { 4019 bs->store_at(this, decorators, type, dst, src, tmp1, thread_tmp); 4020 } 4021 } 4022 4023 void MacroAssembler::resolve(DecoratorSet decorators, Register obj) { 4024 // Use stronger ACCESS_WRITE|ACCESS_READ by default. 4025 if ((decorators & (ACCESS_READ | ACCESS_WRITE)) == 0) { 4026 decorators |= ACCESS_READ | ACCESS_WRITE; 4027 } 4028 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 4029 return bs->resolve(this, decorators, obj); 4030 } 4031 4032 void MacroAssembler::load_heap_oop(Register dst, Address src, Register tmp1, 4033 Register thread_tmp, DecoratorSet decorators) { 4034 access_load_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, thread_tmp); 4035 } 4036 4037 void MacroAssembler::load_heap_oop_not_null(Register dst, Address src, Register tmp1, 4038 Register thread_tmp, DecoratorSet decorators) { 4039 access_load_at(T_OBJECT, IN_HEAP | IS_NOT_NULL | decorators, dst, src, tmp1, thread_tmp); 4040 } 4041 4042 void MacroAssembler::store_heap_oop(Address dst, Register src, Register tmp1, 4043 Register thread_tmp, DecoratorSet decorators) { 4044 access_store_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, thread_tmp); 4045 } 4046 4047 // Used for storing NULLs. 4048 void MacroAssembler::store_heap_oop_null(Address dst) { 4049 access_store_at(T_OBJECT, IN_HEAP, dst, noreg, noreg, noreg); 4050 } 4051 4052 Address MacroAssembler::allocate_metadata_address(Metadata* obj) { 4053 assert(oop_recorder() != NULL, "this assembler needs a Recorder"); 4054 int index = oop_recorder()->allocate_metadata_index(obj); 4055 RelocationHolder rspec = metadata_Relocation::spec(index); 4056 return Address((address)obj, rspec); 4057 } 4058 4059 // Move an oop into a register. immediate is true if we want 4060 // immediate instrcutions, i.e. we are not going to patch this 4061 // instruction while the code is being executed by another thread. In 4062 // that case we can use move immediates rather than the constant pool. 4063 void MacroAssembler::movoop(Register dst, jobject obj, bool immediate) { 4064 int oop_index; 4065 if (obj == NULL) { 4066 oop_index = oop_recorder()->allocate_oop_index(obj); 4067 } else { 4068 #ifdef ASSERT 4069 { 4070 ThreadInVMfromUnknown tiv; 4071 assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "should be real oop"); 4072 } 4073 #endif 4074 oop_index = oop_recorder()->find_index(obj); 4075 } 4076 RelocationHolder rspec = oop_Relocation::spec(oop_index); 4077 if (! immediate) { 4078 address dummy = address(uintptr_t(pc()) & -wordSize); // A nearby aligned address 4079 ldr_constant(dst, Address(dummy, rspec)); 4080 } else 4081 mov(dst, Address((address)obj, rspec)); 4082 } 4083 4084 // Move a metadata address into a register. 4085 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) { 4086 int oop_index; 4087 if (obj == NULL) { 4088 oop_index = oop_recorder()->allocate_metadata_index(obj); 4089 } else { 4090 oop_index = oop_recorder()->find_index(obj); 4091 } 4092 RelocationHolder rspec = metadata_Relocation::spec(oop_index); 4093 mov(dst, Address((address)obj, rspec)); 4094 } 4095 4096 Address MacroAssembler::constant_oop_address(jobject obj) { 4097 #ifdef ASSERT 4098 { 4099 ThreadInVMfromUnknown tiv; 4100 assert(oop_recorder() != NULL, "this assembler needs an OopRecorder"); 4101 assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "not an oop"); 4102 } 4103 #endif 4104 int oop_index = oop_recorder()->find_index(obj); 4105 return Address((address)obj, oop_Relocation::spec(oop_index)); 4106 } 4107 4108 // Defines obj, preserves var_size_in_bytes, okay for t2 == var_size_in_bytes. 4109 void MacroAssembler::tlab_allocate(Register obj, 4110 Register var_size_in_bytes, 4111 int con_size_in_bytes, 4112 Register t1, 4113 Register t2, 4114 Label& slow_case) { 4115 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 4116 bs->tlab_allocate(this, obj, var_size_in_bytes, con_size_in_bytes, t1, t2, slow_case); 4117 } 4118 4119 // Defines obj, preserves var_size_in_bytes 4120 void MacroAssembler::eden_allocate(Register obj, 4121 Register var_size_in_bytes, 4122 int con_size_in_bytes, 4123 Register t1, 4124 Label& slow_case) { 4125 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 4126 bs->eden_allocate(this, obj, var_size_in_bytes, con_size_in_bytes, t1, slow_case); 4127 } 4128 4129 // Zero words; len is in bytes 4130 // Destroys all registers except addr 4131 // len must be a nonzero multiple of wordSize 4132 void MacroAssembler::zero_memory(Register addr, Register len, Register t1) { 4133 assert_different_registers(addr, len, t1, rscratch1, rscratch2); 4134 4135 #ifdef ASSERT 4136 { Label L; 4137 tst(len, BytesPerWord - 1); 4138 br(Assembler::EQ, L); 4139 stop("len is not a multiple of BytesPerWord"); 4140 bind(L); 4141 } 4142 #endif 4143 4144 #ifndef PRODUCT 4145 block_comment("zero memory"); 4146 #endif 4147 4148 Label loop; 4149 Label entry; 4150 4151 // Algorithm: 4152 // 4153 // scratch1 = cnt & 7; 4154 // cnt -= scratch1; 4155 // p += scratch1; 4156 // switch (scratch1) { 4157 // do { 4158 // cnt -= 8; 4159 // p[-8] = 0; 4160 // case 7: 4161 // p[-7] = 0; 4162 // case 6: 4163 // p[-6] = 0; 4164 // // ... 4165 // case 1: 4166 // p[-1] = 0; 4167 // case 0: 4168 // p += 8; 4169 // } while (cnt); 4170 // } 4171 4172 const int unroll = 8; // Number of str(zr) instructions we'll unroll 4173 4174 lsr(len, len, LogBytesPerWord); 4175 andr(rscratch1, len, unroll - 1); // tmp1 = cnt % unroll 4176 sub(len, len, rscratch1); // cnt -= unroll 4177 // t1 always points to the end of the region we're about to zero 4178 add(t1, addr, rscratch1, Assembler::LSL, LogBytesPerWord); 4179 adr(rscratch2, entry); 4180 sub(rscratch2, rscratch2, rscratch1, Assembler::LSL, 2); 4181 br(rscratch2); 4182 bind(loop); 4183 sub(len, len, unroll); 4184 for (int i = -unroll; i < 0; i++) 4185 Assembler::str(zr, Address(t1, i * wordSize)); 4186 bind(entry); 4187 add(t1, t1, unroll * wordSize); 4188 cbnz(len, loop); 4189 } 4190 4191 void MacroAssembler::verify_tlab() { 4192 #ifdef ASSERT 4193 if (UseTLAB && VerifyOops) { 4194 Label next, ok; 4195 4196 stp(rscratch2, rscratch1, Address(pre(sp, -16))); 4197 4198 ldr(rscratch2, Address(rthread, in_bytes(JavaThread::tlab_top_offset()))); 4199 ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_start_offset()))); 4200 cmp(rscratch2, rscratch1); 4201 br(Assembler::HS, next); 4202 STOP("assert(top >= start)"); 4203 should_not_reach_here(); 4204 4205 bind(next); 4206 ldr(rscratch2, Address(rthread, in_bytes(JavaThread::tlab_end_offset()))); 4207 ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_top_offset()))); 4208 cmp(rscratch2, rscratch1); 4209 br(Assembler::HS, ok); 4210 STOP("assert(top <= end)"); 4211 should_not_reach_here(); 4212 4213 bind(ok); 4214 ldp(rscratch2, rscratch1, Address(post(sp, 16))); 4215 } 4216 #endif 4217 } 4218 4219 // Writes to stack successive pages until offset reached to check for 4220 // stack overflow + shadow pages. This clobbers tmp. 4221 void MacroAssembler::bang_stack_size(Register size, Register tmp) { 4222 assert_different_registers(tmp, size, rscratch1); 4223 mov(tmp, sp); 4224 // Bang stack for total size given plus shadow page size. 4225 // Bang one page at a time because large size can bang beyond yellow and 4226 // red zones. 4227 Label loop; 4228 mov(rscratch1, os::vm_page_size()); 4229 bind(loop); 4230 lea(tmp, Address(tmp, -os::vm_page_size())); 4231 subsw(size, size, rscratch1); 4232 str(size, Address(tmp)); 4233 br(Assembler::GT, loop); 4234 4235 // Bang down shadow pages too. 4236 // At this point, (tmp-0) is the last address touched, so don't 4237 // touch it again. (It was touched as (tmp-pagesize) but then tmp 4238 // was post-decremented.) Skip this address by starting at i=1, and 4239 // touch a few more pages below. N.B. It is important to touch all 4240 // the way down to and including i=StackShadowPages. 4241 for (int i = 0; i < (int)(JavaThread::stack_shadow_zone_size() / os::vm_page_size()) - 1; i++) { 4242 // this could be any sized move but this is can be a debugging crumb 4243 // so the bigger the better. 4244 lea(tmp, Address(tmp, -os::vm_page_size())); 4245 str(size, Address(tmp)); 4246 } 4247 } 4248 4249 4250 // Move the address of the polling page into dest. 4251 void MacroAssembler::get_polling_page(Register dest, address page, relocInfo::relocType rtype) { 4252 if (SafepointMechanism::uses_thread_local_poll()) { 4253 ldr(dest, Address(rthread, Thread::polling_page_offset())); 4254 } else { 4255 unsigned long off; 4256 adrp(dest, Address(page, rtype), off); 4257 assert(off == 0, "polling page must be page aligned"); 4258 } 4259 } 4260 4261 // Move the address of the polling page into r, then read the polling 4262 // page. 4263 address MacroAssembler::read_polling_page(Register r, address page, relocInfo::relocType rtype) { 4264 get_polling_page(r, page, rtype); 4265 return read_polling_page(r, rtype); 4266 } 4267 4268 // Read the polling page. The address of the polling page must 4269 // already be in r. 4270 address MacroAssembler::read_polling_page(Register r, relocInfo::relocType rtype) { 4271 InstructionMark im(this); 4272 code_section()->relocate(inst_mark(), rtype); 4273 ldrw(zr, Address(r, 0)); 4274 return inst_mark(); 4275 } 4276 4277 void MacroAssembler::adrp(Register reg1, const Address &dest, unsigned long &byte_offset) { 4278 relocInfo::relocType rtype = dest.rspec().reloc()->type(); 4279 unsigned long low_page = (unsigned long)CodeCache::low_bound() >> 12; 4280 unsigned long high_page = (unsigned long)(CodeCache::high_bound()-1) >> 12; 4281 unsigned long dest_page = (unsigned long)dest.target() >> 12; 4282 long offset_low = dest_page - low_page; 4283 long offset_high = dest_page - high_page; 4284 4285 assert(is_valid_AArch64_address(dest.target()), "bad address"); 4286 assert(dest.getMode() == Address::literal, "ADRP must be applied to a literal address"); 4287 4288 InstructionMark im(this); 4289 code_section()->relocate(inst_mark(), dest.rspec()); 4290 // 8143067: Ensure that the adrp can reach the dest from anywhere within 4291 // the code cache so that if it is relocated we know it will still reach 4292 if (offset_high >= -(1<<20) && offset_low < (1<<20)) { 4293 _adrp(reg1, dest.target()); 4294 } else { 4295 unsigned long target = (unsigned long)dest.target(); 4296 unsigned long adrp_target 4297 = (target & 0xffffffffUL) | ((unsigned long)pc() & 0xffff00000000UL); 4298 4299 _adrp(reg1, (address)adrp_target); 4300 movk(reg1, target >> 32, 32); 4301 } 4302 byte_offset = (unsigned long)dest.target() & 0xfff; 4303 } 4304 4305 void MacroAssembler::load_byte_map_base(Register reg) { 4306 jbyte *byte_map_base = 4307 ((CardTableBarrierSet*)(BarrierSet::barrier_set()))->card_table()->byte_map_base(); 4308 4309 if (is_valid_AArch64_address((address)byte_map_base)) { 4310 // Strictly speaking the byte_map_base isn't an address at all, 4311 // and it might even be negative. 4312 unsigned long offset; 4313 adrp(reg, ExternalAddress((address)byte_map_base), offset); 4314 // We expect offset to be zero with most collectors. 4315 if (offset != 0) { 4316 add(reg, reg, offset); 4317 } 4318 } else { 4319 mov(reg, (uint64_t)byte_map_base); 4320 } 4321 } 4322 4323 void MacroAssembler::build_frame(int framesize) { 4324 assert(framesize > 0, "framesize must be > 0"); 4325 if (framesize < ((1 << 9) + 2 * wordSize)) { 4326 sub(sp, sp, framesize); 4327 stp(rfp, lr, Address(sp, framesize - 2 * wordSize)); 4328 if (PreserveFramePointer) add(rfp, sp, framesize - 2 * wordSize); 4329 } else { 4330 stp(rfp, lr, Address(pre(sp, -2 * wordSize))); 4331 if (PreserveFramePointer) mov(rfp, sp); 4332 if (framesize < ((1 << 12) + 2 * wordSize)) 4333 sub(sp, sp, framesize - 2 * wordSize); 4334 else { 4335 mov(rscratch1, framesize - 2 * wordSize); 4336 sub(sp, sp, rscratch1); 4337 } 4338 } 4339 } 4340 4341 void MacroAssembler::remove_frame(int framesize) { 4342 assert(framesize > 0, "framesize must be > 0"); 4343 if (framesize < ((1 << 9) + 2 * wordSize)) { 4344 ldp(rfp, lr, Address(sp, framesize - 2 * wordSize)); 4345 add(sp, sp, framesize); 4346 } else { 4347 if (framesize < ((1 << 12) + 2 * wordSize)) 4348 add(sp, sp, framesize - 2 * wordSize); 4349 else { 4350 mov(rscratch1, framesize - 2 * wordSize); 4351 add(sp, sp, rscratch1); 4352 } 4353 ldp(rfp, lr, Address(post(sp, 2 * wordSize))); 4354 } 4355 } 4356 4357 #ifdef COMPILER2 4358 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr); 4359 4360 // Search for str1 in str2 and return index or -1 4361 void MacroAssembler::string_indexof(Register str2, Register str1, 4362 Register cnt2, Register cnt1, 4363 Register tmp1, Register tmp2, 4364 Register tmp3, Register tmp4, 4365 Register tmp5, Register tmp6, 4366 int icnt1, Register result, int ae) { 4367 // NOTE: tmp5, tmp6 can be zr depending on specific method version 4368 Label LINEARSEARCH, LINEARSTUB, LINEAR_MEDIUM, DONE, NOMATCH, MATCH; 4369 4370 Register ch1 = rscratch1; 4371 Register ch2 = rscratch2; 4372 Register cnt1tmp = tmp1; 4373 Register cnt2tmp = tmp2; 4374 Register cnt1_neg = cnt1; 4375 Register cnt2_neg = cnt2; 4376 Register result_tmp = tmp4; 4377 4378 bool isL = ae == StrIntrinsicNode::LL; 4379 4380 bool str1_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL; 4381 bool str2_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU; 4382 int str1_chr_shift = str1_isL ? 0:1; 4383 int str2_chr_shift = str2_isL ? 0:1; 4384 int str1_chr_size = str1_isL ? 1:2; 4385 int str2_chr_size = str2_isL ? 1:2; 4386 chr_insn str1_load_1chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb : 4387 (chr_insn)&MacroAssembler::ldrh; 4388 chr_insn str2_load_1chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb : 4389 (chr_insn)&MacroAssembler::ldrh; 4390 chr_insn load_2chr = isL ? (chr_insn)&MacroAssembler::ldrh : (chr_insn)&MacroAssembler::ldrw; 4391 chr_insn load_4chr = isL ? (chr_insn)&MacroAssembler::ldrw : (chr_insn)&MacroAssembler::ldr; 4392 4393 // Note, inline_string_indexOf() generates checks: 4394 // if (substr.count > string.count) return -1; 4395 // if (substr.count == 0) return 0; 4396 4397 // We have two strings, a source string in str2, cnt2 and a pattern string 4398 // in str1, cnt1. Find the 1st occurence of pattern in source or return -1. 4399 4400 // For larger pattern and source we use a simplified Boyer Moore algorithm. 4401 // With a small pattern and source we use linear scan. 4402 4403 if (icnt1 == -1) { 4404 sub(result_tmp, cnt2, cnt1); 4405 cmp(cnt1, (u1)8); // Use Linear Scan if cnt1 < 8 || cnt1 >= 256 4406 br(LT, LINEARSEARCH); 4407 dup(v0, T16B, cnt1); // done in separate FPU pipeline. Almost no penalty 4408 subs(zr, cnt1, 256); 4409 lsr(tmp1, cnt2, 2); 4410 ccmp(cnt1, tmp1, 0b0000, LT); // Source must be 4 * pattern for BM 4411 br(GE, LINEARSTUB); 4412 } 4413 4414 // The Boyer Moore alogorithm is based on the description here:- 4415 // 4416 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm 4417 // 4418 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule 4419 // and the 'Good Suffix' rule. 4420 // 4421 // These rules are essentially heuristics for how far we can shift the 4422 // pattern along the search string. 4423 // 4424 // The implementation here uses the 'Bad Character' rule only because of the 4425 // complexity of initialisation for the 'Good Suffix' rule. 4426 // 4427 // This is also known as the Boyer-Moore-Horspool algorithm:- 4428 // 4429 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm 4430 // 4431 // This particular implementation has few java-specific optimizations. 4432 // 4433 // #define ASIZE 256 4434 // 4435 // int bm(unsigned char *x, int m, unsigned char *y, int n) { 4436 // int i, j; 4437 // unsigned c; 4438 // unsigned char bc[ASIZE]; 4439 // 4440 // /* Preprocessing */ 4441 // for (i = 0; i < ASIZE; ++i) 4442 // bc[i] = m; 4443 // for (i = 0; i < m - 1; ) { 4444 // c = x[i]; 4445 // ++i; 4446 // // c < 256 for Latin1 string, so, no need for branch 4447 // #ifdef PATTERN_STRING_IS_LATIN1 4448 // bc[c] = m - i; 4449 // #else 4450 // if (c < ASIZE) bc[c] = m - i; 4451 // #endif 4452 // } 4453 // 4454 // /* Searching */ 4455 // j = 0; 4456 // while (j <= n - m) { 4457 // c = y[i+j]; 4458 // if (x[m-1] == c) 4459 // for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i); 4460 // if (i < 0) return j; 4461 // // c < 256 for Latin1 string, so, no need for branch 4462 // #ifdef SOURCE_STRING_IS_LATIN1 4463 // // LL case: (c< 256) always true. Remove branch 4464 // j += bc[y[j+m-1]]; 4465 // #endif 4466 // #ifndef PATTERN_STRING_IS_UTF 4467 // // UU case: need if (c<ASIZE) check. Skip 1 character if not. 4468 // if (c < ASIZE) 4469 // j += bc[y[j+m-1]]; 4470 // else 4471 // j += 1 4472 // #endif 4473 // #ifdef PATTERN_IS_LATIN1_AND_SOURCE_IS_UTF 4474 // // UL case: need if (c<ASIZE) check. Skip <pattern length> if not. 4475 // if (c < ASIZE) 4476 // j += bc[y[j+m-1]]; 4477 // else 4478 // j += m 4479 // #endif 4480 // } 4481 // } 4482 4483 if (icnt1 == -1) { 4484 Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH, 4485 BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP; 4486 Register cnt1end = tmp2; 4487 Register str2end = cnt2; 4488 Register skipch = tmp2; 4489 4490 // str1 length is >=8, so, we can read at least 1 register for cases when 4491 // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for 4492 // UL case. We'll re-read last character in inner pre-loop code to have 4493 // single outer pre-loop load 4494 const int firstStep = isL ? 7 : 3; 4495 4496 const int ASIZE = 256; 4497 const int STORED_BYTES = 32; // amount of bytes stored per instruction 4498 sub(sp, sp, ASIZE); 4499 mov(tmp5, ASIZE/STORED_BYTES); // loop iterations 4500 mov(ch1, sp); 4501 BIND(BM_INIT_LOOP); 4502 stpq(v0, v0, Address(post(ch1, STORED_BYTES))); 4503 subs(tmp5, tmp5, 1); 4504 br(GT, BM_INIT_LOOP); 4505 4506 sub(cnt1tmp, cnt1, 1); 4507 mov(tmp5, str2); 4508 add(str2end, str2, result_tmp, LSL, str2_chr_shift); 4509 sub(ch2, cnt1, 1); 4510 mov(tmp3, str1); 4511 BIND(BCLOOP); 4512 (this->*str1_load_1chr)(ch1, Address(post(tmp3, str1_chr_size))); 4513 if (!str1_isL) { 4514 subs(zr, ch1, ASIZE); 4515 br(HS, BCSKIP); 4516 } 4517 strb(ch2, Address(sp, ch1)); 4518 BIND(BCSKIP); 4519 subs(ch2, ch2, 1); 4520 br(GT, BCLOOP); 4521 4522 add(tmp6, str1, cnt1, LSL, str1_chr_shift); // address after str1 4523 if (str1_isL == str2_isL) { 4524 // load last 8 bytes (8LL/4UU symbols) 4525 ldr(tmp6, Address(tmp6, -wordSize)); 4526 } else { 4527 ldrw(tmp6, Address(tmp6, -wordSize/2)); // load last 4 bytes(4 symbols) 4528 // convert Latin1 to UTF. We'll have to wait until load completed, but 4529 // it's still faster than per-character loads+checks 4530 lsr(tmp3, tmp6, BitsPerByte * (wordSize/2 - str1_chr_size)); // str1[N-1] 4531 ubfx(ch1, tmp6, 8, 8); // str1[N-2] 4532 ubfx(ch2, tmp6, 16, 8); // str1[N-3] 4533 andr(tmp6, tmp6, 0xFF); // str1[N-4] 4534 orr(ch2, ch1, ch2, LSL, 16); 4535 orr(tmp6, tmp6, tmp3, LSL, 48); 4536 orr(tmp6, tmp6, ch2, LSL, 16); 4537 } 4538 BIND(BMLOOPSTR2); 4539 (this->*str2_load_1chr)(skipch, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift))); 4540 sub(cnt1tmp, cnt1tmp, firstStep); // cnt1tmp is positive here, because cnt1 >= 8 4541 if (str1_isL == str2_isL) { 4542 // re-init tmp3. It's for free because it's executed in parallel with 4543 // load above. Alternative is to initialize it before loop, but it'll 4544 // affect performance on in-order systems with 2 or more ld/st pipelines 4545 lsr(tmp3, tmp6, BitsPerByte * (wordSize - str1_chr_size)); 4546 } 4547 if (!isL) { // UU/UL case 4548 lsl(ch2, cnt1tmp, 1); // offset in bytes 4549 } 4550 cmp(tmp3, skipch); 4551 br(NE, BMSKIP); 4552 ldr(ch2, Address(str2, isL ? cnt1tmp : ch2)); 4553 mov(ch1, tmp6); 4554 if (isL) { 4555 b(BMLOOPSTR1_AFTER_LOAD); 4556 } else { 4557 sub(cnt1tmp, cnt1tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8 4558 b(BMLOOPSTR1_CMP); 4559 } 4560 BIND(BMLOOPSTR1); 4561 (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift))); 4562 (this->*str2_load_1chr)(ch2, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift))); 4563 BIND(BMLOOPSTR1_AFTER_LOAD); 4564 subs(cnt1tmp, cnt1tmp, 1); 4565 br(LT, BMLOOPSTR1_LASTCMP); 4566 BIND(BMLOOPSTR1_CMP); 4567 cmp(ch1, ch2); 4568 br(EQ, BMLOOPSTR1); 4569 BIND(BMSKIP); 4570 if (!isL) { 4571 // if we've met UTF symbol while searching Latin1 pattern, then we can 4572 // skip cnt1 symbols 4573 if (str1_isL != str2_isL) { 4574 mov(result_tmp, cnt1); 4575 } else { 4576 mov(result_tmp, 1); 4577 } 4578 subs(zr, skipch, ASIZE); 4579 br(HS, BMADV); 4580 } 4581 ldrb(result_tmp, Address(sp, skipch)); // load skip distance 4582 BIND(BMADV); 4583 sub(cnt1tmp, cnt1, 1); 4584 add(str2, str2, result_tmp, LSL, str2_chr_shift); 4585 cmp(str2, str2end); 4586 br(LE, BMLOOPSTR2); 4587 add(sp, sp, ASIZE); 4588 b(NOMATCH); 4589 BIND(BMLOOPSTR1_LASTCMP); 4590 cmp(ch1, ch2); 4591 br(NE, BMSKIP); 4592 BIND(BMMATCH); 4593 sub(result, str2, tmp5); 4594 if (!str2_isL) lsr(result, result, 1); 4595 add(sp, sp, ASIZE); 4596 b(DONE); 4597 4598 BIND(LINEARSTUB); 4599 cmp(cnt1, (u1)16); // small patterns still should be handled by simple algorithm 4600 br(LT, LINEAR_MEDIUM); 4601 mov(result, zr); 4602 RuntimeAddress stub = NULL; 4603 if (isL) { 4604 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ll()); 4605 assert(stub.target() != NULL, "string_indexof_linear_ll stub has not been generated"); 4606 } else if (str1_isL) { 4607 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ul()); 4608 assert(stub.target() != NULL, "string_indexof_linear_ul stub has not been generated"); 4609 } else { 4610 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_uu()); 4611 assert(stub.target() != NULL, "string_indexof_linear_uu stub has not been generated"); 4612 } 4613 trampoline_call(stub); 4614 b(DONE); 4615 } 4616 4617 BIND(LINEARSEARCH); 4618 { 4619 Label DO1, DO2, DO3; 4620 4621 Register str2tmp = tmp2; 4622 Register first = tmp3; 4623 4624 if (icnt1 == -1) 4625 { 4626 Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT; 4627 4628 cmp(cnt1, u1(str1_isL == str2_isL ? 4 : 2)); 4629 br(LT, DOSHORT); 4630 BIND(LINEAR_MEDIUM); 4631 (this->*str1_load_1chr)(first, Address(str1)); 4632 lea(str1, Address(str1, cnt1, Address::lsl(str1_chr_shift))); 4633 sub(cnt1_neg, zr, cnt1, LSL, str1_chr_shift); 4634 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 4635 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 4636 4637 BIND(FIRST_LOOP); 4638 (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg)); 4639 cmp(first, ch2); 4640 br(EQ, STR1_LOOP); 4641 BIND(STR2_NEXT); 4642 adds(cnt2_neg, cnt2_neg, str2_chr_size); 4643 br(LE, FIRST_LOOP); 4644 b(NOMATCH); 4645 4646 BIND(STR1_LOOP); 4647 adds(cnt1tmp, cnt1_neg, str1_chr_size); 4648 add(cnt2tmp, cnt2_neg, str2_chr_size); 4649 br(GE, MATCH); 4650 4651 BIND(STR1_NEXT); 4652 (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp)); 4653 (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp)); 4654 cmp(ch1, ch2); 4655 br(NE, STR2_NEXT); 4656 adds(cnt1tmp, cnt1tmp, str1_chr_size); 4657 add(cnt2tmp, cnt2tmp, str2_chr_size); 4658 br(LT, STR1_NEXT); 4659 b(MATCH); 4660 4661 BIND(DOSHORT); 4662 if (str1_isL == str2_isL) { 4663 cmp(cnt1, (u1)2); 4664 br(LT, DO1); 4665 br(GT, DO3); 4666 } 4667 } 4668 4669 if (icnt1 == 4) { 4670 Label CH1_LOOP; 4671 4672 (this->*load_4chr)(ch1, str1); 4673 sub(result_tmp, cnt2, 4); 4674 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 4675 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 4676 4677 BIND(CH1_LOOP); 4678 (this->*load_4chr)(ch2, Address(str2, cnt2_neg)); 4679 cmp(ch1, ch2); 4680 br(EQ, MATCH); 4681 adds(cnt2_neg, cnt2_neg, str2_chr_size); 4682 br(LE, CH1_LOOP); 4683 b(NOMATCH); 4684 } 4685 4686 if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 2) { 4687 Label CH1_LOOP; 4688 4689 BIND(DO2); 4690 (this->*load_2chr)(ch1, str1); 4691 if (icnt1 == 2) { 4692 sub(result_tmp, cnt2, 2); 4693 } 4694 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 4695 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 4696 BIND(CH1_LOOP); 4697 (this->*load_2chr)(ch2, Address(str2, cnt2_neg)); 4698 cmp(ch1, ch2); 4699 br(EQ, MATCH); 4700 adds(cnt2_neg, cnt2_neg, str2_chr_size); 4701 br(LE, CH1_LOOP); 4702 b(NOMATCH); 4703 } 4704 4705 if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 3) { 4706 Label FIRST_LOOP, STR2_NEXT, STR1_LOOP; 4707 4708 BIND(DO3); 4709 (this->*load_2chr)(first, str1); 4710 (this->*str1_load_1chr)(ch1, Address(str1, 2*str1_chr_size)); 4711 if (icnt1 == 3) { 4712 sub(result_tmp, cnt2, 3); 4713 } 4714 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 4715 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 4716 BIND(FIRST_LOOP); 4717 (this->*load_2chr)(ch2, Address(str2, cnt2_neg)); 4718 cmpw(first, ch2); 4719 br(EQ, STR1_LOOP); 4720 BIND(STR2_NEXT); 4721 adds(cnt2_neg, cnt2_neg, str2_chr_size); 4722 br(LE, FIRST_LOOP); 4723 b(NOMATCH); 4724 4725 BIND(STR1_LOOP); 4726 add(cnt2tmp, cnt2_neg, 2*str2_chr_size); 4727 (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp)); 4728 cmp(ch1, ch2); 4729 br(NE, STR2_NEXT); 4730 b(MATCH); 4731 } 4732 4733 if (icnt1 == -1 || icnt1 == 1) { 4734 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP; 4735 4736 BIND(DO1); 4737 (this->*str1_load_1chr)(ch1, str1); 4738 cmp(cnt2, (u1)8); 4739 br(LT, DO1_SHORT); 4740 4741 sub(result_tmp, cnt2, 8/str2_chr_size); 4742 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 4743 mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001); 4744 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 4745 4746 if (str2_isL) { 4747 orr(ch1, ch1, ch1, LSL, 8); 4748 } 4749 orr(ch1, ch1, ch1, LSL, 16); 4750 orr(ch1, ch1, ch1, LSL, 32); 4751 BIND(CH1_LOOP); 4752 ldr(ch2, Address(str2, cnt2_neg)); 4753 eor(ch2, ch1, ch2); 4754 sub(tmp1, ch2, tmp3); 4755 orr(tmp2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 4756 bics(tmp1, tmp1, tmp2); 4757 br(NE, HAS_ZERO); 4758 adds(cnt2_neg, cnt2_neg, 8); 4759 br(LT, CH1_LOOP); 4760 4761 cmp(cnt2_neg, (u1)8); 4762 mov(cnt2_neg, 0); 4763 br(LT, CH1_LOOP); 4764 b(NOMATCH); 4765 4766 BIND(HAS_ZERO); 4767 rev(tmp1, tmp1); 4768 clz(tmp1, tmp1); 4769 add(cnt2_neg, cnt2_neg, tmp1, LSR, 3); 4770 b(MATCH); 4771 4772 BIND(DO1_SHORT); 4773 mov(result_tmp, cnt2); 4774 lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift))); 4775 sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift); 4776 BIND(DO1_LOOP); 4777 (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg)); 4778 cmpw(ch1, ch2); 4779 br(EQ, MATCH); 4780 adds(cnt2_neg, cnt2_neg, str2_chr_size); 4781 br(LT, DO1_LOOP); 4782 } 4783 } 4784 BIND(NOMATCH); 4785 mov(result, -1); 4786 b(DONE); 4787 BIND(MATCH); 4788 add(result, result_tmp, cnt2_neg, ASR, str2_chr_shift); 4789 BIND(DONE); 4790 } 4791 4792 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr); 4793 typedef void (MacroAssembler::* uxt_insn)(Register Rd, Register Rn); 4794 4795 void MacroAssembler::string_indexof_char(Register str1, Register cnt1, 4796 Register ch, Register result, 4797 Register tmp1, Register tmp2, Register tmp3) 4798 { 4799 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE; 4800 Register cnt1_neg = cnt1; 4801 Register ch1 = rscratch1; 4802 Register result_tmp = rscratch2; 4803 4804 cmp(cnt1, (u1)4); 4805 br(LT, DO1_SHORT); 4806 4807 orr(ch, ch, ch, LSL, 16); 4808 orr(ch, ch, ch, LSL, 32); 4809 4810 sub(cnt1, cnt1, 4); 4811 mov(result_tmp, cnt1); 4812 lea(str1, Address(str1, cnt1, Address::uxtw(1))); 4813 sub(cnt1_neg, zr, cnt1, LSL, 1); 4814 4815 mov(tmp3, 0x0001000100010001); 4816 4817 BIND(CH1_LOOP); 4818 ldr(ch1, Address(str1, cnt1_neg)); 4819 eor(ch1, ch, ch1); 4820 sub(tmp1, ch1, tmp3); 4821 orr(tmp2, ch1, 0x7fff7fff7fff7fff); 4822 bics(tmp1, tmp1, tmp2); 4823 br(NE, HAS_ZERO); 4824 adds(cnt1_neg, cnt1_neg, 8); 4825 br(LT, CH1_LOOP); 4826 4827 cmp(cnt1_neg, (u1)8); 4828 mov(cnt1_neg, 0); 4829 br(LT, CH1_LOOP); 4830 b(NOMATCH); 4831 4832 BIND(HAS_ZERO); 4833 rev(tmp1, tmp1); 4834 clz(tmp1, tmp1); 4835 add(cnt1_neg, cnt1_neg, tmp1, LSR, 3); 4836 b(MATCH); 4837 4838 BIND(DO1_SHORT); 4839 mov(result_tmp, cnt1); 4840 lea(str1, Address(str1, cnt1, Address::uxtw(1))); 4841 sub(cnt1_neg, zr, cnt1, LSL, 1); 4842 BIND(DO1_LOOP); 4843 ldrh(ch1, Address(str1, cnt1_neg)); 4844 cmpw(ch, ch1); 4845 br(EQ, MATCH); 4846 adds(cnt1_neg, cnt1_neg, 2); 4847 br(LT, DO1_LOOP); 4848 BIND(NOMATCH); 4849 mov(result, -1); 4850 b(DONE); 4851 BIND(MATCH); 4852 add(result, result_tmp, cnt1_neg, ASR, 1); 4853 BIND(DONE); 4854 } 4855 4856 // Compare strings. 4857 void MacroAssembler::string_compare(Register str1, Register str2, 4858 Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2, 4859 FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3, int ae) { 4860 Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB, 4861 DIFFERENCE, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, 4862 SHORT_LOOP_START, TAIL_CHECK; 4863 4864 const u1 STUB_THRESHOLD = 64 + 8; 4865 bool isLL = ae == StrIntrinsicNode::LL; 4866 bool isLU = ae == StrIntrinsicNode::LU; 4867 bool isUL = ae == StrIntrinsicNode::UL; 4868 4869 bool str1_isL = isLL || isLU; 4870 bool str2_isL = isLL || isUL; 4871 4872 int str1_chr_shift = str1_isL ? 0 : 1; 4873 int str2_chr_shift = str2_isL ? 0 : 1; 4874 int str1_chr_size = str1_isL ? 1 : 2; 4875 int str2_chr_size = str2_isL ? 1 : 2; 4876 int minCharsInWord = isLL ? wordSize : wordSize/2; 4877 4878 FloatRegister vtmpZ = vtmp1, vtmp = vtmp2; 4879 chr_insn str1_load_chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb : 4880 (chr_insn)&MacroAssembler::ldrh; 4881 chr_insn str2_load_chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb : 4882 (chr_insn)&MacroAssembler::ldrh; 4883 uxt_insn ext_chr = isLL ? (uxt_insn)&MacroAssembler::uxtbw : 4884 (uxt_insn)&MacroAssembler::uxthw; 4885 4886 BLOCK_COMMENT("string_compare {"); 4887 4888 // Bizzarely, the counts are passed in bytes, regardless of whether they 4889 // are L or U strings, however the result is always in characters. 4890 if (!str1_isL) asrw(cnt1, cnt1, 1); 4891 if (!str2_isL) asrw(cnt2, cnt2, 1); 4892 4893 // Compute the minimum of the string lengths and save the difference. 4894 subsw(result, cnt1, cnt2); 4895 cselw(cnt2, cnt1, cnt2, Assembler::LE); // min 4896 4897 // A very short string 4898 cmpw(cnt2, minCharsInWord); 4899 br(Assembler::LT, SHORT_STRING); 4900 4901 // Compare longwords 4902 // load first parts of strings and finish initialization while loading 4903 { 4904 if (str1_isL == str2_isL) { // LL or UU 4905 ldr(tmp1, Address(str1)); 4906 cmp(str1, str2); 4907 br(Assembler::EQ, DONE); 4908 ldr(tmp2, Address(str2)); 4909 cmp(cnt2, STUB_THRESHOLD); 4910 br(GE, STUB); 4911 subsw(cnt2, cnt2, minCharsInWord); 4912 br(EQ, TAIL_CHECK); 4913 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift))); 4914 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift))); 4915 sub(cnt2, zr, cnt2, LSL, str2_chr_shift); 4916 } else if (isLU) { 4917 ldrs(vtmp, Address(str1)); 4918 cmp(str1, str2); 4919 br(Assembler::EQ, DONE); 4920 ldr(tmp2, Address(str2)); 4921 cmp(cnt2, STUB_THRESHOLD); 4922 br(GE, STUB); 4923 subsw(cnt2, cnt2, 4); 4924 br(EQ, TAIL_CHECK); 4925 eor(vtmpZ, T16B, vtmpZ, vtmpZ); 4926 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift))); 4927 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift))); 4928 zip1(vtmp, T8B, vtmp, vtmpZ); 4929 sub(cnt1, zr, cnt2, LSL, str1_chr_shift); 4930 sub(cnt2, zr, cnt2, LSL, str2_chr_shift); 4931 add(cnt1, cnt1, 4); 4932 fmovd(tmp1, vtmp); 4933 } else { // UL case 4934 ldr(tmp1, Address(str1)); 4935 cmp(str1, str2); 4936 br(Assembler::EQ, DONE); 4937 ldrs(vtmp, Address(str2)); 4938 cmp(cnt2, STUB_THRESHOLD); 4939 br(GE, STUB); 4940 subsw(cnt2, cnt2, 4); 4941 br(EQ, TAIL_CHECK); 4942 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift))); 4943 eor(vtmpZ, T16B, vtmpZ, vtmpZ); 4944 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift))); 4945 sub(cnt1, zr, cnt2, LSL, str1_chr_shift); 4946 zip1(vtmp, T8B, vtmp, vtmpZ); 4947 sub(cnt2, zr, cnt2, LSL, str2_chr_shift); 4948 add(cnt1, cnt1, 8); 4949 fmovd(tmp2, vtmp); 4950 } 4951 adds(cnt2, cnt2, isUL ? 4 : 8); 4952 br(GE, TAIL); 4953 eor(rscratch2, tmp1, tmp2); 4954 cbnz(rscratch2, DIFFERENCE); 4955 // main loop 4956 bind(NEXT_WORD); 4957 if (str1_isL == str2_isL) { 4958 ldr(tmp1, Address(str1, cnt2)); 4959 ldr(tmp2, Address(str2, cnt2)); 4960 adds(cnt2, cnt2, 8); 4961 } else if (isLU) { 4962 ldrs(vtmp, Address(str1, cnt1)); 4963 ldr(tmp2, Address(str2, cnt2)); 4964 add(cnt1, cnt1, 4); 4965 zip1(vtmp, T8B, vtmp, vtmpZ); 4966 fmovd(tmp1, vtmp); 4967 adds(cnt2, cnt2, 8); 4968 } else { // UL 4969 ldrs(vtmp, Address(str2, cnt2)); 4970 ldr(tmp1, Address(str1, cnt1)); 4971 zip1(vtmp, T8B, vtmp, vtmpZ); 4972 add(cnt1, cnt1, 8); 4973 fmovd(tmp2, vtmp); 4974 adds(cnt2, cnt2, 4); 4975 } 4976 br(GE, TAIL); 4977 4978 eor(rscratch2, tmp1, tmp2); 4979 cbz(rscratch2, NEXT_WORD); 4980 b(DIFFERENCE); 4981 bind(TAIL); 4982 eor(rscratch2, tmp1, tmp2); 4983 cbnz(rscratch2, DIFFERENCE); 4984 // Last longword. In the case where length == 4 we compare the 4985 // same longword twice, but that's still faster than another 4986 // conditional branch. 4987 if (str1_isL == str2_isL) { 4988 ldr(tmp1, Address(str1)); 4989 ldr(tmp2, Address(str2)); 4990 } else if (isLU) { 4991 ldrs(vtmp, Address(str1)); 4992 ldr(tmp2, Address(str2)); 4993 zip1(vtmp, T8B, vtmp, vtmpZ); 4994 fmovd(tmp1, vtmp); 4995 } else { // UL 4996 ldrs(vtmp, Address(str2)); 4997 ldr(tmp1, Address(str1)); 4998 zip1(vtmp, T8B, vtmp, vtmpZ); 4999 fmovd(tmp2, vtmp); 5000 } 5001 bind(TAIL_CHECK); 5002 eor(rscratch2, tmp1, tmp2); 5003 cbz(rscratch2, DONE); 5004 5005 // Find the first different characters in the longwords and 5006 // compute their difference. 5007 bind(DIFFERENCE); 5008 rev(rscratch2, rscratch2); 5009 clz(rscratch2, rscratch2); 5010 andr(rscratch2, rscratch2, isLL ? -8 : -16); 5011 lsrv(tmp1, tmp1, rscratch2); 5012 (this->*ext_chr)(tmp1, tmp1); 5013 lsrv(tmp2, tmp2, rscratch2); 5014 (this->*ext_chr)(tmp2, tmp2); 5015 subw(result, tmp1, tmp2); 5016 b(DONE); 5017 } 5018 5019 bind(STUB); 5020 RuntimeAddress stub = NULL; 5021 switch(ae) { 5022 case StrIntrinsicNode::LL: 5023 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LL()); 5024 break; 5025 case StrIntrinsicNode::UU: 5026 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UU()); 5027 break; 5028 case StrIntrinsicNode::LU: 5029 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LU()); 5030 break; 5031 case StrIntrinsicNode::UL: 5032 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UL()); 5033 break; 5034 default: 5035 ShouldNotReachHere(); 5036 } 5037 assert(stub.target() != NULL, "compare_long_string stub has not been generated"); 5038 trampoline_call(stub); 5039 b(DONE); 5040 5041 bind(SHORT_STRING); 5042 // Is the minimum length zero? 5043 cbz(cnt2, DONE); 5044 // arrange code to do most branches while loading and loading next characters 5045 // while comparing previous 5046 (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size))); 5047 subs(cnt2, cnt2, 1); 5048 br(EQ, SHORT_LAST_INIT); 5049 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size))); 5050 b(SHORT_LOOP_START); 5051 bind(SHORT_LOOP); 5052 subs(cnt2, cnt2, 1); 5053 br(EQ, SHORT_LAST); 5054 bind(SHORT_LOOP_START); 5055 (this->*str1_load_chr)(tmp2, Address(post(str1, str1_chr_size))); 5056 (this->*str2_load_chr)(rscratch1, Address(post(str2, str2_chr_size))); 5057 cmp(tmp1, cnt1); 5058 br(NE, SHORT_LOOP_TAIL); 5059 subs(cnt2, cnt2, 1); 5060 br(EQ, SHORT_LAST2); 5061 (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size))); 5062 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size))); 5063 cmp(tmp2, rscratch1); 5064 br(EQ, SHORT_LOOP); 5065 sub(result, tmp2, rscratch1); 5066 b(DONE); 5067 bind(SHORT_LOOP_TAIL); 5068 sub(result, tmp1, cnt1); 5069 b(DONE); 5070 bind(SHORT_LAST2); 5071 cmp(tmp2, rscratch1); 5072 br(EQ, DONE); 5073 sub(result, tmp2, rscratch1); 5074 5075 b(DONE); 5076 bind(SHORT_LAST_INIT); 5077 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size))); 5078 bind(SHORT_LAST); 5079 cmp(tmp1, cnt1); 5080 br(EQ, DONE); 5081 sub(result, tmp1, cnt1); 5082 5083 bind(DONE); 5084 5085 BLOCK_COMMENT("} string_compare"); 5086 } 5087 #endif // COMPILER2 5088 5089 // This method checks if provided byte array contains byte with highest bit set. 5090 void MacroAssembler::has_negatives(Register ary1, Register len, Register result) { 5091 // Simple and most common case of aligned small array which is not at the 5092 // end of memory page is placed here. All other cases are in stub. 5093 Label LOOP, END, STUB, STUB_LONG, SET_RESULT, DONE; 5094 const uint64_t UPPER_BIT_MASK=0x8080808080808080; 5095 assert_different_registers(ary1, len, result); 5096 5097 cmpw(len, 0); 5098 br(LE, SET_RESULT); 5099 cmpw(len, 4 * wordSize); 5100 br(GE, STUB_LONG); // size > 32 then go to stub 5101 5102 int shift = 64 - exact_log2(os::vm_page_size()); 5103 lsl(rscratch1, ary1, shift); 5104 mov(rscratch2, (size_t)(4 * wordSize) << shift); 5105 adds(rscratch2, rscratch1, rscratch2); // At end of page? 5106 br(CS, STUB); // at the end of page then go to stub 5107 subs(len, len, wordSize); 5108 br(LT, END); 5109 5110 BIND(LOOP); 5111 ldr(rscratch1, Address(post(ary1, wordSize))); 5112 tst(rscratch1, UPPER_BIT_MASK); 5113 br(NE, SET_RESULT); 5114 subs(len, len, wordSize); 5115 br(GE, LOOP); 5116 cmpw(len, -wordSize); 5117 br(EQ, SET_RESULT); 5118 5119 BIND(END); 5120 ldr(result, Address(ary1)); 5121 sub(len, zr, len, LSL, 3); // LSL 3 is to get bits from bytes 5122 lslv(result, result, len); 5123 tst(result, UPPER_BIT_MASK); 5124 b(SET_RESULT); 5125 5126 BIND(STUB); 5127 RuntimeAddress has_neg = RuntimeAddress(StubRoutines::aarch64::has_negatives()); 5128 assert(has_neg.target() != NULL, "has_negatives stub has not been generated"); 5129 trampoline_call(has_neg); 5130 b(DONE); 5131 5132 BIND(STUB_LONG); 5133 RuntimeAddress has_neg_long = RuntimeAddress( 5134 StubRoutines::aarch64::has_negatives_long()); 5135 assert(has_neg_long.target() != NULL, "has_negatives stub has not been generated"); 5136 trampoline_call(has_neg_long); 5137 b(DONE); 5138 5139 BIND(SET_RESULT); 5140 cset(result, NE); // set true or false 5141 5142 BIND(DONE); 5143 } 5144 5145 void MacroAssembler::arrays_equals(Register a1, Register a2, Register tmp3, 5146 Register tmp4, Register tmp5, Register result, 5147 Register cnt1, int elem_size) { 5148 Label DONE, SAME; 5149 Register tmp1 = rscratch1; 5150 Register tmp2 = rscratch2; 5151 Register cnt2 = tmp2; // cnt2 only used in array length compare 5152 int elem_per_word = wordSize/elem_size; 5153 int log_elem_size = exact_log2(elem_size); 5154 int length_offset = arrayOopDesc::length_offset_in_bytes(); 5155 int base_offset 5156 = arrayOopDesc::base_offset_in_bytes(elem_size == 2 ? T_CHAR : T_BYTE); 5157 int stubBytesThreshold = 3 * 64 + (UseSIMDForArrayEquals ? 0 : 16); 5158 5159 assert(elem_size == 1 || elem_size == 2, "must be char or byte"); 5160 assert_different_registers(a1, a2, result, cnt1, rscratch1, rscratch2); 5161 5162 #ifndef PRODUCT 5163 { 5164 const char kind = (elem_size == 2) ? 'U' : 'L'; 5165 char comment[64]; 5166 snprintf(comment, sizeof comment, "array_equals%c{", kind); 5167 BLOCK_COMMENT(comment); 5168 } 5169 #endif 5170 5171 // if (a1 == a2) 5172 // return true; 5173 cmpoop(a1, a2); // May have read barriers for a1 and a2. 5174 br(EQ, SAME); 5175 5176 if (UseSimpleArrayEquals) { 5177 Label NEXT_WORD, SHORT, TAIL03, TAIL01, A_MIGHT_BE_NULL, A_IS_NOT_NULL; 5178 // if (a1 == null || a2 == null) 5179 // return false; 5180 // a1 & a2 == 0 means (some-pointer is null) or 5181 // (very-rare-or-even-probably-impossible-pointer-values) 5182 // so, we can save one branch in most cases 5183 tst(a1, a2); 5184 mov(result, false); 5185 br(EQ, A_MIGHT_BE_NULL); 5186 // if (a1.length != a2.length) 5187 // return false; 5188 bind(A_IS_NOT_NULL); 5189 ldrw(cnt1, Address(a1, length_offset)); 5190 ldrw(cnt2, Address(a2, length_offset)); 5191 eorw(tmp5, cnt1, cnt2); 5192 cbnzw(tmp5, DONE); 5193 lea(a1, Address(a1, base_offset)); 5194 lea(a2, Address(a2, base_offset)); 5195 // Check for short strings, i.e. smaller than wordSize. 5196 subs(cnt1, cnt1, elem_per_word); 5197 br(Assembler::LT, SHORT); 5198 // Main 8 byte comparison loop. 5199 bind(NEXT_WORD); { 5200 ldr(tmp1, Address(post(a1, wordSize))); 5201 ldr(tmp2, Address(post(a2, wordSize))); 5202 subs(cnt1, cnt1, elem_per_word); 5203 eor(tmp5, tmp1, tmp2); 5204 cbnz(tmp5, DONE); 5205 } br(GT, NEXT_WORD); 5206 // Last longword. In the case where length == 4 we compare the 5207 // same longword twice, but that's still faster than another 5208 // conditional branch. 5209 // cnt1 could be 0, -1, -2, -3, -4 for chars; -4 only happens when 5210 // length == 4. 5211 if (log_elem_size > 0) 5212 lsl(cnt1, cnt1, log_elem_size); 5213 ldr(tmp3, Address(a1, cnt1)); 5214 ldr(tmp4, Address(a2, cnt1)); 5215 eor(tmp5, tmp3, tmp4); 5216 cbnz(tmp5, DONE); 5217 b(SAME); 5218 bind(A_MIGHT_BE_NULL); 5219 // in case both a1 and a2 are not-null, proceed with loads 5220 cbz(a1, DONE); 5221 cbz(a2, DONE); 5222 b(A_IS_NOT_NULL); 5223 bind(SHORT); 5224 5225 tbz(cnt1, 2 - log_elem_size, TAIL03); // 0-7 bytes left. 5226 { 5227 ldrw(tmp1, Address(post(a1, 4))); 5228 ldrw(tmp2, Address(post(a2, 4))); 5229 eorw(tmp5, tmp1, tmp2); 5230 cbnzw(tmp5, DONE); 5231 } 5232 bind(TAIL03); 5233 tbz(cnt1, 1 - log_elem_size, TAIL01); // 0-3 bytes left. 5234 { 5235 ldrh(tmp3, Address(post(a1, 2))); 5236 ldrh(tmp4, Address(post(a2, 2))); 5237 eorw(tmp5, tmp3, tmp4); 5238 cbnzw(tmp5, DONE); 5239 } 5240 bind(TAIL01); 5241 if (elem_size == 1) { // Only needed when comparing byte arrays. 5242 tbz(cnt1, 0, SAME); // 0-1 bytes left. 5243 { 5244 ldrb(tmp1, a1); 5245 ldrb(tmp2, a2); 5246 eorw(tmp5, tmp1, tmp2); 5247 cbnzw(tmp5, DONE); 5248 } 5249 } 5250 } else { 5251 Label NEXT_DWORD, SHORT, TAIL, TAIL2, STUB, EARLY_OUT, 5252 CSET_EQ, LAST_CHECK; 5253 mov(result, false); 5254 cbz(a1, DONE); 5255 ldrw(cnt1, Address(a1, length_offset)); 5256 cbz(a2, DONE); 5257 ldrw(cnt2, Address(a2, length_offset)); 5258 // on most CPUs a2 is still "locked"(surprisingly) in ldrw and it's 5259 // faster to perform another branch before comparing a1 and a2 5260 cmp(cnt1, (u1)elem_per_word); 5261 br(LE, SHORT); // short or same 5262 ldr(tmp3, Address(pre(a1, base_offset))); 5263 subs(zr, cnt1, stubBytesThreshold); 5264 br(GE, STUB); 5265 ldr(tmp4, Address(pre(a2, base_offset))); 5266 sub(tmp5, zr, cnt1, LSL, 3 + log_elem_size); 5267 cmp(cnt2, cnt1); 5268 br(NE, DONE); 5269 5270 // Main 16 byte comparison loop with 2 exits 5271 bind(NEXT_DWORD); { 5272 ldr(tmp1, Address(pre(a1, wordSize))); 5273 ldr(tmp2, Address(pre(a2, wordSize))); 5274 subs(cnt1, cnt1, 2 * elem_per_word); 5275 br(LE, TAIL); 5276 eor(tmp4, tmp3, tmp4); 5277 cbnz(tmp4, DONE); 5278 ldr(tmp3, Address(pre(a1, wordSize))); 5279 ldr(tmp4, Address(pre(a2, wordSize))); 5280 cmp(cnt1, (u1)elem_per_word); 5281 br(LE, TAIL2); 5282 cmp(tmp1, tmp2); 5283 } br(EQ, NEXT_DWORD); 5284 b(DONE); 5285 5286 bind(TAIL); 5287 eor(tmp4, tmp3, tmp4); 5288 eor(tmp2, tmp1, tmp2); 5289 lslv(tmp2, tmp2, tmp5); 5290 orr(tmp5, tmp4, tmp2); 5291 cmp(tmp5, zr); 5292 b(CSET_EQ); 5293 5294 bind(TAIL2); 5295 eor(tmp2, tmp1, tmp2); 5296 cbnz(tmp2, DONE); 5297 b(LAST_CHECK); 5298 5299 bind(STUB); 5300 ldr(tmp4, Address(pre(a2, base_offset))); 5301 cmp(cnt2, cnt1); 5302 br(NE, DONE); 5303 if (elem_size == 2) { // convert to byte counter 5304 lsl(cnt1, cnt1, 1); 5305 } 5306 eor(tmp5, tmp3, tmp4); 5307 cbnz(tmp5, DONE); 5308 RuntimeAddress stub = RuntimeAddress(StubRoutines::aarch64::large_array_equals()); 5309 assert(stub.target() != NULL, "array_equals_long stub has not been generated"); 5310 trampoline_call(stub); 5311 b(DONE); 5312 5313 bind(EARLY_OUT); 5314 // (a1 != null && a2 == null) || (a1 != null && a2 != null && a1 == a2) 5315 // so, if a2 == null => return false(0), else return true, so we can return a2 5316 mov(result, a2); 5317 b(DONE); 5318 bind(SHORT); 5319 cmp(cnt2, cnt1); 5320 br(NE, DONE); 5321 cbz(cnt1, SAME); 5322 sub(tmp5, zr, cnt1, LSL, 3 + log_elem_size); 5323 ldr(tmp3, Address(a1, base_offset)); 5324 ldr(tmp4, Address(a2, base_offset)); 5325 bind(LAST_CHECK); 5326 eor(tmp4, tmp3, tmp4); 5327 lslv(tmp5, tmp4, tmp5); 5328 cmp(tmp5, zr); 5329 bind(CSET_EQ); 5330 cset(result, EQ); 5331 b(DONE); 5332 } 5333 5334 bind(SAME); 5335 mov(result, true); 5336 // That's it. 5337 bind(DONE); 5338 5339 BLOCK_COMMENT("} array_equals"); 5340 } 5341 5342 // Compare Strings 5343 5344 // For Strings we're passed the address of the first characters in a1 5345 // and a2 and the length in cnt1. 5346 // elem_size is the element size in bytes: either 1 or 2. 5347 // There are two implementations. For arrays >= 8 bytes, all 5348 // comparisons (including the final one, which may overlap) are 5349 // performed 8 bytes at a time. For strings < 8 bytes, we compare a 5350 // halfword, then a short, and then a byte. 5351 5352 void MacroAssembler::string_equals(Register a1, Register a2, 5353 Register result, Register cnt1, int elem_size) 5354 { 5355 Label SAME, DONE, SHORT, NEXT_WORD; 5356 Register tmp1 = rscratch1; 5357 Register tmp2 = rscratch2; 5358 Register cnt2 = tmp2; // cnt2 only used in array length compare 5359 5360 assert(elem_size == 1 || elem_size == 2, "must be 2 or 1 byte"); 5361 assert_different_registers(a1, a2, result, cnt1, rscratch1, rscratch2); 5362 5363 #ifndef PRODUCT 5364 { 5365 const char kind = (elem_size == 2) ? 'U' : 'L'; 5366 char comment[64]; 5367 snprintf(comment, sizeof comment, "{string_equals%c", kind); 5368 BLOCK_COMMENT(comment); 5369 } 5370 #endif 5371 5372 mov(result, false); 5373 5374 // Check for short strings, i.e. smaller than wordSize. 5375 subs(cnt1, cnt1, wordSize); 5376 br(Assembler::LT, SHORT); 5377 // Main 8 byte comparison loop. 5378 bind(NEXT_WORD); { 5379 ldr(tmp1, Address(post(a1, wordSize))); 5380 ldr(tmp2, Address(post(a2, wordSize))); 5381 subs(cnt1, cnt1, wordSize); 5382 eor(tmp1, tmp1, tmp2); 5383 cbnz(tmp1, DONE); 5384 } br(GT, NEXT_WORD); 5385 // Last longword. In the case where length == 4 we compare the 5386 // same longword twice, but that's still faster than another 5387 // conditional branch. 5388 // cnt1 could be 0, -1, -2, -3, -4 for chars; -4 only happens when 5389 // length == 4. 5390 ldr(tmp1, Address(a1, cnt1)); 5391 ldr(tmp2, Address(a2, cnt1)); 5392 eor(tmp2, tmp1, tmp2); 5393 cbnz(tmp2, DONE); 5394 b(SAME); 5395 5396 bind(SHORT); 5397 Label TAIL03, TAIL01; 5398 5399 tbz(cnt1, 2, TAIL03); // 0-7 bytes left. 5400 { 5401 ldrw(tmp1, Address(post(a1, 4))); 5402 ldrw(tmp2, Address(post(a2, 4))); 5403 eorw(tmp1, tmp1, tmp2); 5404 cbnzw(tmp1, DONE); 5405 } 5406 bind(TAIL03); 5407 tbz(cnt1, 1, TAIL01); // 0-3 bytes left. 5408 { 5409 ldrh(tmp1, Address(post(a1, 2))); 5410 ldrh(tmp2, Address(post(a2, 2))); 5411 eorw(tmp1, tmp1, tmp2); 5412 cbnzw(tmp1, DONE); 5413 } 5414 bind(TAIL01); 5415 if (elem_size == 1) { // Only needed when comparing 1-byte elements 5416 tbz(cnt1, 0, SAME); // 0-1 bytes left. 5417 { 5418 ldrb(tmp1, a1); 5419 ldrb(tmp2, a2); 5420 eorw(tmp1, tmp1, tmp2); 5421 cbnzw(tmp1, DONE); 5422 } 5423 } 5424 // Arrays are equal. 5425 bind(SAME); 5426 mov(result, true); 5427 5428 // That's it. 5429 bind(DONE); 5430 BLOCK_COMMENT("} string_equals"); 5431 } 5432 5433 5434 // The size of the blocks erased by the zero_blocks stub. We must 5435 // handle anything smaller than this ourselves in zero_words(). 5436 const int MacroAssembler::zero_words_block_size = 8; 5437 5438 // zero_words() is used by C2 ClearArray patterns. It is as small as 5439 // possible, handling small word counts locally and delegating 5440 // anything larger to the zero_blocks stub. It is expanded many times 5441 // in compiled code, so it is important to keep it short. 5442 5443 // ptr: Address of a buffer to be zeroed. 5444 // cnt: Count in HeapWords. 5445 // 5446 // ptr, cnt, rscratch1, and rscratch2 are clobbered. 5447 void MacroAssembler::zero_words(Register ptr, Register cnt) 5448 { 5449 assert(is_power_of_2(zero_words_block_size), "adjust this"); 5450 assert(ptr == r10 && cnt == r11, "mismatch in register usage"); 5451 5452 BLOCK_COMMENT("zero_words {"); 5453 cmp(cnt, (u1)zero_words_block_size); 5454 Label around; 5455 br(LO, around); 5456 { 5457 RuntimeAddress zero_blocks = RuntimeAddress(StubRoutines::aarch64::zero_blocks()); 5458 assert(zero_blocks.target() != NULL, "zero_blocks stub has not been generated"); 5459 if (StubRoutines::aarch64::complete()) { 5460 trampoline_call(zero_blocks); 5461 } else { 5462 bl(zero_blocks); 5463 } 5464 } 5465 bind(around); 5466 for (int i = zero_words_block_size >> 1; i > 1; i >>= 1) { 5467 Label l; 5468 tbz(cnt, exact_log2(i), l); 5469 for (int j = 0; j < i; j += 2) { 5470 stp(zr, zr, post(ptr, 16)); 5471 } 5472 bind(l); 5473 } 5474 { 5475 Label l; 5476 tbz(cnt, 0, l); 5477 str(zr, Address(ptr)); 5478 bind(l); 5479 } 5480 BLOCK_COMMENT("} zero_words"); 5481 } 5482 5483 // base: Address of a buffer to be zeroed, 8 bytes aligned. 5484 // cnt: Immediate count in HeapWords. 5485 #define SmallArraySize (18 * BytesPerLong) 5486 void MacroAssembler::zero_words(Register base, u_int64_t cnt) 5487 { 5488 BLOCK_COMMENT("zero_words {"); 5489 int i = cnt & 1; // store any odd word to start 5490 if (i) str(zr, Address(base)); 5491 5492 if (cnt <= SmallArraySize / BytesPerLong) { 5493 for (; i < (int)cnt; i += 2) 5494 stp(zr, zr, Address(base, i * wordSize)); 5495 } else { 5496 const int unroll = 4; // Number of stp(zr, zr) instructions we'll unroll 5497 int remainder = cnt % (2 * unroll); 5498 for (; i < remainder; i += 2) 5499 stp(zr, zr, Address(base, i * wordSize)); 5500 5501 Label loop; 5502 Register cnt_reg = rscratch1; 5503 Register loop_base = rscratch2; 5504 cnt = cnt - remainder; 5505 mov(cnt_reg, cnt); 5506 // adjust base and prebias by -2 * wordSize so we can pre-increment 5507 add(loop_base, base, (remainder - 2) * wordSize); 5508 bind(loop); 5509 sub(cnt_reg, cnt_reg, 2 * unroll); 5510 for (i = 1; i < unroll; i++) 5511 stp(zr, zr, Address(loop_base, 2 * i * wordSize)); 5512 stp(zr, zr, Address(pre(loop_base, 2 * unroll * wordSize))); 5513 cbnz(cnt_reg, loop); 5514 } 5515 BLOCK_COMMENT("} zero_words"); 5516 } 5517 5518 // Zero blocks of memory by using DC ZVA. 5519 // 5520 // Aligns the base address first sufficently for DC ZVA, then uses 5521 // DC ZVA repeatedly for every full block. cnt is the size to be 5522 // zeroed in HeapWords. Returns the count of words left to be zeroed 5523 // in cnt. 5524 // 5525 // NOTE: This is intended to be used in the zero_blocks() stub. If 5526 // you want to use it elsewhere, note that cnt must be >= 2*zva_length. 5527 void MacroAssembler::zero_dcache_blocks(Register base, Register cnt) { 5528 Register tmp = rscratch1; 5529 Register tmp2 = rscratch2; 5530 int zva_length = VM_Version::zva_length(); 5531 Label initial_table_end, loop_zva; 5532 Label fini; 5533 5534 // Base must be 16 byte aligned. If not just return and let caller handle it 5535 tst(base, 0x0f); 5536 br(Assembler::NE, fini); 5537 // Align base with ZVA length. 5538 neg(tmp, base); 5539 andr(tmp, tmp, zva_length - 1); 5540 5541 // tmp: the number of bytes to be filled to align the base with ZVA length. 5542 add(base, base, tmp); 5543 sub(cnt, cnt, tmp, Assembler::ASR, 3); 5544 adr(tmp2, initial_table_end); 5545 sub(tmp2, tmp2, tmp, Assembler::LSR, 2); 5546 br(tmp2); 5547 5548 for (int i = -zva_length + 16; i < 0; i += 16) 5549 stp(zr, zr, Address(base, i)); 5550 bind(initial_table_end); 5551 5552 sub(cnt, cnt, zva_length >> 3); 5553 bind(loop_zva); 5554 dc(Assembler::ZVA, base); 5555 subs(cnt, cnt, zva_length >> 3); 5556 add(base, base, zva_length); 5557 br(Assembler::GE, loop_zva); 5558 add(cnt, cnt, zva_length >> 3); // count not zeroed by DC ZVA 5559 bind(fini); 5560 } 5561 5562 // base: Address of a buffer to be filled, 8 bytes aligned. 5563 // cnt: Count in 8-byte unit. 5564 // value: Value to be filled with. 5565 // base will point to the end of the buffer after filling. 5566 void MacroAssembler::fill_words(Register base, Register cnt, Register value) 5567 { 5568 // Algorithm: 5569 // 5570 // scratch1 = cnt & 7; 5571 // cnt -= scratch1; 5572 // p += scratch1; 5573 // switch (scratch1) { 5574 // do { 5575 // cnt -= 8; 5576 // p[-8] = v; 5577 // case 7: 5578 // p[-7] = v; 5579 // case 6: 5580 // p[-6] = v; 5581 // // ... 5582 // case 1: 5583 // p[-1] = v; 5584 // case 0: 5585 // p += 8; 5586 // } while (cnt); 5587 // } 5588 5589 assert_different_registers(base, cnt, value, rscratch1, rscratch2); 5590 5591 Label fini, skip, entry, loop; 5592 const int unroll = 8; // Number of stp instructions we'll unroll 5593 5594 cbz(cnt, fini); 5595 tbz(base, 3, skip); 5596 str(value, Address(post(base, 8))); 5597 sub(cnt, cnt, 1); 5598 bind(skip); 5599 5600 andr(rscratch1, cnt, (unroll-1) * 2); 5601 sub(cnt, cnt, rscratch1); 5602 add(base, base, rscratch1, Assembler::LSL, 3); 5603 adr(rscratch2, entry); 5604 sub(rscratch2, rscratch2, rscratch1, Assembler::LSL, 1); 5605 br(rscratch2); 5606 5607 bind(loop); 5608 add(base, base, unroll * 16); 5609 for (int i = -unroll; i < 0; i++) 5610 stp(value, value, Address(base, i * 16)); 5611 bind(entry); 5612 subs(cnt, cnt, unroll * 2); 5613 br(Assembler::GE, loop); 5614 5615 tbz(cnt, 0, fini); 5616 str(value, Address(post(base, 8))); 5617 bind(fini); 5618 } 5619 5620 // Intrinsic for sun/nio/cs/ISO_8859_1$Encoder.implEncodeISOArray and 5621 // java/lang/StringUTF16.compress. 5622 void MacroAssembler::encode_iso_array(Register src, Register dst, 5623 Register len, Register result, 5624 FloatRegister Vtmp1, FloatRegister Vtmp2, 5625 FloatRegister Vtmp3, FloatRegister Vtmp4) 5626 { 5627 Label DONE, SET_RESULT, NEXT_32, NEXT_32_PRFM, LOOP_8, NEXT_8, LOOP_1, NEXT_1, 5628 NEXT_32_START, NEXT_32_PRFM_START; 5629 Register tmp1 = rscratch1, tmp2 = rscratch2; 5630 5631 mov(result, len); // Save initial len 5632 5633 #ifndef BUILTIN_SIM 5634 cmp(len, (u1)8); // handle shortest strings first 5635 br(LT, LOOP_1); 5636 cmp(len, (u1)32); 5637 br(LT, NEXT_8); 5638 // The following code uses the SIMD 'uzp1' and 'uzp2' instructions 5639 // to convert chars to bytes 5640 if (SoftwarePrefetchHintDistance >= 0) { 5641 ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src); 5642 subs(tmp2, len, SoftwarePrefetchHintDistance/2 + 16); 5643 br(LE, NEXT_32_START); 5644 b(NEXT_32_PRFM_START); 5645 BIND(NEXT_32_PRFM); 5646 ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src); 5647 BIND(NEXT_32_PRFM_START); 5648 prfm(Address(src, SoftwarePrefetchHintDistance)); 5649 orr(v4, T16B, Vtmp1, Vtmp2); 5650 orr(v5, T16B, Vtmp3, Vtmp4); 5651 uzp1(Vtmp1, T16B, Vtmp1, Vtmp2); 5652 uzp1(Vtmp3, T16B, Vtmp3, Vtmp4); 5653 stpq(Vtmp1, Vtmp3, dst); 5654 uzp2(v5, T16B, v4, v5); // high bytes 5655 umov(tmp2, v5, D, 1); 5656 fmovd(tmp1, v5); 5657 orr(tmp1, tmp1, tmp2); 5658 cbnz(tmp1, LOOP_8); 5659 sub(len, len, 32); 5660 add(dst, dst, 32); 5661 add(src, src, 64); 5662 subs(tmp2, len, SoftwarePrefetchHintDistance/2 + 16); 5663 br(GE, NEXT_32_PRFM); 5664 cmp(len, (u1)32); 5665 br(LT, LOOP_8); 5666 BIND(NEXT_32); 5667 ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src); 5668 BIND(NEXT_32_START); 5669 } else { 5670 BIND(NEXT_32); 5671 ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src); 5672 } 5673 prfm(Address(src, SoftwarePrefetchHintDistance)); 5674 uzp1(v4, T16B, Vtmp1, Vtmp2); 5675 uzp1(v5, T16B, Vtmp3, Vtmp4); 5676 stpq(v4, v5, dst); 5677 orr(Vtmp1, T16B, Vtmp1, Vtmp2); 5678 orr(Vtmp3, T16B, Vtmp3, Vtmp4); 5679 uzp2(Vtmp1, T16B, Vtmp1, Vtmp3); // high bytes 5680 umov(tmp2, Vtmp1, D, 1); 5681 fmovd(tmp1, Vtmp1); 5682 orr(tmp1, tmp1, tmp2); 5683 cbnz(tmp1, LOOP_8); 5684 sub(len, len, 32); 5685 add(dst, dst, 32); 5686 add(src, src, 64); 5687 cmp(len, (u1)32); 5688 br(GE, NEXT_32); 5689 cbz(len, DONE); 5690 5691 BIND(LOOP_8); 5692 cmp(len, (u1)8); 5693 br(LT, LOOP_1); 5694 BIND(NEXT_8); 5695 ld1(Vtmp1, T8H, src); 5696 uzp1(Vtmp2, T16B, Vtmp1, Vtmp1); // low bytes 5697 uzp2(Vtmp3, T16B, Vtmp1, Vtmp1); // high bytes 5698 strd(Vtmp2, dst); 5699 fmovd(tmp1, Vtmp3); 5700 cbnz(tmp1, NEXT_1); 5701 5702 sub(len, len, 8); 5703 add(dst, dst, 8); 5704 add(src, src, 16); 5705 cmp(len, (u1)8); 5706 br(GE, NEXT_8); 5707 5708 BIND(LOOP_1); 5709 #endif 5710 cbz(len, DONE); 5711 BIND(NEXT_1); 5712 ldrh(tmp1, Address(post(src, 2))); 5713 strb(tmp1, Address(post(dst, 1))); 5714 tst(tmp1, 0xff00); 5715 br(NE, SET_RESULT); 5716 subs(len, len, 1); 5717 br(GT, NEXT_1); 5718 5719 BIND(SET_RESULT); 5720 sub(result, result, len); // Return index where we stopped 5721 // Return len == 0 if we processed all 5722 // characters 5723 BIND(DONE); 5724 } 5725 5726 5727 // Inflate byte[] array to char[]. 5728 void MacroAssembler::byte_array_inflate(Register src, Register dst, Register len, 5729 FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3, 5730 Register tmp4) { 5731 Label big, done, after_init, to_stub; 5732 5733 assert_different_registers(src, dst, len, tmp4, rscratch1); 5734 5735 fmovd(vtmp1, zr); 5736 lsrw(tmp4, len, 3); 5737 bind(after_init); 5738 cbnzw(tmp4, big); 5739 // Short string: less than 8 bytes. 5740 { 5741 Label loop, tiny; 5742 5743 cmpw(len, 4); 5744 br(LT, tiny); 5745 // Use SIMD to do 4 bytes. 5746 ldrs(vtmp2, post(src, 4)); 5747 zip1(vtmp3, T8B, vtmp2, vtmp1); 5748 subw(len, len, 4); 5749 strd(vtmp3, post(dst, 8)); 5750 5751 cbzw(len, done); 5752 5753 // Do the remaining bytes by steam. 5754 bind(loop); 5755 ldrb(tmp4, post(src, 1)); 5756 strh(tmp4, post(dst, 2)); 5757 subw(len, len, 1); 5758 5759 bind(tiny); 5760 cbnz(len, loop); 5761 5762 b(done); 5763 } 5764 5765 if (SoftwarePrefetchHintDistance >= 0) { 5766 bind(to_stub); 5767 RuntimeAddress stub = RuntimeAddress(StubRoutines::aarch64::large_byte_array_inflate()); 5768 assert(stub.target() != NULL, "large_byte_array_inflate stub has not been generated"); 5769 trampoline_call(stub); 5770 b(after_init); 5771 } 5772 5773 // Unpack the bytes 8 at a time. 5774 bind(big); 5775 { 5776 Label loop, around, loop_last, loop_start; 5777 5778 if (SoftwarePrefetchHintDistance >= 0) { 5779 const int large_loop_threshold = (64 + 16)/8; 5780 ldrd(vtmp2, post(src, 8)); 5781 andw(len, len, 7); 5782 cmp(tmp4, (u1)large_loop_threshold); 5783 br(GE, to_stub); 5784 b(loop_start); 5785 5786 bind(loop); 5787 ldrd(vtmp2, post(src, 8)); 5788 bind(loop_start); 5789 subs(tmp4, tmp4, 1); 5790 br(EQ, loop_last); 5791 zip1(vtmp2, T16B, vtmp2, vtmp1); 5792 ldrd(vtmp3, post(src, 8)); 5793 st1(vtmp2, T8H, post(dst, 16)); 5794 subs(tmp4, tmp4, 1); 5795 zip1(vtmp3, T16B, vtmp3, vtmp1); 5796 st1(vtmp3, T8H, post(dst, 16)); 5797 br(NE, loop); 5798 b(around); 5799 bind(loop_last); 5800 zip1(vtmp2, T16B, vtmp2, vtmp1); 5801 st1(vtmp2, T8H, post(dst, 16)); 5802 bind(around); 5803 cbz(len, done); 5804 } else { 5805 andw(len, len, 7); 5806 bind(loop); 5807 ldrd(vtmp2, post(src, 8)); 5808 sub(tmp4, tmp4, 1); 5809 zip1(vtmp3, T16B, vtmp2, vtmp1); 5810 st1(vtmp3, T8H, post(dst, 16)); 5811 cbnz(tmp4, loop); 5812 } 5813 } 5814 5815 // Do the tail of up to 8 bytes. 5816 add(src, src, len); 5817 ldrd(vtmp3, Address(src, -8)); 5818 add(dst, dst, len, ext::uxtw, 1); 5819 zip1(vtmp3, T16B, vtmp3, vtmp1); 5820 strq(vtmp3, Address(dst, -16)); 5821 5822 bind(done); 5823 } 5824 5825 // Compress char[] array to byte[]. 5826 void MacroAssembler::char_array_compress(Register src, Register dst, Register len, 5827 FloatRegister tmp1Reg, FloatRegister tmp2Reg, 5828 FloatRegister tmp3Reg, FloatRegister tmp4Reg, 5829 Register result) { 5830 encode_iso_array(src, dst, len, result, 5831 tmp1Reg, tmp2Reg, tmp3Reg, tmp4Reg); 5832 cmp(len, zr); 5833 csel(result, result, zr, EQ); 5834 } 5835 5836 // get_thread() can be called anywhere inside generated code so we 5837 // need to save whatever non-callee save context might get clobbered 5838 // by the call to JavaThread::aarch64_get_thread_helper() or, indeed, 5839 // the call setup code. 5840 // 5841 // aarch64_get_thread_helper() clobbers only r0, r1, and flags. 5842 // 5843 void MacroAssembler::get_thread(Register dst) { 5844 RegSet saved_regs = RegSet::range(r0, r1) + lr - dst; 5845 push(saved_regs, sp); 5846 5847 mov(lr, CAST_FROM_FN_PTR(address, JavaThread::aarch64_get_thread_helper)); 5848 blrt(lr, 1, 0, 1); 5849 if (dst != c_rarg0) { 5850 mov(dst, c_rarg0); 5851 } 5852 5853 pop(saved_regs, sp); 5854 }