1 /* 2 * Copyright (c) 1997, 2018, Oracle and/or its affiliates. All rights reserved. 3 * Copyright (c) 2014, 2015, Red Hat Inc. All rights reserved. 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. 9 * 10 * This code is distributed in the hope that it will be useful, but WITHOUT 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 * version 2 for more details (a copy is included in the LICENSE file that 14 * accompanied this code). 15 * 16 * You should have received a copy of the GNU General Public License version 17 * 2 along with this work; if not, write to the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 19 * 20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 21 * or visit www.oracle.com if you need additional information or have any 22 * questions. 23 * 24 */ 25 26 #include <sys/types.h> 27 28 #include "precompiled.hpp" 29 #include "jvm.h" 30 #include "asm/assembler.hpp" 31 #include "asm/assembler.inline.hpp" 32 #include "gc/shared/barrierSet.hpp" 33 #include "gc/shared/cardTable.hpp" 34 #include "gc/shared/barrierSetAssembler.hpp" 35 #include "gc/shared/cardTableBarrierSet.hpp" 36 #include "interpreter/interpreter.hpp" 37 #include "compiler/disassembler.hpp" 38 #include "memory/resourceArea.hpp" 39 #include "nativeInst_aarch64.hpp" 40 #include "oops/accessDecorators.hpp" 41 #include "oops/compressedOops.inline.hpp" 42 #include "oops/klass.inline.hpp" 43 #include "runtime/biasedLocking.hpp" 44 #include "runtime/icache.hpp" 45 #include "runtime/interfaceSupport.inline.hpp" 46 #include "runtime/jniHandles.inline.hpp" 47 #include "runtime/sharedRuntime.hpp" 48 #include "runtime/thread.hpp" 49 #ifdef COMPILER1 50 #include "c1/c1_LIRAssembler.hpp" 51 #endif 52 #ifdef COMPILER2 53 #include "oops/oop.hpp" 54 #include "opto/compile.hpp" 55 #include "opto/intrinsicnode.hpp" 56 #include "opto/node.hpp" 57 #endif 58 59 #ifdef PRODUCT 60 #define BLOCK_COMMENT(str) /* nothing */ 61 #define STOP(error) stop(error) 62 #else 63 #define BLOCK_COMMENT(str) block_comment(str) 64 #define STOP(error) block_comment(error); stop(error) 65 #endif 66 67 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 68 69 // Patch any kind of instruction; there may be several instructions. 70 // Return the total length (in bytes) of the instructions. 71 int MacroAssembler::pd_patch_instruction_size(address branch, address target) { 72 int instructions = 1; 73 assert((uint64_t)target < (1ul << 48), "48-bit overflow in address constant"); 74 long offset = (target - branch) >> 2; 75 unsigned insn = *(unsigned*)branch; 76 if ((Instruction_aarch64::extract(insn, 29, 24) & 0b111011) == 0b011000) { 77 // Load register (literal) 78 Instruction_aarch64::spatch(branch, 23, 5, offset); 79 } else if (Instruction_aarch64::extract(insn, 30, 26) == 0b00101) { 80 // Unconditional branch (immediate) 81 Instruction_aarch64::spatch(branch, 25, 0, offset); 82 } else if (Instruction_aarch64::extract(insn, 31, 25) == 0b0101010) { 83 // Conditional branch (immediate) 84 Instruction_aarch64::spatch(branch, 23, 5, offset); 85 } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011010) { 86 // Compare & branch (immediate) 87 Instruction_aarch64::spatch(branch, 23, 5, offset); 88 } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011011) { 89 // Test & branch (immediate) 90 Instruction_aarch64::spatch(branch, 18, 5, offset); 91 } else if (Instruction_aarch64::extract(insn, 28, 24) == 0b10000) { 92 // PC-rel. addressing 93 offset = target-branch; 94 int shift = Instruction_aarch64::extract(insn, 31, 31); 95 if (shift) { 96 u_int64_t dest = (u_int64_t)target; 97 uint64_t pc_page = (uint64_t)branch >> 12; 98 uint64_t adr_page = (uint64_t)target >> 12; 99 unsigned offset_lo = dest & 0xfff; 100 offset = adr_page - pc_page; 101 102 // We handle 4 types of PC relative addressing 103 // 1 - adrp Rx, target_page 104 // ldr/str Ry, [Rx, #offset_in_page] 105 // 2 - adrp Rx, target_page 106 // add Ry, Rx, #offset_in_page 107 // 3 - adrp Rx, target_page (page aligned reloc, offset == 0) 108 // movk Rx, #imm16<<32 109 // 4 - adrp Rx, target_page (page aligned reloc, offset == 0) 110 // In the first 3 cases we must check that Rx is the same in the adrp and the 111 // subsequent ldr/str, add or movk instruction. Otherwise we could accidentally end 112 // up treating a type 4 relocation as a type 1, 2 or 3 just because it happened 113 // to be followed by a random unrelated ldr/str, add or movk instruction. 114 // 115 unsigned insn2 = ((unsigned*)branch)[1]; 116 if (Instruction_aarch64::extract(insn2, 29, 24) == 0b111001 && 117 Instruction_aarch64::extract(insn, 4, 0) == 118 Instruction_aarch64::extract(insn2, 9, 5)) { 119 // Load/store register (unsigned immediate) 120 unsigned size = Instruction_aarch64::extract(insn2, 31, 30); 121 Instruction_aarch64::patch(branch + sizeof (unsigned), 122 21, 10, offset_lo >> size); 123 guarantee(((dest >> size) << size) == dest, "misaligned target"); 124 instructions = 2; 125 } else if (Instruction_aarch64::extract(insn2, 31, 22) == 0b1001000100 && 126 Instruction_aarch64::extract(insn, 4, 0) == 127 Instruction_aarch64::extract(insn2, 4, 0)) { 128 // add (immediate) 129 Instruction_aarch64::patch(branch + sizeof (unsigned), 130 21, 10, offset_lo); 131 instructions = 2; 132 } else if (Instruction_aarch64::extract(insn2, 31, 21) == 0b11110010110 && 133 Instruction_aarch64::extract(insn, 4, 0) == 134 Instruction_aarch64::extract(insn2, 4, 0)) { 135 // movk #imm16<<32 136 Instruction_aarch64::patch(branch + 4, 20, 5, (uint64_t)target >> 32); 137 long dest = ((long)target & 0xffffffffL) | ((long)branch & 0xffff00000000L); 138 long pc_page = (long)branch >> 12; 139 long adr_page = (long)dest >> 12; 140 offset = adr_page - pc_page; 141 instructions = 2; 142 } 143 } 144 int offset_lo = offset & 3; 145 offset >>= 2; 146 Instruction_aarch64::spatch(branch, 23, 5, offset); 147 Instruction_aarch64::patch(branch, 30, 29, offset_lo); 148 } else if (Instruction_aarch64::extract(insn, 31, 21) == 0b11010010100) { 149 u_int64_t dest = (u_int64_t)target; 150 // Move wide constant 151 assert(nativeInstruction_at(branch+4)->is_movk(), "wrong insns in patch"); 152 assert(nativeInstruction_at(branch+8)->is_movk(), "wrong insns in patch"); 153 Instruction_aarch64::patch(branch, 20, 5, dest & 0xffff); 154 Instruction_aarch64::patch(branch+4, 20, 5, (dest >>= 16) & 0xffff); 155 Instruction_aarch64::patch(branch+8, 20, 5, (dest >>= 16) & 0xffff); 156 assert(target_addr_for_insn(branch) == target, "should be"); 157 instructions = 3; 158 } else if (Instruction_aarch64::extract(insn, 31, 22) == 0b1011100101 && 159 Instruction_aarch64::extract(insn, 4, 0) == 0b11111) { 160 // nothing to do 161 assert(target == 0, "did not expect to relocate target for polling page load"); 162 } else { 163 ShouldNotReachHere(); 164 } 165 return instructions * NativeInstruction::instruction_size; 166 } 167 168 int MacroAssembler::patch_oop(address insn_addr, address o) { 169 int instructions; 170 unsigned insn = *(unsigned*)insn_addr; 171 assert(nativeInstruction_at(insn_addr+4)->is_movk(), "wrong insns in patch"); 172 173 // OOPs are either narrow (32 bits) or wide (48 bits). We encode 174 // narrow OOPs by setting the upper 16 bits in the first 175 // instruction. 176 if (Instruction_aarch64::extract(insn, 31, 21) == 0b11010010101) { 177 // Move narrow OOP 178 narrowOop n = CompressedOops::encode((oop)o); 179 Instruction_aarch64::patch(insn_addr, 20, 5, n >> 16); 180 Instruction_aarch64::patch(insn_addr+4, 20, 5, n & 0xffff); 181 instructions = 2; 182 } else { 183 // Move wide OOP 184 assert(nativeInstruction_at(insn_addr+8)->is_movk(), "wrong insns in patch"); 185 uintptr_t dest = (uintptr_t)o; 186 Instruction_aarch64::patch(insn_addr, 20, 5, dest & 0xffff); 187 Instruction_aarch64::patch(insn_addr+4, 20, 5, (dest >>= 16) & 0xffff); 188 Instruction_aarch64::patch(insn_addr+8, 20, 5, (dest >>= 16) & 0xffff); 189 instructions = 3; 190 } 191 return instructions * NativeInstruction::instruction_size; 192 } 193 194 int MacroAssembler::patch_narrow_klass(address insn_addr, narrowKlass n) { 195 // Metatdata pointers are either narrow (32 bits) or wide (48 bits). 196 // We encode narrow ones by setting the upper 16 bits in the first 197 // instruction. 198 NativeInstruction *insn = nativeInstruction_at(insn_addr); 199 assert(Instruction_aarch64::extract(insn->encoding(), 31, 21) == 0b11010010101 && 200 nativeInstruction_at(insn_addr+4)->is_movk(), "wrong insns in patch"); 201 202 Instruction_aarch64::patch(insn_addr, 20, 5, n >> 16); 203 Instruction_aarch64::patch(insn_addr+4, 20, 5, n & 0xffff); 204 return 2 * NativeInstruction::instruction_size; 205 } 206 207 address MacroAssembler::target_addr_for_insn(address insn_addr, unsigned insn) { 208 long offset = 0; 209 if ((Instruction_aarch64::extract(insn, 29, 24) & 0b011011) == 0b00011000) { 210 // Load register (literal) 211 offset = Instruction_aarch64::sextract(insn, 23, 5); 212 return address(((uint64_t)insn_addr + (offset << 2))); 213 } else if (Instruction_aarch64::extract(insn, 30, 26) == 0b00101) { 214 // Unconditional branch (immediate) 215 offset = Instruction_aarch64::sextract(insn, 25, 0); 216 } else if (Instruction_aarch64::extract(insn, 31, 25) == 0b0101010) { 217 // Conditional branch (immediate) 218 offset = Instruction_aarch64::sextract(insn, 23, 5); 219 } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011010) { 220 // Compare & branch (immediate) 221 offset = Instruction_aarch64::sextract(insn, 23, 5); 222 } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011011) { 223 // Test & branch (immediate) 224 offset = Instruction_aarch64::sextract(insn, 18, 5); 225 } else if (Instruction_aarch64::extract(insn, 28, 24) == 0b10000) { 226 // PC-rel. addressing 227 offset = Instruction_aarch64::extract(insn, 30, 29); 228 offset |= Instruction_aarch64::sextract(insn, 23, 5) << 2; 229 int shift = Instruction_aarch64::extract(insn, 31, 31) ? 12 : 0; 230 if (shift) { 231 offset <<= shift; 232 uint64_t target_page = ((uint64_t)insn_addr) + offset; 233 target_page &= ((uint64_t)-1) << shift; 234 // Return the target address for the following sequences 235 // 1 - adrp Rx, target_page 236 // ldr/str Ry, [Rx, #offset_in_page] 237 // 2 - adrp Rx, target_page 238 // add Ry, Rx, #offset_in_page 239 // 3 - adrp Rx, target_page (page aligned reloc, offset == 0) 240 // movk Rx, #imm12<<32 241 // 4 - adrp Rx, target_page (page aligned reloc, offset == 0) 242 // 243 // In the first two cases we check that the register is the same and 244 // return the target_page + the offset within the page. 245 // Otherwise we assume it is a page aligned relocation and return 246 // the target page only. 247 // 248 unsigned insn2 = ((unsigned*)insn_addr)[1]; 249 if (Instruction_aarch64::extract(insn2, 29, 24) == 0b111001 && 250 Instruction_aarch64::extract(insn, 4, 0) == 251 Instruction_aarch64::extract(insn2, 9, 5)) { 252 // Load/store register (unsigned immediate) 253 unsigned int byte_offset = Instruction_aarch64::extract(insn2, 21, 10); 254 unsigned int size = Instruction_aarch64::extract(insn2, 31, 30); 255 return address(target_page + (byte_offset << size)); 256 } else if (Instruction_aarch64::extract(insn2, 31, 22) == 0b1001000100 && 257 Instruction_aarch64::extract(insn, 4, 0) == 258 Instruction_aarch64::extract(insn2, 4, 0)) { 259 // add (immediate) 260 unsigned int byte_offset = Instruction_aarch64::extract(insn2, 21, 10); 261 return address(target_page + byte_offset); 262 } else { 263 if (Instruction_aarch64::extract(insn2, 31, 21) == 0b11110010110 && 264 Instruction_aarch64::extract(insn, 4, 0) == 265 Instruction_aarch64::extract(insn2, 4, 0)) { 266 target_page = (target_page & 0xffffffff) | 267 ((uint64_t)Instruction_aarch64::extract(insn2, 20, 5) << 32); 268 } 269 return (address)target_page; 270 } 271 } else { 272 ShouldNotReachHere(); 273 } 274 } else if (Instruction_aarch64::extract(insn, 31, 23) == 0b110100101) { 275 u_int32_t *insns = (u_int32_t *)insn_addr; 276 // Move wide constant: movz, movk, movk. See movptr(). 277 assert(nativeInstruction_at(insns+1)->is_movk(), "wrong insns in patch"); 278 assert(nativeInstruction_at(insns+2)->is_movk(), "wrong insns in patch"); 279 return address(u_int64_t(Instruction_aarch64::extract(insns[0], 20, 5)) 280 + (u_int64_t(Instruction_aarch64::extract(insns[1], 20, 5)) << 16) 281 + (u_int64_t(Instruction_aarch64::extract(insns[2], 20, 5)) << 32)); 282 } else if (Instruction_aarch64::extract(insn, 31, 22) == 0b1011100101 && 283 Instruction_aarch64::extract(insn, 4, 0) == 0b11111) { 284 return 0; 285 } else { 286 ShouldNotReachHere(); 287 } 288 return address(((uint64_t)insn_addr + (offset << 2))); 289 } 290 291 void MacroAssembler::serialize_memory(Register thread, Register tmp) { 292 dsb(Assembler::SY); 293 } 294 295 void MacroAssembler::safepoint_poll(Label& slow_path) { 296 if (SafepointMechanism::uses_thread_local_poll()) { 297 ldr(rscratch1, Address(rthread, Thread::polling_page_offset())); 298 tbnz(rscratch1, exact_log2(SafepointMechanism::poll_bit()), slow_path); 299 } else { 300 unsigned long offset; 301 adrp(rscratch1, ExternalAddress(SafepointSynchronize::address_of_state()), offset); 302 ldrw(rscratch1, Address(rscratch1, offset)); 303 assert(SafepointSynchronize::_not_synchronized == 0, "rewrite this code"); 304 cbnz(rscratch1, slow_path); 305 } 306 } 307 308 // Just like safepoint_poll, but use an acquiring load for thread- 309 // local polling. 310 // 311 // We need an acquire here to ensure that any subsequent load of the 312 // global SafepointSynchronize::_state flag is ordered after this load 313 // of the local Thread::_polling page. We don't want this poll to 314 // return false (i.e. not safepointing) and a later poll of the global 315 // SafepointSynchronize::_state spuriously to return true. 316 // 317 // This is to avoid a race when we're in a native->Java transition 318 // racing the code which wakes up from a safepoint. 319 // 320 void MacroAssembler::safepoint_poll_acquire(Label& slow_path) { 321 if (SafepointMechanism::uses_thread_local_poll()) { 322 lea(rscratch1, Address(rthread, Thread::polling_page_offset())); 323 ldar(rscratch1, rscratch1); 324 tbnz(rscratch1, exact_log2(SafepointMechanism::poll_bit()), slow_path); 325 } else { 326 safepoint_poll(slow_path); 327 } 328 } 329 330 void MacroAssembler::reset_last_Java_frame(bool clear_fp) { 331 // we must set sp to zero to clear frame 332 str(zr, Address(rthread, JavaThread::last_Java_sp_offset())); 333 334 // must clear fp, so that compiled frames are not confused; it is 335 // possible that we need it only for debugging 336 if (clear_fp) { 337 str(zr, Address(rthread, JavaThread::last_Java_fp_offset())); 338 } 339 340 // Always clear the pc because it could have been set by make_walkable() 341 str(zr, Address(rthread, JavaThread::last_Java_pc_offset())); 342 } 343 344 // Calls to C land 345 // 346 // When entering C land, the rfp, & resp of the last Java frame have to be recorded 347 // in the (thread-local) JavaThread object. When leaving C land, the last Java fp 348 // has to be reset to 0. This is required to allow proper stack traversal. 349 void MacroAssembler::set_last_Java_frame(Register last_java_sp, 350 Register last_java_fp, 351 Register last_java_pc, 352 Register scratch) { 353 354 if (last_java_pc->is_valid()) { 355 str(last_java_pc, Address(rthread, 356 JavaThread::frame_anchor_offset() 357 + JavaFrameAnchor::last_Java_pc_offset())); 358 } 359 360 // determine last_java_sp register 361 if (last_java_sp == sp) { 362 mov(scratch, sp); 363 last_java_sp = scratch; 364 } else if (!last_java_sp->is_valid()) { 365 last_java_sp = esp; 366 } 367 368 str(last_java_sp, Address(rthread, JavaThread::last_Java_sp_offset())); 369 370 // last_java_fp is optional 371 if (last_java_fp->is_valid()) { 372 str(last_java_fp, Address(rthread, JavaThread::last_Java_fp_offset())); 373 } 374 } 375 376 void MacroAssembler::set_last_Java_frame(Register last_java_sp, 377 Register last_java_fp, 378 address last_java_pc, 379 Register scratch) { 380 if (last_java_pc != NULL) { 381 adr(scratch, last_java_pc); 382 } else { 383 // FIXME: This is almost never correct. We should delete all 384 // cases of set_last_Java_frame with last_java_pc=NULL and use the 385 // correct return address instead. 386 adr(scratch, pc()); 387 } 388 389 str(scratch, Address(rthread, 390 JavaThread::frame_anchor_offset() 391 + JavaFrameAnchor::last_Java_pc_offset())); 392 393 set_last_Java_frame(last_java_sp, last_java_fp, noreg, scratch); 394 } 395 396 void MacroAssembler::set_last_Java_frame(Register last_java_sp, 397 Register last_java_fp, 398 Label &L, 399 Register scratch) { 400 if (L.is_bound()) { 401 set_last_Java_frame(last_java_sp, last_java_fp, target(L), scratch); 402 } else { 403 InstructionMark im(this); 404 L.add_patch_at(code(), locator()); 405 set_last_Java_frame(last_java_sp, last_java_fp, (address)NULL, scratch); 406 } 407 } 408 409 void MacroAssembler::far_call(Address entry, CodeBuffer *cbuf, Register tmp) { 410 assert(ReservedCodeCacheSize < 4*G, "branch out of range"); 411 assert(CodeCache::find_blob(entry.target()) != NULL, 412 "destination of far call not found in code cache"); 413 if (far_branches()) { 414 unsigned long offset; 415 // We can use ADRP here because we know that the total size of 416 // the code cache cannot exceed 2Gb. 417 adrp(tmp, entry, offset); 418 add(tmp, tmp, offset); 419 if (cbuf) cbuf->set_insts_mark(); 420 blr(tmp); 421 } else { 422 if (cbuf) cbuf->set_insts_mark(); 423 bl(entry); 424 } 425 } 426 427 void MacroAssembler::far_jump(Address entry, CodeBuffer *cbuf, Register tmp) { 428 assert(ReservedCodeCacheSize < 4*G, "branch out of range"); 429 assert(CodeCache::find_blob(entry.target()) != NULL, 430 "destination of far call not found in code cache"); 431 if (far_branches()) { 432 unsigned long offset; 433 // We can use ADRP here because we know that the total size of 434 // the code cache cannot exceed 2Gb. 435 adrp(tmp, entry, offset); 436 add(tmp, tmp, offset); 437 if (cbuf) cbuf->set_insts_mark(); 438 br(tmp); 439 } else { 440 if (cbuf) cbuf->set_insts_mark(); 441 b(entry); 442 } 443 } 444 445 void MacroAssembler::reserved_stack_check() { 446 // testing if reserved zone needs to be enabled 447 Label no_reserved_zone_enabling; 448 449 ldr(rscratch1, Address(rthread, JavaThread::reserved_stack_activation_offset())); 450 cmp(sp, rscratch1); 451 br(Assembler::LO, no_reserved_zone_enabling); 452 453 enter(); // LR and FP are live. 454 lea(rscratch1, CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone)); 455 mov(c_rarg0, rthread); 456 blr(rscratch1); 457 leave(); 458 459 // We have already removed our own frame. 460 // throw_delayed_StackOverflowError will think that it's been 461 // called by our caller. 462 lea(rscratch1, RuntimeAddress(StubRoutines::throw_delayed_StackOverflowError_entry())); 463 br(rscratch1); 464 should_not_reach_here(); 465 466 bind(no_reserved_zone_enabling); 467 } 468 469 int MacroAssembler::biased_locking_enter(Register lock_reg, 470 Register obj_reg, 471 Register swap_reg, 472 Register tmp_reg, 473 bool swap_reg_contains_mark, 474 Label& done, 475 Label* slow_case, 476 BiasedLockingCounters* counters) { 477 assert(UseBiasedLocking, "why call this otherwise?"); 478 assert_different_registers(lock_reg, obj_reg, swap_reg); 479 480 if (PrintBiasedLockingStatistics && counters == NULL) 481 counters = BiasedLocking::counters(); 482 483 assert_different_registers(lock_reg, obj_reg, swap_reg, tmp_reg, rscratch1, rscratch2, noreg); 484 assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits, "biased locking makes assumptions about bit layout"); 485 Address mark_addr (obj_reg, oopDesc::mark_offset_in_bytes()); 486 Address klass_addr (obj_reg, oopDesc::klass_offset_in_bytes()); 487 Address saved_mark_addr(lock_reg, 0); 488 489 // Biased locking 490 // See whether the lock is currently biased toward our thread and 491 // whether the epoch is still valid 492 // Note that the runtime guarantees sufficient alignment of JavaThread 493 // pointers to allow age to be placed into low bits 494 // First check to see whether biasing is even enabled for this object 495 Label cas_label; 496 int null_check_offset = -1; 497 if (!swap_reg_contains_mark) { 498 null_check_offset = offset(); 499 ldr(swap_reg, mark_addr); 500 } 501 andr(tmp_reg, swap_reg, markOopDesc::biased_lock_mask_in_place); 502 cmp(tmp_reg, (u1)markOopDesc::biased_lock_pattern); 503 br(Assembler::NE, cas_label); 504 // The bias pattern is present in the object's header. Need to check 505 // whether the bias owner and the epoch are both still current. 506 load_prototype_header(tmp_reg, obj_reg); 507 orr(tmp_reg, tmp_reg, rthread); 508 eor(tmp_reg, swap_reg, tmp_reg); 509 andr(tmp_reg, tmp_reg, ~((int) markOopDesc::age_mask_in_place)); 510 if (counters != NULL) { 511 Label around; 512 cbnz(tmp_reg, around); 513 atomic_incw(Address((address)counters->biased_lock_entry_count_addr()), tmp_reg, rscratch1, rscratch2); 514 b(done); 515 bind(around); 516 } else { 517 cbz(tmp_reg, done); 518 } 519 520 Label try_revoke_bias; 521 Label try_rebias; 522 523 // At this point we know that the header has the bias pattern and 524 // that we are not the bias owner in the current epoch. We need to 525 // figure out more details about the state of the header in order to 526 // know what operations can be legally performed on the object's 527 // header. 528 529 // If the low three bits in the xor result aren't clear, that means 530 // the prototype header is no longer biased and we have to revoke 531 // the bias on this object. 532 andr(rscratch1, tmp_reg, markOopDesc::biased_lock_mask_in_place); 533 cbnz(rscratch1, try_revoke_bias); 534 535 // Biasing is still enabled for this data type. See whether the 536 // epoch of the current bias is still valid, meaning that the epoch 537 // bits of the mark word are equal to the epoch bits of the 538 // prototype header. (Note that the prototype header's epoch bits 539 // only change at a safepoint.) If not, attempt to rebias the object 540 // toward the current thread. Note that we must be absolutely sure 541 // that the current epoch is invalid in order to do this because 542 // otherwise the manipulations it performs on the mark word are 543 // illegal. 544 andr(rscratch1, tmp_reg, markOopDesc::epoch_mask_in_place); 545 cbnz(rscratch1, try_rebias); 546 547 // The epoch of the current bias is still valid but we know nothing 548 // about the owner; it might be set or it might be clear. Try to 549 // acquire the bias of the object using an atomic operation. If this 550 // fails we will go in to the runtime to revoke the object's bias. 551 // Note that we first construct the presumed unbiased header so we 552 // don't accidentally blow away another thread's valid bias. 553 { 554 Label here; 555 mov(rscratch1, markOopDesc::biased_lock_mask_in_place | markOopDesc::age_mask_in_place | markOopDesc::epoch_mask_in_place); 556 andr(swap_reg, swap_reg, rscratch1); 557 orr(tmp_reg, swap_reg, rthread); 558 cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, rscratch1, here, slow_case); 559 // If the biasing toward our thread failed, this means that 560 // another thread succeeded in biasing it toward itself and we 561 // need to revoke that bias. The revocation will occur in the 562 // interpreter runtime in the slow case. 563 bind(here); 564 if (counters != NULL) { 565 atomic_incw(Address((address)counters->anonymously_biased_lock_entry_count_addr()), 566 tmp_reg, rscratch1, rscratch2); 567 } 568 } 569 b(done); 570 571 bind(try_rebias); 572 // At this point we know the epoch has expired, meaning that the 573 // current "bias owner", if any, is actually invalid. Under these 574 // circumstances _only_, we are allowed to use the current header's 575 // value as the comparison value when doing the cas to acquire the 576 // bias in the current epoch. In other words, we allow transfer of 577 // the bias from one thread to another directly in this situation. 578 // 579 // FIXME: due to a lack of registers we currently blow away the age 580 // bits in this situation. Should attempt to preserve them. 581 { 582 Label here; 583 load_prototype_header(tmp_reg, obj_reg); 584 orr(tmp_reg, rthread, tmp_reg); 585 cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, rscratch1, here, slow_case); 586 // If the biasing toward our thread failed, then another thread 587 // succeeded in biasing it toward itself and we need to revoke that 588 // bias. The revocation will occur in the runtime in the slow case. 589 bind(here); 590 if (counters != NULL) { 591 atomic_incw(Address((address)counters->rebiased_lock_entry_count_addr()), 592 tmp_reg, rscratch1, rscratch2); 593 } 594 } 595 b(done); 596 597 bind(try_revoke_bias); 598 // The prototype mark in the klass doesn't have the bias bit set any 599 // more, indicating that objects of this data type are not supposed 600 // to be biased any more. We are going to try to reset the mark of 601 // this object to the prototype value and fall through to the 602 // CAS-based locking scheme. Note that if our CAS fails, it means 603 // that another thread raced us for the privilege of revoking the 604 // bias of this particular object, so it's okay to continue in the 605 // normal locking code. 606 // 607 // FIXME: due to a lack of registers we currently blow away the age 608 // bits in this situation. Should attempt to preserve them. 609 { 610 Label here, nope; 611 load_prototype_header(tmp_reg, obj_reg); 612 cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, rscratch1, here, &nope); 613 bind(here); 614 615 // Fall through to the normal CAS-based lock, because no matter what 616 // the result of the above CAS, some thread must have succeeded in 617 // removing the bias bit from the object's header. 618 if (counters != NULL) { 619 atomic_incw(Address((address)counters->revoked_lock_entry_count_addr()), tmp_reg, 620 rscratch1, rscratch2); 621 } 622 bind(nope); 623 } 624 625 bind(cas_label); 626 627 return null_check_offset; 628 } 629 630 void MacroAssembler::biased_locking_exit(Register obj_reg, Register temp_reg, Label& done) { 631 assert(UseBiasedLocking, "why call this otherwise?"); 632 633 // Check for biased locking unlock case, which is a no-op 634 // Note: we do not have to check the thread ID for two reasons. 635 // First, the interpreter checks for IllegalMonitorStateException at 636 // a higher level. Second, if the bias was revoked while we held the 637 // lock, the object could not be rebiased toward another thread, so 638 // the bias bit would be clear. 639 ldr(temp_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes())); 640 andr(temp_reg, temp_reg, markOopDesc::biased_lock_mask_in_place); 641 cmp(temp_reg, (u1)markOopDesc::biased_lock_pattern); 642 br(Assembler::EQ, done); 643 } 644 645 static void pass_arg0(MacroAssembler* masm, Register arg) { 646 if (c_rarg0 != arg ) { 647 masm->mov(c_rarg0, arg); 648 } 649 } 650 651 static void pass_arg1(MacroAssembler* masm, Register arg) { 652 if (c_rarg1 != arg ) { 653 masm->mov(c_rarg1, arg); 654 } 655 } 656 657 static void pass_arg2(MacroAssembler* masm, Register arg) { 658 if (c_rarg2 != arg ) { 659 masm->mov(c_rarg2, arg); 660 } 661 } 662 663 static void pass_arg3(MacroAssembler* masm, Register arg) { 664 if (c_rarg3 != arg ) { 665 masm->mov(c_rarg3, arg); 666 } 667 } 668 669 void MacroAssembler::call_VM_base(Register oop_result, 670 Register java_thread, 671 Register last_java_sp, 672 address entry_point, 673 int number_of_arguments, 674 bool check_exceptions) { 675 // determine java_thread register 676 if (!java_thread->is_valid()) { 677 java_thread = rthread; 678 } 679 680 // determine last_java_sp register 681 if (!last_java_sp->is_valid()) { 682 last_java_sp = esp; 683 } 684 685 // debugging support 686 assert(number_of_arguments >= 0 , "cannot have negative number of arguments"); 687 assert(java_thread == rthread, "unexpected register"); 688 #ifdef ASSERT 689 // TraceBytecodes does not use r12 but saves it over the call, so don't verify 690 // if ((UseCompressedOops || UseCompressedClassPointers) && !TraceBytecodes) verify_heapbase("call_VM_base: heap base corrupted?"); 691 #endif // ASSERT 692 693 assert(java_thread != oop_result , "cannot use the same register for java_thread & oop_result"); 694 assert(java_thread != last_java_sp, "cannot use the same register for java_thread & last_java_sp"); 695 696 // push java thread (becomes first argument of C function) 697 698 mov(c_rarg0, java_thread); 699 700 // set last Java frame before call 701 assert(last_java_sp != rfp, "can't use rfp"); 702 703 Label l; 704 set_last_Java_frame(last_java_sp, rfp, l, rscratch1); 705 706 // do the call, remove parameters 707 MacroAssembler::call_VM_leaf_base(entry_point, number_of_arguments, &l); 708 709 // reset last Java frame 710 // Only interpreter should have to clear fp 711 reset_last_Java_frame(true); 712 713 // C++ interp handles this in the interpreter 714 check_and_handle_popframe(java_thread); 715 check_and_handle_earlyret(java_thread); 716 717 if (check_exceptions) { 718 // check for pending exceptions (java_thread is set upon return) 719 ldr(rscratch1, Address(java_thread, in_bytes(Thread::pending_exception_offset()))); 720 Label ok; 721 cbz(rscratch1, ok); 722 lea(rscratch1, RuntimeAddress(StubRoutines::forward_exception_entry())); 723 br(rscratch1); 724 bind(ok); 725 } 726 727 // get oop result if there is one and reset the value in the thread 728 if (oop_result->is_valid()) { 729 get_vm_result(oop_result, java_thread); 730 } 731 } 732 733 void MacroAssembler::call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions) { 734 call_VM_base(oop_result, noreg, noreg, entry_point, number_of_arguments, check_exceptions); 735 } 736 737 // Maybe emit a call via a trampoline. If the code cache is small 738 // trampolines won't be emitted. 739 740 address MacroAssembler::trampoline_call(Address entry, CodeBuffer *cbuf) { 741 assert(JavaThread::current()->is_Compiler_thread(), "just checking"); 742 assert(entry.rspec().type() == relocInfo::runtime_call_type 743 || entry.rspec().type() == relocInfo::opt_virtual_call_type 744 || entry.rspec().type() == relocInfo::static_call_type 745 || entry.rspec().type() == relocInfo::virtual_call_type, "wrong reloc type"); 746 747 // We need a trampoline if branches are far. 748 if (far_branches()) { 749 bool in_scratch_emit_size = false; 750 #ifdef COMPILER2 751 // We don't want to emit a trampoline if C2 is generating dummy 752 // code during its branch shortening phase. 753 CompileTask* task = ciEnv::current()->task(); 754 in_scratch_emit_size = 755 (task != NULL && is_c2_compile(task->comp_level()) && 756 Compile::current()->in_scratch_emit_size()); 757 #endif 758 if (!in_scratch_emit_size) { 759 address stub = emit_trampoline_stub(offset(), entry.target()); 760 if (stub == NULL) { 761 return NULL; // CodeCache is full 762 } 763 } 764 } 765 766 if (cbuf) cbuf->set_insts_mark(); 767 relocate(entry.rspec()); 768 if (!far_branches()) { 769 bl(entry.target()); 770 } else { 771 bl(pc()); 772 } 773 // just need to return a non-null address 774 return pc(); 775 } 776 777 778 // Emit a trampoline stub for a call to a target which is too far away. 779 // 780 // code sequences: 781 // 782 // call-site: 783 // branch-and-link to <destination> or <trampoline stub> 784 // 785 // Related trampoline stub for this call site in the stub section: 786 // load the call target from the constant pool 787 // branch (LR still points to the call site above) 788 789 address MacroAssembler::emit_trampoline_stub(int insts_call_instruction_offset, 790 address dest) { 791 // Max stub size: alignment nop, TrampolineStub. 792 address stub = start_a_stub(NativeInstruction::instruction_size 793 + NativeCallTrampolineStub::instruction_size); 794 if (stub == NULL) { 795 return NULL; // CodeBuffer::expand failed 796 } 797 798 // Create a trampoline stub relocation which relates this trampoline stub 799 // with the call instruction at insts_call_instruction_offset in the 800 // instructions code-section. 801 align(wordSize); 802 relocate(trampoline_stub_Relocation::spec(code()->insts()->start() 803 + insts_call_instruction_offset)); 804 const int stub_start_offset = offset(); 805 806 // Now, create the trampoline stub's code: 807 // - load the call 808 // - call 809 Label target; 810 ldr(rscratch1, target); 811 br(rscratch1); 812 bind(target); 813 assert(offset() - stub_start_offset == NativeCallTrampolineStub::data_offset, 814 "should be"); 815 emit_int64((int64_t)dest); 816 817 const address stub_start_addr = addr_at(stub_start_offset); 818 819 assert(is_NativeCallTrampolineStub_at(stub_start_addr), "doesn't look like a trampoline"); 820 821 end_a_stub(); 822 return stub_start_addr; 823 } 824 825 void MacroAssembler::c2bool(Register x) { 826 // implements x == 0 ? 0 : 1 827 // note: must only look at least-significant byte of x 828 // since C-style booleans are stored in one byte 829 // only! (was bug) 830 tst(x, 0xff); 831 cset(x, Assembler::NE); 832 } 833 834 address MacroAssembler::ic_call(address entry, jint method_index) { 835 RelocationHolder rh = virtual_call_Relocation::spec(pc(), method_index); 836 // address const_ptr = long_constant((jlong)Universe::non_oop_word()); 837 // unsigned long offset; 838 // ldr_constant(rscratch2, const_ptr); 839 movptr(rscratch2, (uintptr_t)Universe::non_oop_word()); 840 return trampoline_call(Address(entry, rh)); 841 } 842 843 // Implementation of call_VM versions 844 845 void MacroAssembler::call_VM(Register oop_result, 846 address entry_point, 847 bool check_exceptions) { 848 call_VM_helper(oop_result, entry_point, 0, check_exceptions); 849 } 850 851 void MacroAssembler::call_VM(Register oop_result, 852 address entry_point, 853 Register arg_1, 854 bool check_exceptions) { 855 pass_arg1(this, arg_1); 856 call_VM_helper(oop_result, entry_point, 1, check_exceptions); 857 } 858 859 void MacroAssembler::call_VM(Register oop_result, 860 address entry_point, 861 Register arg_1, 862 Register arg_2, 863 bool check_exceptions) { 864 assert(arg_1 != c_rarg2, "smashed arg"); 865 pass_arg2(this, arg_2); 866 pass_arg1(this, arg_1); 867 call_VM_helper(oop_result, entry_point, 2, check_exceptions); 868 } 869 870 void MacroAssembler::call_VM(Register oop_result, 871 address entry_point, 872 Register arg_1, 873 Register arg_2, 874 Register arg_3, 875 bool check_exceptions) { 876 assert(arg_1 != c_rarg3, "smashed arg"); 877 assert(arg_2 != c_rarg3, "smashed arg"); 878 pass_arg3(this, arg_3); 879 880 assert(arg_1 != c_rarg2, "smashed arg"); 881 pass_arg2(this, arg_2); 882 883 pass_arg1(this, arg_1); 884 call_VM_helper(oop_result, entry_point, 3, check_exceptions); 885 } 886 887 void MacroAssembler::call_VM(Register oop_result, 888 Register last_java_sp, 889 address entry_point, 890 int number_of_arguments, 891 bool check_exceptions) { 892 call_VM_base(oop_result, rthread, last_java_sp, entry_point, number_of_arguments, check_exceptions); 893 } 894 895 void MacroAssembler::call_VM(Register oop_result, 896 Register last_java_sp, 897 address entry_point, 898 Register arg_1, 899 bool check_exceptions) { 900 pass_arg1(this, arg_1); 901 call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions); 902 } 903 904 void MacroAssembler::call_VM(Register oop_result, 905 Register last_java_sp, 906 address entry_point, 907 Register arg_1, 908 Register arg_2, 909 bool check_exceptions) { 910 911 assert(arg_1 != c_rarg2, "smashed arg"); 912 pass_arg2(this, arg_2); 913 pass_arg1(this, arg_1); 914 call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions); 915 } 916 917 void MacroAssembler::call_VM(Register oop_result, 918 Register last_java_sp, 919 address entry_point, 920 Register arg_1, 921 Register arg_2, 922 Register arg_3, 923 bool check_exceptions) { 924 assert(arg_1 != c_rarg3, "smashed arg"); 925 assert(arg_2 != c_rarg3, "smashed arg"); 926 pass_arg3(this, arg_3); 927 assert(arg_1 != c_rarg2, "smashed arg"); 928 pass_arg2(this, arg_2); 929 pass_arg1(this, arg_1); 930 call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions); 931 } 932 933 934 void MacroAssembler::get_vm_result(Register oop_result, Register java_thread) { 935 ldr(oop_result, Address(java_thread, JavaThread::vm_result_offset())); 936 str(zr, Address(java_thread, JavaThread::vm_result_offset())); 937 verify_oop(oop_result, "broken oop in call_VM_base"); 938 } 939 940 void MacroAssembler::get_vm_result_2(Register metadata_result, Register java_thread) { 941 ldr(metadata_result, Address(java_thread, JavaThread::vm_result_2_offset())); 942 str(zr, Address(java_thread, JavaThread::vm_result_2_offset())); 943 } 944 945 void MacroAssembler::align(int modulus) { 946 while (offset() % modulus != 0) nop(); 947 } 948 949 // these are no-ops overridden by InterpreterMacroAssembler 950 951 void MacroAssembler::check_and_handle_earlyret(Register java_thread) { } 952 953 void MacroAssembler::check_and_handle_popframe(Register java_thread) { } 954 955 956 RegisterOrConstant MacroAssembler::delayed_value_impl(intptr_t* delayed_value_addr, 957 Register tmp, 958 int offset) { 959 intptr_t value = *delayed_value_addr; 960 if (value != 0) 961 return RegisterOrConstant(value + offset); 962 963 // load indirectly to solve generation ordering problem 964 ldr(tmp, ExternalAddress((address) delayed_value_addr)); 965 966 if (offset != 0) 967 add(tmp, tmp, offset); 968 969 return RegisterOrConstant(tmp); 970 } 971 972 973 void MacroAssembler:: notify(int type) { 974 if (type == bytecode_start) { 975 // set_last_Java_frame(esp, rfp, (address)NULL); 976 Assembler:: notify(type); 977 // reset_last_Java_frame(true); 978 } 979 else 980 Assembler:: notify(type); 981 } 982 983 // Look up the method for a megamorphic invokeinterface call. 984 // The target method is determined by <intf_klass, itable_index>. 985 // The receiver klass is in recv_klass. 986 // On success, the result will be in method_result, and execution falls through. 987 // On failure, execution transfers to the given label. 988 void MacroAssembler::lookup_interface_method(Register recv_klass, 989 Register intf_klass, 990 RegisterOrConstant itable_index, 991 Register method_result, 992 Register scan_temp, 993 Label& L_no_such_interface, 994 bool return_method) { 995 assert_different_registers(recv_klass, intf_klass, scan_temp); 996 assert_different_registers(method_result, intf_klass, scan_temp); 997 assert(recv_klass != method_result || !return_method, 998 "recv_klass can be destroyed when method isn't needed"); 999 assert(itable_index.is_constant() || itable_index.as_register() == method_result, 1000 "caller must use same register for non-constant itable index as for method"); 1001 1002 // Compute start of first itableOffsetEntry (which is at the end of the vtable) 1003 int vtable_base = in_bytes(Klass::vtable_start_offset()); 1004 int itentry_off = itableMethodEntry::method_offset_in_bytes(); 1005 int scan_step = itableOffsetEntry::size() * wordSize; 1006 int vte_size = vtableEntry::size_in_bytes(); 1007 assert(vte_size == wordSize, "else adjust times_vte_scale"); 1008 1009 ldrw(scan_temp, Address(recv_klass, Klass::vtable_length_offset())); 1010 1011 // %%% Could store the aligned, prescaled offset in the klassoop. 1012 // lea(scan_temp, Address(recv_klass, scan_temp, times_vte_scale, vtable_base)); 1013 lea(scan_temp, Address(recv_klass, scan_temp, Address::lsl(3))); 1014 add(scan_temp, scan_temp, vtable_base); 1015 1016 if (return_method) { 1017 // Adjust recv_klass by scaled itable_index, so we can free itable_index. 1018 assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below"); 1019 // lea(recv_klass, Address(recv_klass, itable_index, Address::times_ptr, itentry_off)); 1020 lea(recv_klass, Address(recv_klass, itable_index, Address::lsl(3))); 1021 if (itentry_off) 1022 add(recv_klass, recv_klass, itentry_off); 1023 } 1024 1025 // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) { 1026 // if (scan->interface() == intf) { 1027 // result = (klass + scan->offset() + itable_index); 1028 // } 1029 // } 1030 Label search, found_method; 1031 1032 for (int peel = 1; peel >= 0; peel--) { 1033 ldr(method_result, Address(scan_temp, itableOffsetEntry::interface_offset_in_bytes())); 1034 cmp(intf_klass, method_result); 1035 1036 if (peel) { 1037 br(Assembler::EQ, found_method); 1038 } else { 1039 br(Assembler::NE, search); 1040 // (invert the test to fall through to found_method...) 1041 } 1042 1043 if (!peel) break; 1044 1045 bind(search); 1046 1047 // Check that the previous entry is non-null. A null entry means that 1048 // the receiver class doesn't implement the interface, and wasn't the 1049 // same as when the caller was compiled. 1050 cbz(method_result, L_no_such_interface); 1051 add(scan_temp, scan_temp, scan_step); 1052 } 1053 1054 bind(found_method); 1055 1056 // Got a hit. 1057 if (return_method) { 1058 ldrw(scan_temp, Address(scan_temp, itableOffsetEntry::offset_offset_in_bytes())); 1059 ldr(method_result, Address(recv_klass, scan_temp, Address::uxtw(0))); 1060 } 1061 } 1062 1063 // virtual method calling 1064 void MacroAssembler::lookup_virtual_method(Register recv_klass, 1065 RegisterOrConstant vtable_index, 1066 Register method_result) { 1067 const int base = in_bytes(Klass::vtable_start_offset()); 1068 assert(vtableEntry::size() * wordSize == 8, 1069 "adjust the scaling in the code below"); 1070 int vtable_offset_in_bytes = base + vtableEntry::method_offset_in_bytes(); 1071 1072 if (vtable_index.is_register()) { 1073 lea(method_result, Address(recv_klass, 1074 vtable_index.as_register(), 1075 Address::lsl(LogBytesPerWord))); 1076 ldr(method_result, Address(method_result, vtable_offset_in_bytes)); 1077 } else { 1078 vtable_offset_in_bytes += vtable_index.as_constant() * wordSize; 1079 ldr(method_result, 1080 form_address(rscratch1, recv_klass, vtable_offset_in_bytes, 0)); 1081 } 1082 } 1083 1084 void MacroAssembler::check_klass_subtype(Register sub_klass, 1085 Register super_klass, 1086 Register temp_reg, 1087 Label& L_success) { 1088 Label L_failure; 1089 check_klass_subtype_fast_path(sub_klass, super_klass, temp_reg, &L_success, &L_failure, NULL); 1090 check_klass_subtype_slow_path(sub_klass, super_klass, temp_reg, noreg, &L_success, NULL); 1091 bind(L_failure); 1092 } 1093 1094 1095 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass, 1096 Register super_klass, 1097 Register temp_reg, 1098 Label* L_success, 1099 Label* L_failure, 1100 Label* L_slow_path, 1101 RegisterOrConstant super_check_offset) { 1102 assert_different_registers(sub_klass, super_klass, temp_reg); 1103 bool must_load_sco = (super_check_offset.constant_or_zero() == -1); 1104 if (super_check_offset.is_register()) { 1105 assert_different_registers(sub_klass, super_klass, 1106 super_check_offset.as_register()); 1107 } else if (must_load_sco) { 1108 assert(temp_reg != noreg, "supply either a temp or a register offset"); 1109 } 1110 1111 Label L_fallthrough; 1112 int label_nulls = 0; 1113 if (L_success == NULL) { L_success = &L_fallthrough; label_nulls++; } 1114 if (L_failure == NULL) { L_failure = &L_fallthrough; label_nulls++; } 1115 if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; } 1116 assert(label_nulls <= 1, "at most one NULL in the batch"); 1117 1118 int sc_offset = in_bytes(Klass::secondary_super_cache_offset()); 1119 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 1120 Address super_check_offset_addr(super_klass, sco_offset); 1121 1122 // Hacked jmp, which may only be used just before L_fallthrough. 1123 #define final_jmp(label) \ 1124 if (&(label) == &L_fallthrough) { /*do nothing*/ } \ 1125 else b(label) /*omit semi*/ 1126 1127 // If the pointers are equal, we are done (e.g., String[] elements). 1128 // This self-check enables sharing of secondary supertype arrays among 1129 // non-primary types such as array-of-interface. Otherwise, each such 1130 // type would need its own customized SSA. 1131 // We move this check to the front of the fast path because many 1132 // type checks are in fact trivially successful in this manner, 1133 // so we get a nicely predicted branch right at the start of the check. 1134 cmp(sub_klass, super_klass); 1135 br(Assembler::EQ, *L_success); 1136 1137 // Check the supertype display: 1138 if (must_load_sco) { 1139 ldrw(temp_reg, super_check_offset_addr); 1140 super_check_offset = RegisterOrConstant(temp_reg); 1141 } 1142 Address super_check_addr(sub_klass, super_check_offset); 1143 ldr(rscratch1, super_check_addr); 1144 cmp(super_klass, rscratch1); // load displayed supertype 1145 1146 // This check has worked decisively for primary supers. 1147 // Secondary supers are sought in the super_cache ('super_cache_addr'). 1148 // (Secondary supers are interfaces and very deeply nested subtypes.) 1149 // This works in the same check above because of a tricky aliasing 1150 // between the super_cache and the primary super display elements. 1151 // (The 'super_check_addr' can address either, as the case requires.) 1152 // Note that the cache is updated below if it does not help us find 1153 // what we need immediately. 1154 // So if it was a primary super, we can just fail immediately. 1155 // Otherwise, it's the slow path for us (no success at this point). 1156 1157 if (super_check_offset.is_register()) { 1158 br(Assembler::EQ, *L_success); 1159 subs(zr, super_check_offset.as_register(), sc_offset); 1160 if (L_failure == &L_fallthrough) { 1161 br(Assembler::EQ, *L_slow_path); 1162 } else { 1163 br(Assembler::NE, *L_failure); 1164 final_jmp(*L_slow_path); 1165 } 1166 } else if (super_check_offset.as_constant() == sc_offset) { 1167 // Need a slow path; fast failure is impossible. 1168 if (L_slow_path == &L_fallthrough) { 1169 br(Assembler::EQ, *L_success); 1170 } else { 1171 br(Assembler::NE, *L_slow_path); 1172 final_jmp(*L_success); 1173 } 1174 } else { 1175 // No slow path; it's a fast decision. 1176 if (L_failure == &L_fallthrough) { 1177 br(Assembler::EQ, *L_success); 1178 } else { 1179 br(Assembler::NE, *L_failure); 1180 final_jmp(*L_success); 1181 } 1182 } 1183 1184 bind(L_fallthrough); 1185 1186 #undef final_jmp 1187 } 1188 1189 // These two are taken from x86, but they look generally useful 1190 1191 // scans count pointer sized words at [addr] for occurence of value, 1192 // generic 1193 void MacroAssembler::repne_scan(Register addr, Register value, Register count, 1194 Register scratch) { 1195 Label Lloop, Lexit; 1196 cbz(count, Lexit); 1197 bind(Lloop); 1198 ldr(scratch, post(addr, wordSize)); 1199 cmp(value, scratch); 1200 br(EQ, Lexit); 1201 sub(count, count, 1); 1202 cbnz(count, Lloop); 1203 bind(Lexit); 1204 } 1205 1206 // scans count 4 byte words at [addr] for occurence of value, 1207 // generic 1208 void MacroAssembler::repne_scanw(Register addr, Register value, Register count, 1209 Register scratch) { 1210 Label Lloop, Lexit; 1211 cbz(count, Lexit); 1212 bind(Lloop); 1213 ldrw(scratch, post(addr, wordSize)); 1214 cmpw(value, scratch); 1215 br(EQ, Lexit); 1216 sub(count, count, 1); 1217 cbnz(count, Lloop); 1218 bind(Lexit); 1219 } 1220 1221 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass, 1222 Register super_klass, 1223 Register temp_reg, 1224 Register temp2_reg, 1225 Label* L_success, 1226 Label* L_failure, 1227 bool set_cond_codes) { 1228 assert_different_registers(sub_klass, super_klass, temp_reg); 1229 if (temp2_reg != noreg) 1230 assert_different_registers(sub_klass, super_klass, temp_reg, temp2_reg, rscratch1); 1231 #define IS_A_TEMP(reg) ((reg) == temp_reg || (reg) == temp2_reg) 1232 1233 Label L_fallthrough; 1234 int label_nulls = 0; 1235 if (L_success == NULL) { L_success = &L_fallthrough; label_nulls++; } 1236 if (L_failure == NULL) { L_failure = &L_fallthrough; label_nulls++; } 1237 assert(label_nulls <= 1, "at most one NULL in the batch"); 1238 1239 // a couple of useful fields in sub_klass: 1240 int ss_offset = in_bytes(Klass::secondary_supers_offset()); 1241 int sc_offset = in_bytes(Klass::secondary_super_cache_offset()); 1242 Address secondary_supers_addr(sub_klass, ss_offset); 1243 Address super_cache_addr( sub_klass, sc_offset); 1244 1245 BLOCK_COMMENT("check_klass_subtype_slow_path"); 1246 1247 // Do a linear scan of the secondary super-klass chain. 1248 // This code is rarely used, so simplicity is a virtue here. 1249 // The repne_scan instruction uses fixed registers, which we must spill. 1250 // Don't worry too much about pre-existing connections with the input regs. 1251 1252 assert(sub_klass != r0, "killed reg"); // killed by mov(r0, super) 1253 assert(sub_klass != r2, "killed reg"); // killed by lea(r2, &pst_counter) 1254 1255 RegSet pushed_registers; 1256 if (!IS_A_TEMP(r2)) pushed_registers += r2; 1257 if (!IS_A_TEMP(r5)) pushed_registers += r5; 1258 1259 if (super_klass != r0 || UseCompressedOops) { 1260 if (!IS_A_TEMP(r0)) pushed_registers += r0; 1261 } 1262 1263 push(pushed_registers, sp); 1264 1265 // Get super_klass value into r0 (even if it was in r5 or r2). 1266 if (super_klass != r0) { 1267 mov(r0, super_klass); 1268 } 1269 1270 #ifndef PRODUCT 1271 mov(rscratch2, (address)&SharedRuntime::_partial_subtype_ctr); 1272 Address pst_counter_addr(rscratch2); 1273 ldr(rscratch1, pst_counter_addr); 1274 add(rscratch1, rscratch1, 1); 1275 str(rscratch1, pst_counter_addr); 1276 #endif //PRODUCT 1277 1278 // We will consult the secondary-super array. 1279 ldr(r5, secondary_supers_addr); 1280 // Load the array length. 1281 ldrw(r2, Address(r5, Array<Klass*>::length_offset_in_bytes())); 1282 // Skip to start of data. 1283 add(r5, r5, Array<Klass*>::base_offset_in_bytes()); 1284 1285 cmp(sp, zr); // Clear Z flag; SP is never zero 1286 // Scan R2 words at [R5] for an occurrence of R0. 1287 // Set NZ/Z based on last compare. 1288 repne_scan(r5, r0, r2, rscratch1); 1289 1290 // Unspill the temp. registers: 1291 pop(pushed_registers, sp); 1292 1293 br(Assembler::NE, *L_failure); 1294 1295 // Success. Cache the super we found and proceed in triumph. 1296 str(super_klass, super_cache_addr); 1297 1298 if (L_success != &L_fallthrough) { 1299 b(*L_success); 1300 } 1301 1302 #undef IS_A_TEMP 1303 1304 bind(L_fallthrough); 1305 } 1306 1307 1308 void MacroAssembler::verify_oop(Register reg, const char* s) { 1309 if (!VerifyOops) return; 1310 1311 // Pass register number to verify_oop_subroutine 1312 const char* b = NULL; 1313 { 1314 ResourceMark rm; 1315 stringStream ss; 1316 ss.print("verify_oop: %s: %s", reg->name(), s); 1317 b = code_string(ss.as_string()); 1318 } 1319 BLOCK_COMMENT("verify_oop {"); 1320 1321 stp(r0, rscratch1, Address(pre(sp, -2 * wordSize))); 1322 stp(rscratch2, lr, Address(pre(sp, -2 * wordSize))); 1323 1324 mov(r0, reg); 1325 mov(rscratch1, (address)b); 1326 1327 // call indirectly to solve generation ordering problem 1328 lea(rscratch2, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address())); 1329 ldr(rscratch2, Address(rscratch2)); 1330 blr(rscratch2); 1331 1332 ldp(rscratch2, lr, Address(post(sp, 2 * wordSize))); 1333 ldp(r0, rscratch1, Address(post(sp, 2 * wordSize))); 1334 1335 BLOCK_COMMENT("} verify_oop"); 1336 } 1337 1338 void MacroAssembler::verify_oop_addr(Address addr, const char* s) { 1339 if (!VerifyOops) return; 1340 1341 const char* b = NULL; 1342 { 1343 ResourceMark rm; 1344 stringStream ss; 1345 ss.print("verify_oop_addr: %s", s); 1346 b = code_string(ss.as_string()); 1347 } 1348 BLOCK_COMMENT("verify_oop_addr {"); 1349 1350 stp(r0, rscratch1, Address(pre(sp, -2 * wordSize))); 1351 stp(rscratch2, lr, Address(pre(sp, -2 * wordSize))); 1352 1353 // addr may contain sp so we will have to adjust it based on the 1354 // pushes that we just did. 1355 if (addr.uses(sp)) { 1356 lea(r0, addr); 1357 ldr(r0, Address(r0, 4 * wordSize)); 1358 } else { 1359 ldr(r0, addr); 1360 } 1361 mov(rscratch1, (address)b); 1362 1363 // call indirectly to solve generation ordering problem 1364 lea(rscratch2, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address())); 1365 ldr(rscratch2, Address(rscratch2)); 1366 blr(rscratch2); 1367 1368 ldp(rscratch2, lr, Address(post(sp, 2 * wordSize))); 1369 ldp(r0, rscratch1, Address(post(sp, 2 * wordSize))); 1370 1371 BLOCK_COMMENT("} verify_oop_addr"); 1372 } 1373 1374 Address MacroAssembler::argument_address(RegisterOrConstant arg_slot, 1375 int extra_slot_offset) { 1376 // cf. TemplateTable::prepare_invoke(), if (load_receiver). 1377 int stackElementSize = Interpreter::stackElementSize; 1378 int offset = Interpreter::expr_offset_in_bytes(extra_slot_offset+0); 1379 #ifdef ASSERT 1380 int offset1 = Interpreter::expr_offset_in_bytes(extra_slot_offset+1); 1381 assert(offset1 - offset == stackElementSize, "correct arithmetic"); 1382 #endif 1383 if (arg_slot.is_constant()) { 1384 return Address(esp, arg_slot.as_constant() * stackElementSize 1385 + offset); 1386 } else { 1387 add(rscratch1, esp, arg_slot.as_register(), 1388 ext::uxtx, exact_log2(stackElementSize)); 1389 return Address(rscratch1, offset); 1390 } 1391 } 1392 1393 void MacroAssembler::call_VM_leaf_base(address entry_point, 1394 int number_of_arguments, 1395 Label *retaddr) { 1396 call_VM_leaf_base1(entry_point, number_of_arguments, 0, ret_type_integral, retaddr); 1397 } 1398 1399 void MacroAssembler::call_VM_leaf_base1(address entry_point, 1400 int number_of_gp_arguments, 1401 int number_of_fp_arguments, 1402 ret_type type, 1403 Label *retaddr) { 1404 Label E, L; 1405 1406 stp(rscratch1, rmethod, Address(pre(sp, -2 * wordSize))); 1407 1408 // We add 1 to number_of_arguments because the thread in arg0 is 1409 // not counted 1410 mov(rscratch1, entry_point); 1411 blrt(rscratch1, number_of_gp_arguments + 1, number_of_fp_arguments, type); 1412 if (retaddr) 1413 bind(*retaddr); 1414 1415 ldp(rscratch1, rmethod, Address(post(sp, 2 * wordSize))); 1416 maybe_isb(); 1417 } 1418 1419 void MacroAssembler::call_VM_leaf(address entry_point, int number_of_arguments) { 1420 call_VM_leaf_base(entry_point, number_of_arguments); 1421 } 1422 1423 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0) { 1424 pass_arg0(this, arg_0); 1425 call_VM_leaf_base(entry_point, 1); 1426 } 1427 1428 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1) { 1429 pass_arg0(this, arg_0); 1430 pass_arg1(this, arg_1); 1431 call_VM_leaf_base(entry_point, 2); 1432 } 1433 1434 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, 1435 Register arg_1, Register arg_2) { 1436 pass_arg0(this, arg_0); 1437 pass_arg1(this, arg_1); 1438 pass_arg2(this, arg_2); 1439 call_VM_leaf_base(entry_point, 3); 1440 } 1441 1442 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0) { 1443 pass_arg0(this, arg_0); 1444 MacroAssembler::call_VM_leaf_base(entry_point, 1); 1445 } 1446 1447 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1) { 1448 1449 assert(arg_0 != c_rarg1, "smashed arg"); 1450 pass_arg1(this, arg_1); 1451 pass_arg0(this, arg_0); 1452 MacroAssembler::call_VM_leaf_base(entry_point, 2); 1453 } 1454 1455 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) { 1456 assert(arg_0 != c_rarg2, "smashed arg"); 1457 assert(arg_1 != c_rarg2, "smashed arg"); 1458 pass_arg2(this, arg_2); 1459 assert(arg_0 != c_rarg1, "smashed arg"); 1460 pass_arg1(this, arg_1); 1461 pass_arg0(this, arg_0); 1462 MacroAssembler::call_VM_leaf_base(entry_point, 3); 1463 } 1464 1465 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) { 1466 assert(arg_0 != c_rarg3, "smashed arg"); 1467 assert(arg_1 != c_rarg3, "smashed arg"); 1468 assert(arg_2 != c_rarg3, "smashed arg"); 1469 pass_arg3(this, arg_3); 1470 assert(arg_0 != c_rarg2, "smashed arg"); 1471 assert(arg_1 != c_rarg2, "smashed arg"); 1472 pass_arg2(this, arg_2); 1473 assert(arg_0 != c_rarg1, "smashed arg"); 1474 pass_arg1(this, arg_1); 1475 pass_arg0(this, arg_0); 1476 MacroAssembler::call_VM_leaf_base(entry_point, 4); 1477 } 1478 1479 void MacroAssembler::null_check(Register reg, int offset) { 1480 if (needs_explicit_null_check(offset)) { 1481 // provoke OS NULL exception if reg = NULL by 1482 // accessing M[reg] w/o changing any registers 1483 // NOTE: this is plenty to provoke a segv 1484 ldr(zr, Address(reg)); 1485 } else { 1486 // nothing to do, (later) access of M[reg + offset] 1487 // will provoke OS NULL exception if reg = NULL 1488 } 1489 } 1490 1491 // MacroAssembler protected routines needed to implement 1492 // public methods 1493 1494 void MacroAssembler::mov(Register r, Address dest) { 1495 code_section()->relocate(pc(), dest.rspec()); 1496 u_int64_t imm64 = (u_int64_t)dest.target(); 1497 movptr(r, imm64); 1498 } 1499 1500 // Move a constant pointer into r. In AArch64 mode the virtual 1501 // address space is 48 bits in size, so we only need three 1502 // instructions to create a patchable instruction sequence that can 1503 // reach anywhere. 1504 void MacroAssembler::movptr(Register r, uintptr_t imm64) { 1505 #ifndef PRODUCT 1506 { 1507 char buffer[64]; 1508 snprintf(buffer, sizeof(buffer), "0x%" PRIX64, imm64); 1509 block_comment(buffer); 1510 } 1511 #endif 1512 assert(imm64 < (1ul << 48), "48-bit overflow in address constant"); 1513 movz(r, imm64 & 0xffff); 1514 imm64 >>= 16; 1515 movk(r, imm64 & 0xffff, 16); 1516 imm64 >>= 16; 1517 movk(r, imm64 & 0xffff, 32); 1518 } 1519 1520 // Macro to mov replicated immediate to vector register. 1521 // Vd will get the following values for different arrangements in T 1522 // imm32 == hex 000000gh T8B: Vd = ghghghghghghghgh 1523 // imm32 == hex 000000gh T16B: Vd = ghghghghghghghghghghghghghghghgh 1524 // imm32 == hex 0000efgh T4H: Vd = efghefghefghefgh 1525 // imm32 == hex 0000efgh T8H: Vd = efghefghefghefghefghefghefghefgh 1526 // imm32 == hex abcdefgh T2S: Vd = abcdefghabcdefgh 1527 // imm32 == hex abcdefgh T4S: Vd = abcdefghabcdefghabcdefghabcdefgh 1528 // T1D/T2D: invalid 1529 void MacroAssembler::mov(FloatRegister Vd, SIMD_Arrangement T, u_int32_t imm32) { 1530 assert(T != T1D && T != T2D, "invalid arrangement"); 1531 if (T == T8B || T == T16B) { 1532 assert((imm32 & ~0xff) == 0, "extraneous bits in unsigned imm32 (T8B/T16B)"); 1533 movi(Vd, T, imm32 & 0xff, 0); 1534 return; 1535 } 1536 u_int32_t nimm32 = ~imm32; 1537 if (T == T4H || T == T8H) { 1538 assert((imm32 & ~0xffff) == 0, "extraneous bits in unsigned imm32 (T4H/T8H)"); 1539 imm32 &= 0xffff; 1540 nimm32 &= 0xffff; 1541 } 1542 u_int32_t x = imm32; 1543 int movi_cnt = 0; 1544 int movn_cnt = 0; 1545 while (x) { if (x & 0xff) movi_cnt++; x >>= 8; } 1546 x = nimm32; 1547 while (x) { if (x & 0xff) movn_cnt++; x >>= 8; } 1548 if (movn_cnt < movi_cnt) imm32 = nimm32; 1549 unsigned lsl = 0; 1550 while (imm32 && (imm32 & 0xff) == 0) { lsl += 8; imm32 >>= 8; } 1551 if (movn_cnt < movi_cnt) 1552 mvni(Vd, T, imm32 & 0xff, lsl); 1553 else 1554 movi(Vd, T, imm32 & 0xff, lsl); 1555 imm32 >>= 8; lsl += 8; 1556 while (imm32) { 1557 while ((imm32 & 0xff) == 0) { lsl += 8; imm32 >>= 8; } 1558 if (movn_cnt < movi_cnt) 1559 bici(Vd, T, imm32 & 0xff, lsl); 1560 else 1561 orri(Vd, T, imm32 & 0xff, lsl); 1562 lsl += 8; imm32 >>= 8; 1563 } 1564 } 1565 1566 void MacroAssembler::mov_immediate64(Register dst, u_int64_t imm64) 1567 { 1568 #ifndef PRODUCT 1569 { 1570 char buffer[64]; 1571 snprintf(buffer, sizeof(buffer), "0x%" PRIX64, imm64); 1572 block_comment(buffer); 1573 } 1574 #endif 1575 if (operand_valid_for_logical_immediate(false, imm64)) { 1576 orr(dst, zr, imm64); 1577 } else { 1578 // we can use a combination of MOVZ or MOVN with 1579 // MOVK to build up the constant 1580 u_int64_t imm_h[4]; 1581 int zero_count = 0; 1582 int neg_count = 0; 1583 int i; 1584 for (i = 0; i < 4; i++) { 1585 imm_h[i] = ((imm64 >> (i * 16)) & 0xffffL); 1586 if (imm_h[i] == 0) { 1587 zero_count++; 1588 } else if (imm_h[i] == 0xffffL) { 1589 neg_count++; 1590 } 1591 } 1592 if (zero_count == 4) { 1593 // one MOVZ will do 1594 movz(dst, 0); 1595 } else if (neg_count == 4) { 1596 // one MOVN will do 1597 movn(dst, 0); 1598 } else if (zero_count == 3) { 1599 for (i = 0; i < 4; i++) { 1600 if (imm_h[i] != 0L) { 1601 movz(dst, (u_int32_t)imm_h[i], (i << 4)); 1602 break; 1603 } 1604 } 1605 } else if (neg_count == 3) { 1606 // one MOVN will do 1607 for (int i = 0; i < 4; i++) { 1608 if (imm_h[i] != 0xffffL) { 1609 movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4)); 1610 break; 1611 } 1612 } 1613 } else if (zero_count == 2) { 1614 // one MOVZ and one MOVK will do 1615 for (i = 0; i < 3; i++) { 1616 if (imm_h[i] != 0L) { 1617 movz(dst, (u_int32_t)imm_h[i], (i << 4)); 1618 i++; 1619 break; 1620 } 1621 } 1622 for (;i < 4; i++) { 1623 if (imm_h[i] != 0L) { 1624 movk(dst, (u_int32_t)imm_h[i], (i << 4)); 1625 } 1626 } 1627 } else if (neg_count == 2) { 1628 // one MOVN and one MOVK will do 1629 for (i = 0; i < 4; i++) { 1630 if (imm_h[i] != 0xffffL) { 1631 movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4)); 1632 i++; 1633 break; 1634 } 1635 } 1636 for (;i < 4; i++) { 1637 if (imm_h[i] != 0xffffL) { 1638 movk(dst, (u_int32_t)imm_h[i], (i << 4)); 1639 } 1640 } 1641 } else if (zero_count == 1) { 1642 // one MOVZ and two MOVKs will do 1643 for (i = 0; i < 4; i++) { 1644 if (imm_h[i] != 0L) { 1645 movz(dst, (u_int32_t)imm_h[i], (i << 4)); 1646 i++; 1647 break; 1648 } 1649 } 1650 for (;i < 4; i++) { 1651 if (imm_h[i] != 0x0L) { 1652 movk(dst, (u_int32_t)imm_h[i], (i << 4)); 1653 } 1654 } 1655 } else if (neg_count == 1) { 1656 // one MOVN and two MOVKs will do 1657 for (i = 0; i < 4; i++) { 1658 if (imm_h[i] != 0xffffL) { 1659 movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4)); 1660 i++; 1661 break; 1662 } 1663 } 1664 for (;i < 4; i++) { 1665 if (imm_h[i] != 0xffffL) { 1666 movk(dst, (u_int32_t)imm_h[i], (i << 4)); 1667 } 1668 } 1669 } else { 1670 // use a MOVZ and 3 MOVKs (makes it easier to debug) 1671 movz(dst, (u_int32_t)imm_h[0], 0); 1672 for (i = 1; i < 4; i++) { 1673 movk(dst, (u_int32_t)imm_h[i], (i << 4)); 1674 } 1675 } 1676 } 1677 } 1678 1679 void MacroAssembler::mov_immediate32(Register dst, u_int32_t imm32) 1680 { 1681 #ifndef PRODUCT 1682 { 1683 char buffer[64]; 1684 snprintf(buffer, sizeof(buffer), "0x%" PRIX32, imm32); 1685 block_comment(buffer); 1686 } 1687 #endif 1688 if (operand_valid_for_logical_immediate(true, imm32)) { 1689 orrw(dst, zr, imm32); 1690 } else { 1691 // we can use MOVZ, MOVN or two calls to MOVK to build up the 1692 // constant 1693 u_int32_t imm_h[2]; 1694 imm_h[0] = imm32 & 0xffff; 1695 imm_h[1] = ((imm32 >> 16) & 0xffff); 1696 if (imm_h[0] == 0) { 1697 movzw(dst, imm_h[1], 16); 1698 } else if (imm_h[0] == 0xffff) { 1699 movnw(dst, imm_h[1] ^ 0xffff, 16); 1700 } else if (imm_h[1] == 0) { 1701 movzw(dst, imm_h[0], 0); 1702 } else if (imm_h[1] == 0xffff) { 1703 movnw(dst, imm_h[0] ^ 0xffff, 0); 1704 } else { 1705 // use a MOVZ and MOVK (makes it easier to debug) 1706 movzw(dst, imm_h[0], 0); 1707 movkw(dst, imm_h[1], 16); 1708 } 1709 } 1710 } 1711 1712 // Form an address from base + offset in Rd. Rd may or may 1713 // not actually be used: you must use the Address that is returned. 1714 // It is up to you to ensure that the shift provided matches the size 1715 // of your data. 1716 Address MacroAssembler::form_address(Register Rd, Register base, long byte_offset, int shift) { 1717 if (Address::offset_ok_for_immed(byte_offset, shift)) 1718 // It fits; no need for any heroics 1719 return Address(base, byte_offset); 1720 1721 // Don't do anything clever with negative or misaligned offsets 1722 unsigned mask = (1 << shift) - 1; 1723 if (byte_offset < 0 || byte_offset & mask) { 1724 mov(Rd, byte_offset); 1725 add(Rd, base, Rd); 1726 return Address(Rd); 1727 } 1728 1729 // See if we can do this with two 12-bit offsets 1730 { 1731 unsigned long word_offset = byte_offset >> shift; 1732 unsigned long masked_offset = word_offset & 0xfff000; 1733 if (Address::offset_ok_for_immed(word_offset - masked_offset) 1734 && Assembler::operand_valid_for_add_sub_immediate(masked_offset << shift)) { 1735 add(Rd, base, masked_offset << shift); 1736 word_offset -= masked_offset; 1737 return Address(Rd, word_offset << shift); 1738 } 1739 } 1740 1741 // Do it the hard way 1742 mov(Rd, byte_offset); 1743 add(Rd, base, Rd); 1744 return Address(Rd); 1745 } 1746 1747 void MacroAssembler::atomic_incw(Register counter_addr, Register tmp, Register tmp2) { 1748 if (UseLSE) { 1749 mov(tmp, 1); 1750 ldadd(Assembler::word, tmp, zr, counter_addr); 1751 return; 1752 } 1753 Label retry_load; 1754 if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH)) 1755 prfm(Address(counter_addr), PSTL1STRM); 1756 bind(retry_load); 1757 // flush and load exclusive from the memory location 1758 ldxrw(tmp, counter_addr); 1759 addw(tmp, tmp, 1); 1760 // if we store+flush with no intervening write tmp wil be zero 1761 stxrw(tmp2, tmp, counter_addr); 1762 cbnzw(tmp2, retry_load); 1763 } 1764 1765 1766 int MacroAssembler::corrected_idivl(Register result, Register ra, Register rb, 1767 bool want_remainder, Register scratch) 1768 { 1769 // Full implementation of Java idiv and irem. The function 1770 // returns the (pc) offset of the div instruction - may be needed 1771 // for implicit exceptions. 1772 // 1773 // constraint : ra/rb =/= scratch 1774 // normal case 1775 // 1776 // input : ra: dividend 1777 // rb: divisor 1778 // 1779 // result: either 1780 // quotient (= ra idiv rb) 1781 // remainder (= ra irem rb) 1782 1783 assert(ra != scratch && rb != scratch, "reg cannot be scratch"); 1784 1785 int idivl_offset = offset(); 1786 if (! want_remainder) { 1787 sdivw(result, ra, rb); 1788 } else { 1789 sdivw(scratch, ra, rb); 1790 Assembler::msubw(result, scratch, rb, ra); 1791 } 1792 1793 return idivl_offset; 1794 } 1795 1796 int MacroAssembler::corrected_idivq(Register result, Register ra, Register rb, 1797 bool want_remainder, Register scratch) 1798 { 1799 // Full implementation of Java ldiv and lrem. The function 1800 // returns the (pc) offset of the div instruction - may be needed 1801 // for implicit exceptions. 1802 // 1803 // constraint : ra/rb =/= scratch 1804 // normal case 1805 // 1806 // input : ra: dividend 1807 // rb: divisor 1808 // 1809 // result: either 1810 // quotient (= ra idiv rb) 1811 // remainder (= ra irem rb) 1812 1813 assert(ra != scratch && rb != scratch, "reg cannot be scratch"); 1814 1815 int idivq_offset = offset(); 1816 if (! want_remainder) { 1817 sdiv(result, ra, rb); 1818 } else { 1819 sdiv(scratch, ra, rb); 1820 Assembler::msub(result, scratch, rb, ra); 1821 } 1822 1823 return idivq_offset; 1824 } 1825 1826 void MacroAssembler::membar(Membar_mask_bits order_constraint) { 1827 address prev = pc() - NativeMembar::instruction_size; 1828 address last = code()->last_insn(); 1829 if (last != NULL && nativeInstruction_at(last)->is_Membar() && prev == last) { 1830 NativeMembar *bar = NativeMembar_at(prev); 1831 // We are merging two memory barrier instructions. On AArch64 we 1832 // can do this simply by ORing them together. 1833 bar->set_kind(bar->get_kind() | order_constraint); 1834 BLOCK_COMMENT("merged membar"); 1835 } else { 1836 code()->set_last_insn(pc()); 1837 dmb(Assembler::barrier(order_constraint)); 1838 } 1839 } 1840 1841 bool MacroAssembler::try_merge_ldst(Register rt, const Address &adr, size_t size_in_bytes, bool is_store) { 1842 if (ldst_can_merge(rt, adr, size_in_bytes, is_store)) { 1843 merge_ldst(rt, adr, size_in_bytes, is_store); 1844 code()->clear_last_insn(); 1845 return true; 1846 } else { 1847 assert(size_in_bytes == 8 || size_in_bytes == 4, "only 8 bytes or 4 bytes load/store is supported."); 1848 const unsigned mask = size_in_bytes - 1; 1849 if (adr.getMode() == Address::base_plus_offset && 1850 (adr.offset() & mask) == 0) { // only supports base_plus_offset. 1851 code()->set_last_insn(pc()); 1852 } 1853 return false; 1854 } 1855 } 1856 1857 void MacroAssembler::ldr(Register Rx, const Address &adr) { 1858 // We always try to merge two adjacent loads into one ldp. 1859 if (!try_merge_ldst(Rx, adr, 8, false)) { 1860 Assembler::ldr(Rx, adr); 1861 } 1862 } 1863 1864 void MacroAssembler::ldrw(Register Rw, const Address &adr) { 1865 // We always try to merge two adjacent loads into one ldp. 1866 if (!try_merge_ldst(Rw, adr, 4, false)) { 1867 Assembler::ldrw(Rw, adr); 1868 } 1869 } 1870 1871 void MacroAssembler::str(Register Rx, const Address &adr) { 1872 // We always try to merge two adjacent stores into one stp. 1873 if (!try_merge_ldst(Rx, adr, 8, true)) { 1874 Assembler::str(Rx, adr); 1875 } 1876 } 1877 1878 void MacroAssembler::strw(Register Rw, const Address &adr) { 1879 // We always try to merge two adjacent stores into one stp. 1880 if (!try_merge_ldst(Rw, adr, 4, true)) { 1881 Assembler::strw(Rw, adr); 1882 } 1883 } 1884 1885 // MacroAssembler routines found actually to be needed 1886 1887 void MacroAssembler::push(Register src) 1888 { 1889 str(src, Address(pre(esp, -1 * wordSize))); 1890 } 1891 1892 void MacroAssembler::pop(Register dst) 1893 { 1894 ldr(dst, Address(post(esp, 1 * wordSize))); 1895 } 1896 1897 // Note: load_unsigned_short used to be called load_unsigned_word. 1898 int MacroAssembler::load_unsigned_short(Register dst, Address src) { 1899 int off = offset(); 1900 ldrh(dst, src); 1901 return off; 1902 } 1903 1904 int MacroAssembler::load_unsigned_byte(Register dst, Address src) { 1905 int off = offset(); 1906 ldrb(dst, src); 1907 return off; 1908 } 1909 1910 int MacroAssembler::load_signed_short(Register dst, Address src) { 1911 int off = offset(); 1912 ldrsh(dst, src); 1913 return off; 1914 } 1915 1916 int MacroAssembler::load_signed_byte(Register dst, Address src) { 1917 int off = offset(); 1918 ldrsb(dst, src); 1919 return off; 1920 } 1921 1922 int MacroAssembler::load_signed_short32(Register dst, Address src) { 1923 int off = offset(); 1924 ldrshw(dst, src); 1925 return off; 1926 } 1927 1928 int MacroAssembler::load_signed_byte32(Register dst, Address src) { 1929 int off = offset(); 1930 ldrsbw(dst, src); 1931 return off; 1932 } 1933 1934 void MacroAssembler::load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed, Register dst2) { 1935 switch (size_in_bytes) { 1936 case 8: ldr(dst, src); break; 1937 case 4: ldrw(dst, src); break; 1938 case 2: is_signed ? load_signed_short(dst, src) : load_unsigned_short(dst, src); break; 1939 case 1: is_signed ? load_signed_byte( dst, src) : load_unsigned_byte( dst, src); break; 1940 default: ShouldNotReachHere(); 1941 } 1942 } 1943 1944 void MacroAssembler::store_sized_value(Address dst, Register src, size_t size_in_bytes, Register src2) { 1945 switch (size_in_bytes) { 1946 case 8: str(src, dst); break; 1947 case 4: strw(src, dst); break; 1948 case 2: strh(src, dst); break; 1949 case 1: strb(src, dst); break; 1950 default: ShouldNotReachHere(); 1951 } 1952 } 1953 1954 void MacroAssembler::decrementw(Register reg, int value) 1955 { 1956 if (value < 0) { incrementw(reg, -value); return; } 1957 if (value == 0) { return; } 1958 if (value < (1 << 12)) { subw(reg, reg, value); return; } 1959 /* else */ { 1960 guarantee(reg != rscratch2, "invalid dst for register decrement"); 1961 movw(rscratch2, (unsigned)value); 1962 subw(reg, reg, rscratch2); 1963 } 1964 } 1965 1966 void MacroAssembler::decrement(Register reg, int value) 1967 { 1968 if (value < 0) { increment(reg, -value); return; } 1969 if (value == 0) { return; } 1970 if (value < (1 << 12)) { sub(reg, reg, value); return; } 1971 /* else */ { 1972 assert(reg != rscratch2, "invalid dst for register decrement"); 1973 mov(rscratch2, (unsigned long)value); 1974 sub(reg, reg, rscratch2); 1975 } 1976 } 1977 1978 void MacroAssembler::decrementw(Address dst, int value) 1979 { 1980 assert(!dst.uses(rscratch1), "invalid dst for address decrement"); 1981 if (dst.getMode() == Address::literal) { 1982 assert(abs(value) < (1 << 12), "invalid value and address mode combination"); 1983 lea(rscratch2, dst); 1984 dst = Address(rscratch2); 1985 } 1986 ldrw(rscratch1, dst); 1987 decrementw(rscratch1, value); 1988 strw(rscratch1, dst); 1989 } 1990 1991 void MacroAssembler::decrement(Address dst, int value) 1992 { 1993 assert(!dst.uses(rscratch1), "invalid address for decrement"); 1994 if (dst.getMode() == Address::literal) { 1995 assert(abs(value) < (1 << 12), "invalid value and address mode combination"); 1996 lea(rscratch2, dst); 1997 dst = Address(rscratch2); 1998 } 1999 ldr(rscratch1, dst); 2000 decrement(rscratch1, value); 2001 str(rscratch1, dst); 2002 } 2003 2004 void MacroAssembler::incrementw(Register reg, int value) 2005 { 2006 if (value < 0) { decrementw(reg, -value); return; } 2007 if (value == 0) { return; } 2008 if (value < (1 << 12)) { addw(reg, reg, value); return; } 2009 /* else */ { 2010 assert(reg != rscratch2, "invalid dst for register increment"); 2011 movw(rscratch2, (unsigned)value); 2012 addw(reg, reg, rscratch2); 2013 } 2014 } 2015 2016 void MacroAssembler::increment(Register reg, int value) 2017 { 2018 if (value < 0) { decrement(reg, -value); return; } 2019 if (value == 0) { return; } 2020 if (value < (1 << 12)) { add(reg, reg, value); return; } 2021 /* else */ { 2022 assert(reg != rscratch2, "invalid dst for register increment"); 2023 movw(rscratch2, (unsigned)value); 2024 add(reg, reg, rscratch2); 2025 } 2026 } 2027 2028 void MacroAssembler::incrementw(Address dst, int value) 2029 { 2030 assert(!dst.uses(rscratch1), "invalid dst for address increment"); 2031 if (dst.getMode() == Address::literal) { 2032 assert(abs(value) < (1 << 12), "invalid value and address mode combination"); 2033 lea(rscratch2, dst); 2034 dst = Address(rscratch2); 2035 } 2036 ldrw(rscratch1, dst); 2037 incrementw(rscratch1, value); 2038 strw(rscratch1, dst); 2039 } 2040 2041 void MacroAssembler::increment(Address dst, int value) 2042 { 2043 assert(!dst.uses(rscratch1), "invalid dst for address increment"); 2044 if (dst.getMode() == Address::literal) { 2045 assert(abs(value) < (1 << 12), "invalid value and address mode combination"); 2046 lea(rscratch2, dst); 2047 dst = Address(rscratch2); 2048 } 2049 ldr(rscratch1, dst); 2050 increment(rscratch1, value); 2051 str(rscratch1, dst); 2052 } 2053 2054 2055 void MacroAssembler::pusha() { 2056 push(0x7fffffff, sp); 2057 } 2058 2059 void MacroAssembler::popa() { 2060 pop(0x7fffffff, sp); 2061 } 2062 2063 // Push lots of registers in the bit set supplied. Don't push sp. 2064 // Return the number of words pushed 2065 int MacroAssembler::push(unsigned int bitset, Register stack) { 2066 int words_pushed = 0; 2067 2068 // Scan bitset to accumulate register pairs 2069 unsigned char regs[32]; 2070 int count = 0; 2071 for (int reg = 0; reg <= 30; reg++) { 2072 if (1 & bitset) 2073 regs[count++] = reg; 2074 bitset >>= 1; 2075 } 2076 regs[count++] = zr->encoding_nocheck(); 2077 count &= ~1; // Only push an even nuber of regs 2078 2079 if (count) { 2080 stp(as_Register(regs[0]), as_Register(regs[1]), 2081 Address(pre(stack, -count * wordSize))); 2082 words_pushed += 2; 2083 } 2084 for (int i = 2; i < count; i += 2) { 2085 stp(as_Register(regs[i]), as_Register(regs[i+1]), 2086 Address(stack, i * wordSize)); 2087 words_pushed += 2; 2088 } 2089 2090 assert(words_pushed == count, "oops, pushed != count"); 2091 2092 return count; 2093 } 2094 2095 int MacroAssembler::pop(unsigned int bitset, Register stack) { 2096 int words_pushed = 0; 2097 2098 // Scan bitset to accumulate register pairs 2099 unsigned char regs[32]; 2100 int count = 0; 2101 for (int reg = 0; reg <= 30; reg++) { 2102 if (1 & bitset) 2103 regs[count++] = reg; 2104 bitset >>= 1; 2105 } 2106 regs[count++] = zr->encoding_nocheck(); 2107 count &= ~1; 2108 2109 for (int i = 2; i < count; i += 2) { 2110 ldp(as_Register(regs[i]), as_Register(regs[i+1]), 2111 Address(stack, i * wordSize)); 2112 words_pushed += 2; 2113 } 2114 if (count) { 2115 ldp(as_Register(regs[0]), as_Register(regs[1]), 2116 Address(post(stack, count * wordSize))); 2117 words_pushed += 2; 2118 } 2119 2120 assert(words_pushed == count, "oops, pushed != count"); 2121 2122 return count; 2123 } 2124 #ifdef ASSERT 2125 void MacroAssembler::verify_heapbase(const char* msg) { 2126 #if 0 2127 assert (UseCompressedOops || UseCompressedClassPointers, "should be compressed"); 2128 assert (Universe::heap() != NULL, "java heap should be initialized"); 2129 if (CheckCompressedOops) { 2130 Label ok; 2131 push(1 << rscratch1->encoding(), sp); // cmpptr trashes rscratch1 2132 cmpptr(rheapbase, ExternalAddress((address)Universe::narrow_ptrs_base_addr())); 2133 br(Assembler::EQ, ok); 2134 stop(msg); 2135 bind(ok); 2136 pop(1 << rscratch1->encoding(), sp); 2137 } 2138 #endif 2139 } 2140 #endif 2141 2142 void MacroAssembler::resolve_jobject(Register value, Register thread, Register tmp) { 2143 Label done, not_weak; 2144 cbz(value, done); // Use NULL as-is. 2145 2146 STATIC_ASSERT(JNIHandles::weak_tag_mask == 1u); 2147 tbz(r0, 0, not_weak); // Test for jweak tag. 2148 2149 // Resolve jweak. 2150 access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF, value, 2151 Address(value, -JNIHandles::weak_tag_value), tmp, thread); 2152 verify_oop(value); 2153 b(done); 2154 2155 bind(not_weak); 2156 // Resolve (untagged) jobject. 2157 access_load_at(T_OBJECT, IN_NATIVE, value, Address(value, 0), tmp, thread); 2158 verify_oop(value); 2159 bind(done); 2160 } 2161 2162 void MacroAssembler::stop(const char* msg) { 2163 address ip = pc(); 2164 pusha(); 2165 mov(c_rarg0, (address)msg); 2166 mov(c_rarg1, (address)ip); 2167 mov(c_rarg2, sp); 2168 mov(c_rarg3, CAST_FROM_FN_PTR(address, MacroAssembler::debug64)); 2169 // call(c_rarg3); 2170 blrt(c_rarg3, 3, 0, 1); 2171 hlt(0); 2172 } 2173 2174 void MacroAssembler::unimplemented(const char* what) { 2175 const char* buf = NULL; 2176 { 2177 ResourceMark rm; 2178 stringStream ss; 2179 ss.print("unimplemented: %s", what); 2180 buf = code_string(ss.as_string()); 2181 } 2182 stop(buf); 2183 } 2184 2185 // If a constant does not fit in an immediate field, generate some 2186 // number of MOV instructions and then perform the operation. 2187 void MacroAssembler::wrap_add_sub_imm_insn(Register Rd, Register Rn, unsigned imm, 2188 add_sub_imm_insn insn1, 2189 add_sub_reg_insn insn2) { 2190 assert(Rd != zr, "Rd = zr and not setting flags?"); 2191 if (operand_valid_for_add_sub_immediate((int)imm)) { 2192 (this->*insn1)(Rd, Rn, imm); 2193 } else { 2194 if (uabs(imm) < (1 << 24)) { 2195 (this->*insn1)(Rd, Rn, imm & -(1 << 12)); 2196 (this->*insn1)(Rd, Rd, imm & ((1 << 12)-1)); 2197 } else { 2198 assert_different_registers(Rd, Rn); 2199 mov(Rd, (uint64_t)imm); 2200 (this->*insn2)(Rd, Rn, Rd, LSL, 0); 2201 } 2202 } 2203 } 2204 2205 // Seperate vsn which sets the flags. Optimisations are more restricted 2206 // because we must set the flags correctly. 2207 void MacroAssembler::wrap_adds_subs_imm_insn(Register Rd, Register Rn, unsigned imm, 2208 add_sub_imm_insn insn1, 2209 add_sub_reg_insn insn2) { 2210 if (operand_valid_for_add_sub_immediate((int)imm)) { 2211 (this->*insn1)(Rd, Rn, imm); 2212 } else { 2213 assert_different_registers(Rd, Rn); 2214 assert(Rd != zr, "overflow in immediate operand"); 2215 mov(Rd, (uint64_t)imm); 2216 (this->*insn2)(Rd, Rn, Rd, LSL, 0); 2217 } 2218 } 2219 2220 2221 void MacroAssembler::add(Register Rd, Register Rn, RegisterOrConstant increment) { 2222 if (increment.is_register()) { 2223 add(Rd, Rn, increment.as_register()); 2224 } else { 2225 add(Rd, Rn, increment.as_constant()); 2226 } 2227 } 2228 2229 void MacroAssembler::addw(Register Rd, Register Rn, RegisterOrConstant increment) { 2230 if (increment.is_register()) { 2231 addw(Rd, Rn, increment.as_register()); 2232 } else { 2233 addw(Rd, Rn, increment.as_constant()); 2234 } 2235 } 2236 2237 void MacroAssembler::sub(Register Rd, Register Rn, RegisterOrConstant decrement) { 2238 if (decrement.is_register()) { 2239 sub(Rd, Rn, decrement.as_register()); 2240 } else { 2241 sub(Rd, Rn, decrement.as_constant()); 2242 } 2243 } 2244 2245 void MacroAssembler::subw(Register Rd, Register Rn, RegisterOrConstant decrement) { 2246 if (decrement.is_register()) { 2247 subw(Rd, Rn, decrement.as_register()); 2248 } else { 2249 subw(Rd, Rn, decrement.as_constant()); 2250 } 2251 } 2252 2253 void MacroAssembler::reinit_heapbase() 2254 { 2255 if (UseCompressedOops) { 2256 if (Universe::is_fully_initialized()) { 2257 mov(rheapbase, Universe::narrow_ptrs_base()); 2258 } else { 2259 lea(rheapbase, ExternalAddress((address)Universe::narrow_ptrs_base_addr())); 2260 ldr(rheapbase, Address(rheapbase)); 2261 } 2262 } 2263 } 2264 2265 // this simulates the behaviour of the x86 cmpxchg instruction using a 2266 // load linked/store conditional pair. we use the acquire/release 2267 // versions of these instructions so that we flush pending writes as 2268 // per Java semantics. 2269 2270 // n.b the x86 version assumes the old value to be compared against is 2271 // in rax and updates rax with the value located in memory if the 2272 // cmpxchg fails. we supply a register for the old value explicitly 2273 2274 // the aarch64 load linked/store conditional instructions do not 2275 // accept an offset. so, unlike x86, we must provide a plain register 2276 // to identify the memory word to be compared/exchanged rather than a 2277 // register+offset Address. 2278 2279 void MacroAssembler::cmpxchgptr(Register oldv, Register newv, Register addr, Register tmp, 2280 Label &succeed, Label *fail) { 2281 // oldv holds comparison value 2282 // newv holds value to write in exchange 2283 // addr identifies memory word to compare against/update 2284 if (UseLSE) { 2285 mov(tmp, oldv); 2286 casal(Assembler::xword, oldv, newv, addr); 2287 cmp(tmp, oldv); 2288 br(Assembler::EQ, succeed); 2289 membar(AnyAny); 2290 } else { 2291 Label retry_load, nope; 2292 if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH)) 2293 prfm(Address(addr), PSTL1STRM); 2294 bind(retry_load); 2295 // flush and load exclusive from the memory location 2296 // and fail if it is not what we expect 2297 ldaxr(tmp, addr); 2298 cmp(tmp, oldv); 2299 br(Assembler::NE, nope); 2300 // if we store+flush with no intervening write tmp wil be zero 2301 stlxr(tmp, newv, addr); 2302 cbzw(tmp, succeed); 2303 // retry so we only ever return after a load fails to compare 2304 // ensures we don't return a stale value after a failed write. 2305 b(retry_load); 2306 // if the memory word differs we return it in oldv and signal a fail 2307 bind(nope); 2308 membar(AnyAny); 2309 mov(oldv, tmp); 2310 } 2311 if (fail) 2312 b(*fail); 2313 } 2314 2315 void MacroAssembler::cmpxchg_obj_header(Register oldv, Register newv, Register obj, Register tmp, 2316 Label &succeed, Label *fail) { 2317 assert(oopDesc::mark_offset_in_bytes() == 0, "assumption"); 2318 cmpxchgptr(oldv, newv, obj, tmp, succeed, fail); 2319 } 2320 2321 void MacroAssembler::cmpxchgw(Register oldv, Register newv, Register addr, Register tmp, 2322 Label &succeed, Label *fail) { 2323 // oldv holds comparison value 2324 // newv holds value to write in exchange 2325 // addr identifies memory word to compare against/update 2326 // tmp returns 0/1 for success/failure 2327 if (UseLSE) { 2328 mov(tmp, oldv); 2329 casal(Assembler::word, oldv, newv, addr); 2330 cmp(tmp, oldv); 2331 br(Assembler::EQ, succeed); 2332 membar(AnyAny); 2333 } else { 2334 Label retry_load, nope; 2335 if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH)) 2336 prfm(Address(addr), PSTL1STRM); 2337 bind(retry_load); 2338 // flush and load exclusive from the memory location 2339 // and fail if it is not what we expect 2340 ldaxrw(tmp, addr); 2341 cmp(tmp, oldv); 2342 br(Assembler::NE, nope); 2343 // if we store+flush with no intervening write tmp wil be zero 2344 stlxrw(tmp, newv, addr); 2345 cbzw(tmp, succeed); 2346 // retry so we only ever return after a load fails to compare 2347 // ensures we don't return a stale value after a failed write. 2348 b(retry_load); 2349 // if the memory word differs we return it in oldv and signal a fail 2350 bind(nope); 2351 membar(AnyAny); 2352 mov(oldv, tmp); 2353 } 2354 if (fail) 2355 b(*fail); 2356 } 2357 2358 // A generic CAS; success or failure is in the EQ flag. A weak CAS 2359 // doesn't retry and may fail spuriously. If the oldval is wanted, 2360 // Pass a register for the result, otherwise pass noreg. 2361 2362 // Clobbers rscratch1 2363 void MacroAssembler::cmpxchg(Register addr, Register expected, 2364 Register new_val, 2365 enum operand_size size, 2366 bool acquire, bool release, 2367 bool weak, 2368 Register result) { 2369 if (result == noreg) result = rscratch1; 2370 BLOCK_COMMENT("cmpxchg {"); 2371 if (UseLSE) { 2372 mov(result, expected); 2373 lse_cas(result, new_val, addr, size, acquire, release, /*not_pair*/ true); 2374 compare_eq(result, expected, size); 2375 } else { 2376 Label retry_load, done; 2377 if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH)) 2378 prfm(Address(addr), PSTL1STRM); 2379 bind(retry_load); 2380 load_exclusive(result, addr, size, acquire); 2381 compare_eq(result, expected, size); 2382 br(Assembler::NE, done); 2383 store_exclusive(rscratch1, new_val, addr, size, release); 2384 if (weak) { 2385 cmpw(rscratch1, 0u); // If the store fails, return NE to our caller. 2386 } else { 2387 cbnzw(rscratch1, retry_load); 2388 } 2389 bind(done); 2390 } 2391 BLOCK_COMMENT("} cmpxchg"); 2392 } 2393 2394 // A generic comparison. Only compares for equality, clobbers rscratch1. 2395 void MacroAssembler::compare_eq(Register rm, Register rn, enum operand_size size) { 2396 if (size == xword) { 2397 cmp(rm, rn); 2398 } else if (size == word) { 2399 cmpw(rm, rn); 2400 } else if (size == halfword) { 2401 eorw(rscratch1, rm, rn); 2402 ands(zr, rscratch1, 0xffff); 2403 } else if (size == byte) { 2404 eorw(rscratch1, rm, rn); 2405 ands(zr, rscratch1, 0xff); 2406 } else { 2407 ShouldNotReachHere(); 2408 } 2409 } 2410 2411 2412 static bool different(Register a, RegisterOrConstant b, Register c) { 2413 if (b.is_constant()) 2414 return a != c; 2415 else 2416 return a != b.as_register() && a != c && b.as_register() != c; 2417 } 2418 2419 #define ATOMIC_OP(NAME, LDXR, OP, IOP, AOP, STXR, sz) \ 2420 void MacroAssembler::atomic_##NAME(Register prev, RegisterOrConstant incr, Register addr) { \ 2421 if (UseLSE) { \ 2422 prev = prev->is_valid() ? prev : zr; \ 2423 if (incr.is_register()) { \ 2424 AOP(sz, incr.as_register(), prev, addr); \ 2425 } else { \ 2426 mov(rscratch2, incr.as_constant()); \ 2427 AOP(sz, rscratch2, prev, addr); \ 2428 } \ 2429 return; \ 2430 } \ 2431 Register result = rscratch2; \ 2432 if (prev->is_valid()) \ 2433 result = different(prev, incr, addr) ? prev : rscratch2; \ 2434 \ 2435 Label retry_load; \ 2436 if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH)) \ 2437 prfm(Address(addr), PSTL1STRM); \ 2438 bind(retry_load); \ 2439 LDXR(result, addr); \ 2440 OP(rscratch1, result, incr); \ 2441 STXR(rscratch2, rscratch1, addr); \ 2442 cbnzw(rscratch2, retry_load); \ 2443 if (prev->is_valid() && prev != result) { \ 2444 IOP(prev, rscratch1, incr); \ 2445 } \ 2446 } 2447 2448 ATOMIC_OP(add, ldxr, add, sub, ldadd, stxr, Assembler::xword) 2449 ATOMIC_OP(addw, ldxrw, addw, subw, ldadd, stxrw, Assembler::word) 2450 ATOMIC_OP(addal, ldaxr, add, sub, ldaddal, stlxr, Assembler::xword) 2451 ATOMIC_OP(addalw, ldaxrw, addw, subw, ldaddal, stlxrw, Assembler::word) 2452 2453 #undef ATOMIC_OP 2454 2455 #define ATOMIC_XCHG(OP, AOP, LDXR, STXR, sz) \ 2456 void MacroAssembler::atomic_##OP(Register prev, Register newv, Register addr) { \ 2457 if (UseLSE) { \ 2458 prev = prev->is_valid() ? prev : zr; \ 2459 AOP(sz, newv, prev, addr); \ 2460 return; \ 2461 } \ 2462 Register result = rscratch2; \ 2463 if (prev->is_valid()) \ 2464 result = different(prev, newv, addr) ? prev : rscratch2; \ 2465 \ 2466 Label retry_load; \ 2467 if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH)) \ 2468 prfm(Address(addr), PSTL1STRM); \ 2469 bind(retry_load); \ 2470 LDXR(result, addr); \ 2471 STXR(rscratch1, newv, addr); \ 2472 cbnzw(rscratch1, retry_load); \ 2473 if (prev->is_valid() && prev != result) \ 2474 mov(prev, result); \ 2475 } 2476 2477 ATOMIC_XCHG(xchg, swp, ldxr, stxr, Assembler::xword) 2478 ATOMIC_XCHG(xchgw, swp, ldxrw, stxrw, Assembler::word) 2479 ATOMIC_XCHG(xchgal, swpal, ldaxr, stlxr, Assembler::xword) 2480 ATOMIC_XCHG(xchgalw, swpal, ldaxrw, stlxrw, Assembler::word) 2481 2482 #undef ATOMIC_XCHG 2483 2484 #ifndef PRODUCT 2485 extern "C" void findpc(intptr_t x); 2486 #endif 2487 2488 void MacroAssembler::debug64(char* msg, int64_t pc, int64_t regs[]) 2489 { 2490 // In order to get locks to work, we need to fake a in_VM state 2491 if (ShowMessageBoxOnError ) { 2492 JavaThread* thread = JavaThread::current(); 2493 JavaThreadState saved_state = thread->thread_state(); 2494 thread->set_thread_state(_thread_in_vm); 2495 #ifndef PRODUCT 2496 if (CountBytecodes || TraceBytecodes || StopInterpreterAt) { 2497 ttyLocker ttyl; 2498 BytecodeCounter::print(); 2499 } 2500 #endif 2501 if (os::message_box(msg, "Execution stopped, print registers?")) { 2502 ttyLocker ttyl; 2503 tty->print_cr(" pc = 0x%016lx", pc); 2504 #ifndef PRODUCT 2505 tty->cr(); 2506 findpc(pc); 2507 tty->cr(); 2508 #endif 2509 tty->print_cr(" r0 = 0x%016lx", regs[0]); 2510 tty->print_cr(" r1 = 0x%016lx", regs[1]); 2511 tty->print_cr(" r2 = 0x%016lx", regs[2]); 2512 tty->print_cr(" r3 = 0x%016lx", regs[3]); 2513 tty->print_cr(" r4 = 0x%016lx", regs[4]); 2514 tty->print_cr(" r5 = 0x%016lx", regs[5]); 2515 tty->print_cr(" r6 = 0x%016lx", regs[6]); 2516 tty->print_cr(" r7 = 0x%016lx", regs[7]); 2517 tty->print_cr(" r8 = 0x%016lx", regs[8]); 2518 tty->print_cr(" r9 = 0x%016lx", regs[9]); 2519 tty->print_cr("r10 = 0x%016lx", regs[10]); 2520 tty->print_cr("r11 = 0x%016lx", regs[11]); 2521 tty->print_cr("r12 = 0x%016lx", regs[12]); 2522 tty->print_cr("r13 = 0x%016lx", regs[13]); 2523 tty->print_cr("r14 = 0x%016lx", regs[14]); 2524 tty->print_cr("r15 = 0x%016lx", regs[15]); 2525 tty->print_cr("r16 = 0x%016lx", regs[16]); 2526 tty->print_cr("r17 = 0x%016lx", regs[17]); 2527 tty->print_cr("r18 = 0x%016lx", regs[18]); 2528 tty->print_cr("r19 = 0x%016lx", regs[19]); 2529 tty->print_cr("r20 = 0x%016lx", regs[20]); 2530 tty->print_cr("r21 = 0x%016lx", regs[21]); 2531 tty->print_cr("r22 = 0x%016lx", regs[22]); 2532 tty->print_cr("r23 = 0x%016lx", regs[23]); 2533 tty->print_cr("r24 = 0x%016lx", regs[24]); 2534 tty->print_cr("r25 = 0x%016lx", regs[25]); 2535 tty->print_cr("r26 = 0x%016lx", regs[26]); 2536 tty->print_cr("r27 = 0x%016lx", regs[27]); 2537 tty->print_cr("r28 = 0x%016lx", regs[28]); 2538 tty->print_cr("r30 = 0x%016lx", regs[30]); 2539 tty->print_cr("r31 = 0x%016lx", regs[31]); 2540 BREAKPOINT; 2541 } 2542 ThreadStateTransition::transition(thread, _thread_in_vm, saved_state); 2543 } else { 2544 ttyLocker ttyl; 2545 ::tty->print_cr("=============== DEBUG MESSAGE: %s ================\n", 2546 msg); 2547 assert(false, "DEBUG MESSAGE: %s", msg); 2548 } 2549 } 2550 2551 #ifdef BUILTIN_SIM 2552 // routine to generate an x86 prolog for a stub function which 2553 // bootstraps into the generated ARM code which directly follows the 2554 // stub 2555 // 2556 // the argument encodes the number of general and fp registers 2557 // passed by the caller and the callng convention (currently just 2558 // the number of general registers and assumes C argument passing) 2559 2560 extern "C" { 2561 int aarch64_stub_prolog_size(); 2562 void aarch64_stub_prolog(); 2563 void aarch64_prolog(); 2564 } 2565 2566 void MacroAssembler::c_stub_prolog(int gp_arg_count, int fp_arg_count, int ret_type, 2567 address *prolog_ptr) 2568 { 2569 int calltype = (((ret_type & 0x3) << 8) | 2570 ((fp_arg_count & 0xf) << 4) | 2571 (gp_arg_count & 0xf)); 2572 2573 // the addresses for the x86 to ARM entry code we need to use 2574 address start = pc(); 2575 // printf("start = %lx\n", start); 2576 int byteCount = aarch64_stub_prolog_size(); 2577 // printf("byteCount = %x\n", byteCount); 2578 int instructionCount = (byteCount + 3)/ 4; 2579 // printf("instructionCount = %x\n", instructionCount); 2580 for (int i = 0; i < instructionCount; i++) { 2581 nop(); 2582 } 2583 2584 memcpy(start, (void*)aarch64_stub_prolog, byteCount); 2585 2586 // write the address of the setup routine and the call format at the 2587 // end of into the copied code 2588 u_int64_t *patch_end = (u_int64_t *)(start + byteCount); 2589 if (prolog_ptr) 2590 patch_end[-2] = (u_int64_t)prolog_ptr; 2591 patch_end[-1] = calltype; 2592 } 2593 #endif 2594 2595 void MacroAssembler::push_call_clobbered_registers() { 2596 int step = 4 * wordSize; 2597 push(RegSet::range(r0, r18) - RegSet::of(rscratch1, rscratch2), sp); 2598 sub(sp, sp, step); 2599 mov(rscratch1, -step); 2600 // Push v0-v7, v16-v31. 2601 for (int i = 31; i>= 4; i -= 4) { 2602 if (i <= v7->encoding() || i >= v16->encoding()) 2603 st1(as_FloatRegister(i-3), as_FloatRegister(i-2), as_FloatRegister(i-1), 2604 as_FloatRegister(i), T1D, Address(post(sp, rscratch1))); 2605 } 2606 st1(as_FloatRegister(0), as_FloatRegister(1), as_FloatRegister(2), 2607 as_FloatRegister(3), T1D, Address(sp)); 2608 } 2609 2610 void MacroAssembler::pop_call_clobbered_registers() { 2611 for (int i = 0; i < 32; i += 4) { 2612 if (i <= v7->encoding() || i >= v16->encoding()) 2613 ld1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2), 2614 as_FloatRegister(i+3), T1D, Address(post(sp, 4 * wordSize))); 2615 } 2616 2617 pop(RegSet::range(r0, r18) - RegSet::of(rscratch1, rscratch2), sp); 2618 } 2619 2620 void MacroAssembler::push_CPU_state(bool save_vectors) { 2621 int step = (save_vectors ? 8 : 4) * wordSize; 2622 push(0x3fffffff, sp); // integer registers except lr & sp 2623 mov(rscratch1, -step); 2624 sub(sp, sp, step); 2625 for (int i = 28; i >= 4; i -= 4) { 2626 st1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2), 2627 as_FloatRegister(i+3), save_vectors ? T2D : T1D, Address(post(sp, rscratch1))); 2628 } 2629 st1(v0, v1, v2, v3, save_vectors ? T2D : T1D, sp); 2630 } 2631 2632 void MacroAssembler::pop_CPU_state(bool restore_vectors) { 2633 int step = (restore_vectors ? 8 : 4) * wordSize; 2634 for (int i = 0; i <= 28; i += 4) 2635 ld1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2), 2636 as_FloatRegister(i+3), restore_vectors ? T2D : T1D, Address(post(sp, step))); 2637 pop(0x3fffffff, sp); // integer registers except lr & sp 2638 } 2639 2640 /** 2641 * Helpers for multiply_to_len(). 2642 */ 2643 void MacroAssembler::add2_with_carry(Register final_dest_hi, Register dest_hi, Register dest_lo, 2644 Register src1, Register src2) { 2645 adds(dest_lo, dest_lo, src1); 2646 adc(dest_hi, dest_hi, zr); 2647 adds(dest_lo, dest_lo, src2); 2648 adc(final_dest_hi, dest_hi, zr); 2649 } 2650 2651 // Generate an address from (r + r1 extend offset). "size" is the 2652 // size of the operand. The result may be in rscratch2. 2653 Address MacroAssembler::offsetted_address(Register r, Register r1, 2654 Address::extend ext, int offset, int size) { 2655 if (offset || (ext.shift() % size != 0)) { 2656 lea(rscratch2, Address(r, r1, ext)); 2657 return Address(rscratch2, offset); 2658 } else { 2659 return Address(r, r1, ext); 2660 } 2661 } 2662 2663 Address MacroAssembler::spill_address(int size, int offset, Register tmp) 2664 { 2665 assert(offset >= 0, "spill to negative address?"); 2666 // Offset reachable ? 2667 // Not aligned - 9 bits signed offset 2668 // Aligned - 12 bits unsigned offset shifted 2669 Register base = sp; 2670 if ((offset & (size-1)) && offset >= (1<<8)) { 2671 add(tmp, base, offset & ((1<<12)-1)); 2672 base = tmp; 2673 offset &= -1<<12; 2674 } 2675 2676 if (offset >= (1<<12) * size) { 2677 add(tmp, base, offset & (((1<<12)-1)<<12)); 2678 base = tmp; 2679 offset &= ~(((1<<12)-1)<<12); 2680 } 2681 2682 return Address(base, offset); 2683 } 2684 2685 // Checks whether offset is aligned. 2686 // Returns true if it is, else false. 2687 bool MacroAssembler::merge_alignment_check(Register base, 2688 size_t size, 2689 long cur_offset, 2690 long prev_offset) const { 2691 if (AvoidUnalignedAccesses) { 2692 if (base == sp) { 2693 // Checks whether low offset if aligned to pair of registers. 2694 long pair_mask = size * 2 - 1; 2695 long offset = prev_offset > cur_offset ? cur_offset : prev_offset; 2696 return (offset & pair_mask) == 0; 2697 } else { // If base is not sp, we can't guarantee the access is aligned. 2698 return false; 2699 } 2700 } else { 2701 long mask = size - 1; 2702 // Load/store pair instruction only supports element size aligned offset. 2703 return (cur_offset & mask) == 0 && (prev_offset & mask) == 0; 2704 } 2705 } 2706 2707 // Checks whether current and previous loads/stores can be merged. 2708 // Returns true if it can be merged, else false. 2709 bool MacroAssembler::ldst_can_merge(Register rt, 2710 const Address &adr, 2711 size_t cur_size_in_bytes, 2712 bool is_store) const { 2713 address prev = pc() - NativeInstruction::instruction_size; 2714 address last = code()->last_insn(); 2715 2716 if (last == NULL || !nativeInstruction_at(last)->is_Imm_LdSt()) { 2717 return false; 2718 } 2719 2720 if (adr.getMode() != Address::base_plus_offset || prev != last) { 2721 return false; 2722 } 2723 2724 NativeLdSt* prev_ldst = NativeLdSt_at(prev); 2725 size_t prev_size_in_bytes = prev_ldst->size_in_bytes(); 2726 2727 assert(prev_size_in_bytes == 4 || prev_size_in_bytes == 8, "only supports 64/32bit merging."); 2728 assert(cur_size_in_bytes == 4 || cur_size_in_bytes == 8, "only supports 64/32bit merging."); 2729 2730 if (cur_size_in_bytes != prev_size_in_bytes || is_store != prev_ldst->is_store()) { 2731 return false; 2732 } 2733 2734 long max_offset = 63 * prev_size_in_bytes; 2735 long min_offset = -64 * prev_size_in_bytes; 2736 2737 assert(prev_ldst->is_not_pre_post_index(), "pre-index or post-index is not supported to be merged."); 2738 2739 // Only same base can be merged. 2740 if (adr.base() != prev_ldst->base()) { 2741 return false; 2742 } 2743 2744 long cur_offset = adr.offset(); 2745 long prev_offset = prev_ldst->offset(); 2746 size_t diff = abs(cur_offset - prev_offset); 2747 if (diff != prev_size_in_bytes) { 2748 return false; 2749 } 2750 2751 // Following cases can not be merged: 2752 // ldr x2, [x2, #8] 2753 // ldr x3, [x2, #16] 2754 // or: 2755 // ldr x2, [x3, #8] 2756 // ldr x2, [x3, #16] 2757 // If t1 and t2 is the same in "ldp t1, t2, [xn, #imm]", we'll get SIGILL. 2758 if (!is_store && (adr.base() == prev_ldst->target() || rt == prev_ldst->target())) { 2759 return false; 2760 } 2761 2762 long low_offset = prev_offset > cur_offset ? cur_offset : prev_offset; 2763 // Offset range must be in ldp/stp instruction's range. 2764 if (low_offset > max_offset || low_offset < min_offset) { 2765 return false; 2766 } 2767 2768 if (merge_alignment_check(adr.base(), prev_size_in_bytes, cur_offset, prev_offset)) { 2769 return true; 2770 } 2771 2772 return false; 2773 } 2774 2775 // Merge current load/store with previous load/store into ldp/stp. 2776 void MacroAssembler::merge_ldst(Register rt, 2777 const Address &adr, 2778 size_t cur_size_in_bytes, 2779 bool is_store) { 2780 2781 assert(ldst_can_merge(rt, adr, cur_size_in_bytes, is_store) == true, "cur and prev must be able to be merged."); 2782 2783 Register rt_low, rt_high; 2784 address prev = pc() - NativeInstruction::instruction_size; 2785 NativeLdSt* prev_ldst = NativeLdSt_at(prev); 2786 2787 long offset; 2788 2789 if (adr.offset() < prev_ldst->offset()) { 2790 offset = adr.offset(); 2791 rt_low = rt; 2792 rt_high = prev_ldst->target(); 2793 } else { 2794 offset = prev_ldst->offset(); 2795 rt_low = prev_ldst->target(); 2796 rt_high = rt; 2797 } 2798 2799 Address adr_p = Address(prev_ldst->base(), offset); 2800 // Overwrite previous generated binary. 2801 code_section()->set_end(prev); 2802 2803 const int sz = prev_ldst->size_in_bytes(); 2804 assert(sz == 8 || sz == 4, "only supports 64/32bit merging."); 2805 if (!is_store) { 2806 BLOCK_COMMENT("merged ldr pair"); 2807 if (sz == 8) { 2808 ldp(rt_low, rt_high, adr_p); 2809 } else { 2810 ldpw(rt_low, rt_high, adr_p); 2811 } 2812 } else { 2813 BLOCK_COMMENT("merged str pair"); 2814 if (sz == 8) { 2815 stp(rt_low, rt_high, adr_p); 2816 } else { 2817 stpw(rt_low, rt_high, adr_p); 2818 } 2819 } 2820 } 2821 2822 /** 2823 * Multiply 64 bit by 64 bit first loop. 2824 */ 2825 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart, 2826 Register y, Register y_idx, Register z, 2827 Register carry, Register product, 2828 Register idx, Register kdx) { 2829 // 2830 // jlong carry, x[], y[], z[]; 2831 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) { 2832 // huge_128 product = y[idx] * x[xstart] + carry; 2833 // z[kdx] = (jlong)product; 2834 // carry = (jlong)(product >>> 64); 2835 // } 2836 // z[xstart] = carry; 2837 // 2838 2839 Label L_first_loop, L_first_loop_exit; 2840 Label L_one_x, L_one_y, L_multiply; 2841 2842 subsw(xstart, xstart, 1); 2843 br(Assembler::MI, L_one_x); 2844 2845 lea(rscratch1, Address(x, xstart, Address::lsl(LogBytesPerInt))); 2846 ldr(x_xstart, Address(rscratch1)); 2847 ror(x_xstart, x_xstart, 32); // convert big-endian to little-endian 2848 2849 bind(L_first_loop); 2850 subsw(idx, idx, 1); 2851 br(Assembler::MI, L_first_loop_exit); 2852 subsw(idx, idx, 1); 2853 br(Assembler::MI, L_one_y); 2854 lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt))); 2855 ldr(y_idx, Address(rscratch1)); 2856 ror(y_idx, y_idx, 32); // convert big-endian to little-endian 2857 bind(L_multiply); 2858 2859 // AArch64 has a multiply-accumulate instruction that we can't use 2860 // here because it has no way to process carries, so we have to use 2861 // separate add and adc instructions. Bah. 2862 umulh(rscratch1, x_xstart, y_idx); // x_xstart * y_idx -> rscratch1:product 2863 mul(product, x_xstart, y_idx); 2864 adds(product, product, carry); 2865 adc(carry, rscratch1, zr); // x_xstart * y_idx + carry -> carry:product 2866 2867 subw(kdx, kdx, 2); 2868 ror(product, product, 32); // back to big-endian 2869 str(product, offsetted_address(z, kdx, Address::uxtw(LogBytesPerInt), 0, BytesPerLong)); 2870 2871 b(L_first_loop); 2872 2873 bind(L_one_y); 2874 ldrw(y_idx, Address(y, 0)); 2875 b(L_multiply); 2876 2877 bind(L_one_x); 2878 ldrw(x_xstart, Address(x, 0)); 2879 b(L_first_loop); 2880 2881 bind(L_first_loop_exit); 2882 } 2883 2884 /** 2885 * Multiply 128 bit by 128. Unrolled inner loop. 2886 * 2887 */ 2888 void MacroAssembler::multiply_128_x_128_loop(Register y, Register z, 2889 Register carry, Register carry2, 2890 Register idx, Register jdx, 2891 Register yz_idx1, Register yz_idx2, 2892 Register tmp, Register tmp3, Register tmp4, 2893 Register tmp6, Register product_hi) { 2894 2895 // jlong carry, x[], y[], z[]; 2896 // int kdx = ystart+1; 2897 // for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop 2898 // huge_128 tmp3 = (y[idx+1] * product_hi) + z[kdx+idx+1] + carry; 2899 // jlong carry2 = (jlong)(tmp3 >>> 64); 2900 // huge_128 tmp4 = (y[idx] * product_hi) + z[kdx+idx] + carry2; 2901 // carry = (jlong)(tmp4 >>> 64); 2902 // z[kdx+idx+1] = (jlong)tmp3; 2903 // z[kdx+idx] = (jlong)tmp4; 2904 // } 2905 // idx += 2; 2906 // if (idx > 0) { 2907 // yz_idx1 = (y[idx] * product_hi) + z[kdx+idx] + carry; 2908 // z[kdx+idx] = (jlong)yz_idx1; 2909 // carry = (jlong)(yz_idx1 >>> 64); 2910 // } 2911 // 2912 2913 Label L_third_loop, L_third_loop_exit, L_post_third_loop_done; 2914 2915 lsrw(jdx, idx, 2); 2916 2917 bind(L_third_loop); 2918 2919 subsw(jdx, jdx, 1); 2920 br(Assembler::MI, L_third_loop_exit); 2921 subw(idx, idx, 4); 2922 2923 lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt))); 2924 2925 ldp(yz_idx2, yz_idx1, Address(rscratch1, 0)); 2926 2927 lea(tmp6, Address(z, idx, Address::uxtw(LogBytesPerInt))); 2928 2929 ror(yz_idx1, yz_idx1, 32); // convert big-endian to little-endian 2930 ror(yz_idx2, yz_idx2, 32); 2931 2932 ldp(rscratch2, rscratch1, Address(tmp6, 0)); 2933 2934 mul(tmp3, product_hi, yz_idx1); // yz_idx1 * product_hi -> tmp4:tmp3 2935 umulh(tmp4, product_hi, yz_idx1); 2936 2937 ror(rscratch1, rscratch1, 32); // convert big-endian to little-endian 2938 ror(rscratch2, rscratch2, 32); 2939 2940 mul(tmp, product_hi, yz_idx2); // yz_idx2 * product_hi -> carry2:tmp 2941 umulh(carry2, product_hi, yz_idx2); 2942 2943 // propagate sum of both multiplications into carry:tmp4:tmp3 2944 adds(tmp3, tmp3, carry); 2945 adc(tmp4, tmp4, zr); 2946 adds(tmp3, tmp3, rscratch1); 2947 adcs(tmp4, tmp4, tmp); 2948 adc(carry, carry2, zr); 2949 adds(tmp4, tmp4, rscratch2); 2950 adc(carry, carry, zr); 2951 2952 ror(tmp3, tmp3, 32); // convert little-endian to big-endian 2953 ror(tmp4, tmp4, 32); 2954 stp(tmp4, tmp3, Address(tmp6, 0)); 2955 2956 b(L_third_loop); 2957 bind (L_third_loop_exit); 2958 2959 andw (idx, idx, 0x3); 2960 cbz(idx, L_post_third_loop_done); 2961 2962 Label L_check_1; 2963 subsw(idx, idx, 2); 2964 br(Assembler::MI, L_check_1); 2965 2966 lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt))); 2967 ldr(yz_idx1, Address(rscratch1, 0)); 2968 ror(yz_idx1, yz_idx1, 32); 2969 mul(tmp3, product_hi, yz_idx1); // yz_idx1 * product_hi -> tmp4:tmp3 2970 umulh(tmp4, product_hi, yz_idx1); 2971 lea(rscratch1, Address(z, idx, Address::uxtw(LogBytesPerInt))); 2972 ldr(yz_idx2, Address(rscratch1, 0)); 2973 ror(yz_idx2, yz_idx2, 32); 2974 2975 add2_with_carry(carry, tmp4, tmp3, carry, yz_idx2); 2976 2977 ror(tmp3, tmp3, 32); 2978 str(tmp3, Address(rscratch1, 0)); 2979 2980 bind (L_check_1); 2981 2982 andw (idx, idx, 0x1); 2983 subsw(idx, idx, 1); 2984 br(Assembler::MI, L_post_third_loop_done); 2985 ldrw(tmp4, Address(y, idx, Address::uxtw(LogBytesPerInt))); 2986 mul(tmp3, tmp4, product_hi); // tmp4 * product_hi -> carry2:tmp3 2987 umulh(carry2, tmp4, product_hi); 2988 ldrw(tmp4, Address(z, idx, Address::uxtw(LogBytesPerInt))); 2989 2990 add2_with_carry(carry2, tmp3, tmp4, carry); 2991 2992 strw(tmp3, Address(z, idx, Address::uxtw(LogBytesPerInt))); 2993 extr(carry, carry2, tmp3, 32); 2994 2995 bind(L_post_third_loop_done); 2996 } 2997 2998 /** 2999 * Code for BigInteger::multiplyToLen() instrinsic. 3000 * 3001 * r0: x 3002 * r1: xlen 3003 * r2: y 3004 * r3: ylen 3005 * r4: z 3006 * r5: zlen 3007 * r10: tmp1 3008 * r11: tmp2 3009 * r12: tmp3 3010 * r13: tmp4 3011 * r14: tmp5 3012 * r15: tmp6 3013 * r16: tmp7 3014 * 3015 */ 3016 void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Register ylen, 3017 Register z, Register zlen, 3018 Register tmp1, Register tmp2, Register tmp3, Register tmp4, 3019 Register tmp5, Register tmp6, Register product_hi) { 3020 3021 assert_different_registers(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6); 3022 3023 const Register idx = tmp1; 3024 const Register kdx = tmp2; 3025 const Register xstart = tmp3; 3026 3027 const Register y_idx = tmp4; 3028 const Register carry = tmp5; 3029 const Register product = xlen; 3030 const Register x_xstart = zlen; // reuse register 3031 3032 // First Loop. 3033 // 3034 // final static long LONG_MASK = 0xffffffffL; 3035 // int xstart = xlen - 1; 3036 // int ystart = ylen - 1; 3037 // long carry = 0; 3038 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) { 3039 // long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry; 3040 // z[kdx] = (int)product; 3041 // carry = product >>> 32; 3042 // } 3043 // z[xstart] = (int)carry; 3044 // 3045 3046 movw(idx, ylen); // idx = ylen; 3047 movw(kdx, zlen); // kdx = xlen+ylen; 3048 mov(carry, zr); // carry = 0; 3049 3050 Label L_done; 3051 3052 movw(xstart, xlen); 3053 subsw(xstart, xstart, 1); 3054 br(Assembler::MI, L_done); 3055 3056 multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, carry, product, idx, kdx); 3057 3058 Label L_second_loop; 3059 cbzw(kdx, L_second_loop); 3060 3061 Label L_carry; 3062 subw(kdx, kdx, 1); 3063 cbzw(kdx, L_carry); 3064 3065 strw(carry, Address(z, kdx, Address::uxtw(LogBytesPerInt))); 3066 lsr(carry, carry, 32); 3067 subw(kdx, kdx, 1); 3068 3069 bind(L_carry); 3070 strw(carry, Address(z, kdx, Address::uxtw(LogBytesPerInt))); 3071 3072 // Second and third (nested) loops. 3073 // 3074 // for (int i = xstart-1; i >= 0; i--) { // Second loop 3075 // carry = 0; 3076 // for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop 3077 // long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) + 3078 // (z[k] & LONG_MASK) + carry; 3079 // z[k] = (int)product; 3080 // carry = product >>> 32; 3081 // } 3082 // z[i] = (int)carry; 3083 // } 3084 // 3085 // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = product_hi 3086 3087 const Register jdx = tmp1; 3088 3089 bind(L_second_loop); 3090 mov(carry, zr); // carry = 0; 3091 movw(jdx, ylen); // j = ystart+1 3092 3093 subsw(xstart, xstart, 1); // i = xstart-1; 3094 br(Assembler::MI, L_done); 3095 3096 str(z, Address(pre(sp, -4 * wordSize))); 3097 3098 Label L_last_x; 3099 lea(z, offsetted_address(z, xstart, Address::uxtw(LogBytesPerInt), 4, BytesPerInt)); // z = z + k - j 3100 subsw(xstart, xstart, 1); // i = xstart-1; 3101 br(Assembler::MI, L_last_x); 3102 3103 lea(rscratch1, Address(x, xstart, Address::uxtw(LogBytesPerInt))); 3104 ldr(product_hi, Address(rscratch1)); 3105 ror(product_hi, product_hi, 32); // convert big-endian to little-endian 3106 3107 Label L_third_loop_prologue; 3108 bind(L_third_loop_prologue); 3109 3110 str(ylen, Address(sp, wordSize)); 3111 stp(x, xstart, Address(sp, 2 * wordSize)); 3112 multiply_128_x_128_loop(y, z, carry, x, jdx, ylen, product, 3113 tmp2, x_xstart, tmp3, tmp4, tmp6, product_hi); 3114 ldp(z, ylen, Address(post(sp, 2 * wordSize))); 3115 ldp(x, xlen, Address(post(sp, 2 * wordSize))); // copy old xstart -> xlen 3116 3117 addw(tmp3, xlen, 1); 3118 strw(carry, Address(z, tmp3, Address::uxtw(LogBytesPerInt))); 3119 subsw(tmp3, tmp3, 1); 3120 br(Assembler::MI, L_done); 3121 3122 lsr(carry, carry, 32); 3123 strw(carry, Address(z, tmp3, Address::uxtw(LogBytesPerInt))); 3124 b(L_second_loop); 3125 3126 // Next infrequent code is moved outside loops. 3127 bind(L_last_x); 3128 ldrw(product_hi, Address(x, 0)); 3129 b(L_third_loop_prologue); 3130 3131 bind(L_done); 3132 } 3133 3134 // Code for BigInteger::mulAdd instrinsic 3135 // out = r0 3136 // in = r1 3137 // offset = r2 (already out.length-offset) 3138 // len = r3 3139 // k = r4 3140 // 3141 // pseudo code from java implementation: 3142 // carry = 0; 3143 // offset = out.length-offset - 1; 3144 // for (int j=len-1; j >= 0; j--) { 3145 // product = (in[j] & LONG_MASK) * kLong + (out[offset] & LONG_MASK) + carry; 3146 // out[offset--] = (int)product; 3147 // carry = product >>> 32; 3148 // } 3149 // return (int)carry; 3150 void MacroAssembler::mul_add(Register out, Register in, Register offset, 3151 Register len, Register k) { 3152 Label LOOP, END; 3153 // pre-loop 3154 cmp(len, zr); // cmp, not cbz/cbnz: to use condition twice => less branches 3155 csel(out, zr, out, Assembler::EQ); 3156 br(Assembler::EQ, END); 3157 add(in, in, len, LSL, 2); // in[j+1] address 3158 add(offset, out, offset, LSL, 2); // out[offset + 1] address 3159 mov(out, zr); // used to keep carry now 3160 BIND(LOOP); 3161 ldrw(rscratch1, Address(pre(in, -4))); 3162 madd(rscratch1, rscratch1, k, out); 3163 ldrw(rscratch2, Address(pre(offset, -4))); 3164 add(rscratch1, rscratch1, rscratch2); 3165 strw(rscratch1, Address(offset)); 3166 lsr(out, rscratch1, 32); 3167 subs(len, len, 1); 3168 br(Assembler::NE, LOOP); 3169 BIND(END); 3170 } 3171 3172 /** 3173 * Emits code to update CRC-32 with a byte value according to constants in table 3174 * 3175 * @param [in,out]crc Register containing the crc. 3176 * @param [in]val Register containing the byte to fold into the CRC. 3177 * @param [in]table Register containing the table of crc constants. 3178 * 3179 * uint32_t crc; 3180 * val = crc_table[(val ^ crc) & 0xFF]; 3181 * crc = val ^ (crc >> 8); 3182 * 3183 */ 3184 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) { 3185 eor(val, val, crc); 3186 andr(val, val, 0xff); 3187 ldrw(val, Address(table, val, Address::lsl(2))); 3188 eor(crc, val, crc, Assembler::LSR, 8); 3189 } 3190 3191 /** 3192 * Emits code to update CRC-32 with a 32-bit value according to tables 0 to 3 3193 * 3194 * @param [in,out]crc Register containing the crc. 3195 * @param [in]v Register containing the 32-bit to fold into the CRC. 3196 * @param [in]table0 Register containing table 0 of crc constants. 3197 * @param [in]table1 Register containing table 1 of crc constants. 3198 * @param [in]table2 Register containing table 2 of crc constants. 3199 * @param [in]table3 Register containing table 3 of crc constants. 3200 * 3201 * uint32_t crc; 3202 * v = crc ^ v 3203 * crc = table3[v&0xff]^table2[(v>>8)&0xff]^table1[(v>>16)&0xff]^table0[v>>24] 3204 * 3205 */ 3206 void MacroAssembler::update_word_crc32(Register crc, Register v, Register tmp, 3207 Register table0, Register table1, Register table2, Register table3, 3208 bool upper) { 3209 eor(v, crc, v, upper ? LSR:LSL, upper ? 32:0); 3210 uxtb(tmp, v); 3211 ldrw(crc, Address(table3, tmp, Address::lsl(2))); 3212 ubfx(tmp, v, 8, 8); 3213 ldrw(tmp, Address(table2, tmp, Address::lsl(2))); 3214 eor(crc, crc, tmp); 3215 ubfx(tmp, v, 16, 8); 3216 ldrw(tmp, Address(table1, tmp, Address::lsl(2))); 3217 eor(crc, crc, tmp); 3218 ubfx(tmp, v, 24, 8); 3219 ldrw(tmp, Address(table0, tmp, Address::lsl(2))); 3220 eor(crc, crc, tmp); 3221 } 3222 3223 void MacroAssembler::kernel_crc32_using_crc32(Register crc, Register buf, 3224 Register len, Register tmp0, Register tmp1, Register tmp2, 3225 Register tmp3) { 3226 Label CRC_by64_loop, CRC_by4_loop, CRC_by1_loop, CRC_less64, CRC_by64_pre, CRC_by32_loop, CRC_less32, L_exit; 3227 assert_different_registers(crc, buf, len, tmp0, tmp1, tmp2, tmp3); 3228 3229 mvnw(crc, crc); 3230 3231 subs(len, len, 128); 3232 br(Assembler::GE, CRC_by64_pre); 3233 BIND(CRC_less64); 3234 adds(len, len, 128-32); 3235 br(Assembler::GE, CRC_by32_loop); 3236 BIND(CRC_less32); 3237 adds(len, len, 32-4); 3238 br(Assembler::GE, CRC_by4_loop); 3239 adds(len, len, 4); 3240 br(Assembler::GT, CRC_by1_loop); 3241 b(L_exit); 3242 3243 BIND(CRC_by32_loop); 3244 ldp(tmp0, tmp1, Address(post(buf, 16))); 3245 subs(len, len, 32); 3246 crc32x(crc, crc, tmp0); 3247 ldr(tmp2, Address(post(buf, 8))); 3248 crc32x(crc, crc, tmp1); 3249 ldr(tmp3, Address(post(buf, 8))); 3250 crc32x(crc, crc, tmp2); 3251 crc32x(crc, crc, tmp3); 3252 br(Assembler::GE, CRC_by32_loop); 3253 cmn(len, 32); 3254 br(Assembler::NE, CRC_less32); 3255 b(L_exit); 3256 3257 BIND(CRC_by4_loop); 3258 ldrw(tmp0, Address(post(buf, 4))); 3259 subs(len, len, 4); 3260 crc32w(crc, crc, tmp0); 3261 br(Assembler::GE, CRC_by4_loop); 3262 adds(len, len, 4); 3263 br(Assembler::LE, L_exit); 3264 BIND(CRC_by1_loop); 3265 ldrb(tmp0, Address(post(buf, 1))); 3266 subs(len, len, 1); 3267 crc32b(crc, crc, tmp0); 3268 br(Assembler::GT, CRC_by1_loop); 3269 b(L_exit); 3270 3271 BIND(CRC_by64_pre); 3272 sub(buf, buf, 8); 3273 ldp(tmp0, tmp1, Address(buf, 8)); 3274 crc32x(crc, crc, tmp0); 3275 ldr(tmp2, Address(buf, 24)); 3276 crc32x(crc, crc, tmp1); 3277 ldr(tmp3, Address(buf, 32)); 3278 crc32x(crc, crc, tmp2); 3279 ldr(tmp0, Address(buf, 40)); 3280 crc32x(crc, crc, tmp3); 3281 ldr(tmp1, Address(buf, 48)); 3282 crc32x(crc, crc, tmp0); 3283 ldr(tmp2, Address(buf, 56)); 3284 crc32x(crc, crc, tmp1); 3285 ldr(tmp3, Address(pre(buf, 64))); 3286 3287 b(CRC_by64_loop); 3288 3289 align(CodeEntryAlignment); 3290 BIND(CRC_by64_loop); 3291 subs(len, len, 64); 3292 crc32x(crc, crc, tmp2); 3293 ldr(tmp0, Address(buf, 8)); 3294 crc32x(crc, crc, tmp3); 3295 ldr(tmp1, Address(buf, 16)); 3296 crc32x(crc, crc, tmp0); 3297 ldr(tmp2, Address(buf, 24)); 3298 crc32x(crc, crc, tmp1); 3299 ldr(tmp3, Address(buf, 32)); 3300 crc32x(crc, crc, tmp2); 3301 ldr(tmp0, Address(buf, 40)); 3302 crc32x(crc, crc, tmp3); 3303 ldr(tmp1, Address(buf, 48)); 3304 crc32x(crc, crc, tmp0); 3305 ldr(tmp2, Address(buf, 56)); 3306 crc32x(crc, crc, tmp1); 3307 ldr(tmp3, Address(pre(buf, 64))); 3308 br(Assembler::GE, CRC_by64_loop); 3309 3310 // post-loop 3311 crc32x(crc, crc, tmp2); 3312 crc32x(crc, crc, tmp3); 3313 3314 sub(len, len, 64); 3315 add(buf, buf, 8); 3316 cmn(len, 128); 3317 br(Assembler::NE, CRC_less64); 3318 BIND(L_exit); 3319 mvnw(crc, crc); 3320 } 3321 3322 /** 3323 * @param crc register containing existing CRC (32-bit) 3324 * @param buf register pointing to input byte buffer (byte*) 3325 * @param len register containing number of bytes 3326 * @param table register that will contain address of CRC table 3327 * @param tmp scratch register 3328 */ 3329 void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len, 3330 Register table0, Register table1, Register table2, Register table3, 3331 Register tmp, Register tmp2, Register tmp3) { 3332 Label L_by16, L_by16_loop, L_by4, L_by4_loop, L_by1, L_by1_loop, L_exit; 3333 unsigned long offset; 3334 3335 if (UseCRC32) { 3336 kernel_crc32_using_crc32(crc, buf, len, table0, table1, table2, table3); 3337 return; 3338 } 3339 3340 mvnw(crc, crc); 3341 3342 adrp(table0, ExternalAddress(StubRoutines::crc_table_addr()), offset); 3343 if (offset) add(table0, table0, offset); 3344 add(table1, table0, 1*256*sizeof(juint)); 3345 add(table2, table0, 2*256*sizeof(juint)); 3346 add(table3, table0, 3*256*sizeof(juint)); 3347 3348 if (UseNeon) { 3349 cmp(len, (u1)64); 3350 br(Assembler::LT, L_by16); 3351 eor(v16, T16B, v16, v16); 3352 3353 Label L_fold; 3354 3355 add(tmp, table0, 4*256*sizeof(juint)); // Point at the Neon constants 3356 3357 ld1(v0, v1, T2D, post(buf, 32)); 3358 ld1r(v4, T2D, post(tmp, 8)); 3359 ld1r(v5, T2D, post(tmp, 8)); 3360 ld1r(v6, T2D, post(tmp, 8)); 3361 ld1r(v7, T2D, post(tmp, 8)); 3362 mov(v16, T4S, 0, crc); 3363 3364 eor(v0, T16B, v0, v16); 3365 sub(len, len, 64); 3366 3367 BIND(L_fold); 3368 pmull(v22, T8H, v0, v5, T8B); 3369 pmull(v20, T8H, v0, v7, T8B); 3370 pmull(v23, T8H, v0, v4, T8B); 3371 pmull(v21, T8H, v0, v6, T8B); 3372 3373 pmull2(v18, T8H, v0, v5, T16B); 3374 pmull2(v16, T8H, v0, v7, T16B); 3375 pmull2(v19, T8H, v0, v4, T16B); 3376 pmull2(v17, T8H, v0, v6, T16B); 3377 3378 uzp1(v24, T8H, v20, v22); 3379 uzp2(v25, T8H, v20, v22); 3380 eor(v20, T16B, v24, v25); 3381 3382 uzp1(v26, T8H, v16, v18); 3383 uzp2(v27, T8H, v16, v18); 3384 eor(v16, T16B, v26, v27); 3385 3386 ushll2(v22, T4S, v20, T8H, 8); 3387 ushll(v20, T4S, v20, T4H, 8); 3388 3389 ushll2(v18, T4S, v16, T8H, 8); 3390 ushll(v16, T4S, v16, T4H, 8); 3391 3392 eor(v22, T16B, v23, v22); 3393 eor(v18, T16B, v19, v18); 3394 eor(v20, T16B, v21, v20); 3395 eor(v16, T16B, v17, v16); 3396 3397 uzp1(v17, T2D, v16, v20); 3398 uzp2(v21, T2D, v16, v20); 3399 eor(v17, T16B, v17, v21); 3400 3401 ushll2(v20, T2D, v17, T4S, 16); 3402 ushll(v16, T2D, v17, T2S, 16); 3403 3404 eor(v20, T16B, v20, v22); 3405 eor(v16, T16B, v16, v18); 3406 3407 uzp1(v17, T2D, v20, v16); 3408 uzp2(v21, T2D, v20, v16); 3409 eor(v28, T16B, v17, v21); 3410 3411 pmull(v22, T8H, v1, v5, T8B); 3412 pmull(v20, T8H, v1, v7, T8B); 3413 pmull(v23, T8H, v1, v4, T8B); 3414 pmull(v21, T8H, v1, v6, T8B); 3415 3416 pmull2(v18, T8H, v1, v5, T16B); 3417 pmull2(v16, T8H, v1, v7, T16B); 3418 pmull2(v19, T8H, v1, v4, T16B); 3419 pmull2(v17, T8H, v1, v6, T16B); 3420 3421 ld1(v0, v1, T2D, post(buf, 32)); 3422 3423 uzp1(v24, T8H, v20, v22); 3424 uzp2(v25, T8H, v20, v22); 3425 eor(v20, T16B, v24, v25); 3426 3427 uzp1(v26, T8H, v16, v18); 3428 uzp2(v27, T8H, v16, v18); 3429 eor(v16, T16B, v26, v27); 3430 3431 ushll2(v22, T4S, v20, T8H, 8); 3432 ushll(v20, T4S, v20, T4H, 8); 3433 3434 ushll2(v18, T4S, v16, T8H, 8); 3435 ushll(v16, T4S, v16, T4H, 8); 3436 3437 eor(v22, T16B, v23, v22); 3438 eor(v18, T16B, v19, v18); 3439 eor(v20, T16B, v21, v20); 3440 eor(v16, T16B, v17, v16); 3441 3442 uzp1(v17, T2D, v16, v20); 3443 uzp2(v21, T2D, v16, v20); 3444 eor(v16, T16B, v17, v21); 3445 3446 ushll2(v20, T2D, v16, T4S, 16); 3447 ushll(v16, T2D, v16, T2S, 16); 3448 3449 eor(v20, T16B, v22, v20); 3450 eor(v16, T16B, v16, v18); 3451 3452 uzp1(v17, T2D, v20, v16); 3453 uzp2(v21, T2D, v20, v16); 3454 eor(v20, T16B, v17, v21); 3455 3456 shl(v16, T2D, v28, 1); 3457 shl(v17, T2D, v20, 1); 3458 3459 eor(v0, T16B, v0, v16); 3460 eor(v1, T16B, v1, v17); 3461 3462 subs(len, len, 32); 3463 br(Assembler::GE, L_fold); 3464 3465 mov(crc, 0); 3466 mov(tmp, v0, T1D, 0); 3467 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false); 3468 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true); 3469 mov(tmp, v0, T1D, 1); 3470 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false); 3471 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true); 3472 mov(tmp, v1, T1D, 0); 3473 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false); 3474 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true); 3475 mov(tmp, v1, T1D, 1); 3476 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false); 3477 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true); 3478 3479 add(len, len, 32); 3480 } 3481 3482 BIND(L_by16); 3483 subs(len, len, 16); 3484 br(Assembler::GE, L_by16_loop); 3485 adds(len, len, 16-4); 3486 br(Assembler::GE, L_by4_loop); 3487 adds(len, len, 4); 3488 br(Assembler::GT, L_by1_loop); 3489 b(L_exit); 3490 3491 BIND(L_by4_loop); 3492 ldrw(tmp, Address(post(buf, 4))); 3493 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3); 3494 subs(len, len, 4); 3495 br(Assembler::GE, L_by4_loop); 3496 adds(len, len, 4); 3497 br(Assembler::LE, L_exit); 3498 BIND(L_by1_loop); 3499 subs(len, len, 1); 3500 ldrb(tmp, Address(post(buf, 1))); 3501 update_byte_crc32(crc, tmp, table0); 3502 br(Assembler::GT, L_by1_loop); 3503 b(L_exit); 3504 3505 align(CodeEntryAlignment); 3506 BIND(L_by16_loop); 3507 subs(len, len, 16); 3508 ldp(tmp, tmp3, Address(post(buf, 16))); 3509 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false); 3510 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true); 3511 update_word_crc32(crc, tmp3, tmp2, table0, table1, table2, table3, false); 3512 update_word_crc32(crc, tmp3, tmp2, table0, table1, table2, table3, true); 3513 br(Assembler::GE, L_by16_loop); 3514 adds(len, len, 16-4); 3515 br(Assembler::GE, L_by4_loop); 3516 adds(len, len, 4); 3517 br(Assembler::GT, L_by1_loop); 3518 BIND(L_exit); 3519 mvnw(crc, crc); 3520 } 3521 3522 void MacroAssembler::kernel_crc32c_using_crc32c(Register crc, Register buf, 3523 Register len, Register tmp0, Register tmp1, Register tmp2, 3524 Register tmp3) { 3525 Label CRC_by64_loop, CRC_by4_loop, CRC_by1_loop, CRC_less64, CRC_by64_pre, CRC_by32_loop, CRC_less32, L_exit; 3526 assert_different_registers(crc, buf, len, tmp0, tmp1, tmp2, tmp3); 3527 3528 subs(len, len, 128); 3529 br(Assembler::GE, CRC_by64_pre); 3530 BIND(CRC_less64); 3531 adds(len, len, 128-32); 3532 br(Assembler::GE, CRC_by32_loop); 3533 BIND(CRC_less32); 3534 adds(len, len, 32-4); 3535 br(Assembler::GE, CRC_by4_loop); 3536 adds(len, len, 4); 3537 br(Assembler::GT, CRC_by1_loop); 3538 b(L_exit); 3539 3540 BIND(CRC_by32_loop); 3541 ldp(tmp0, tmp1, Address(post(buf, 16))); 3542 subs(len, len, 32); 3543 crc32cx(crc, crc, tmp0); 3544 ldr(tmp2, Address(post(buf, 8))); 3545 crc32cx(crc, crc, tmp1); 3546 ldr(tmp3, Address(post(buf, 8))); 3547 crc32cx(crc, crc, tmp2); 3548 crc32cx(crc, crc, tmp3); 3549 br(Assembler::GE, CRC_by32_loop); 3550 cmn(len, 32); 3551 br(Assembler::NE, CRC_less32); 3552 b(L_exit); 3553 3554 BIND(CRC_by4_loop); 3555 ldrw(tmp0, Address(post(buf, 4))); 3556 subs(len, len, 4); 3557 crc32cw(crc, crc, tmp0); 3558 br(Assembler::GE, CRC_by4_loop); 3559 adds(len, len, 4); 3560 br(Assembler::LE, L_exit); 3561 BIND(CRC_by1_loop); 3562 ldrb(tmp0, Address(post(buf, 1))); 3563 subs(len, len, 1); 3564 crc32cb(crc, crc, tmp0); 3565 br(Assembler::GT, CRC_by1_loop); 3566 b(L_exit); 3567 3568 BIND(CRC_by64_pre); 3569 sub(buf, buf, 8); 3570 ldp(tmp0, tmp1, Address(buf, 8)); 3571 crc32cx(crc, crc, tmp0); 3572 ldr(tmp2, Address(buf, 24)); 3573 crc32cx(crc, crc, tmp1); 3574 ldr(tmp3, Address(buf, 32)); 3575 crc32cx(crc, crc, tmp2); 3576 ldr(tmp0, Address(buf, 40)); 3577 crc32cx(crc, crc, tmp3); 3578 ldr(tmp1, Address(buf, 48)); 3579 crc32cx(crc, crc, tmp0); 3580 ldr(tmp2, Address(buf, 56)); 3581 crc32cx(crc, crc, tmp1); 3582 ldr(tmp3, Address(pre(buf, 64))); 3583 3584 b(CRC_by64_loop); 3585 3586 align(CodeEntryAlignment); 3587 BIND(CRC_by64_loop); 3588 subs(len, len, 64); 3589 crc32cx(crc, crc, tmp2); 3590 ldr(tmp0, Address(buf, 8)); 3591 crc32cx(crc, crc, tmp3); 3592 ldr(tmp1, Address(buf, 16)); 3593 crc32cx(crc, crc, tmp0); 3594 ldr(tmp2, Address(buf, 24)); 3595 crc32cx(crc, crc, tmp1); 3596 ldr(tmp3, Address(buf, 32)); 3597 crc32cx(crc, crc, tmp2); 3598 ldr(tmp0, Address(buf, 40)); 3599 crc32cx(crc, crc, tmp3); 3600 ldr(tmp1, Address(buf, 48)); 3601 crc32cx(crc, crc, tmp0); 3602 ldr(tmp2, Address(buf, 56)); 3603 crc32cx(crc, crc, tmp1); 3604 ldr(tmp3, Address(pre(buf, 64))); 3605 br(Assembler::GE, CRC_by64_loop); 3606 3607 // post-loop 3608 crc32cx(crc, crc, tmp2); 3609 crc32cx(crc, crc, tmp3); 3610 3611 sub(len, len, 64); 3612 add(buf, buf, 8); 3613 cmn(len, 128); 3614 br(Assembler::NE, CRC_less64); 3615 BIND(L_exit); 3616 } 3617 3618 /** 3619 * @param crc register containing existing CRC (32-bit) 3620 * @param buf register pointing to input byte buffer (byte*) 3621 * @param len register containing number of bytes 3622 * @param table register that will contain address of CRC table 3623 * @param tmp scratch register 3624 */ 3625 void MacroAssembler::kernel_crc32c(Register crc, Register buf, Register len, 3626 Register table0, Register table1, Register table2, Register table3, 3627 Register tmp, Register tmp2, Register tmp3) { 3628 kernel_crc32c_using_crc32c(crc, buf, len, table0, table1, table2, table3); 3629 } 3630 3631 3632 SkipIfEqual::SkipIfEqual( 3633 MacroAssembler* masm, const bool* flag_addr, bool value) { 3634 _masm = masm; 3635 unsigned long offset; 3636 _masm->adrp(rscratch1, ExternalAddress((address)flag_addr), offset); 3637 _masm->ldrb(rscratch1, Address(rscratch1, offset)); 3638 _masm->cbzw(rscratch1, _label); 3639 } 3640 3641 SkipIfEqual::~SkipIfEqual() { 3642 _masm->bind(_label); 3643 } 3644 3645 void MacroAssembler::addptr(const Address &dst, int32_t src) { 3646 Address adr; 3647 switch(dst.getMode()) { 3648 case Address::base_plus_offset: 3649 // This is the expected mode, although we allow all the other 3650 // forms below. 3651 adr = form_address(rscratch2, dst.base(), dst.offset(), LogBytesPerWord); 3652 break; 3653 default: 3654 lea(rscratch2, dst); 3655 adr = Address(rscratch2); 3656 break; 3657 } 3658 ldr(rscratch1, adr); 3659 add(rscratch1, rscratch1, src); 3660 str(rscratch1, adr); 3661 } 3662 3663 void MacroAssembler::cmpptr(Register src1, Address src2) { 3664 unsigned long offset; 3665 adrp(rscratch1, src2, offset); 3666 ldr(rscratch1, Address(rscratch1, offset)); 3667 cmp(src1, rscratch1); 3668 } 3669 3670 void MacroAssembler::cmpoop(Register obj1, Register obj2) { 3671 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 3672 bs->obj_equals(this, obj1, obj2); 3673 } 3674 3675 void MacroAssembler::load_klass(Register dst, Register src) { 3676 if (UseCompressedClassPointers) { 3677 ldrw(dst, Address(src, oopDesc::klass_offset_in_bytes())); 3678 decode_klass_not_null(dst); 3679 } else { 3680 ldr(dst, Address(src, oopDesc::klass_offset_in_bytes())); 3681 } 3682 } 3683 3684 // ((OopHandle)result).resolve(); 3685 void MacroAssembler::resolve_oop_handle(Register result, Register tmp) { 3686 // OopHandle::resolve is an indirection. 3687 access_load_at(T_OBJECT, IN_NATIVE, result, Address(result, 0), tmp, noreg); 3688 } 3689 3690 void MacroAssembler::load_mirror(Register dst, Register method, Register tmp) { 3691 const int mirror_offset = in_bytes(Klass::java_mirror_offset()); 3692 ldr(dst, Address(rmethod, Method::const_offset())); 3693 ldr(dst, Address(dst, ConstMethod::constants_offset())); 3694 ldr(dst, Address(dst, ConstantPool::pool_holder_offset_in_bytes())); 3695 ldr(dst, Address(dst, mirror_offset)); 3696 resolve_oop_handle(dst, tmp); 3697 } 3698 3699 void MacroAssembler::cmp_klass(Register oop, Register trial_klass, Register tmp) { 3700 if (UseCompressedClassPointers) { 3701 ldrw(tmp, Address(oop, oopDesc::klass_offset_in_bytes())); 3702 if (Universe::narrow_klass_base() == NULL) { 3703 cmp(trial_klass, tmp, LSL, Universe::narrow_klass_shift()); 3704 return; 3705 } else if (((uint64_t)Universe::narrow_klass_base() & 0xffffffff) == 0 3706 && Universe::narrow_klass_shift() == 0) { 3707 // Only the bottom 32 bits matter 3708 cmpw(trial_klass, tmp); 3709 return; 3710 } 3711 decode_klass_not_null(tmp); 3712 } else { 3713 ldr(tmp, Address(oop, oopDesc::klass_offset_in_bytes())); 3714 } 3715 cmp(trial_klass, tmp); 3716 } 3717 3718 void MacroAssembler::load_prototype_header(Register dst, Register src) { 3719 load_klass(dst, src); 3720 ldr(dst, Address(dst, Klass::prototype_header_offset())); 3721 } 3722 3723 void MacroAssembler::store_klass(Register dst, Register src) { 3724 // FIXME: Should this be a store release? concurrent gcs assumes 3725 // klass length is valid if klass field is not null. 3726 if (UseCompressedClassPointers) { 3727 encode_klass_not_null(src); 3728 strw(src, Address(dst, oopDesc::klass_offset_in_bytes())); 3729 } else { 3730 str(src, Address(dst, oopDesc::klass_offset_in_bytes())); 3731 } 3732 } 3733 3734 void MacroAssembler::store_klass_gap(Register dst, Register src) { 3735 if (UseCompressedClassPointers) { 3736 // Store to klass gap in destination 3737 strw(src, Address(dst, oopDesc::klass_gap_offset_in_bytes())); 3738 } 3739 } 3740 3741 // Algorithm must match CompressedOops::encode. 3742 void MacroAssembler::encode_heap_oop(Register d, Register s) { 3743 #ifdef ASSERT 3744 verify_heapbase("MacroAssembler::encode_heap_oop: heap base corrupted?"); 3745 #endif 3746 verify_oop(s, "broken oop in encode_heap_oop"); 3747 if (Universe::narrow_oop_base() == NULL) { 3748 if (Universe::narrow_oop_shift() != 0) { 3749 assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong"); 3750 lsr(d, s, LogMinObjAlignmentInBytes); 3751 } else { 3752 mov(d, s); 3753 } 3754 } else { 3755 subs(d, s, rheapbase); 3756 csel(d, d, zr, Assembler::HS); 3757 lsr(d, d, LogMinObjAlignmentInBytes); 3758 3759 /* Old algorithm: is this any worse? 3760 Label nonnull; 3761 cbnz(r, nonnull); 3762 sub(r, r, rheapbase); 3763 bind(nonnull); 3764 lsr(r, r, LogMinObjAlignmentInBytes); 3765 */ 3766 } 3767 } 3768 3769 void MacroAssembler::encode_heap_oop_not_null(Register r) { 3770 #ifdef ASSERT 3771 verify_heapbase("MacroAssembler::encode_heap_oop_not_null: heap base corrupted?"); 3772 if (CheckCompressedOops) { 3773 Label ok; 3774 cbnz(r, ok); 3775 stop("null oop passed to encode_heap_oop_not_null"); 3776 bind(ok); 3777 } 3778 #endif 3779 verify_oop(r, "broken oop in encode_heap_oop_not_null"); 3780 if (Universe::narrow_oop_base() != NULL) { 3781 sub(r, r, rheapbase); 3782 } 3783 if (Universe::narrow_oop_shift() != 0) { 3784 assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong"); 3785 lsr(r, r, LogMinObjAlignmentInBytes); 3786 } 3787 } 3788 3789 void MacroAssembler::encode_heap_oop_not_null(Register dst, Register src) { 3790 #ifdef ASSERT 3791 verify_heapbase("MacroAssembler::encode_heap_oop_not_null2: heap base corrupted?"); 3792 if (CheckCompressedOops) { 3793 Label ok; 3794 cbnz(src, ok); 3795 stop("null oop passed to encode_heap_oop_not_null2"); 3796 bind(ok); 3797 } 3798 #endif 3799 verify_oop(src, "broken oop in encode_heap_oop_not_null2"); 3800 3801 Register data = src; 3802 if (Universe::narrow_oop_base() != NULL) { 3803 sub(dst, src, rheapbase); 3804 data = dst; 3805 } 3806 if (Universe::narrow_oop_shift() != 0) { 3807 assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong"); 3808 lsr(dst, data, LogMinObjAlignmentInBytes); 3809 data = dst; 3810 } 3811 if (data == src) 3812 mov(dst, src); 3813 } 3814 3815 void MacroAssembler::decode_heap_oop(Register d, Register s) { 3816 #ifdef ASSERT 3817 verify_heapbase("MacroAssembler::decode_heap_oop: heap base corrupted?"); 3818 #endif 3819 if (Universe::narrow_oop_base() == NULL) { 3820 if (Universe::narrow_oop_shift() != 0 || d != s) { 3821 lsl(d, s, Universe::narrow_oop_shift()); 3822 } 3823 } else { 3824 Label done; 3825 if (d != s) 3826 mov(d, s); 3827 cbz(s, done); 3828 add(d, rheapbase, s, Assembler::LSL, LogMinObjAlignmentInBytes); 3829 bind(done); 3830 } 3831 verify_oop(d, "broken oop in decode_heap_oop"); 3832 } 3833 3834 void MacroAssembler::decode_heap_oop_not_null(Register r) { 3835 assert (UseCompressedOops, "should only be used for compressed headers"); 3836 assert (Universe::heap() != NULL, "java heap should be initialized"); 3837 // Cannot assert, unverified entry point counts instructions (see .ad file) 3838 // vtableStubs also counts instructions in pd_code_size_limit. 3839 // Also do not verify_oop as this is called by verify_oop. 3840 if (Universe::narrow_oop_shift() != 0) { 3841 assert(LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong"); 3842 if (Universe::narrow_oop_base() != NULL) { 3843 add(r, rheapbase, r, Assembler::LSL, LogMinObjAlignmentInBytes); 3844 } else { 3845 add(r, zr, r, Assembler::LSL, LogMinObjAlignmentInBytes); 3846 } 3847 } else { 3848 assert (Universe::narrow_oop_base() == NULL, "sanity"); 3849 } 3850 } 3851 3852 void MacroAssembler::decode_heap_oop_not_null(Register dst, Register src) { 3853 assert (UseCompressedOops, "should only be used for compressed headers"); 3854 assert (Universe::heap() != NULL, "java heap should be initialized"); 3855 // Cannot assert, unverified entry point counts instructions (see .ad file) 3856 // vtableStubs also counts instructions in pd_code_size_limit. 3857 // Also do not verify_oop as this is called by verify_oop. 3858 if (Universe::narrow_oop_shift() != 0) { 3859 assert(LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong"); 3860 if (Universe::narrow_oop_base() != NULL) { 3861 add(dst, rheapbase, src, Assembler::LSL, LogMinObjAlignmentInBytes); 3862 } else { 3863 add(dst, zr, src, Assembler::LSL, LogMinObjAlignmentInBytes); 3864 } 3865 } else { 3866 assert (Universe::narrow_oop_base() == NULL, "sanity"); 3867 if (dst != src) { 3868 mov(dst, src); 3869 } 3870 } 3871 } 3872 3873 void MacroAssembler::encode_klass_not_null(Register dst, Register src) { 3874 if (Universe::narrow_klass_base() == NULL) { 3875 if (Universe::narrow_klass_shift() != 0) { 3876 assert (LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong"); 3877 lsr(dst, src, LogKlassAlignmentInBytes); 3878 } else { 3879 if (dst != src) mov(dst, src); 3880 } 3881 return; 3882 } 3883 3884 if (use_XOR_for_compressed_class_base) { 3885 if (Universe::narrow_klass_shift() != 0) { 3886 eor(dst, src, (uint64_t)Universe::narrow_klass_base()); 3887 lsr(dst, dst, LogKlassAlignmentInBytes); 3888 } else { 3889 eor(dst, src, (uint64_t)Universe::narrow_klass_base()); 3890 } 3891 return; 3892 } 3893 3894 if (((uint64_t)Universe::narrow_klass_base() & 0xffffffff) == 0 3895 && Universe::narrow_klass_shift() == 0) { 3896 movw(dst, src); 3897 return; 3898 } 3899 3900 #ifdef ASSERT 3901 verify_heapbase("MacroAssembler::encode_klass_not_null2: heap base corrupted?"); 3902 #endif 3903 3904 Register rbase = dst; 3905 if (dst == src) rbase = rheapbase; 3906 mov(rbase, (uint64_t)Universe::narrow_klass_base()); 3907 sub(dst, src, rbase); 3908 if (Universe::narrow_klass_shift() != 0) { 3909 assert (LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong"); 3910 lsr(dst, dst, LogKlassAlignmentInBytes); 3911 } 3912 if (dst == src) reinit_heapbase(); 3913 } 3914 3915 void MacroAssembler::encode_klass_not_null(Register r) { 3916 encode_klass_not_null(r, r); 3917 } 3918 3919 void MacroAssembler::decode_klass_not_null(Register dst, Register src) { 3920 Register rbase = dst; 3921 assert (UseCompressedClassPointers, "should only be used for compressed headers"); 3922 3923 if (Universe::narrow_klass_base() == NULL) { 3924 if (Universe::narrow_klass_shift() != 0) { 3925 assert(LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong"); 3926 lsl(dst, src, LogKlassAlignmentInBytes); 3927 } else { 3928 if (dst != src) mov(dst, src); 3929 } 3930 return; 3931 } 3932 3933 if (use_XOR_for_compressed_class_base) { 3934 if (Universe::narrow_klass_shift() != 0) { 3935 lsl(dst, src, LogKlassAlignmentInBytes); 3936 eor(dst, dst, (uint64_t)Universe::narrow_klass_base()); 3937 } else { 3938 eor(dst, src, (uint64_t)Universe::narrow_klass_base()); 3939 } 3940 return; 3941 } 3942 3943 if (((uint64_t)Universe::narrow_klass_base() & 0xffffffff) == 0 3944 && Universe::narrow_klass_shift() == 0) { 3945 if (dst != src) 3946 movw(dst, src); 3947 movk(dst, (uint64_t)Universe::narrow_klass_base() >> 32, 32); 3948 return; 3949 } 3950 3951 // Cannot assert, unverified entry point counts instructions (see .ad file) 3952 // vtableStubs also counts instructions in pd_code_size_limit. 3953 // Also do not verify_oop as this is called by verify_oop. 3954 if (dst == src) rbase = rheapbase; 3955 mov(rbase, (uint64_t)Universe::narrow_klass_base()); 3956 if (Universe::narrow_klass_shift() != 0) { 3957 assert(LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong"); 3958 add(dst, rbase, src, Assembler::LSL, LogKlassAlignmentInBytes); 3959 } else { 3960 add(dst, rbase, src); 3961 } 3962 if (dst == src) reinit_heapbase(); 3963 } 3964 3965 void MacroAssembler::decode_klass_not_null(Register r) { 3966 decode_klass_not_null(r, r); 3967 } 3968 3969 void MacroAssembler::set_narrow_oop(Register dst, jobject obj) { 3970 #ifdef ASSERT 3971 { 3972 ThreadInVMfromUnknown tiv; 3973 assert (UseCompressedOops, "should only be used for compressed oops"); 3974 assert (Universe::heap() != NULL, "java heap should be initialized"); 3975 assert (oop_recorder() != NULL, "this assembler needs an OopRecorder"); 3976 assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "should be real oop"); 3977 } 3978 #endif 3979 int oop_index = oop_recorder()->find_index(obj); 3980 InstructionMark im(this); 3981 RelocationHolder rspec = oop_Relocation::spec(oop_index); 3982 code_section()->relocate(inst_mark(), rspec); 3983 movz(dst, 0xDEAD, 16); 3984 movk(dst, 0xBEEF); 3985 } 3986 3987 void MacroAssembler::set_narrow_klass(Register dst, Klass* k) { 3988 assert (UseCompressedClassPointers, "should only be used for compressed headers"); 3989 assert (oop_recorder() != NULL, "this assembler needs an OopRecorder"); 3990 int index = oop_recorder()->find_index(k); 3991 assert(! Universe::heap()->is_in_reserved(k), "should not be an oop"); 3992 3993 InstructionMark im(this); 3994 RelocationHolder rspec = metadata_Relocation::spec(index); 3995 code_section()->relocate(inst_mark(), rspec); 3996 narrowKlass nk = Klass::encode_klass(k); 3997 movz(dst, (nk >> 16), 16); 3998 movk(dst, nk & 0xffff); 3999 } 4000 4001 void MacroAssembler::access_load_at(BasicType type, DecoratorSet decorators, 4002 Register dst, Address src, 4003 Register tmp1, Register thread_tmp) { 4004 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 4005 decorators = AccessInternal::decorator_fixup(decorators); 4006 bool as_raw = (decorators & AS_RAW) != 0; 4007 if (as_raw) { 4008 bs->BarrierSetAssembler::load_at(this, decorators, type, dst, src, tmp1, thread_tmp); 4009 } else { 4010 bs->load_at(this, decorators, type, dst, src, tmp1, thread_tmp); 4011 } 4012 } 4013 4014 void MacroAssembler::access_store_at(BasicType type, DecoratorSet decorators, 4015 Address dst, Register src, 4016 Register tmp1, Register thread_tmp) { 4017 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 4018 decorators = AccessInternal::decorator_fixup(decorators); 4019 bool as_raw = (decorators & AS_RAW) != 0; 4020 if (as_raw) { 4021 bs->BarrierSetAssembler::store_at(this, decorators, type, dst, src, tmp1, thread_tmp); 4022 } else { 4023 bs->store_at(this, decorators, type, dst, src, tmp1, thread_tmp); 4024 } 4025 } 4026 4027 void MacroAssembler::resolve(DecoratorSet decorators, Register obj) { 4028 // Use stronger ACCESS_WRITE|ACCESS_READ by default. 4029 if ((decorators & (ACCESS_READ | ACCESS_WRITE)) == 0) { 4030 decorators |= ACCESS_READ | ACCESS_WRITE; 4031 } 4032 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 4033 return bs->resolve(this, decorators, obj); 4034 } 4035 4036 void MacroAssembler::load_heap_oop(Register dst, Address src, Register tmp1, 4037 Register thread_tmp, DecoratorSet decorators) { 4038 access_load_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, thread_tmp); 4039 } 4040 4041 void MacroAssembler::load_heap_oop_not_null(Register dst, Address src, Register tmp1, 4042 Register thread_tmp, DecoratorSet decorators) { 4043 access_load_at(T_OBJECT, IN_HEAP | IS_NOT_NULL | decorators, dst, src, tmp1, thread_tmp); 4044 } 4045 4046 void MacroAssembler::store_heap_oop(Address dst, Register src, Register tmp1, 4047 Register thread_tmp, DecoratorSet decorators) { 4048 access_store_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, thread_tmp); 4049 } 4050 4051 // Used for storing NULLs. 4052 void MacroAssembler::store_heap_oop_null(Address dst) { 4053 access_store_at(T_OBJECT, IN_HEAP, dst, noreg, noreg, noreg); 4054 } 4055 4056 Address MacroAssembler::allocate_metadata_address(Metadata* obj) { 4057 assert(oop_recorder() != NULL, "this assembler needs a Recorder"); 4058 int index = oop_recorder()->allocate_metadata_index(obj); 4059 RelocationHolder rspec = metadata_Relocation::spec(index); 4060 return Address((address)obj, rspec); 4061 } 4062 4063 // Move an oop into a register. immediate is true if we want 4064 // immediate instrcutions, i.e. we are not going to patch this 4065 // instruction while the code is being executed by another thread. In 4066 // that case we can use move immediates rather than the constant pool. 4067 void MacroAssembler::movoop(Register dst, jobject obj, bool immediate) { 4068 int oop_index; 4069 if (obj == NULL) { 4070 oop_index = oop_recorder()->allocate_oop_index(obj); 4071 } else { 4072 #ifdef ASSERT 4073 { 4074 ThreadInVMfromUnknown tiv; 4075 assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "should be real oop"); 4076 } 4077 #endif 4078 oop_index = oop_recorder()->find_index(obj); 4079 } 4080 RelocationHolder rspec = oop_Relocation::spec(oop_index); 4081 if (! immediate) { 4082 address dummy = address(uintptr_t(pc()) & -wordSize); // A nearby aligned address 4083 ldr_constant(dst, Address(dummy, rspec)); 4084 } else 4085 mov(dst, Address((address)obj, rspec)); 4086 } 4087 4088 // Move a metadata address into a register. 4089 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) { 4090 int oop_index; 4091 if (obj == NULL) { 4092 oop_index = oop_recorder()->allocate_metadata_index(obj); 4093 } else { 4094 oop_index = oop_recorder()->find_index(obj); 4095 } 4096 RelocationHolder rspec = metadata_Relocation::spec(oop_index); 4097 mov(dst, Address((address)obj, rspec)); 4098 } 4099 4100 Address MacroAssembler::constant_oop_address(jobject obj) { 4101 #ifdef ASSERT 4102 { 4103 ThreadInVMfromUnknown tiv; 4104 assert(oop_recorder() != NULL, "this assembler needs an OopRecorder"); 4105 assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "not an oop"); 4106 } 4107 #endif 4108 int oop_index = oop_recorder()->find_index(obj); 4109 return Address((address)obj, oop_Relocation::spec(oop_index)); 4110 } 4111 4112 // Defines obj, preserves var_size_in_bytes, okay for t2 == var_size_in_bytes. 4113 void MacroAssembler::tlab_allocate(Register obj, 4114 Register var_size_in_bytes, 4115 int con_size_in_bytes, 4116 Register t1, 4117 Register t2, 4118 Label& slow_case) { 4119 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 4120 bs->tlab_allocate(this, obj, var_size_in_bytes, con_size_in_bytes, t1, t2, slow_case); 4121 } 4122 4123 // Defines obj, preserves var_size_in_bytes 4124 void MacroAssembler::eden_allocate(Register obj, 4125 Register var_size_in_bytes, 4126 int con_size_in_bytes, 4127 Register t1, 4128 Label& slow_case) { 4129 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 4130 bs->eden_allocate(this, obj, var_size_in_bytes, con_size_in_bytes, t1, slow_case); 4131 } 4132 4133 // Zero words; len is in bytes 4134 // Destroys all registers except addr 4135 // len must be a nonzero multiple of wordSize 4136 void MacroAssembler::zero_memory(Register addr, Register len, Register t1) { 4137 assert_different_registers(addr, len, t1, rscratch1, rscratch2); 4138 4139 #ifdef ASSERT 4140 { Label L; 4141 tst(len, BytesPerWord - 1); 4142 br(Assembler::EQ, L); 4143 stop("len is not a multiple of BytesPerWord"); 4144 bind(L); 4145 } 4146 #endif 4147 4148 #ifndef PRODUCT 4149 block_comment("zero memory"); 4150 #endif 4151 4152 Label loop; 4153 Label entry; 4154 4155 // Algorithm: 4156 // 4157 // scratch1 = cnt & 7; 4158 // cnt -= scratch1; 4159 // p += scratch1; 4160 // switch (scratch1) { 4161 // do { 4162 // cnt -= 8; 4163 // p[-8] = 0; 4164 // case 7: 4165 // p[-7] = 0; 4166 // case 6: 4167 // p[-6] = 0; 4168 // // ... 4169 // case 1: 4170 // p[-1] = 0; 4171 // case 0: 4172 // p += 8; 4173 // } while (cnt); 4174 // } 4175 4176 const int unroll = 8; // Number of str(zr) instructions we'll unroll 4177 4178 lsr(len, len, LogBytesPerWord); 4179 andr(rscratch1, len, unroll - 1); // tmp1 = cnt % unroll 4180 sub(len, len, rscratch1); // cnt -= unroll 4181 // t1 always points to the end of the region we're about to zero 4182 add(t1, addr, rscratch1, Assembler::LSL, LogBytesPerWord); 4183 adr(rscratch2, entry); 4184 sub(rscratch2, rscratch2, rscratch1, Assembler::LSL, 2); 4185 br(rscratch2); 4186 bind(loop); 4187 sub(len, len, unroll); 4188 for (int i = -unroll; i < 0; i++) 4189 Assembler::str(zr, Address(t1, i * wordSize)); 4190 bind(entry); 4191 add(t1, t1, unroll * wordSize); 4192 cbnz(len, loop); 4193 } 4194 4195 void MacroAssembler::verify_tlab() { 4196 #ifdef ASSERT 4197 if (UseTLAB && VerifyOops) { 4198 Label next, ok; 4199 4200 stp(rscratch2, rscratch1, Address(pre(sp, -16))); 4201 4202 ldr(rscratch2, Address(rthread, in_bytes(JavaThread::tlab_top_offset()))); 4203 ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_start_offset()))); 4204 cmp(rscratch2, rscratch1); 4205 br(Assembler::HS, next); 4206 STOP("assert(top >= start)"); 4207 should_not_reach_here(); 4208 4209 bind(next); 4210 ldr(rscratch2, Address(rthread, in_bytes(JavaThread::tlab_end_offset()))); 4211 ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_top_offset()))); 4212 cmp(rscratch2, rscratch1); 4213 br(Assembler::HS, ok); 4214 STOP("assert(top <= end)"); 4215 should_not_reach_here(); 4216 4217 bind(ok); 4218 ldp(rscratch2, rscratch1, Address(post(sp, 16))); 4219 } 4220 #endif 4221 } 4222 4223 // Writes to stack successive pages until offset reached to check for 4224 // stack overflow + shadow pages. This clobbers tmp. 4225 void MacroAssembler::bang_stack_size(Register size, Register tmp) { 4226 assert_different_registers(tmp, size, rscratch1); 4227 mov(tmp, sp); 4228 // Bang stack for total size given plus shadow page size. 4229 // Bang one page at a time because large size can bang beyond yellow and 4230 // red zones. 4231 Label loop; 4232 mov(rscratch1, os::vm_page_size()); 4233 bind(loop); 4234 lea(tmp, Address(tmp, -os::vm_page_size())); 4235 subsw(size, size, rscratch1); 4236 str(size, Address(tmp)); 4237 br(Assembler::GT, loop); 4238 4239 // Bang down shadow pages too. 4240 // At this point, (tmp-0) is the last address touched, so don't 4241 // touch it again. (It was touched as (tmp-pagesize) but then tmp 4242 // was post-decremented.) Skip this address by starting at i=1, and 4243 // touch a few more pages below. N.B. It is important to touch all 4244 // the way down to and including i=StackShadowPages. 4245 for (int i = 0; i < (int)(JavaThread::stack_shadow_zone_size() / os::vm_page_size()) - 1; i++) { 4246 // this could be any sized move but this is can be a debugging crumb 4247 // so the bigger the better. 4248 lea(tmp, Address(tmp, -os::vm_page_size())); 4249 str(size, Address(tmp)); 4250 } 4251 } 4252 4253 4254 // Move the address of the polling page into dest. 4255 void MacroAssembler::get_polling_page(Register dest, address page, relocInfo::relocType rtype) { 4256 if (SafepointMechanism::uses_thread_local_poll()) { 4257 ldr(dest, Address(rthread, Thread::polling_page_offset())); 4258 } else { 4259 unsigned long off; 4260 adrp(dest, Address(page, rtype), off); 4261 assert(off == 0, "polling page must be page aligned"); 4262 } 4263 } 4264 4265 // Move the address of the polling page into r, then read the polling 4266 // page. 4267 address MacroAssembler::read_polling_page(Register r, address page, relocInfo::relocType rtype) { 4268 get_polling_page(r, page, rtype); 4269 return read_polling_page(r, rtype); 4270 } 4271 4272 // Read the polling page. The address of the polling page must 4273 // already be in r. 4274 address MacroAssembler::read_polling_page(Register r, relocInfo::relocType rtype) { 4275 InstructionMark im(this); 4276 code_section()->relocate(inst_mark(), rtype); 4277 ldrw(zr, Address(r, 0)); 4278 return inst_mark(); 4279 } 4280 4281 void MacroAssembler::adrp(Register reg1, const Address &dest, unsigned long &byte_offset) { 4282 relocInfo::relocType rtype = dest.rspec().reloc()->type(); 4283 unsigned long low_page = (unsigned long)CodeCache::low_bound() >> 12; 4284 unsigned long high_page = (unsigned long)(CodeCache::high_bound()-1) >> 12; 4285 unsigned long dest_page = (unsigned long)dest.target() >> 12; 4286 long offset_low = dest_page - low_page; 4287 long offset_high = dest_page - high_page; 4288 4289 assert(is_valid_AArch64_address(dest.target()), "bad address"); 4290 assert(dest.getMode() == Address::literal, "ADRP must be applied to a literal address"); 4291 4292 InstructionMark im(this); 4293 code_section()->relocate(inst_mark(), dest.rspec()); 4294 // 8143067: Ensure that the adrp can reach the dest from anywhere within 4295 // the code cache so that if it is relocated we know it will still reach 4296 if (offset_high >= -(1<<20) && offset_low < (1<<20)) { 4297 _adrp(reg1, dest.target()); 4298 } else { 4299 unsigned long target = (unsigned long)dest.target(); 4300 unsigned long adrp_target 4301 = (target & 0xffffffffUL) | ((unsigned long)pc() & 0xffff00000000UL); 4302 4303 _adrp(reg1, (address)adrp_target); 4304 movk(reg1, target >> 32, 32); 4305 } 4306 byte_offset = (unsigned long)dest.target() & 0xfff; 4307 } 4308 4309 void MacroAssembler::load_byte_map_base(Register reg) { 4310 jbyte *byte_map_base = 4311 ((CardTableBarrierSet*)(BarrierSet::barrier_set()))->card_table()->byte_map_base(); 4312 4313 if (is_valid_AArch64_address((address)byte_map_base)) { 4314 // Strictly speaking the byte_map_base isn't an address at all, 4315 // and it might even be negative. 4316 unsigned long offset; 4317 adrp(reg, ExternalAddress((address)byte_map_base), offset); 4318 // We expect offset to be zero with most collectors. 4319 if (offset != 0) { 4320 add(reg, reg, offset); 4321 } 4322 } else { 4323 mov(reg, (uint64_t)byte_map_base); 4324 } 4325 } 4326 4327 void MacroAssembler::build_frame(int framesize) { 4328 assert(framesize > 0, "framesize must be > 0"); 4329 if (framesize < ((1 << 9) + 2 * wordSize)) { 4330 sub(sp, sp, framesize); 4331 stp(rfp, lr, Address(sp, framesize - 2 * wordSize)); 4332 if (PreserveFramePointer) add(rfp, sp, framesize - 2 * wordSize); 4333 } else { 4334 stp(rfp, lr, Address(pre(sp, -2 * wordSize))); 4335 if (PreserveFramePointer) mov(rfp, sp); 4336 if (framesize < ((1 << 12) + 2 * wordSize)) 4337 sub(sp, sp, framesize - 2 * wordSize); 4338 else { 4339 mov(rscratch1, framesize - 2 * wordSize); 4340 sub(sp, sp, rscratch1); 4341 } 4342 } 4343 } 4344 4345 void MacroAssembler::remove_frame(int framesize) { 4346 assert(framesize > 0, "framesize must be > 0"); 4347 if (framesize < ((1 << 9) + 2 * wordSize)) { 4348 ldp(rfp, lr, Address(sp, framesize - 2 * wordSize)); 4349 add(sp, sp, framesize); 4350 } else { 4351 if (framesize < ((1 << 12) + 2 * wordSize)) 4352 add(sp, sp, framesize - 2 * wordSize); 4353 else { 4354 mov(rscratch1, framesize - 2 * wordSize); 4355 add(sp, sp, rscratch1); 4356 } 4357 ldp(rfp, lr, Address(post(sp, 2 * wordSize))); 4358 } 4359 } 4360 4361 #ifdef COMPILER2 4362 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr); 4363 4364 // Search for str1 in str2 and return index or -1 4365 void MacroAssembler::string_indexof(Register str2, Register str1, 4366 Register cnt2, Register cnt1, 4367 Register tmp1, Register tmp2, 4368 Register tmp3, Register tmp4, 4369 Register tmp5, Register tmp6, 4370 int icnt1, Register result, int ae) { 4371 // NOTE: tmp5, tmp6 can be zr depending on specific method version 4372 Label LINEARSEARCH, LINEARSTUB, LINEAR_MEDIUM, DONE, NOMATCH, MATCH; 4373 4374 Register ch1 = rscratch1; 4375 Register ch2 = rscratch2; 4376 Register cnt1tmp = tmp1; 4377 Register cnt2tmp = tmp2; 4378 Register cnt1_neg = cnt1; 4379 Register cnt2_neg = cnt2; 4380 Register result_tmp = tmp4; 4381 4382 bool isL = ae == StrIntrinsicNode::LL; 4383 4384 bool str1_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL; 4385 bool str2_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU; 4386 int str1_chr_shift = str1_isL ? 0:1; 4387 int str2_chr_shift = str2_isL ? 0:1; 4388 int str1_chr_size = str1_isL ? 1:2; 4389 int str2_chr_size = str2_isL ? 1:2; 4390 chr_insn str1_load_1chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb : 4391 (chr_insn)&MacroAssembler::ldrh; 4392 chr_insn str2_load_1chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb : 4393 (chr_insn)&MacroAssembler::ldrh; 4394 chr_insn load_2chr = isL ? (chr_insn)&MacroAssembler::ldrh : (chr_insn)&MacroAssembler::ldrw; 4395 chr_insn load_4chr = isL ? (chr_insn)&MacroAssembler::ldrw : (chr_insn)&MacroAssembler::ldr; 4396 4397 // Note, inline_string_indexOf() generates checks: 4398 // if (substr.count > string.count) return -1; 4399 // if (substr.count == 0) return 0; 4400 4401 // We have two strings, a source string in str2, cnt2 and a pattern string 4402 // in str1, cnt1. Find the 1st occurence of pattern in source or return -1. 4403 4404 // For larger pattern and source we use a simplified Boyer Moore algorithm. 4405 // With a small pattern and source we use linear scan. 4406 4407 if (icnt1 == -1) { 4408 sub(result_tmp, cnt2, cnt1); 4409 cmp(cnt1, (u1)8); // Use Linear Scan if cnt1 < 8 || cnt1 >= 256 4410 br(LT, LINEARSEARCH); 4411 dup(v0, T16B, cnt1); // done in separate FPU pipeline. Almost no penalty 4412 subs(zr, cnt1, 256); 4413 lsr(tmp1, cnt2, 2); 4414 ccmp(cnt1, tmp1, 0b0000, LT); // Source must be 4 * pattern for BM 4415 br(GE, LINEARSTUB); 4416 } 4417 4418 // The Boyer Moore alogorithm is based on the description here:- 4419 // 4420 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm 4421 // 4422 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule 4423 // and the 'Good Suffix' rule. 4424 // 4425 // These rules are essentially heuristics for how far we can shift the 4426 // pattern along the search string. 4427 // 4428 // The implementation here uses the 'Bad Character' rule only because of the 4429 // complexity of initialisation for the 'Good Suffix' rule. 4430 // 4431 // This is also known as the Boyer-Moore-Horspool algorithm:- 4432 // 4433 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm 4434 // 4435 // This particular implementation has few java-specific optimizations. 4436 // 4437 // #define ASIZE 256 4438 // 4439 // int bm(unsigned char *x, int m, unsigned char *y, int n) { 4440 // int i, j; 4441 // unsigned c; 4442 // unsigned char bc[ASIZE]; 4443 // 4444 // /* Preprocessing */ 4445 // for (i = 0; i < ASIZE; ++i) 4446 // bc[i] = m; 4447 // for (i = 0; i < m - 1; ) { 4448 // c = x[i]; 4449 // ++i; 4450 // // c < 256 for Latin1 string, so, no need for branch 4451 // #ifdef PATTERN_STRING_IS_LATIN1 4452 // bc[c] = m - i; 4453 // #else 4454 // if (c < ASIZE) bc[c] = m - i; 4455 // #endif 4456 // } 4457 // 4458 // /* Searching */ 4459 // j = 0; 4460 // while (j <= n - m) { 4461 // c = y[i+j]; 4462 // if (x[m-1] == c) 4463 // for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i); 4464 // if (i < 0) return j; 4465 // // c < 256 for Latin1 string, so, no need for branch 4466 // #ifdef SOURCE_STRING_IS_LATIN1 4467 // // LL case: (c< 256) always true. Remove branch 4468 // j += bc[y[j+m-1]]; 4469 // #endif 4470 // #ifndef PATTERN_STRING_IS_UTF 4471 // // UU case: need if (c<ASIZE) check. Skip 1 character if not. 4472 // if (c < ASIZE) 4473 // j += bc[y[j+m-1]]; 4474 // else 4475 // j += 1 4476 // #endif 4477 // #ifdef PATTERN_IS_LATIN1_AND_SOURCE_IS_UTF 4478 // // UL case: need if (c<ASIZE) check. Skip <pattern length> if not. 4479 // if (c < ASIZE) 4480 // j += bc[y[j+m-1]]; 4481 // else 4482 // j += m 4483 // #endif 4484 // } 4485 // } 4486 4487 if (icnt1 == -1) { 4488 Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH, 4489 BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP; 4490 Register cnt1end = tmp2; 4491 Register str2end = cnt2; 4492 Register skipch = tmp2; 4493 4494 // str1 length is >=8, so, we can read at least 1 register for cases when 4495 // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for 4496 // UL case. We'll re-read last character in inner pre-loop code to have 4497 // single outer pre-loop load 4498 const int firstStep = isL ? 7 : 3; 4499 4500 const int ASIZE = 256; 4501 const int STORED_BYTES = 32; // amount of bytes stored per instruction 4502 sub(sp, sp, ASIZE); 4503 mov(tmp5, ASIZE/STORED_BYTES); // loop iterations 4504 mov(ch1, sp); 4505 BIND(BM_INIT_LOOP); 4506 stpq(v0, v0, Address(post(ch1, STORED_BYTES))); 4507 subs(tmp5, tmp5, 1); 4508 br(GT, BM_INIT_LOOP); 4509 4510 sub(cnt1tmp, cnt1, 1); 4511 mov(tmp5, str2); 4512 add(str2end, str2, result_tmp, LSL, str2_chr_shift); 4513 sub(ch2, cnt1, 1); 4514 mov(tmp3, str1); 4515 BIND(BCLOOP); 4516 (this->*str1_load_1chr)(ch1, Address(post(tmp3, str1_chr_size))); 4517 if (!str1_isL) { 4518 subs(zr, ch1, ASIZE); 4519 br(HS, BCSKIP); 4520 } 4521 strb(ch2, Address(sp, ch1)); 4522 BIND(BCSKIP); 4523 subs(ch2, ch2, 1); 4524 br(GT, BCLOOP); 4525 4526 add(tmp6, str1, cnt1, LSL, str1_chr_shift); // address after str1 4527 if (str1_isL == str2_isL) { 4528 // load last 8 bytes (8LL/4UU symbols) 4529 ldr(tmp6, Address(tmp6, -wordSize)); 4530 } else { 4531 ldrw(tmp6, Address(tmp6, -wordSize/2)); // load last 4 bytes(4 symbols) 4532 // convert Latin1 to UTF. We'll have to wait until load completed, but 4533 // it's still faster than per-character loads+checks 4534 lsr(tmp3, tmp6, BitsPerByte * (wordSize/2 - str1_chr_size)); // str1[N-1] 4535 ubfx(ch1, tmp6, 8, 8); // str1[N-2] 4536 ubfx(ch2, tmp6, 16, 8); // str1[N-3] 4537 andr(tmp6, tmp6, 0xFF); // str1[N-4] 4538 orr(ch2, ch1, ch2, LSL, 16); 4539 orr(tmp6, tmp6, tmp3, LSL, 48); 4540 orr(tmp6, tmp6, ch2, LSL, 16); 4541 } 4542 BIND(BMLOOPSTR2); 4543 (this->*str2_load_1chr)(skipch, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift))); 4544 sub(cnt1tmp, cnt1tmp, firstStep); // cnt1tmp is positive here, because cnt1 >= 8 4545 if (str1_isL == str2_isL) { 4546 // re-init tmp3. It's for free because it's executed in parallel with 4547 // load above. Alternative is to initialize it before loop, but it'll 4548 // affect performance on in-order systems with 2 or more ld/st pipelines 4549 lsr(tmp3, tmp6, BitsPerByte * (wordSize - str1_chr_size)); 4550 } 4551 if (!isL) { // UU/UL case 4552 lsl(ch2, cnt1tmp, 1); // offset in bytes 4553 } 4554 cmp(tmp3, skipch); 4555 br(NE, BMSKIP); 4556 ldr(ch2, Address(str2, isL ? cnt1tmp : ch2)); 4557 mov(ch1, tmp6); 4558 if (isL) { 4559 b(BMLOOPSTR1_AFTER_LOAD); 4560 } else { 4561 sub(cnt1tmp, cnt1tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8 4562 b(BMLOOPSTR1_CMP); 4563 } 4564 BIND(BMLOOPSTR1); 4565 (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift))); 4566 (this->*str2_load_1chr)(ch2, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift))); 4567 BIND(BMLOOPSTR1_AFTER_LOAD); 4568 subs(cnt1tmp, cnt1tmp, 1); 4569 br(LT, BMLOOPSTR1_LASTCMP); 4570 BIND(BMLOOPSTR1_CMP); 4571 cmp(ch1, ch2); 4572 br(EQ, BMLOOPSTR1); 4573 BIND(BMSKIP); 4574 if (!isL) { 4575 // if we've met UTF symbol while searching Latin1 pattern, then we can 4576 // skip cnt1 symbols 4577 if (str1_isL != str2_isL) { 4578 mov(result_tmp, cnt1); 4579 } else { 4580 mov(result_tmp, 1); 4581 } 4582 subs(zr, skipch, ASIZE); 4583 br(HS, BMADV); 4584 } 4585 ldrb(result_tmp, Address(sp, skipch)); // load skip distance 4586 BIND(BMADV); 4587 sub(cnt1tmp, cnt1, 1); 4588 add(str2, str2, result_tmp, LSL, str2_chr_shift); 4589 cmp(str2, str2end); 4590 br(LE, BMLOOPSTR2); 4591 add(sp, sp, ASIZE); 4592 b(NOMATCH); 4593 BIND(BMLOOPSTR1_LASTCMP); 4594 cmp(ch1, ch2); 4595 br(NE, BMSKIP); 4596 BIND(BMMATCH); 4597 sub(result, str2, tmp5); 4598 if (!str2_isL) lsr(result, result, 1); 4599 add(sp, sp, ASIZE); 4600 b(DONE); 4601 4602 BIND(LINEARSTUB); 4603 cmp(cnt1, (u1)16); // small patterns still should be handled by simple algorithm 4604 br(LT, LINEAR_MEDIUM); 4605 mov(result, zr); 4606 RuntimeAddress stub = NULL; 4607 if (isL) { 4608 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ll()); 4609 assert(stub.target() != NULL, "string_indexof_linear_ll stub has not been generated"); 4610 } else if (str1_isL) { 4611 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ul()); 4612 assert(stub.target() != NULL, "string_indexof_linear_ul stub has not been generated"); 4613 } else { 4614 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_uu()); 4615 assert(stub.target() != NULL, "string_indexof_linear_uu stub has not been generated"); 4616 } 4617 trampoline_call(stub); 4618 b(DONE); 4619 } 4620 4621 BIND(LINEARSEARCH); 4622 { 4623 Label DO1, DO2, DO3; 4624 4625 Register str2tmp = tmp2; 4626 Register first = tmp3; 4627 4628 if (icnt1 == -1) 4629 { 4630 Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT; 4631 4632 cmp(cnt1, u1(str1_isL == str2_isL ? 4 : 2)); 4633 br(LT, DOSHORT); 4634 BIND(LINEAR_MEDIUM); 4635 (this->*str1_load_1chr)(first, Address(str1)); 4636 lea(str1, Address(str1, cnt1, Address::lsl(str1_chr_shift))); 4637 sub(cnt1_neg, zr, cnt1, LSL, str1_chr_shift); 4638 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 4639 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 4640 4641 BIND(FIRST_LOOP); 4642 (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg)); 4643 cmp(first, ch2); 4644 br(EQ, STR1_LOOP); 4645 BIND(STR2_NEXT); 4646 adds(cnt2_neg, cnt2_neg, str2_chr_size); 4647 br(LE, FIRST_LOOP); 4648 b(NOMATCH); 4649 4650 BIND(STR1_LOOP); 4651 adds(cnt1tmp, cnt1_neg, str1_chr_size); 4652 add(cnt2tmp, cnt2_neg, str2_chr_size); 4653 br(GE, MATCH); 4654 4655 BIND(STR1_NEXT); 4656 (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp)); 4657 (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp)); 4658 cmp(ch1, ch2); 4659 br(NE, STR2_NEXT); 4660 adds(cnt1tmp, cnt1tmp, str1_chr_size); 4661 add(cnt2tmp, cnt2tmp, str2_chr_size); 4662 br(LT, STR1_NEXT); 4663 b(MATCH); 4664 4665 BIND(DOSHORT); 4666 if (str1_isL == str2_isL) { 4667 cmp(cnt1, (u1)2); 4668 br(LT, DO1); 4669 br(GT, DO3); 4670 } 4671 } 4672 4673 if (icnt1 == 4) { 4674 Label CH1_LOOP; 4675 4676 (this->*load_4chr)(ch1, str1); 4677 sub(result_tmp, cnt2, 4); 4678 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 4679 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 4680 4681 BIND(CH1_LOOP); 4682 (this->*load_4chr)(ch2, Address(str2, cnt2_neg)); 4683 cmp(ch1, ch2); 4684 br(EQ, MATCH); 4685 adds(cnt2_neg, cnt2_neg, str2_chr_size); 4686 br(LE, CH1_LOOP); 4687 b(NOMATCH); 4688 } 4689 4690 if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 2) { 4691 Label CH1_LOOP; 4692 4693 BIND(DO2); 4694 (this->*load_2chr)(ch1, str1); 4695 if (icnt1 == 2) { 4696 sub(result_tmp, cnt2, 2); 4697 } 4698 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 4699 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 4700 BIND(CH1_LOOP); 4701 (this->*load_2chr)(ch2, Address(str2, cnt2_neg)); 4702 cmp(ch1, ch2); 4703 br(EQ, MATCH); 4704 adds(cnt2_neg, cnt2_neg, str2_chr_size); 4705 br(LE, CH1_LOOP); 4706 b(NOMATCH); 4707 } 4708 4709 if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 3) { 4710 Label FIRST_LOOP, STR2_NEXT, STR1_LOOP; 4711 4712 BIND(DO3); 4713 (this->*load_2chr)(first, str1); 4714 (this->*str1_load_1chr)(ch1, Address(str1, 2*str1_chr_size)); 4715 if (icnt1 == 3) { 4716 sub(result_tmp, cnt2, 3); 4717 } 4718 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 4719 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 4720 BIND(FIRST_LOOP); 4721 (this->*load_2chr)(ch2, Address(str2, cnt2_neg)); 4722 cmpw(first, ch2); 4723 br(EQ, STR1_LOOP); 4724 BIND(STR2_NEXT); 4725 adds(cnt2_neg, cnt2_neg, str2_chr_size); 4726 br(LE, FIRST_LOOP); 4727 b(NOMATCH); 4728 4729 BIND(STR1_LOOP); 4730 add(cnt2tmp, cnt2_neg, 2*str2_chr_size); 4731 (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp)); 4732 cmp(ch1, ch2); 4733 br(NE, STR2_NEXT); 4734 b(MATCH); 4735 } 4736 4737 if (icnt1 == -1 || icnt1 == 1) { 4738 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP; 4739 4740 BIND(DO1); 4741 (this->*str1_load_1chr)(ch1, str1); 4742 cmp(cnt2, (u1)8); 4743 br(LT, DO1_SHORT); 4744 4745 sub(result_tmp, cnt2, 8/str2_chr_size); 4746 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 4747 mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001); 4748 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 4749 4750 if (str2_isL) { 4751 orr(ch1, ch1, ch1, LSL, 8); 4752 } 4753 orr(ch1, ch1, ch1, LSL, 16); 4754 orr(ch1, ch1, ch1, LSL, 32); 4755 BIND(CH1_LOOP); 4756 ldr(ch2, Address(str2, cnt2_neg)); 4757 eor(ch2, ch1, ch2); 4758 sub(tmp1, ch2, tmp3); 4759 orr(tmp2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 4760 bics(tmp1, tmp1, tmp2); 4761 br(NE, HAS_ZERO); 4762 adds(cnt2_neg, cnt2_neg, 8); 4763 br(LT, CH1_LOOP); 4764 4765 cmp(cnt2_neg, (u1)8); 4766 mov(cnt2_neg, 0); 4767 br(LT, CH1_LOOP); 4768 b(NOMATCH); 4769 4770 BIND(HAS_ZERO); 4771 rev(tmp1, tmp1); 4772 clz(tmp1, tmp1); 4773 add(cnt2_neg, cnt2_neg, tmp1, LSR, 3); 4774 b(MATCH); 4775 4776 BIND(DO1_SHORT); 4777 mov(result_tmp, cnt2); 4778 lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift))); 4779 sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift); 4780 BIND(DO1_LOOP); 4781 (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg)); 4782 cmpw(ch1, ch2); 4783 br(EQ, MATCH); 4784 adds(cnt2_neg, cnt2_neg, str2_chr_size); 4785 br(LT, DO1_LOOP); 4786 } 4787 } 4788 BIND(NOMATCH); 4789 mov(result, -1); 4790 b(DONE); 4791 BIND(MATCH); 4792 add(result, result_tmp, cnt2_neg, ASR, str2_chr_shift); 4793 BIND(DONE); 4794 } 4795 4796 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr); 4797 typedef void (MacroAssembler::* uxt_insn)(Register Rd, Register Rn); 4798 4799 void MacroAssembler::string_indexof_char(Register str1, Register cnt1, 4800 Register ch, Register result, 4801 Register tmp1, Register tmp2, Register tmp3) 4802 { 4803 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE; 4804 Register cnt1_neg = cnt1; 4805 Register ch1 = rscratch1; 4806 Register result_tmp = rscratch2; 4807 4808 cmp(cnt1, (u1)4); 4809 br(LT, DO1_SHORT); 4810 4811 orr(ch, ch, ch, LSL, 16); 4812 orr(ch, ch, ch, LSL, 32); 4813 4814 sub(cnt1, cnt1, 4); 4815 mov(result_tmp, cnt1); 4816 lea(str1, Address(str1, cnt1, Address::uxtw(1))); 4817 sub(cnt1_neg, zr, cnt1, LSL, 1); 4818 4819 mov(tmp3, 0x0001000100010001); 4820 4821 BIND(CH1_LOOP); 4822 ldr(ch1, Address(str1, cnt1_neg)); 4823 eor(ch1, ch, ch1); 4824 sub(tmp1, ch1, tmp3); 4825 orr(tmp2, ch1, 0x7fff7fff7fff7fff); 4826 bics(tmp1, tmp1, tmp2); 4827 br(NE, HAS_ZERO); 4828 adds(cnt1_neg, cnt1_neg, 8); 4829 br(LT, CH1_LOOP); 4830 4831 cmp(cnt1_neg, (u1)8); 4832 mov(cnt1_neg, 0); 4833 br(LT, CH1_LOOP); 4834 b(NOMATCH); 4835 4836 BIND(HAS_ZERO); 4837 rev(tmp1, tmp1); 4838 clz(tmp1, tmp1); 4839 add(cnt1_neg, cnt1_neg, tmp1, LSR, 3); 4840 b(MATCH); 4841 4842 BIND(DO1_SHORT); 4843 mov(result_tmp, cnt1); 4844 lea(str1, Address(str1, cnt1, Address::uxtw(1))); 4845 sub(cnt1_neg, zr, cnt1, LSL, 1); 4846 BIND(DO1_LOOP); 4847 ldrh(ch1, Address(str1, cnt1_neg)); 4848 cmpw(ch, ch1); 4849 br(EQ, MATCH); 4850 adds(cnt1_neg, cnt1_neg, 2); 4851 br(LT, DO1_LOOP); 4852 BIND(NOMATCH); 4853 mov(result, -1); 4854 b(DONE); 4855 BIND(MATCH); 4856 add(result, result_tmp, cnt1_neg, ASR, 1); 4857 BIND(DONE); 4858 } 4859 4860 // Compare strings. 4861 void MacroAssembler::string_compare(Register str1, Register str2, 4862 Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2, 4863 FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3, int ae) { 4864 Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB, 4865 DIFFERENCE, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, 4866 SHORT_LOOP_START, TAIL_CHECK; 4867 4868 const u1 STUB_THRESHOLD = 64 + 8; 4869 bool isLL = ae == StrIntrinsicNode::LL; 4870 bool isLU = ae == StrIntrinsicNode::LU; 4871 bool isUL = ae == StrIntrinsicNode::UL; 4872 4873 bool str1_isL = isLL || isLU; 4874 bool str2_isL = isLL || isUL; 4875 4876 int str1_chr_shift = str1_isL ? 0 : 1; 4877 int str2_chr_shift = str2_isL ? 0 : 1; 4878 int str1_chr_size = str1_isL ? 1 : 2; 4879 int str2_chr_size = str2_isL ? 1 : 2; 4880 int minCharsInWord = isLL ? wordSize : wordSize/2; 4881 4882 FloatRegister vtmpZ = vtmp1, vtmp = vtmp2; 4883 chr_insn str1_load_chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb : 4884 (chr_insn)&MacroAssembler::ldrh; 4885 chr_insn str2_load_chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb : 4886 (chr_insn)&MacroAssembler::ldrh; 4887 uxt_insn ext_chr = isLL ? (uxt_insn)&MacroAssembler::uxtbw : 4888 (uxt_insn)&MacroAssembler::uxthw; 4889 4890 BLOCK_COMMENT("string_compare {"); 4891 4892 // Bizzarely, the counts are passed in bytes, regardless of whether they 4893 // are L or U strings, however the result is always in characters. 4894 if (!str1_isL) asrw(cnt1, cnt1, 1); 4895 if (!str2_isL) asrw(cnt2, cnt2, 1); 4896 4897 // Compute the minimum of the string lengths and save the difference. 4898 subsw(result, cnt1, cnt2); 4899 cselw(cnt2, cnt1, cnt2, Assembler::LE); // min 4900 4901 // A very short string 4902 cmpw(cnt2, minCharsInWord); 4903 br(Assembler::LT, SHORT_STRING); 4904 4905 // Compare longwords 4906 // load first parts of strings and finish initialization while loading 4907 { 4908 if (str1_isL == str2_isL) { // LL or UU 4909 ldr(tmp1, Address(str1)); 4910 cmp(str1, str2); 4911 br(Assembler::EQ, DONE); 4912 ldr(tmp2, Address(str2)); 4913 cmp(cnt2, STUB_THRESHOLD); 4914 br(GE, STUB); 4915 subsw(cnt2, cnt2, minCharsInWord); 4916 br(EQ, TAIL_CHECK); 4917 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift))); 4918 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift))); 4919 sub(cnt2, zr, cnt2, LSL, str2_chr_shift); 4920 } else if (isLU) { 4921 ldrs(vtmp, Address(str1)); 4922 cmp(str1, str2); 4923 br(Assembler::EQ, DONE); 4924 ldr(tmp2, Address(str2)); 4925 cmp(cnt2, STUB_THRESHOLD); 4926 br(GE, STUB); 4927 subsw(cnt2, cnt2, 4); 4928 br(EQ, TAIL_CHECK); 4929 eor(vtmpZ, T16B, vtmpZ, vtmpZ); 4930 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift))); 4931 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift))); 4932 zip1(vtmp, T8B, vtmp, vtmpZ); 4933 sub(cnt1, zr, cnt2, LSL, str1_chr_shift); 4934 sub(cnt2, zr, cnt2, LSL, str2_chr_shift); 4935 add(cnt1, cnt1, 4); 4936 fmovd(tmp1, vtmp); 4937 } else { // UL case 4938 ldr(tmp1, Address(str1)); 4939 cmp(str1, str2); 4940 br(Assembler::EQ, DONE); 4941 ldrs(vtmp, Address(str2)); 4942 cmp(cnt2, STUB_THRESHOLD); 4943 br(GE, STUB); 4944 subsw(cnt2, cnt2, 4); 4945 br(EQ, TAIL_CHECK); 4946 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift))); 4947 eor(vtmpZ, T16B, vtmpZ, vtmpZ); 4948 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift))); 4949 sub(cnt1, zr, cnt2, LSL, str1_chr_shift); 4950 zip1(vtmp, T8B, vtmp, vtmpZ); 4951 sub(cnt2, zr, cnt2, LSL, str2_chr_shift); 4952 add(cnt1, cnt1, 8); 4953 fmovd(tmp2, vtmp); 4954 } 4955 adds(cnt2, cnt2, isUL ? 4 : 8); 4956 br(GE, TAIL); 4957 eor(rscratch2, tmp1, tmp2); 4958 cbnz(rscratch2, DIFFERENCE); 4959 // main loop 4960 bind(NEXT_WORD); 4961 if (str1_isL == str2_isL) { 4962 ldr(tmp1, Address(str1, cnt2)); 4963 ldr(tmp2, Address(str2, cnt2)); 4964 adds(cnt2, cnt2, 8); 4965 } else if (isLU) { 4966 ldrs(vtmp, Address(str1, cnt1)); 4967 ldr(tmp2, Address(str2, cnt2)); 4968 add(cnt1, cnt1, 4); 4969 zip1(vtmp, T8B, vtmp, vtmpZ); 4970 fmovd(tmp1, vtmp); 4971 adds(cnt2, cnt2, 8); 4972 } else { // UL 4973 ldrs(vtmp, Address(str2, cnt2)); 4974 ldr(tmp1, Address(str1, cnt1)); 4975 zip1(vtmp, T8B, vtmp, vtmpZ); 4976 add(cnt1, cnt1, 8); 4977 fmovd(tmp2, vtmp); 4978 adds(cnt2, cnt2, 4); 4979 } 4980 br(GE, TAIL); 4981 4982 eor(rscratch2, tmp1, tmp2); 4983 cbz(rscratch2, NEXT_WORD); 4984 b(DIFFERENCE); 4985 bind(TAIL); 4986 eor(rscratch2, tmp1, tmp2); 4987 cbnz(rscratch2, DIFFERENCE); 4988 // Last longword. In the case where length == 4 we compare the 4989 // same longword twice, but that's still faster than another 4990 // conditional branch. 4991 if (str1_isL == str2_isL) { 4992 ldr(tmp1, Address(str1)); 4993 ldr(tmp2, Address(str2)); 4994 } else if (isLU) { 4995 ldrs(vtmp, Address(str1)); 4996 ldr(tmp2, Address(str2)); 4997 zip1(vtmp, T8B, vtmp, vtmpZ); 4998 fmovd(tmp1, vtmp); 4999 } else { // UL 5000 ldrs(vtmp, Address(str2)); 5001 ldr(tmp1, Address(str1)); 5002 zip1(vtmp, T8B, vtmp, vtmpZ); 5003 fmovd(tmp2, vtmp); 5004 } 5005 bind(TAIL_CHECK); 5006 eor(rscratch2, tmp1, tmp2); 5007 cbz(rscratch2, DONE); 5008 5009 // Find the first different characters in the longwords and 5010 // compute their difference. 5011 bind(DIFFERENCE); 5012 rev(rscratch2, rscratch2); 5013 clz(rscratch2, rscratch2); 5014 andr(rscratch2, rscratch2, isLL ? -8 : -16); 5015 lsrv(tmp1, tmp1, rscratch2); 5016 (this->*ext_chr)(tmp1, tmp1); 5017 lsrv(tmp2, tmp2, rscratch2); 5018 (this->*ext_chr)(tmp2, tmp2); 5019 subw(result, tmp1, tmp2); 5020 b(DONE); 5021 } 5022 5023 bind(STUB); 5024 RuntimeAddress stub = NULL; 5025 switch(ae) { 5026 case StrIntrinsicNode::LL: 5027 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LL()); 5028 break; 5029 case StrIntrinsicNode::UU: 5030 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UU()); 5031 break; 5032 case StrIntrinsicNode::LU: 5033 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LU()); 5034 break; 5035 case StrIntrinsicNode::UL: 5036 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UL()); 5037 break; 5038 default: 5039 ShouldNotReachHere(); 5040 } 5041 assert(stub.target() != NULL, "compare_long_string stub has not been generated"); 5042 trampoline_call(stub); 5043 b(DONE); 5044 5045 bind(SHORT_STRING); 5046 // Is the minimum length zero? 5047 cbz(cnt2, DONE); 5048 // arrange code to do most branches while loading and loading next characters 5049 // while comparing previous 5050 (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size))); 5051 subs(cnt2, cnt2, 1); 5052 br(EQ, SHORT_LAST_INIT); 5053 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size))); 5054 b(SHORT_LOOP_START); 5055 bind(SHORT_LOOP); 5056 subs(cnt2, cnt2, 1); 5057 br(EQ, SHORT_LAST); 5058 bind(SHORT_LOOP_START); 5059 (this->*str1_load_chr)(tmp2, Address(post(str1, str1_chr_size))); 5060 (this->*str2_load_chr)(rscratch1, Address(post(str2, str2_chr_size))); 5061 cmp(tmp1, cnt1); 5062 br(NE, SHORT_LOOP_TAIL); 5063 subs(cnt2, cnt2, 1); 5064 br(EQ, SHORT_LAST2); 5065 (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size))); 5066 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size))); 5067 cmp(tmp2, rscratch1); 5068 br(EQ, SHORT_LOOP); 5069 sub(result, tmp2, rscratch1); 5070 b(DONE); 5071 bind(SHORT_LOOP_TAIL); 5072 sub(result, tmp1, cnt1); 5073 b(DONE); 5074 bind(SHORT_LAST2); 5075 cmp(tmp2, rscratch1); 5076 br(EQ, DONE); 5077 sub(result, tmp2, rscratch1); 5078 5079 b(DONE); 5080 bind(SHORT_LAST_INIT); 5081 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size))); 5082 bind(SHORT_LAST); 5083 cmp(tmp1, cnt1); 5084 br(EQ, DONE); 5085 sub(result, tmp1, cnt1); 5086 5087 bind(DONE); 5088 5089 BLOCK_COMMENT("} string_compare"); 5090 } 5091 #endif // COMPILER2 5092 5093 // This method checks if provided byte array contains byte with highest bit set. 5094 void MacroAssembler::has_negatives(Register ary1, Register len, Register result) { 5095 // Simple and most common case of aligned small array which is not at the 5096 // end of memory page is placed here. All other cases are in stub. 5097 Label LOOP, END, STUB, STUB_LONG, SET_RESULT, DONE; 5098 const uint64_t UPPER_BIT_MASK=0x8080808080808080; 5099 assert_different_registers(ary1, len, result); 5100 5101 cmpw(len, 0); 5102 br(LE, SET_RESULT); 5103 cmpw(len, 4 * wordSize); 5104 br(GE, STUB_LONG); // size > 32 then go to stub 5105 5106 int shift = 64 - exact_log2(os::vm_page_size()); 5107 lsl(rscratch1, ary1, shift); 5108 mov(rscratch2, (size_t)(4 * wordSize) << shift); 5109 adds(rscratch2, rscratch1, rscratch2); // At end of page? 5110 br(CS, STUB); // at the end of page then go to stub 5111 subs(len, len, wordSize); 5112 br(LT, END); 5113 5114 BIND(LOOP); 5115 ldr(rscratch1, Address(post(ary1, wordSize))); 5116 tst(rscratch1, UPPER_BIT_MASK); 5117 br(NE, SET_RESULT); 5118 subs(len, len, wordSize); 5119 br(GE, LOOP); 5120 cmpw(len, -wordSize); 5121 br(EQ, SET_RESULT); 5122 5123 BIND(END); 5124 ldr(result, Address(ary1)); 5125 sub(len, zr, len, LSL, 3); // LSL 3 is to get bits from bytes 5126 lslv(result, result, len); 5127 tst(result, UPPER_BIT_MASK); 5128 b(SET_RESULT); 5129 5130 BIND(STUB); 5131 RuntimeAddress has_neg = RuntimeAddress(StubRoutines::aarch64::has_negatives()); 5132 assert(has_neg.target() != NULL, "has_negatives stub has not been generated"); 5133 trampoline_call(has_neg); 5134 b(DONE); 5135 5136 BIND(STUB_LONG); 5137 RuntimeAddress has_neg_long = RuntimeAddress( 5138 StubRoutines::aarch64::has_negatives_long()); 5139 assert(has_neg_long.target() != NULL, "has_negatives stub has not been generated"); 5140 trampoline_call(has_neg_long); 5141 b(DONE); 5142 5143 BIND(SET_RESULT); 5144 cset(result, NE); // set true or false 5145 5146 BIND(DONE); 5147 } 5148 5149 void MacroAssembler::arrays_equals(Register a1, Register a2, Register tmp3, 5150 Register tmp4, Register tmp5, Register result, 5151 Register cnt1, int elem_size) { 5152 Label DONE, SAME; 5153 Register tmp1 = rscratch1; 5154 Register tmp2 = rscratch2; 5155 Register cnt2 = tmp2; // cnt2 only used in array length compare 5156 int elem_per_word = wordSize/elem_size; 5157 int log_elem_size = exact_log2(elem_size); 5158 int length_offset = arrayOopDesc::length_offset_in_bytes(); 5159 int base_offset 5160 = arrayOopDesc::base_offset_in_bytes(elem_size == 2 ? T_CHAR : T_BYTE); 5161 int stubBytesThreshold = 3 * 64 + (UseSIMDForArrayEquals ? 0 : 16); 5162 5163 assert(elem_size == 1 || elem_size == 2, "must be char or byte"); 5164 assert_different_registers(a1, a2, result, cnt1, rscratch1, rscratch2); 5165 5166 #ifndef PRODUCT 5167 { 5168 const char kind = (elem_size == 2) ? 'U' : 'L'; 5169 char comment[64]; 5170 snprintf(comment, sizeof comment, "array_equals%c{", kind); 5171 BLOCK_COMMENT(comment); 5172 } 5173 #endif 5174 5175 // if (a1 == a2) 5176 // return true; 5177 cmpoop(a1, a2); // May have read barriers for a1 and a2. 5178 br(EQ, SAME); 5179 5180 if (UseSimpleArrayEquals) { 5181 Label NEXT_WORD, SHORT, TAIL03, TAIL01, A_MIGHT_BE_NULL, A_IS_NOT_NULL; 5182 // if (a1 == null || a2 == null) 5183 // return false; 5184 // a1 & a2 == 0 means (some-pointer is null) or 5185 // (very-rare-or-even-probably-impossible-pointer-values) 5186 // so, we can save one branch in most cases 5187 tst(a1, a2); 5188 mov(result, false); 5189 br(EQ, A_MIGHT_BE_NULL); 5190 // if (a1.length != a2.length) 5191 // return false; 5192 bind(A_IS_NOT_NULL); 5193 ldrw(cnt1, Address(a1, length_offset)); 5194 ldrw(cnt2, Address(a2, length_offset)); 5195 eorw(tmp5, cnt1, cnt2); 5196 cbnzw(tmp5, DONE); 5197 lea(a1, Address(a1, base_offset)); 5198 lea(a2, Address(a2, base_offset)); 5199 // Check for short strings, i.e. smaller than wordSize. 5200 subs(cnt1, cnt1, elem_per_word); 5201 br(Assembler::LT, SHORT); 5202 // Main 8 byte comparison loop. 5203 bind(NEXT_WORD); { 5204 ldr(tmp1, Address(post(a1, wordSize))); 5205 ldr(tmp2, Address(post(a2, wordSize))); 5206 subs(cnt1, cnt1, elem_per_word); 5207 eor(tmp5, tmp1, tmp2); 5208 cbnz(tmp5, DONE); 5209 } br(GT, NEXT_WORD); 5210 // Last longword. In the case where length == 4 we compare the 5211 // same longword twice, but that's still faster than another 5212 // conditional branch. 5213 // cnt1 could be 0, -1, -2, -3, -4 for chars; -4 only happens when 5214 // length == 4. 5215 if (log_elem_size > 0) 5216 lsl(cnt1, cnt1, log_elem_size); 5217 ldr(tmp3, Address(a1, cnt1)); 5218 ldr(tmp4, Address(a2, cnt1)); 5219 eor(tmp5, tmp3, tmp4); 5220 cbnz(tmp5, DONE); 5221 b(SAME); 5222 bind(A_MIGHT_BE_NULL); 5223 // in case both a1 and a2 are not-null, proceed with loads 5224 cbz(a1, DONE); 5225 cbz(a2, DONE); 5226 b(A_IS_NOT_NULL); 5227 bind(SHORT); 5228 5229 tbz(cnt1, 2 - log_elem_size, TAIL03); // 0-7 bytes left. 5230 { 5231 ldrw(tmp1, Address(post(a1, 4))); 5232 ldrw(tmp2, Address(post(a2, 4))); 5233 eorw(tmp5, tmp1, tmp2); 5234 cbnzw(tmp5, DONE); 5235 } 5236 bind(TAIL03); 5237 tbz(cnt1, 1 - log_elem_size, TAIL01); // 0-3 bytes left. 5238 { 5239 ldrh(tmp3, Address(post(a1, 2))); 5240 ldrh(tmp4, Address(post(a2, 2))); 5241 eorw(tmp5, tmp3, tmp4); 5242 cbnzw(tmp5, DONE); 5243 } 5244 bind(TAIL01); 5245 if (elem_size == 1) { // Only needed when comparing byte arrays. 5246 tbz(cnt1, 0, SAME); // 0-1 bytes left. 5247 { 5248 ldrb(tmp1, a1); 5249 ldrb(tmp2, a2); 5250 eorw(tmp5, tmp1, tmp2); 5251 cbnzw(tmp5, DONE); 5252 } 5253 } 5254 } else { 5255 Label NEXT_DWORD, SHORT, TAIL, TAIL2, STUB, EARLY_OUT, 5256 CSET_EQ, LAST_CHECK; 5257 mov(result, false); 5258 cbz(a1, DONE); 5259 ldrw(cnt1, Address(a1, length_offset)); 5260 cbz(a2, DONE); 5261 ldrw(cnt2, Address(a2, length_offset)); 5262 // on most CPUs a2 is still "locked"(surprisingly) in ldrw and it's 5263 // faster to perform another branch before comparing a1 and a2 5264 cmp(cnt1, (u1)elem_per_word); 5265 br(LE, SHORT); // short or same 5266 ldr(tmp3, Address(pre(a1, base_offset))); 5267 subs(zr, cnt1, stubBytesThreshold); 5268 br(GE, STUB); 5269 ldr(tmp4, Address(pre(a2, base_offset))); 5270 sub(tmp5, zr, cnt1, LSL, 3 + log_elem_size); 5271 cmp(cnt2, cnt1); 5272 br(NE, DONE); 5273 5274 // Main 16 byte comparison loop with 2 exits 5275 bind(NEXT_DWORD); { 5276 ldr(tmp1, Address(pre(a1, wordSize))); 5277 ldr(tmp2, Address(pre(a2, wordSize))); 5278 subs(cnt1, cnt1, 2 * elem_per_word); 5279 br(LE, TAIL); 5280 eor(tmp4, tmp3, tmp4); 5281 cbnz(tmp4, DONE); 5282 ldr(tmp3, Address(pre(a1, wordSize))); 5283 ldr(tmp4, Address(pre(a2, wordSize))); 5284 cmp(cnt1, (u1)elem_per_word); 5285 br(LE, TAIL2); 5286 cmp(tmp1, tmp2); 5287 } br(EQ, NEXT_DWORD); 5288 b(DONE); 5289 5290 bind(TAIL); 5291 eor(tmp4, tmp3, tmp4); 5292 eor(tmp2, tmp1, tmp2); 5293 lslv(tmp2, tmp2, tmp5); 5294 orr(tmp5, tmp4, tmp2); 5295 cmp(tmp5, zr); 5296 b(CSET_EQ); 5297 5298 bind(TAIL2); 5299 eor(tmp2, tmp1, tmp2); 5300 cbnz(tmp2, DONE); 5301 b(LAST_CHECK); 5302 5303 bind(STUB); 5304 ldr(tmp4, Address(pre(a2, base_offset))); 5305 cmp(cnt2, cnt1); 5306 br(NE, DONE); 5307 if (elem_size == 2) { // convert to byte counter 5308 lsl(cnt1, cnt1, 1); 5309 } 5310 eor(tmp5, tmp3, tmp4); 5311 cbnz(tmp5, DONE); 5312 RuntimeAddress stub = RuntimeAddress(StubRoutines::aarch64::large_array_equals()); 5313 assert(stub.target() != NULL, "array_equals_long stub has not been generated"); 5314 trampoline_call(stub); 5315 b(DONE); 5316 5317 bind(EARLY_OUT); 5318 // (a1 != null && a2 == null) || (a1 != null && a2 != null && a1 == a2) 5319 // so, if a2 == null => return false(0), else return true, so we can return a2 5320 mov(result, a2); 5321 b(DONE); 5322 bind(SHORT); 5323 cmp(cnt2, cnt1); 5324 br(NE, DONE); 5325 cbz(cnt1, SAME); 5326 sub(tmp5, zr, cnt1, LSL, 3 + log_elem_size); 5327 ldr(tmp3, Address(a1, base_offset)); 5328 ldr(tmp4, Address(a2, base_offset)); 5329 bind(LAST_CHECK); 5330 eor(tmp4, tmp3, tmp4); 5331 lslv(tmp5, tmp4, tmp5); 5332 cmp(tmp5, zr); 5333 bind(CSET_EQ); 5334 cset(result, EQ); 5335 b(DONE); 5336 } 5337 5338 bind(SAME); 5339 mov(result, true); 5340 // That's it. 5341 bind(DONE); 5342 5343 BLOCK_COMMENT("} array_equals"); 5344 } 5345 5346 // Compare Strings 5347 5348 // For Strings we're passed the address of the first characters in a1 5349 // and a2 and the length in cnt1. 5350 // elem_size is the element size in bytes: either 1 or 2. 5351 // There are two implementations. For arrays >= 8 bytes, all 5352 // comparisons (including the final one, which may overlap) are 5353 // performed 8 bytes at a time. For strings < 8 bytes, we compare a 5354 // halfword, then a short, and then a byte. 5355 5356 void MacroAssembler::string_equals(Register a1, Register a2, 5357 Register result, Register cnt1, int elem_size) 5358 { 5359 Label SAME, DONE, SHORT, NEXT_WORD; 5360 Register tmp1 = rscratch1; 5361 Register tmp2 = rscratch2; 5362 Register cnt2 = tmp2; // cnt2 only used in array length compare 5363 5364 assert(elem_size == 1 || elem_size == 2, "must be 2 or 1 byte"); 5365 assert_different_registers(a1, a2, result, cnt1, rscratch1, rscratch2); 5366 5367 #ifndef PRODUCT 5368 { 5369 const char kind = (elem_size == 2) ? 'U' : 'L'; 5370 char comment[64]; 5371 snprintf(comment, sizeof comment, "{string_equals%c", kind); 5372 BLOCK_COMMENT(comment); 5373 } 5374 #endif 5375 5376 mov(result, false); 5377 5378 // Check for short strings, i.e. smaller than wordSize. 5379 subs(cnt1, cnt1, wordSize); 5380 br(Assembler::LT, SHORT); 5381 // Main 8 byte comparison loop. 5382 bind(NEXT_WORD); { 5383 ldr(tmp1, Address(post(a1, wordSize))); 5384 ldr(tmp2, Address(post(a2, wordSize))); 5385 subs(cnt1, cnt1, wordSize); 5386 eor(tmp1, tmp1, tmp2); 5387 cbnz(tmp1, DONE); 5388 } br(GT, NEXT_WORD); 5389 // Last longword. In the case where length == 4 we compare the 5390 // same longword twice, but that's still faster than another 5391 // conditional branch. 5392 // cnt1 could be 0, -1, -2, -3, -4 for chars; -4 only happens when 5393 // length == 4. 5394 ldr(tmp1, Address(a1, cnt1)); 5395 ldr(tmp2, Address(a2, cnt1)); 5396 eor(tmp2, tmp1, tmp2); 5397 cbnz(tmp2, DONE); 5398 b(SAME); 5399 5400 bind(SHORT); 5401 Label TAIL03, TAIL01; 5402 5403 tbz(cnt1, 2, TAIL03); // 0-7 bytes left. 5404 { 5405 ldrw(tmp1, Address(post(a1, 4))); 5406 ldrw(tmp2, Address(post(a2, 4))); 5407 eorw(tmp1, tmp1, tmp2); 5408 cbnzw(tmp1, DONE); 5409 } 5410 bind(TAIL03); 5411 tbz(cnt1, 1, TAIL01); // 0-3 bytes left. 5412 { 5413 ldrh(tmp1, Address(post(a1, 2))); 5414 ldrh(tmp2, Address(post(a2, 2))); 5415 eorw(tmp1, tmp1, tmp2); 5416 cbnzw(tmp1, DONE); 5417 } 5418 bind(TAIL01); 5419 if (elem_size == 1) { // Only needed when comparing 1-byte elements 5420 tbz(cnt1, 0, SAME); // 0-1 bytes left. 5421 { 5422 ldrb(tmp1, a1); 5423 ldrb(tmp2, a2); 5424 eorw(tmp1, tmp1, tmp2); 5425 cbnzw(tmp1, DONE); 5426 } 5427 } 5428 // Arrays are equal. 5429 bind(SAME); 5430 mov(result, true); 5431 5432 // That's it. 5433 bind(DONE); 5434 BLOCK_COMMENT("} string_equals"); 5435 } 5436 5437 5438 // The size of the blocks erased by the zero_blocks stub. We must 5439 // handle anything smaller than this ourselves in zero_words(). 5440 const int MacroAssembler::zero_words_block_size = 8; 5441 5442 // zero_words() is used by C2 ClearArray patterns. It is as small as 5443 // possible, handling small word counts locally and delegating 5444 // anything larger to the zero_blocks stub. It is expanded many times 5445 // in compiled code, so it is important to keep it short. 5446 5447 // ptr: Address of a buffer to be zeroed. 5448 // cnt: Count in HeapWords. 5449 // 5450 // ptr, cnt, rscratch1, and rscratch2 are clobbered. 5451 void MacroAssembler::zero_words(Register ptr, Register cnt) 5452 { 5453 assert(is_power_of_2(zero_words_block_size), "adjust this"); 5454 assert(ptr == r10 && cnt == r11, "mismatch in register usage"); 5455 5456 BLOCK_COMMENT("zero_words {"); 5457 cmp(cnt, (u1)zero_words_block_size); 5458 Label around; 5459 br(LO, around); 5460 { 5461 RuntimeAddress zero_blocks = RuntimeAddress(StubRoutines::aarch64::zero_blocks()); 5462 assert(zero_blocks.target() != NULL, "zero_blocks stub has not been generated"); 5463 if (StubRoutines::aarch64::complete()) { 5464 trampoline_call(zero_blocks); 5465 } else { 5466 bl(zero_blocks); 5467 } 5468 } 5469 bind(around); 5470 for (int i = zero_words_block_size >> 1; i > 1; i >>= 1) { 5471 Label l; 5472 tbz(cnt, exact_log2(i), l); 5473 for (int j = 0; j < i; j += 2) { 5474 stp(zr, zr, post(ptr, 16)); 5475 } 5476 bind(l); 5477 } 5478 { 5479 Label l; 5480 tbz(cnt, 0, l); 5481 str(zr, Address(ptr)); 5482 bind(l); 5483 } 5484 BLOCK_COMMENT("} zero_words"); 5485 } 5486 5487 // base: Address of a buffer to be zeroed, 8 bytes aligned. 5488 // cnt: Immediate count in HeapWords. 5489 #define SmallArraySize (18 * BytesPerLong) 5490 void MacroAssembler::zero_words(Register base, u_int64_t cnt) 5491 { 5492 BLOCK_COMMENT("zero_words {"); 5493 int i = cnt & 1; // store any odd word to start 5494 if (i) str(zr, Address(base)); 5495 5496 if (cnt <= SmallArraySize / BytesPerLong) { 5497 for (; i < (int)cnt; i += 2) 5498 stp(zr, zr, Address(base, i * wordSize)); 5499 } else { 5500 const int unroll = 4; // Number of stp(zr, zr) instructions we'll unroll 5501 int remainder = cnt % (2 * unroll); 5502 for (; i < remainder; i += 2) 5503 stp(zr, zr, Address(base, i * wordSize)); 5504 5505 Label loop; 5506 Register cnt_reg = rscratch1; 5507 Register loop_base = rscratch2; 5508 cnt = cnt - remainder; 5509 mov(cnt_reg, cnt); 5510 // adjust base and prebias by -2 * wordSize so we can pre-increment 5511 add(loop_base, base, (remainder - 2) * wordSize); 5512 bind(loop); 5513 sub(cnt_reg, cnt_reg, 2 * unroll); 5514 for (i = 1; i < unroll; i++) 5515 stp(zr, zr, Address(loop_base, 2 * i * wordSize)); 5516 stp(zr, zr, Address(pre(loop_base, 2 * unroll * wordSize))); 5517 cbnz(cnt_reg, loop); 5518 } 5519 BLOCK_COMMENT("} zero_words"); 5520 } 5521 5522 // Zero blocks of memory by using DC ZVA. 5523 // 5524 // Aligns the base address first sufficently for DC ZVA, then uses 5525 // DC ZVA repeatedly for every full block. cnt is the size to be 5526 // zeroed in HeapWords. Returns the count of words left to be zeroed 5527 // in cnt. 5528 // 5529 // NOTE: This is intended to be used in the zero_blocks() stub. If 5530 // you want to use it elsewhere, note that cnt must be >= 2*zva_length. 5531 void MacroAssembler::zero_dcache_blocks(Register base, Register cnt) { 5532 Register tmp = rscratch1; 5533 Register tmp2 = rscratch2; 5534 int zva_length = VM_Version::zva_length(); 5535 Label initial_table_end, loop_zva; 5536 Label fini; 5537 5538 // Base must be 16 byte aligned. If not just return and let caller handle it 5539 tst(base, 0x0f); 5540 br(Assembler::NE, fini); 5541 // Align base with ZVA length. 5542 neg(tmp, base); 5543 andr(tmp, tmp, zva_length - 1); 5544 5545 // tmp: the number of bytes to be filled to align the base with ZVA length. 5546 add(base, base, tmp); 5547 sub(cnt, cnt, tmp, Assembler::ASR, 3); 5548 adr(tmp2, initial_table_end); 5549 sub(tmp2, tmp2, tmp, Assembler::LSR, 2); 5550 br(tmp2); 5551 5552 for (int i = -zva_length + 16; i < 0; i += 16) 5553 stp(zr, zr, Address(base, i)); 5554 bind(initial_table_end); 5555 5556 sub(cnt, cnt, zva_length >> 3); 5557 bind(loop_zva); 5558 dc(Assembler::ZVA, base); 5559 subs(cnt, cnt, zva_length >> 3); 5560 add(base, base, zva_length); 5561 br(Assembler::GE, loop_zva); 5562 add(cnt, cnt, zva_length >> 3); // count not zeroed by DC ZVA 5563 bind(fini); 5564 } 5565 5566 // base: Address of a buffer to be filled, 8 bytes aligned. 5567 // cnt: Count in 8-byte unit. 5568 // value: Value to be filled with. 5569 // base will point to the end of the buffer after filling. 5570 void MacroAssembler::fill_words(Register base, Register cnt, Register value) 5571 { 5572 // Algorithm: 5573 // 5574 // scratch1 = cnt & 7; 5575 // cnt -= scratch1; 5576 // p += scratch1; 5577 // switch (scratch1) { 5578 // do { 5579 // cnt -= 8; 5580 // p[-8] = v; 5581 // case 7: 5582 // p[-7] = v; 5583 // case 6: 5584 // p[-6] = v; 5585 // // ... 5586 // case 1: 5587 // p[-1] = v; 5588 // case 0: 5589 // p += 8; 5590 // } while (cnt); 5591 // } 5592 5593 assert_different_registers(base, cnt, value, rscratch1, rscratch2); 5594 5595 Label fini, skip, entry, loop; 5596 const int unroll = 8; // Number of stp instructions we'll unroll 5597 5598 cbz(cnt, fini); 5599 tbz(base, 3, skip); 5600 str(value, Address(post(base, 8))); 5601 sub(cnt, cnt, 1); 5602 bind(skip); 5603 5604 andr(rscratch1, cnt, (unroll-1) * 2); 5605 sub(cnt, cnt, rscratch1); 5606 add(base, base, rscratch1, Assembler::LSL, 3); 5607 adr(rscratch2, entry); 5608 sub(rscratch2, rscratch2, rscratch1, Assembler::LSL, 1); 5609 br(rscratch2); 5610 5611 bind(loop); 5612 add(base, base, unroll * 16); 5613 for (int i = -unroll; i < 0; i++) 5614 stp(value, value, Address(base, i * 16)); 5615 bind(entry); 5616 subs(cnt, cnt, unroll * 2); 5617 br(Assembler::GE, loop); 5618 5619 tbz(cnt, 0, fini); 5620 str(value, Address(post(base, 8))); 5621 bind(fini); 5622 } 5623 5624 // Intrinsic for sun/nio/cs/ISO_8859_1$Encoder.implEncodeISOArray and 5625 // java/lang/StringUTF16.compress. 5626 void MacroAssembler::encode_iso_array(Register src, Register dst, 5627 Register len, Register result, 5628 FloatRegister Vtmp1, FloatRegister Vtmp2, 5629 FloatRegister Vtmp3, FloatRegister Vtmp4) 5630 { 5631 Label DONE, SET_RESULT, NEXT_32, NEXT_32_PRFM, LOOP_8, NEXT_8, LOOP_1, NEXT_1, 5632 NEXT_32_START, NEXT_32_PRFM_START; 5633 Register tmp1 = rscratch1, tmp2 = rscratch2; 5634 5635 mov(result, len); // Save initial len 5636 5637 #ifndef BUILTIN_SIM 5638 cmp(len, (u1)8); // handle shortest strings first 5639 br(LT, LOOP_1); 5640 cmp(len, (u1)32); 5641 br(LT, NEXT_8); 5642 // The following code uses the SIMD 'uzp1' and 'uzp2' instructions 5643 // to convert chars to bytes 5644 if (SoftwarePrefetchHintDistance >= 0) { 5645 ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src); 5646 subs(tmp2, len, SoftwarePrefetchHintDistance/2 + 16); 5647 br(LE, NEXT_32_START); 5648 b(NEXT_32_PRFM_START); 5649 BIND(NEXT_32_PRFM); 5650 ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src); 5651 BIND(NEXT_32_PRFM_START); 5652 prfm(Address(src, SoftwarePrefetchHintDistance)); 5653 orr(v4, T16B, Vtmp1, Vtmp2); 5654 orr(v5, T16B, Vtmp3, Vtmp4); 5655 uzp1(Vtmp1, T16B, Vtmp1, Vtmp2); 5656 uzp1(Vtmp3, T16B, Vtmp3, Vtmp4); 5657 stpq(Vtmp1, Vtmp3, dst); 5658 uzp2(v5, T16B, v4, v5); // high bytes 5659 umov(tmp2, v5, D, 1); 5660 fmovd(tmp1, v5); 5661 orr(tmp1, tmp1, tmp2); 5662 cbnz(tmp1, LOOP_8); 5663 sub(len, len, 32); 5664 add(dst, dst, 32); 5665 add(src, src, 64); 5666 subs(tmp2, len, SoftwarePrefetchHintDistance/2 + 16); 5667 br(GE, NEXT_32_PRFM); 5668 cmp(len, (u1)32); 5669 br(LT, LOOP_8); 5670 BIND(NEXT_32); 5671 ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src); 5672 BIND(NEXT_32_START); 5673 } else { 5674 BIND(NEXT_32); 5675 ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src); 5676 } 5677 prfm(Address(src, SoftwarePrefetchHintDistance)); 5678 uzp1(v4, T16B, Vtmp1, Vtmp2); 5679 uzp1(v5, T16B, Vtmp3, Vtmp4); 5680 stpq(v4, v5, dst); 5681 orr(Vtmp1, T16B, Vtmp1, Vtmp2); 5682 orr(Vtmp3, T16B, Vtmp3, Vtmp4); 5683 uzp2(Vtmp1, T16B, Vtmp1, Vtmp3); // high bytes 5684 umov(tmp2, Vtmp1, D, 1); 5685 fmovd(tmp1, Vtmp1); 5686 orr(tmp1, tmp1, tmp2); 5687 cbnz(tmp1, LOOP_8); 5688 sub(len, len, 32); 5689 add(dst, dst, 32); 5690 add(src, src, 64); 5691 cmp(len, (u1)32); 5692 br(GE, NEXT_32); 5693 cbz(len, DONE); 5694 5695 BIND(LOOP_8); 5696 cmp(len, (u1)8); 5697 br(LT, LOOP_1); 5698 BIND(NEXT_8); 5699 ld1(Vtmp1, T8H, src); 5700 uzp1(Vtmp2, T16B, Vtmp1, Vtmp1); // low bytes 5701 uzp2(Vtmp3, T16B, Vtmp1, Vtmp1); // high bytes 5702 strd(Vtmp2, dst); 5703 fmovd(tmp1, Vtmp3); 5704 cbnz(tmp1, NEXT_1); 5705 5706 sub(len, len, 8); 5707 add(dst, dst, 8); 5708 add(src, src, 16); 5709 cmp(len, (u1)8); 5710 br(GE, NEXT_8); 5711 5712 BIND(LOOP_1); 5713 #endif 5714 cbz(len, DONE); 5715 BIND(NEXT_1); 5716 ldrh(tmp1, Address(post(src, 2))); 5717 strb(tmp1, Address(post(dst, 1))); 5718 tst(tmp1, 0xff00); 5719 br(NE, SET_RESULT); 5720 subs(len, len, 1); 5721 br(GT, NEXT_1); 5722 5723 BIND(SET_RESULT); 5724 sub(result, result, len); // Return index where we stopped 5725 // Return len == 0 if we processed all 5726 // characters 5727 BIND(DONE); 5728 } 5729 5730 5731 // Inflate byte[] array to char[]. 5732 void MacroAssembler::byte_array_inflate(Register src, Register dst, Register len, 5733 FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3, 5734 Register tmp4) { 5735 Label big, done, after_init, to_stub; 5736 5737 assert_different_registers(src, dst, len, tmp4, rscratch1); 5738 5739 fmovd(vtmp1, zr); 5740 lsrw(tmp4, len, 3); 5741 bind(after_init); 5742 cbnzw(tmp4, big); 5743 // Short string: less than 8 bytes. 5744 { 5745 Label loop, tiny; 5746 5747 cmpw(len, 4); 5748 br(LT, tiny); 5749 // Use SIMD to do 4 bytes. 5750 ldrs(vtmp2, post(src, 4)); 5751 zip1(vtmp3, T8B, vtmp2, vtmp1); 5752 subw(len, len, 4); 5753 strd(vtmp3, post(dst, 8)); 5754 5755 cbzw(len, done); 5756 5757 // Do the remaining bytes by steam. 5758 bind(loop); 5759 ldrb(tmp4, post(src, 1)); 5760 strh(tmp4, post(dst, 2)); 5761 subw(len, len, 1); 5762 5763 bind(tiny); 5764 cbnz(len, loop); 5765 5766 b(done); 5767 } 5768 5769 if (SoftwarePrefetchHintDistance >= 0) { 5770 bind(to_stub); 5771 RuntimeAddress stub = RuntimeAddress(StubRoutines::aarch64::large_byte_array_inflate()); 5772 assert(stub.target() != NULL, "large_byte_array_inflate stub has not been generated"); 5773 trampoline_call(stub); 5774 b(after_init); 5775 } 5776 5777 // Unpack the bytes 8 at a time. 5778 bind(big); 5779 { 5780 Label loop, around, loop_last, loop_start; 5781 5782 if (SoftwarePrefetchHintDistance >= 0) { 5783 const int large_loop_threshold = (64 + 16)/8; 5784 ldrd(vtmp2, post(src, 8)); 5785 andw(len, len, 7); 5786 cmp(tmp4, (u1)large_loop_threshold); 5787 br(GE, to_stub); 5788 b(loop_start); 5789 5790 bind(loop); 5791 ldrd(vtmp2, post(src, 8)); 5792 bind(loop_start); 5793 subs(tmp4, tmp4, 1); 5794 br(EQ, loop_last); 5795 zip1(vtmp2, T16B, vtmp2, vtmp1); 5796 ldrd(vtmp3, post(src, 8)); 5797 st1(vtmp2, T8H, post(dst, 16)); 5798 subs(tmp4, tmp4, 1); 5799 zip1(vtmp3, T16B, vtmp3, vtmp1); 5800 st1(vtmp3, T8H, post(dst, 16)); 5801 br(NE, loop); 5802 b(around); 5803 bind(loop_last); 5804 zip1(vtmp2, T16B, vtmp2, vtmp1); 5805 st1(vtmp2, T8H, post(dst, 16)); 5806 bind(around); 5807 cbz(len, done); 5808 } else { 5809 andw(len, len, 7); 5810 bind(loop); 5811 ldrd(vtmp2, post(src, 8)); 5812 sub(tmp4, tmp4, 1); 5813 zip1(vtmp3, T16B, vtmp2, vtmp1); 5814 st1(vtmp3, T8H, post(dst, 16)); 5815 cbnz(tmp4, loop); 5816 } 5817 } 5818 5819 // Do the tail of up to 8 bytes. 5820 add(src, src, len); 5821 ldrd(vtmp3, Address(src, -8)); 5822 add(dst, dst, len, ext::uxtw, 1); 5823 zip1(vtmp3, T16B, vtmp3, vtmp1); 5824 strq(vtmp3, Address(dst, -16)); 5825 5826 bind(done); 5827 } 5828 5829 // Compress char[] array to byte[]. 5830 void MacroAssembler::char_array_compress(Register src, Register dst, Register len, 5831 FloatRegister tmp1Reg, FloatRegister tmp2Reg, 5832 FloatRegister tmp3Reg, FloatRegister tmp4Reg, 5833 Register result) { 5834 encode_iso_array(src, dst, len, result, 5835 tmp1Reg, tmp2Reg, tmp3Reg, tmp4Reg); 5836 cmp(len, zr); 5837 csel(result, result, zr, EQ); 5838 } 5839 5840 // get_thread() can be called anywhere inside generated code so we 5841 // need to save whatever non-callee save context might get clobbered 5842 // by the call to JavaThread::aarch64_get_thread_helper() or, indeed, 5843 // the call setup code. 5844 // 5845 // aarch64_get_thread_helper() clobbers only r0, r1, and flags. 5846 // 5847 void MacroAssembler::get_thread(Register dst) { 5848 RegSet saved_regs = RegSet::range(r0, r1) + lr - dst; 5849 push(saved_regs, sp); 5850 5851 mov(lr, CAST_FROM_FN_PTR(address, JavaThread::aarch64_get_thread_helper)); 5852 blrt(lr, 1, 0, 1); 5853 if (dst != c_rarg0) { 5854 mov(dst, c_rarg0); 5855 } 5856 5857 pop(saved_regs, sp); 5858 }