1 /* 2 * Copyright (c) 1997, 2019, Oracle and/or its affiliates. All rights reserved. 3 * Copyright (c) 2014, 2015, Red Hat Inc. All rights reserved. 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. 9 * 10 * This code is distributed in the hope that it will be useful, but WITHOUT 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 * version 2 for more details (a copy is included in the LICENSE file that 14 * accompanied this code). 15 * 16 * You should have received a copy of the GNU General Public License version 17 * 2 along with this work; if not, write to the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 19 * 20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 21 * or visit www.oracle.com if you need additional information or have any 22 * questions. 23 * 24 */ 25 26 #include <sys/types.h> 27 28 #include "precompiled.hpp" 29 #include "jvm.h" 30 #include "asm/assembler.hpp" 31 #include "asm/assembler.inline.hpp" 32 #include "gc/shared/barrierSet.hpp" 33 #include "gc/shared/cardTable.hpp" 34 #include "gc/shared/barrierSetAssembler.hpp" 35 #include "gc/shared/cardTableBarrierSet.hpp" 36 #include "interpreter/interpreter.hpp" 37 #include "compiler/disassembler.hpp" 38 #include "memory/resourceArea.hpp" 39 #include "nativeInst_aarch64.hpp" 40 #include "oops/accessDecorators.hpp" 41 #include "oops/compressedOops.inline.hpp" 42 #include "oops/klass.inline.hpp" 43 #include "runtime/biasedLocking.hpp" 44 #include "runtime/icache.hpp" 45 #include "runtime/interfaceSupport.inline.hpp" 46 #include "runtime/jniHandles.inline.hpp" 47 #include "runtime/sharedRuntime.hpp" 48 #include "runtime/thread.hpp" 49 #ifdef COMPILER1 50 #include "c1/c1_LIRAssembler.hpp" 51 #endif 52 #ifdef COMPILER2 53 #include "oops/oop.hpp" 54 #include "opto/compile.hpp" 55 #include "opto/intrinsicnode.hpp" 56 #include "opto/node.hpp" 57 #endif 58 59 #ifdef PRODUCT 60 #define BLOCK_COMMENT(str) /* nothing */ 61 #define STOP(error) stop(error) 62 #else 63 #define BLOCK_COMMENT(str) block_comment(str) 64 #define STOP(error) block_comment(error); stop(error) 65 #endif 66 67 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 68 69 // Patch any kind of instruction; there may be several instructions. 70 // Return the total length (in bytes) of the instructions. 71 int MacroAssembler::pd_patch_instruction_size(address branch, address target) { 72 int instructions = 1; 73 assert((uint64_t)target < (1ul << 48), "48-bit overflow in address constant"); 74 long offset = (target - branch) >> 2; 75 unsigned insn = *(unsigned*)branch; 76 if ((Instruction_aarch64::extract(insn, 29, 24) & 0b111011) == 0b011000) { 77 // Load register (literal) 78 Instruction_aarch64::spatch(branch, 23, 5, offset); 79 } else if (Instruction_aarch64::extract(insn, 30, 26) == 0b00101) { 80 // Unconditional branch (immediate) 81 Instruction_aarch64::spatch(branch, 25, 0, offset); 82 } else if (Instruction_aarch64::extract(insn, 31, 25) == 0b0101010) { 83 // Conditional branch (immediate) 84 Instruction_aarch64::spatch(branch, 23, 5, offset); 85 } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011010) { 86 // Compare & branch (immediate) 87 Instruction_aarch64::spatch(branch, 23, 5, offset); 88 } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011011) { 89 // Test & branch (immediate) 90 Instruction_aarch64::spatch(branch, 18, 5, offset); 91 } else if (Instruction_aarch64::extract(insn, 28, 24) == 0b10000) { 92 // PC-rel. addressing 93 offset = target-branch; 94 int shift = Instruction_aarch64::extract(insn, 31, 31); 95 if (shift) { 96 u_int64_t dest = (u_int64_t)target; 97 uint64_t pc_page = (uint64_t)branch >> 12; 98 uint64_t adr_page = (uint64_t)target >> 12; 99 unsigned offset_lo = dest & 0xfff; 100 offset = adr_page - pc_page; 101 102 // We handle 4 types of PC relative addressing 103 // 1 - adrp Rx, target_page 104 // ldr/str Ry, [Rx, #offset_in_page] 105 // 2 - adrp Rx, target_page 106 // add Ry, Rx, #offset_in_page 107 // 3 - adrp Rx, target_page (page aligned reloc, offset == 0) 108 // movk Rx, #imm16<<32 109 // 4 - adrp Rx, target_page (page aligned reloc, offset == 0) 110 // In the first 3 cases we must check that Rx is the same in the adrp and the 111 // subsequent ldr/str, add or movk instruction. Otherwise we could accidentally end 112 // up treating a type 4 relocation as a type 1, 2 or 3 just because it happened 113 // to be followed by a random unrelated ldr/str, add or movk instruction. 114 // 115 unsigned insn2 = ((unsigned*)branch)[1]; 116 if (Instruction_aarch64::extract(insn2, 29, 24) == 0b111001 && 117 Instruction_aarch64::extract(insn, 4, 0) == 118 Instruction_aarch64::extract(insn2, 9, 5)) { 119 // Load/store register (unsigned immediate) 120 unsigned size = Instruction_aarch64::extract(insn2, 31, 30); 121 Instruction_aarch64::patch(branch + sizeof (unsigned), 122 21, 10, offset_lo >> size); 123 guarantee(((dest >> size) << size) == dest, "misaligned target"); 124 instructions = 2; 125 } else if (Instruction_aarch64::extract(insn2, 31, 22) == 0b1001000100 && 126 Instruction_aarch64::extract(insn, 4, 0) == 127 Instruction_aarch64::extract(insn2, 4, 0)) { 128 // add (immediate) 129 Instruction_aarch64::patch(branch + sizeof (unsigned), 130 21, 10, offset_lo); 131 instructions = 2; 132 } else if (Instruction_aarch64::extract(insn2, 31, 21) == 0b11110010110 && 133 Instruction_aarch64::extract(insn, 4, 0) == 134 Instruction_aarch64::extract(insn2, 4, 0)) { 135 // movk #imm16<<32 136 Instruction_aarch64::patch(branch + 4, 20, 5, (uint64_t)target >> 32); 137 long dest = ((long)target & 0xffffffffL) | ((long)branch & 0xffff00000000L); 138 long pc_page = (long)branch >> 12; 139 long adr_page = (long)dest >> 12; 140 offset = adr_page - pc_page; 141 instructions = 2; 142 } 143 } 144 int offset_lo = offset & 3; 145 offset >>= 2; 146 Instruction_aarch64::spatch(branch, 23, 5, offset); 147 Instruction_aarch64::patch(branch, 30, 29, offset_lo); 148 } else if (Instruction_aarch64::extract(insn, 31, 21) == 0b11010010100) { 149 u_int64_t dest = (u_int64_t)target; 150 // Move wide constant 151 assert(nativeInstruction_at(branch+4)->is_movk(), "wrong insns in patch"); 152 assert(nativeInstruction_at(branch+8)->is_movk(), "wrong insns in patch"); 153 Instruction_aarch64::patch(branch, 20, 5, dest & 0xffff); 154 Instruction_aarch64::patch(branch+4, 20, 5, (dest >>= 16) & 0xffff); 155 Instruction_aarch64::patch(branch+8, 20, 5, (dest >>= 16) & 0xffff); 156 assert(target_addr_for_insn(branch) == target, "should be"); 157 instructions = 3; 158 } else if (Instruction_aarch64::extract(insn, 31, 22) == 0b1011100101 && 159 Instruction_aarch64::extract(insn, 4, 0) == 0b11111) { 160 // nothing to do 161 assert(target == 0, "did not expect to relocate target for polling page load"); 162 } else { 163 ShouldNotReachHere(); 164 } 165 return instructions * NativeInstruction::instruction_size; 166 } 167 168 int MacroAssembler::patch_oop(address insn_addr, address o) { 169 int instructions; 170 unsigned insn = *(unsigned*)insn_addr; 171 assert(nativeInstruction_at(insn_addr+4)->is_movk(), "wrong insns in patch"); 172 173 // OOPs are either narrow (32 bits) or wide (48 bits). We encode 174 // narrow OOPs by setting the upper 16 bits in the first 175 // instruction. 176 if (Instruction_aarch64::extract(insn, 31, 21) == 0b11010010101) { 177 // Move narrow OOP 178 narrowOop n = CompressedOops::encode((oop)o); 179 Instruction_aarch64::patch(insn_addr, 20, 5, n >> 16); 180 Instruction_aarch64::patch(insn_addr+4, 20, 5, n & 0xffff); 181 instructions = 2; 182 } else { 183 // Move wide OOP 184 assert(nativeInstruction_at(insn_addr+8)->is_movk(), "wrong insns in patch"); 185 uintptr_t dest = (uintptr_t)o; 186 Instruction_aarch64::patch(insn_addr, 20, 5, dest & 0xffff); 187 Instruction_aarch64::patch(insn_addr+4, 20, 5, (dest >>= 16) & 0xffff); 188 Instruction_aarch64::patch(insn_addr+8, 20, 5, (dest >>= 16) & 0xffff); 189 instructions = 3; 190 } 191 return instructions * NativeInstruction::instruction_size; 192 } 193 194 int MacroAssembler::patch_narrow_klass(address insn_addr, narrowKlass n) { 195 // Metatdata pointers are either narrow (32 bits) or wide (48 bits). 196 // We encode narrow ones by setting the upper 16 bits in the first 197 // instruction. 198 NativeInstruction *insn = nativeInstruction_at(insn_addr); 199 assert(Instruction_aarch64::extract(insn->encoding(), 31, 21) == 0b11010010101 && 200 nativeInstruction_at(insn_addr+4)->is_movk(), "wrong insns in patch"); 201 202 Instruction_aarch64::patch(insn_addr, 20, 5, n >> 16); 203 Instruction_aarch64::patch(insn_addr+4, 20, 5, n & 0xffff); 204 return 2 * NativeInstruction::instruction_size; 205 } 206 207 address MacroAssembler::target_addr_for_insn(address insn_addr, unsigned insn) { 208 long offset = 0; 209 if ((Instruction_aarch64::extract(insn, 29, 24) & 0b011011) == 0b00011000) { 210 // Load register (literal) 211 offset = Instruction_aarch64::sextract(insn, 23, 5); 212 return address(((uint64_t)insn_addr + (offset << 2))); 213 } else if (Instruction_aarch64::extract(insn, 30, 26) == 0b00101) { 214 // Unconditional branch (immediate) 215 offset = Instruction_aarch64::sextract(insn, 25, 0); 216 } else if (Instruction_aarch64::extract(insn, 31, 25) == 0b0101010) { 217 // Conditional branch (immediate) 218 offset = Instruction_aarch64::sextract(insn, 23, 5); 219 } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011010) { 220 // Compare & branch (immediate) 221 offset = Instruction_aarch64::sextract(insn, 23, 5); 222 } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011011) { 223 // Test & branch (immediate) 224 offset = Instruction_aarch64::sextract(insn, 18, 5); 225 } else if (Instruction_aarch64::extract(insn, 28, 24) == 0b10000) { 226 // PC-rel. addressing 227 offset = Instruction_aarch64::extract(insn, 30, 29); 228 offset |= Instruction_aarch64::sextract(insn, 23, 5) << 2; 229 int shift = Instruction_aarch64::extract(insn, 31, 31) ? 12 : 0; 230 if (shift) { 231 offset <<= shift; 232 uint64_t target_page = ((uint64_t)insn_addr) + offset; 233 target_page &= ((uint64_t)-1) << shift; 234 // Return the target address for the following sequences 235 // 1 - adrp Rx, target_page 236 // ldr/str Ry, [Rx, #offset_in_page] 237 // 2 - adrp Rx, target_page 238 // add Ry, Rx, #offset_in_page 239 // 3 - adrp Rx, target_page (page aligned reloc, offset == 0) 240 // movk Rx, #imm12<<32 241 // 4 - adrp Rx, target_page (page aligned reloc, offset == 0) 242 // 243 // In the first two cases we check that the register is the same and 244 // return the target_page + the offset within the page. 245 // Otherwise we assume it is a page aligned relocation and return 246 // the target page only. 247 // 248 unsigned insn2 = ((unsigned*)insn_addr)[1]; 249 if (Instruction_aarch64::extract(insn2, 29, 24) == 0b111001 && 250 Instruction_aarch64::extract(insn, 4, 0) == 251 Instruction_aarch64::extract(insn2, 9, 5)) { 252 // Load/store register (unsigned immediate) 253 unsigned int byte_offset = Instruction_aarch64::extract(insn2, 21, 10); 254 unsigned int size = Instruction_aarch64::extract(insn2, 31, 30); 255 return address(target_page + (byte_offset << size)); 256 } else if (Instruction_aarch64::extract(insn2, 31, 22) == 0b1001000100 && 257 Instruction_aarch64::extract(insn, 4, 0) == 258 Instruction_aarch64::extract(insn2, 4, 0)) { 259 // add (immediate) 260 unsigned int byte_offset = Instruction_aarch64::extract(insn2, 21, 10); 261 return address(target_page + byte_offset); 262 } else { 263 if (Instruction_aarch64::extract(insn2, 31, 21) == 0b11110010110 && 264 Instruction_aarch64::extract(insn, 4, 0) == 265 Instruction_aarch64::extract(insn2, 4, 0)) { 266 target_page = (target_page & 0xffffffff) | 267 ((uint64_t)Instruction_aarch64::extract(insn2, 20, 5) << 32); 268 } 269 return (address)target_page; 270 } 271 } else { 272 ShouldNotReachHere(); 273 } 274 } else if (Instruction_aarch64::extract(insn, 31, 23) == 0b110100101) { 275 u_int32_t *insns = (u_int32_t *)insn_addr; 276 // Move wide constant: movz, movk, movk. See movptr(). 277 assert(nativeInstruction_at(insns+1)->is_movk(), "wrong insns in patch"); 278 assert(nativeInstruction_at(insns+2)->is_movk(), "wrong insns in patch"); 279 return address(u_int64_t(Instruction_aarch64::extract(insns[0], 20, 5)) 280 + (u_int64_t(Instruction_aarch64::extract(insns[1], 20, 5)) << 16) 281 + (u_int64_t(Instruction_aarch64::extract(insns[2], 20, 5)) << 32)); 282 } else if (Instruction_aarch64::extract(insn, 31, 22) == 0b1011100101 && 283 Instruction_aarch64::extract(insn, 4, 0) == 0b11111) { 284 return 0; 285 } else { 286 ShouldNotReachHere(); 287 } 288 return address(((uint64_t)insn_addr + (offset << 2))); 289 } 290 291 void MacroAssembler::safepoint_poll(Label& slow_path) { 292 if (SafepointMechanism::uses_thread_local_poll()) { 293 ldr(rscratch1, Address(rthread, Thread::polling_page_offset())); 294 tbnz(rscratch1, exact_log2(SafepointMechanism::poll_bit()), slow_path); 295 } else { 296 unsigned long offset; 297 adrp(rscratch1, ExternalAddress(SafepointSynchronize::address_of_state()), offset); 298 ldrw(rscratch1, Address(rscratch1, offset)); 299 assert(SafepointSynchronize::_not_synchronized == 0, "rewrite this code"); 300 cbnz(rscratch1, slow_path); 301 } 302 } 303 304 // Just like safepoint_poll, but use an acquiring load for thread- 305 // local polling. 306 // 307 // We need an acquire here to ensure that any subsequent load of the 308 // global SafepointSynchronize::_state flag is ordered after this load 309 // of the local Thread::_polling page. We don't want this poll to 310 // return false (i.e. not safepointing) and a later poll of the global 311 // SafepointSynchronize::_state spuriously to return true. 312 // 313 // This is to avoid a race when we're in a native->Java transition 314 // racing the code which wakes up from a safepoint. 315 // 316 void MacroAssembler::safepoint_poll_acquire(Label& slow_path) { 317 if (SafepointMechanism::uses_thread_local_poll()) { 318 lea(rscratch1, Address(rthread, Thread::polling_page_offset())); 319 ldar(rscratch1, rscratch1); 320 tbnz(rscratch1, exact_log2(SafepointMechanism::poll_bit()), slow_path); 321 } else { 322 safepoint_poll(slow_path); 323 } 324 } 325 326 void MacroAssembler::reset_last_Java_frame(bool clear_fp) { 327 // we must set sp to zero to clear frame 328 str(zr, Address(rthread, JavaThread::last_Java_sp_offset())); 329 330 // must clear fp, so that compiled frames are not confused; it is 331 // possible that we need it only for debugging 332 if (clear_fp) { 333 str(zr, Address(rthread, JavaThread::last_Java_fp_offset())); 334 } 335 336 // Always clear the pc because it could have been set by make_walkable() 337 str(zr, Address(rthread, JavaThread::last_Java_pc_offset())); 338 } 339 340 // Calls to C land 341 // 342 // When entering C land, the rfp, & resp of the last Java frame have to be recorded 343 // in the (thread-local) JavaThread object. When leaving C land, the last Java fp 344 // has to be reset to 0. This is required to allow proper stack traversal. 345 void MacroAssembler::set_last_Java_frame(Register last_java_sp, 346 Register last_java_fp, 347 Register last_java_pc, 348 Register scratch) { 349 350 if (last_java_pc->is_valid()) { 351 str(last_java_pc, Address(rthread, 352 JavaThread::frame_anchor_offset() 353 + JavaFrameAnchor::last_Java_pc_offset())); 354 } 355 356 // determine last_java_sp register 357 if (last_java_sp == sp) { 358 mov(scratch, sp); 359 last_java_sp = scratch; 360 } else if (!last_java_sp->is_valid()) { 361 last_java_sp = esp; 362 } 363 364 str(last_java_sp, Address(rthread, JavaThread::last_Java_sp_offset())); 365 366 // last_java_fp is optional 367 if (last_java_fp->is_valid()) { 368 str(last_java_fp, Address(rthread, JavaThread::last_Java_fp_offset())); 369 } 370 } 371 372 void MacroAssembler::set_last_Java_frame(Register last_java_sp, 373 Register last_java_fp, 374 address last_java_pc, 375 Register scratch) { 376 if (last_java_pc != NULL) { 377 adr(scratch, last_java_pc); 378 } else { 379 // FIXME: This is almost never correct. We should delete all 380 // cases of set_last_Java_frame with last_java_pc=NULL and use the 381 // correct return address instead. 382 adr(scratch, pc()); 383 } 384 385 str(scratch, Address(rthread, 386 JavaThread::frame_anchor_offset() 387 + JavaFrameAnchor::last_Java_pc_offset())); 388 389 set_last_Java_frame(last_java_sp, last_java_fp, noreg, scratch); 390 } 391 392 void MacroAssembler::set_last_Java_frame(Register last_java_sp, 393 Register last_java_fp, 394 Label &L, 395 Register scratch) { 396 if (L.is_bound()) { 397 set_last_Java_frame(last_java_sp, last_java_fp, target(L), scratch); 398 } else { 399 InstructionMark im(this); 400 L.add_patch_at(code(), locator()); 401 set_last_Java_frame(last_java_sp, last_java_fp, (address)NULL, scratch); 402 } 403 } 404 405 void MacroAssembler::far_call(Address entry, CodeBuffer *cbuf, Register tmp) { 406 assert(ReservedCodeCacheSize < 4*G, "branch out of range"); 407 assert(CodeCache::find_blob(entry.target()) != NULL, 408 "destination of far call not found in code cache"); 409 if (far_branches()) { 410 unsigned long offset; 411 // We can use ADRP here because we know that the total size of 412 // the code cache cannot exceed 2Gb. 413 adrp(tmp, entry, offset); 414 add(tmp, tmp, offset); 415 if (cbuf) cbuf->set_insts_mark(); 416 blr(tmp); 417 } else { 418 if (cbuf) cbuf->set_insts_mark(); 419 bl(entry); 420 } 421 } 422 423 void MacroAssembler::far_jump(Address entry, CodeBuffer *cbuf, Register tmp) { 424 assert(ReservedCodeCacheSize < 4*G, "branch out of range"); 425 assert(CodeCache::find_blob(entry.target()) != NULL, 426 "destination of far call not found in code cache"); 427 if (far_branches()) { 428 unsigned long offset; 429 // We can use ADRP here because we know that the total size of 430 // the code cache cannot exceed 2Gb. 431 adrp(tmp, entry, offset); 432 add(tmp, tmp, offset); 433 if (cbuf) cbuf->set_insts_mark(); 434 br(tmp); 435 } else { 436 if (cbuf) cbuf->set_insts_mark(); 437 b(entry); 438 } 439 } 440 441 void MacroAssembler::reserved_stack_check() { 442 // testing if reserved zone needs to be enabled 443 Label no_reserved_zone_enabling; 444 445 ldr(rscratch1, Address(rthread, JavaThread::reserved_stack_activation_offset())); 446 cmp(sp, rscratch1); 447 br(Assembler::LO, no_reserved_zone_enabling); 448 449 enter(); // LR and FP are live. 450 lea(rscratch1, CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone)); 451 mov(c_rarg0, rthread); 452 blr(rscratch1); 453 leave(); 454 455 // We have already removed our own frame. 456 // throw_delayed_StackOverflowError will think that it's been 457 // called by our caller. 458 lea(rscratch1, RuntimeAddress(StubRoutines::throw_delayed_StackOverflowError_entry())); 459 br(rscratch1); 460 should_not_reach_here(); 461 462 bind(no_reserved_zone_enabling); 463 } 464 465 int MacroAssembler::biased_locking_enter(Register lock_reg, 466 Register obj_reg, 467 Register swap_reg, 468 Register tmp_reg, 469 bool swap_reg_contains_mark, 470 Label& done, 471 Label* slow_case, 472 BiasedLockingCounters* counters) { 473 assert(UseBiasedLocking, "why call this otherwise?"); 474 assert_different_registers(lock_reg, obj_reg, swap_reg); 475 476 if (PrintBiasedLockingStatistics && counters == NULL) 477 counters = BiasedLocking::counters(); 478 479 assert_different_registers(lock_reg, obj_reg, swap_reg, tmp_reg, rscratch1, rscratch2, noreg); 480 assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits, "biased locking makes assumptions about bit layout"); 481 Address mark_addr (obj_reg, oopDesc::mark_offset_in_bytes()); 482 Address klass_addr (obj_reg, oopDesc::klass_offset_in_bytes()); 483 Address saved_mark_addr(lock_reg, 0); 484 485 // Biased locking 486 // See whether the lock is currently biased toward our thread and 487 // whether the epoch is still valid 488 // Note that the runtime guarantees sufficient alignment of JavaThread 489 // pointers to allow age to be placed into low bits 490 // First check to see whether biasing is even enabled for this object 491 Label cas_label; 492 int null_check_offset = -1; 493 if (!swap_reg_contains_mark) { 494 null_check_offset = offset(); 495 ldr(swap_reg, mark_addr); 496 } 497 andr(tmp_reg, swap_reg, markOopDesc::biased_lock_mask_in_place); 498 cmp(tmp_reg, (u1)markOopDesc::biased_lock_pattern); 499 br(Assembler::NE, cas_label); 500 // The bias pattern is present in the object's header. Need to check 501 // whether the bias owner and the epoch are both still current. 502 load_prototype_header(tmp_reg, obj_reg); 503 orr(tmp_reg, tmp_reg, rthread); 504 eor(tmp_reg, swap_reg, tmp_reg); 505 andr(tmp_reg, tmp_reg, ~((int) markOopDesc::age_mask_in_place)); 506 if (counters != NULL) { 507 Label around; 508 cbnz(tmp_reg, around); 509 atomic_incw(Address((address)counters->biased_lock_entry_count_addr()), tmp_reg, rscratch1, rscratch2); 510 b(done); 511 bind(around); 512 } else { 513 cbz(tmp_reg, done); 514 } 515 516 Label try_revoke_bias; 517 Label try_rebias; 518 519 // At this point we know that the header has the bias pattern and 520 // that we are not the bias owner in the current epoch. We need to 521 // figure out more details about the state of the header in order to 522 // know what operations can be legally performed on the object's 523 // header. 524 525 // If the low three bits in the xor result aren't clear, that means 526 // the prototype header is no longer biased and we have to revoke 527 // the bias on this object. 528 andr(rscratch1, tmp_reg, markOopDesc::biased_lock_mask_in_place); 529 cbnz(rscratch1, try_revoke_bias); 530 531 // Biasing is still enabled for this data type. See whether the 532 // epoch of the current bias is still valid, meaning that the epoch 533 // bits of the mark word are equal to the epoch bits of the 534 // prototype header. (Note that the prototype header's epoch bits 535 // only change at a safepoint.) If not, attempt to rebias the object 536 // toward the current thread. Note that we must be absolutely sure 537 // that the current epoch is invalid in order to do this because 538 // otherwise the manipulations it performs on the mark word are 539 // illegal. 540 andr(rscratch1, tmp_reg, markOopDesc::epoch_mask_in_place); 541 cbnz(rscratch1, try_rebias); 542 543 // The epoch of the current bias is still valid but we know nothing 544 // about the owner; it might be set or it might be clear. Try to 545 // acquire the bias of the object using an atomic operation. If this 546 // fails we will go in to the runtime to revoke the object's bias. 547 // Note that we first construct the presumed unbiased header so we 548 // don't accidentally blow away another thread's valid bias. 549 { 550 Label here; 551 mov(rscratch1, markOopDesc::biased_lock_mask_in_place | markOopDesc::age_mask_in_place | markOopDesc::epoch_mask_in_place); 552 andr(swap_reg, swap_reg, rscratch1); 553 orr(tmp_reg, swap_reg, rthread); 554 cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, rscratch1, here, slow_case); 555 // If the biasing toward our thread failed, this means that 556 // another thread succeeded in biasing it toward itself and we 557 // need to revoke that bias. The revocation will occur in the 558 // interpreter runtime in the slow case. 559 bind(here); 560 if (counters != NULL) { 561 atomic_incw(Address((address)counters->anonymously_biased_lock_entry_count_addr()), 562 tmp_reg, rscratch1, rscratch2); 563 } 564 } 565 b(done); 566 567 bind(try_rebias); 568 // At this point we know the epoch has expired, meaning that the 569 // current "bias owner", if any, is actually invalid. Under these 570 // circumstances _only_, we are allowed to use the current header's 571 // value as the comparison value when doing the cas to acquire the 572 // bias in the current epoch. In other words, we allow transfer of 573 // the bias from one thread to another directly in this situation. 574 // 575 // FIXME: due to a lack of registers we currently blow away the age 576 // bits in this situation. Should attempt to preserve them. 577 { 578 Label here; 579 load_prototype_header(tmp_reg, obj_reg); 580 orr(tmp_reg, rthread, tmp_reg); 581 cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, rscratch1, here, slow_case); 582 // If the biasing toward our thread failed, then another thread 583 // succeeded in biasing it toward itself and we need to revoke that 584 // bias. The revocation will occur in the runtime in the slow case. 585 bind(here); 586 if (counters != NULL) { 587 atomic_incw(Address((address)counters->rebiased_lock_entry_count_addr()), 588 tmp_reg, rscratch1, rscratch2); 589 } 590 } 591 b(done); 592 593 bind(try_revoke_bias); 594 // The prototype mark in the klass doesn't have the bias bit set any 595 // more, indicating that objects of this data type are not supposed 596 // to be biased any more. We are going to try to reset the mark of 597 // this object to the prototype value and fall through to the 598 // CAS-based locking scheme. Note that if our CAS fails, it means 599 // that another thread raced us for the privilege of revoking the 600 // bias of this particular object, so it's okay to continue in the 601 // normal locking code. 602 // 603 // FIXME: due to a lack of registers we currently blow away the age 604 // bits in this situation. Should attempt to preserve them. 605 { 606 Label here, nope; 607 load_prototype_header(tmp_reg, obj_reg); 608 cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, rscratch1, here, &nope); 609 bind(here); 610 611 // Fall through to the normal CAS-based lock, because no matter what 612 // the result of the above CAS, some thread must have succeeded in 613 // removing the bias bit from the object's header. 614 if (counters != NULL) { 615 atomic_incw(Address((address)counters->revoked_lock_entry_count_addr()), tmp_reg, 616 rscratch1, rscratch2); 617 } 618 bind(nope); 619 } 620 621 bind(cas_label); 622 623 return null_check_offset; 624 } 625 626 void MacroAssembler::biased_locking_exit(Register obj_reg, Register temp_reg, Label& done) { 627 assert(UseBiasedLocking, "why call this otherwise?"); 628 629 // Check for biased locking unlock case, which is a no-op 630 // Note: we do not have to check the thread ID for two reasons. 631 // First, the interpreter checks for IllegalMonitorStateException at 632 // a higher level. Second, if the bias was revoked while we held the 633 // lock, the object could not be rebiased toward another thread, so 634 // the bias bit would be clear. 635 ldr(temp_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes())); 636 andr(temp_reg, temp_reg, markOopDesc::biased_lock_mask_in_place); 637 cmp(temp_reg, (u1)markOopDesc::biased_lock_pattern); 638 br(Assembler::EQ, done); 639 } 640 641 static void pass_arg0(MacroAssembler* masm, Register arg) { 642 if (c_rarg0 != arg ) { 643 masm->mov(c_rarg0, arg); 644 } 645 } 646 647 static void pass_arg1(MacroAssembler* masm, Register arg) { 648 if (c_rarg1 != arg ) { 649 masm->mov(c_rarg1, arg); 650 } 651 } 652 653 static void pass_arg2(MacroAssembler* masm, Register arg) { 654 if (c_rarg2 != arg ) { 655 masm->mov(c_rarg2, arg); 656 } 657 } 658 659 static void pass_arg3(MacroAssembler* masm, Register arg) { 660 if (c_rarg3 != arg ) { 661 masm->mov(c_rarg3, arg); 662 } 663 } 664 665 void MacroAssembler::call_VM_base(Register oop_result, 666 Register java_thread, 667 Register last_java_sp, 668 address entry_point, 669 int number_of_arguments, 670 bool check_exceptions) { 671 // determine java_thread register 672 if (!java_thread->is_valid()) { 673 java_thread = rthread; 674 } 675 676 // determine last_java_sp register 677 if (!last_java_sp->is_valid()) { 678 last_java_sp = esp; 679 } 680 681 // debugging support 682 assert(number_of_arguments >= 0 , "cannot have negative number of arguments"); 683 assert(java_thread == rthread, "unexpected register"); 684 #ifdef ASSERT 685 // TraceBytecodes does not use r12 but saves it over the call, so don't verify 686 // if ((UseCompressedOops || UseCompressedClassPointers) && !TraceBytecodes) verify_heapbase("call_VM_base: heap base corrupted?"); 687 #endif // ASSERT 688 689 assert(java_thread != oop_result , "cannot use the same register for java_thread & oop_result"); 690 assert(java_thread != last_java_sp, "cannot use the same register for java_thread & last_java_sp"); 691 692 // push java thread (becomes first argument of C function) 693 694 mov(c_rarg0, java_thread); 695 696 // set last Java frame before call 697 assert(last_java_sp != rfp, "can't use rfp"); 698 699 Label l; 700 set_last_Java_frame(last_java_sp, rfp, l, rscratch1); 701 702 // do the call, remove parameters 703 MacroAssembler::call_VM_leaf_base(entry_point, number_of_arguments, &l); 704 705 // reset last Java frame 706 // Only interpreter should have to clear fp 707 reset_last_Java_frame(true); 708 709 // C++ interp handles this in the interpreter 710 check_and_handle_popframe(java_thread); 711 check_and_handle_earlyret(java_thread); 712 713 if (check_exceptions) { 714 // check for pending exceptions (java_thread is set upon return) 715 ldr(rscratch1, Address(java_thread, in_bytes(Thread::pending_exception_offset()))); 716 Label ok; 717 cbz(rscratch1, ok); 718 lea(rscratch1, RuntimeAddress(StubRoutines::forward_exception_entry())); 719 br(rscratch1); 720 bind(ok); 721 } 722 723 // get oop result if there is one and reset the value in the thread 724 if (oop_result->is_valid()) { 725 get_vm_result(oop_result, java_thread); 726 } 727 } 728 729 void MacroAssembler::call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions) { 730 call_VM_base(oop_result, noreg, noreg, entry_point, number_of_arguments, check_exceptions); 731 } 732 733 // Maybe emit a call via a trampoline. If the code cache is small 734 // trampolines won't be emitted. 735 736 address MacroAssembler::trampoline_call(Address entry, CodeBuffer *cbuf) { 737 assert(JavaThread::current()->is_Compiler_thread(), "just checking"); 738 assert(entry.rspec().type() == relocInfo::runtime_call_type 739 || entry.rspec().type() == relocInfo::opt_virtual_call_type 740 || entry.rspec().type() == relocInfo::static_call_type 741 || entry.rspec().type() == relocInfo::virtual_call_type, "wrong reloc type"); 742 743 // We need a trampoline if branches are far. 744 if (far_branches()) { 745 bool in_scratch_emit_size = false; 746 #ifdef COMPILER2 747 // We don't want to emit a trampoline if C2 is generating dummy 748 // code during its branch shortening phase. 749 CompileTask* task = ciEnv::current()->task(); 750 in_scratch_emit_size = 751 (task != NULL && is_c2_compile(task->comp_level()) && 752 Compile::current()->in_scratch_emit_size()); 753 #endif 754 if (!in_scratch_emit_size) { 755 address stub = emit_trampoline_stub(offset(), entry.target()); 756 if (stub == NULL) { 757 return NULL; // CodeCache is full 758 } 759 } 760 } 761 762 if (cbuf) cbuf->set_insts_mark(); 763 relocate(entry.rspec()); 764 if (!far_branches()) { 765 bl(entry.target()); 766 } else { 767 bl(pc()); 768 } 769 // just need to return a non-null address 770 return pc(); 771 } 772 773 774 // Emit a trampoline stub for a call to a target which is too far away. 775 // 776 // code sequences: 777 // 778 // call-site: 779 // branch-and-link to <destination> or <trampoline stub> 780 // 781 // Related trampoline stub for this call site in the stub section: 782 // load the call target from the constant pool 783 // branch (LR still points to the call site above) 784 785 address MacroAssembler::emit_trampoline_stub(int insts_call_instruction_offset, 786 address dest) { 787 // Max stub size: alignment nop, TrampolineStub. 788 address stub = start_a_stub(NativeInstruction::instruction_size 789 + NativeCallTrampolineStub::instruction_size); 790 if (stub == NULL) { 791 return NULL; // CodeBuffer::expand failed 792 } 793 794 // Create a trampoline stub relocation which relates this trampoline stub 795 // with the call instruction at insts_call_instruction_offset in the 796 // instructions code-section. 797 align(wordSize); 798 relocate(trampoline_stub_Relocation::spec(code()->insts()->start() 799 + insts_call_instruction_offset)); 800 const int stub_start_offset = offset(); 801 802 // Now, create the trampoline stub's code: 803 // - load the call 804 // - call 805 Label target; 806 ldr(rscratch1, target); 807 br(rscratch1); 808 bind(target); 809 assert(offset() - stub_start_offset == NativeCallTrampolineStub::data_offset, 810 "should be"); 811 emit_int64((int64_t)dest); 812 813 const address stub_start_addr = addr_at(stub_start_offset); 814 815 assert(is_NativeCallTrampolineStub_at(stub_start_addr), "doesn't look like a trampoline"); 816 817 end_a_stub(); 818 return stub_start_addr; 819 } 820 821 void MacroAssembler::c2bool(Register x) { 822 // implements x == 0 ? 0 : 1 823 // note: must only look at least-significant byte of x 824 // since C-style booleans are stored in one byte 825 // only! (was bug) 826 tst(x, 0xff); 827 cset(x, Assembler::NE); 828 } 829 830 address MacroAssembler::ic_call(address entry, jint method_index) { 831 RelocationHolder rh = virtual_call_Relocation::spec(pc(), method_index); 832 // address const_ptr = long_constant((jlong)Universe::non_oop_word()); 833 // unsigned long offset; 834 // ldr_constant(rscratch2, const_ptr); 835 movptr(rscratch2, (uintptr_t)Universe::non_oop_word()); 836 return trampoline_call(Address(entry, rh)); 837 } 838 839 // Implementation of call_VM versions 840 841 void MacroAssembler::call_VM(Register oop_result, 842 address entry_point, 843 bool check_exceptions) { 844 call_VM_helper(oop_result, entry_point, 0, check_exceptions); 845 } 846 847 void MacroAssembler::call_VM(Register oop_result, 848 address entry_point, 849 Register arg_1, 850 bool check_exceptions) { 851 pass_arg1(this, arg_1); 852 call_VM_helper(oop_result, entry_point, 1, check_exceptions); 853 } 854 855 void MacroAssembler::call_VM(Register oop_result, 856 address entry_point, 857 Register arg_1, 858 Register arg_2, 859 bool check_exceptions) { 860 assert(arg_1 != c_rarg2, "smashed arg"); 861 pass_arg2(this, arg_2); 862 pass_arg1(this, arg_1); 863 call_VM_helper(oop_result, entry_point, 2, check_exceptions); 864 } 865 866 void MacroAssembler::call_VM(Register oop_result, 867 address entry_point, 868 Register arg_1, 869 Register arg_2, 870 Register arg_3, 871 bool check_exceptions) { 872 assert(arg_1 != c_rarg3, "smashed arg"); 873 assert(arg_2 != c_rarg3, "smashed arg"); 874 pass_arg3(this, arg_3); 875 876 assert(arg_1 != c_rarg2, "smashed arg"); 877 pass_arg2(this, arg_2); 878 879 pass_arg1(this, arg_1); 880 call_VM_helper(oop_result, entry_point, 3, check_exceptions); 881 } 882 883 void MacroAssembler::call_VM(Register oop_result, 884 Register last_java_sp, 885 address entry_point, 886 int number_of_arguments, 887 bool check_exceptions) { 888 call_VM_base(oop_result, rthread, last_java_sp, entry_point, number_of_arguments, check_exceptions); 889 } 890 891 void MacroAssembler::call_VM(Register oop_result, 892 Register last_java_sp, 893 address entry_point, 894 Register arg_1, 895 bool check_exceptions) { 896 pass_arg1(this, arg_1); 897 call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions); 898 } 899 900 void MacroAssembler::call_VM(Register oop_result, 901 Register last_java_sp, 902 address entry_point, 903 Register arg_1, 904 Register arg_2, 905 bool check_exceptions) { 906 907 assert(arg_1 != c_rarg2, "smashed arg"); 908 pass_arg2(this, arg_2); 909 pass_arg1(this, arg_1); 910 call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions); 911 } 912 913 void MacroAssembler::call_VM(Register oop_result, 914 Register last_java_sp, 915 address entry_point, 916 Register arg_1, 917 Register arg_2, 918 Register arg_3, 919 bool check_exceptions) { 920 assert(arg_1 != c_rarg3, "smashed arg"); 921 assert(arg_2 != c_rarg3, "smashed arg"); 922 pass_arg3(this, arg_3); 923 assert(arg_1 != c_rarg2, "smashed arg"); 924 pass_arg2(this, arg_2); 925 pass_arg1(this, arg_1); 926 call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions); 927 } 928 929 930 void MacroAssembler::get_vm_result(Register oop_result, Register java_thread) { 931 ldr(oop_result, Address(java_thread, JavaThread::vm_result_offset())); 932 str(zr, Address(java_thread, JavaThread::vm_result_offset())); 933 verify_oop(oop_result, "broken oop in call_VM_base"); 934 } 935 936 void MacroAssembler::get_vm_result_2(Register metadata_result, Register java_thread) { 937 ldr(metadata_result, Address(java_thread, JavaThread::vm_result_2_offset())); 938 str(zr, Address(java_thread, JavaThread::vm_result_2_offset())); 939 } 940 941 void MacroAssembler::align(int modulus) { 942 while (offset() % modulus != 0) nop(); 943 } 944 945 // these are no-ops overridden by InterpreterMacroAssembler 946 947 void MacroAssembler::check_and_handle_earlyret(Register java_thread) { } 948 949 void MacroAssembler::check_and_handle_popframe(Register java_thread) { } 950 951 952 RegisterOrConstant MacroAssembler::delayed_value_impl(intptr_t* delayed_value_addr, 953 Register tmp, 954 int offset) { 955 intptr_t value = *delayed_value_addr; 956 if (value != 0) 957 return RegisterOrConstant(value + offset); 958 959 // load indirectly to solve generation ordering problem 960 ldr(tmp, ExternalAddress((address) delayed_value_addr)); 961 962 if (offset != 0) 963 add(tmp, tmp, offset); 964 965 return RegisterOrConstant(tmp); 966 } 967 968 969 void MacroAssembler:: notify(int type) { 970 if (type == bytecode_start) { 971 // set_last_Java_frame(esp, rfp, (address)NULL); 972 Assembler:: notify(type); 973 // reset_last_Java_frame(true); 974 } 975 else 976 Assembler:: notify(type); 977 } 978 979 // Look up the method for a megamorphic invokeinterface call. 980 // The target method is determined by <intf_klass, itable_index>. 981 // The receiver klass is in recv_klass. 982 // On success, the result will be in method_result, and execution falls through. 983 // On failure, execution transfers to the given label. 984 void MacroAssembler::lookup_interface_method(Register recv_klass, 985 Register intf_klass, 986 RegisterOrConstant itable_index, 987 Register method_result, 988 Register scan_temp, 989 Label& L_no_such_interface, 990 bool return_method) { 991 assert_different_registers(recv_klass, intf_klass, scan_temp); 992 assert_different_registers(method_result, intf_klass, scan_temp); 993 assert(recv_klass != method_result || !return_method, 994 "recv_klass can be destroyed when method isn't needed"); 995 assert(itable_index.is_constant() || itable_index.as_register() == method_result, 996 "caller must use same register for non-constant itable index as for method"); 997 998 // Compute start of first itableOffsetEntry (which is at the end of the vtable) 999 int vtable_base = in_bytes(Klass::vtable_start_offset()); 1000 int itentry_off = itableMethodEntry::method_offset_in_bytes(); 1001 int scan_step = itableOffsetEntry::size() * wordSize; 1002 int vte_size = vtableEntry::size_in_bytes(); 1003 assert(vte_size == wordSize, "else adjust times_vte_scale"); 1004 1005 ldrw(scan_temp, Address(recv_klass, Klass::vtable_length_offset())); 1006 1007 // %%% Could store the aligned, prescaled offset in the klassoop. 1008 // lea(scan_temp, Address(recv_klass, scan_temp, times_vte_scale, vtable_base)); 1009 lea(scan_temp, Address(recv_klass, scan_temp, Address::lsl(3))); 1010 add(scan_temp, scan_temp, vtable_base); 1011 1012 if (return_method) { 1013 // Adjust recv_klass by scaled itable_index, so we can free itable_index. 1014 assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below"); 1015 // lea(recv_klass, Address(recv_klass, itable_index, Address::times_ptr, itentry_off)); 1016 lea(recv_klass, Address(recv_klass, itable_index, Address::lsl(3))); 1017 if (itentry_off) 1018 add(recv_klass, recv_klass, itentry_off); 1019 } 1020 1021 // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) { 1022 // if (scan->interface() == intf) { 1023 // result = (klass + scan->offset() + itable_index); 1024 // } 1025 // } 1026 Label search, found_method; 1027 1028 for (int peel = 1; peel >= 0; peel--) { 1029 ldr(method_result, Address(scan_temp, itableOffsetEntry::interface_offset_in_bytes())); 1030 cmp(intf_klass, method_result); 1031 1032 if (peel) { 1033 br(Assembler::EQ, found_method); 1034 } else { 1035 br(Assembler::NE, search); 1036 // (invert the test to fall through to found_method...) 1037 } 1038 1039 if (!peel) break; 1040 1041 bind(search); 1042 1043 // Check that the previous entry is non-null. A null entry means that 1044 // the receiver class doesn't implement the interface, and wasn't the 1045 // same as when the caller was compiled. 1046 cbz(method_result, L_no_such_interface); 1047 add(scan_temp, scan_temp, scan_step); 1048 } 1049 1050 bind(found_method); 1051 1052 // Got a hit. 1053 if (return_method) { 1054 ldrw(scan_temp, Address(scan_temp, itableOffsetEntry::offset_offset_in_bytes())); 1055 ldr(method_result, Address(recv_klass, scan_temp, Address::uxtw(0))); 1056 } 1057 } 1058 1059 // virtual method calling 1060 void MacroAssembler::lookup_virtual_method(Register recv_klass, 1061 RegisterOrConstant vtable_index, 1062 Register method_result) { 1063 const int base = in_bytes(Klass::vtable_start_offset()); 1064 assert(vtableEntry::size() * wordSize == 8, 1065 "adjust the scaling in the code below"); 1066 int vtable_offset_in_bytes = base + vtableEntry::method_offset_in_bytes(); 1067 1068 if (vtable_index.is_register()) { 1069 lea(method_result, Address(recv_klass, 1070 vtable_index.as_register(), 1071 Address::lsl(LogBytesPerWord))); 1072 ldr(method_result, Address(method_result, vtable_offset_in_bytes)); 1073 } else { 1074 vtable_offset_in_bytes += vtable_index.as_constant() * wordSize; 1075 ldr(method_result, 1076 form_address(rscratch1, recv_klass, vtable_offset_in_bytes, 0)); 1077 } 1078 } 1079 1080 void MacroAssembler::check_klass_subtype(Register sub_klass, 1081 Register super_klass, 1082 Register temp_reg, 1083 Label& L_success) { 1084 Label L_failure; 1085 check_klass_subtype_fast_path(sub_klass, super_klass, temp_reg, &L_success, &L_failure, NULL); 1086 check_klass_subtype_slow_path(sub_klass, super_klass, temp_reg, noreg, &L_success, NULL); 1087 bind(L_failure); 1088 } 1089 1090 1091 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass, 1092 Register super_klass, 1093 Register temp_reg, 1094 Label* L_success, 1095 Label* L_failure, 1096 Label* L_slow_path, 1097 RegisterOrConstant super_check_offset) { 1098 assert_different_registers(sub_klass, super_klass, temp_reg); 1099 bool must_load_sco = (super_check_offset.constant_or_zero() == -1); 1100 if (super_check_offset.is_register()) { 1101 assert_different_registers(sub_klass, super_klass, 1102 super_check_offset.as_register()); 1103 } else if (must_load_sco) { 1104 assert(temp_reg != noreg, "supply either a temp or a register offset"); 1105 } 1106 1107 Label L_fallthrough; 1108 int label_nulls = 0; 1109 if (L_success == NULL) { L_success = &L_fallthrough; label_nulls++; } 1110 if (L_failure == NULL) { L_failure = &L_fallthrough; label_nulls++; } 1111 if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; } 1112 assert(label_nulls <= 1, "at most one NULL in the batch"); 1113 1114 int sc_offset = in_bytes(Klass::secondary_super_cache_offset()); 1115 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 1116 Address super_check_offset_addr(super_klass, sco_offset); 1117 1118 // Hacked jmp, which may only be used just before L_fallthrough. 1119 #define final_jmp(label) \ 1120 if (&(label) == &L_fallthrough) { /*do nothing*/ } \ 1121 else b(label) /*omit semi*/ 1122 1123 // If the pointers are equal, we are done (e.g., String[] elements). 1124 // This self-check enables sharing of secondary supertype arrays among 1125 // non-primary types such as array-of-interface. Otherwise, each such 1126 // type would need its own customized SSA. 1127 // We move this check to the front of the fast path because many 1128 // type checks are in fact trivially successful in this manner, 1129 // so we get a nicely predicted branch right at the start of the check. 1130 cmp(sub_klass, super_klass); 1131 br(Assembler::EQ, *L_success); 1132 1133 // Check the supertype display: 1134 if (must_load_sco) { 1135 ldrw(temp_reg, super_check_offset_addr); 1136 super_check_offset = RegisterOrConstant(temp_reg); 1137 } 1138 Address super_check_addr(sub_klass, super_check_offset); 1139 ldr(rscratch1, super_check_addr); 1140 cmp(super_klass, rscratch1); // load displayed supertype 1141 1142 // This check has worked decisively for primary supers. 1143 // Secondary supers are sought in the super_cache ('super_cache_addr'). 1144 // (Secondary supers are interfaces and very deeply nested subtypes.) 1145 // This works in the same check above because of a tricky aliasing 1146 // between the super_cache and the primary super display elements. 1147 // (The 'super_check_addr' can address either, as the case requires.) 1148 // Note that the cache is updated below if it does not help us find 1149 // what we need immediately. 1150 // So if it was a primary super, we can just fail immediately. 1151 // Otherwise, it's the slow path for us (no success at this point). 1152 1153 if (super_check_offset.is_register()) { 1154 br(Assembler::EQ, *L_success); 1155 subs(zr, super_check_offset.as_register(), sc_offset); 1156 if (L_failure == &L_fallthrough) { 1157 br(Assembler::EQ, *L_slow_path); 1158 } else { 1159 br(Assembler::NE, *L_failure); 1160 final_jmp(*L_slow_path); 1161 } 1162 } else if (super_check_offset.as_constant() == sc_offset) { 1163 // Need a slow path; fast failure is impossible. 1164 if (L_slow_path == &L_fallthrough) { 1165 br(Assembler::EQ, *L_success); 1166 } else { 1167 br(Assembler::NE, *L_slow_path); 1168 final_jmp(*L_success); 1169 } 1170 } else { 1171 // No slow path; it's a fast decision. 1172 if (L_failure == &L_fallthrough) { 1173 br(Assembler::EQ, *L_success); 1174 } else { 1175 br(Assembler::NE, *L_failure); 1176 final_jmp(*L_success); 1177 } 1178 } 1179 1180 bind(L_fallthrough); 1181 1182 #undef final_jmp 1183 } 1184 1185 // These two are taken from x86, but they look generally useful 1186 1187 // scans count pointer sized words at [addr] for occurence of value, 1188 // generic 1189 void MacroAssembler::repne_scan(Register addr, Register value, Register count, 1190 Register scratch) { 1191 Label Lloop, Lexit; 1192 cbz(count, Lexit); 1193 bind(Lloop); 1194 ldr(scratch, post(addr, wordSize)); 1195 cmp(value, scratch); 1196 br(EQ, Lexit); 1197 sub(count, count, 1); 1198 cbnz(count, Lloop); 1199 bind(Lexit); 1200 } 1201 1202 // scans count 4 byte words at [addr] for occurence of value, 1203 // generic 1204 void MacroAssembler::repne_scanw(Register addr, Register value, Register count, 1205 Register scratch) { 1206 Label Lloop, Lexit; 1207 cbz(count, Lexit); 1208 bind(Lloop); 1209 ldrw(scratch, post(addr, wordSize)); 1210 cmpw(value, scratch); 1211 br(EQ, Lexit); 1212 sub(count, count, 1); 1213 cbnz(count, Lloop); 1214 bind(Lexit); 1215 } 1216 1217 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass, 1218 Register super_klass, 1219 Register temp_reg, 1220 Register temp2_reg, 1221 Label* L_success, 1222 Label* L_failure, 1223 bool set_cond_codes) { 1224 assert_different_registers(sub_klass, super_klass, temp_reg); 1225 if (temp2_reg != noreg) 1226 assert_different_registers(sub_klass, super_klass, temp_reg, temp2_reg, rscratch1); 1227 #define IS_A_TEMP(reg) ((reg) == temp_reg || (reg) == temp2_reg) 1228 1229 Label L_fallthrough; 1230 int label_nulls = 0; 1231 if (L_success == NULL) { L_success = &L_fallthrough; label_nulls++; } 1232 if (L_failure == NULL) { L_failure = &L_fallthrough; label_nulls++; } 1233 assert(label_nulls <= 1, "at most one NULL in the batch"); 1234 1235 // a couple of useful fields in sub_klass: 1236 int ss_offset = in_bytes(Klass::secondary_supers_offset()); 1237 int sc_offset = in_bytes(Klass::secondary_super_cache_offset()); 1238 Address secondary_supers_addr(sub_klass, ss_offset); 1239 Address super_cache_addr( sub_klass, sc_offset); 1240 1241 BLOCK_COMMENT("check_klass_subtype_slow_path"); 1242 1243 // Do a linear scan of the secondary super-klass chain. 1244 // This code is rarely used, so simplicity is a virtue here. 1245 // The repne_scan instruction uses fixed registers, which we must spill. 1246 // Don't worry too much about pre-existing connections with the input regs. 1247 1248 assert(sub_klass != r0, "killed reg"); // killed by mov(r0, super) 1249 assert(sub_klass != r2, "killed reg"); // killed by lea(r2, &pst_counter) 1250 1251 RegSet pushed_registers; 1252 if (!IS_A_TEMP(r2)) pushed_registers += r2; 1253 if (!IS_A_TEMP(r5)) pushed_registers += r5; 1254 1255 if (super_klass != r0 || UseCompressedOops) { 1256 if (!IS_A_TEMP(r0)) pushed_registers += r0; 1257 } 1258 1259 push(pushed_registers, sp); 1260 1261 // Get super_klass value into r0 (even if it was in r5 or r2). 1262 if (super_klass != r0) { 1263 mov(r0, super_klass); 1264 } 1265 1266 #ifndef PRODUCT 1267 mov(rscratch2, (address)&SharedRuntime::_partial_subtype_ctr); 1268 Address pst_counter_addr(rscratch2); 1269 ldr(rscratch1, pst_counter_addr); 1270 add(rscratch1, rscratch1, 1); 1271 str(rscratch1, pst_counter_addr); 1272 #endif //PRODUCT 1273 1274 // We will consult the secondary-super array. 1275 ldr(r5, secondary_supers_addr); 1276 // Load the array length. 1277 ldrw(r2, Address(r5, Array<Klass*>::length_offset_in_bytes())); 1278 // Skip to start of data. 1279 add(r5, r5, Array<Klass*>::base_offset_in_bytes()); 1280 1281 cmp(sp, zr); // Clear Z flag; SP is never zero 1282 // Scan R2 words at [R5] for an occurrence of R0. 1283 // Set NZ/Z based on last compare. 1284 repne_scan(r5, r0, r2, rscratch1); 1285 1286 // Unspill the temp. registers: 1287 pop(pushed_registers, sp); 1288 1289 br(Assembler::NE, *L_failure); 1290 1291 // Success. Cache the super we found and proceed in triumph. 1292 str(super_klass, super_cache_addr); 1293 1294 if (L_success != &L_fallthrough) { 1295 b(*L_success); 1296 } 1297 1298 #undef IS_A_TEMP 1299 1300 bind(L_fallthrough); 1301 } 1302 1303 1304 void MacroAssembler::verify_oop(Register reg, const char* s) { 1305 if (!VerifyOops) return; 1306 1307 // Pass register number to verify_oop_subroutine 1308 const char* b = NULL; 1309 { 1310 ResourceMark rm; 1311 stringStream ss; 1312 ss.print("verify_oop: %s: %s", reg->name(), s); 1313 b = code_string(ss.as_string()); 1314 } 1315 BLOCK_COMMENT("verify_oop {"); 1316 1317 stp(r0, rscratch1, Address(pre(sp, -2 * wordSize))); 1318 stp(rscratch2, lr, Address(pre(sp, -2 * wordSize))); 1319 1320 mov(r0, reg); 1321 mov(rscratch1, (address)b); 1322 1323 // call indirectly to solve generation ordering problem 1324 lea(rscratch2, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address())); 1325 ldr(rscratch2, Address(rscratch2)); 1326 blr(rscratch2); 1327 1328 ldp(rscratch2, lr, Address(post(sp, 2 * wordSize))); 1329 ldp(r0, rscratch1, Address(post(sp, 2 * wordSize))); 1330 1331 BLOCK_COMMENT("} verify_oop"); 1332 } 1333 1334 void MacroAssembler::verify_oop_addr(Address addr, const char* s) { 1335 if (!VerifyOops) return; 1336 1337 const char* b = NULL; 1338 { 1339 ResourceMark rm; 1340 stringStream ss; 1341 ss.print("verify_oop_addr: %s", s); 1342 b = code_string(ss.as_string()); 1343 } 1344 BLOCK_COMMENT("verify_oop_addr {"); 1345 1346 stp(r0, rscratch1, Address(pre(sp, -2 * wordSize))); 1347 stp(rscratch2, lr, Address(pre(sp, -2 * wordSize))); 1348 1349 // addr may contain sp so we will have to adjust it based on the 1350 // pushes that we just did. 1351 if (addr.uses(sp)) { 1352 lea(r0, addr); 1353 ldr(r0, Address(r0, 4 * wordSize)); 1354 } else { 1355 ldr(r0, addr); 1356 } 1357 mov(rscratch1, (address)b); 1358 1359 // call indirectly to solve generation ordering problem 1360 lea(rscratch2, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address())); 1361 ldr(rscratch2, Address(rscratch2)); 1362 blr(rscratch2); 1363 1364 ldp(rscratch2, lr, Address(post(sp, 2 * wordSize))); 1365 ldp(r0, rscratch1, Address(post(sp, 2 * wordSize))); 1366 1367 BLOCK_COMMENT("} verify_oop_addr"); 1368 } 1369 1370 Address MacroAssembler::argument_address(RegisterOrConstant arg_slot, 1371 int extra_slot_offset) { 1372 // cf. TemplateTable::prepare_invoke(), if (load_receiver). 1373 int stackElementSize = Interpreter::stackElementSize; 1374 int offset = Interpreter::expr_offset_in_bytes(extra_slot_offset+0); 1375 #ifdef ASSERT 1376 int offset1 = Interpreter::expr_offset_in_bytes(extra_slot_offset+1); 1377 assert(offset1 - offset == stackElementSize, "correct arithmetic"); 1378 #endif 1379 if (arg_slot.is_constant()) { 1380 return Address(esp, arg_slot.as_constant() * stackElementSize 1381 + offset); 1382 } else { 1383 add(rscratch1, esp, arg_slot.as_register(), 1384 ext::uxtx, exact_log2(stackElementSize)); 1385 return Address(rscratch1, offset); 1386 } 1387 } 1388 1389 void MacroAssembler::call_VM_leaf_base(address entry_point, 1390 int number_of_arguments, 1391 Label *retaddr) { 1392 call_VM_leaf_base1(entry_point, number_of_arguments, 0, ret_type_integral, retaddr); 1393 } 1394 1395 void MacroAssembler::call_VM_leaf_base1(address entry_point, 1396 int number_of_gp_arguments, 1397 int number_of_fp_arguments, 1398 ret_type type, 1399 Label *retaddr) { 1400 Label E, L; 1401 1402 stp(rscratch1, rmethod, Address(pre(sp, -2 * wordSize))); 1403 1404 // We add 1 to number_of_arguments because the thread in arg0 is 1405 // not counted 1406 mov(rscratch1, entry_point); 1407 blrt(rscratch1, number_of_gp_arguments + 1, number_of_fp_arguments, type); 1408 if (retaddr) 1409 bind(*retaddr); 1410 1411 ldp(rscratch1, rmethod, Address(post(sp, 2 * wordSize))); 1412 maybe_isb(); 1413 } 1414 1415 void MacroAssembler::call_VM_leaf(address entry_point, int number_of_arguments) { 1416 call_VM_leaf_base(entry_point, number_of_arguments); 1417 } 1418 1419 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0) { 1420 pass_arg0(this, arg_0); 1421 call_VM_leaf_base(entry_point, 1); 1422 } 1423 1424 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1) { 1425 pass_arg0(this, arg_0); 1426 pass_arg1(this, arg_1); 1427 call_VM_leaf_base(entry_point, 2); 1428 } 1429 1430 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, 1431 Register arg_1, Register arg_2) { 1432 pass_arg0(this, arg_0); 1433 pass_arg1(this, arg_1); 1434 pass_arg2(this, arg_2); 1435 call_VM_leaf_base(entry_point, 3); 1436 } 1437 1438 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0) { 1439 pass_arg0(this, arg_0); 1440 MacroAssembler::call_VM_leaf_base(entry_point, 1); 1441 } 1442 1443 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1) { 1444 1445 assert(arg_0 != c_rarg1, "smashed arg"); 1446 pass_arg1(this, arg_1); 1447 pass_arg0(this, arg_0); 1448 MacroAssembler::call_VM_leaf_base(entry_point, 2); 1449 } 1450 1451 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) { 1452 assert(arg_0 != c_rarg2, "smashed arg"); 1453 assert(arg_1 != c_rarg2, "smashed arg"); 1454 pass_arg2(this, arg_2); 1455 assert(arg_0 != c_rarg1, "smashed arg"); 1456 pass_arg1(this, arg_1); 1457 pass_arg0(this, arg_0); 1458 MacroAssembler::call_VM_leaf_base(entry_point, 3); 1459 } 1460 1461 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) { 1462 assert(arg_0 != c_rarg3, "smashed arg"); 1463 assert(arg_1 != c_rarg3, "smashed arg"); 1464 assert(arg_2 != c_rarg3, "smashed arg"); 1465 pass_arg3(this, arg_3); 1466 assert(arg_0 != c_rarg2, "smashed arg"); 1467 assert(arg_1 != c_rarg2, "smashed arg"); 1468 pass_arg2(this, arg_2); 1469 assert(arg_0 != c_rarg1, "smashed arg"); 1470 pass_arg1(this, arg_1); 1471 pass_arg0(this, arg_0); 1472 MacroAssembler::call_VM_leaf_base(entry_point, 4); 1473 } 1474 1475 void MacroAssembler::null_check(Register reg, int offset) { 1476 if (needs_explicit_null_check(offset)) { 1477 // provoke OS NULL exception if reg = NULL by 1478 // accessing M[reg] w/o changing any registers 1479 // NOTE: this is plenty to provoke a segv 1480 ldr(zr, Address(reg)); 1481 } else { 1482 // nothing to do, (later) access of M[reg + offset] 1483 // will provoke OS NULL exception if reg = NULL 1484 } 1485 } 1486 1487 // MacroAssembler protected routines needed to implement 1488 // public methods 1489 1490 void MacroAssembler::mov(Register r, Address dest) { 1491 code_section()->relocate(pc(), dest.rspec()); 1492 u_int64_t imm64 = (u_int64_t)dest.target(); 1493 movptr(r, imm64); 1494 } 1495 1496 // Move a constant pointer into r. In AArch64 mode the virtual 1497 // address space is 48 bits in size, so we only need three 1498 // instructions to create a patchable instruction sequence that can 1499 // reach anywhere. 1500 void MacroAssembler::movptr(Register r, uintptr_t imm64) { 1501 #ifndef PRODUCT 1502 { 1503 char buffer[64]; 1504 snprintf(buffer, sizeof(buffer), "0x%" PRIX64, imm64); 1505 block_comment(buffer); 1506 } 1507 #endif 1508 assert(imm64 < (1ul << 48), "48-bit overflow in address constant"); 1509 movz(r, imm64 & 0xffff); 1510 imm64 >>= 16; 1511 movk(r, imm64 & 0xffff, 16); 1512 imm64 >>= 16; 1513 movk(r, imm64 & 0xffff, 32); 1514 } 1515 1516 // Macro to mov replicated immediate to vector register. 1517 // Vd will get the following values for different arrangements in T 1518 // imm32 == hex 000000gh T8B: Vd = ghghghghghghghgh 1519 // imm32 == hex 000000gh T16B: Vd = ghghghghghghghghghghghghghghghgh 1520 // imm32 == hex 0000efgh T4H: Vd = efghefghefghefgh 1521 // imm32 == hex 0000efgh T8H: Vd = efghefghefghefghefghefghefghefgh 1522 // imm32 == hex abcdefgh T2S: Vd = abcdefghabcdefgh 1523 // imm32 == hex abcdefgh T4S: Vd = abcdefghabcdefghabcdefghabcdefgh 1524 // T1D/T2D: invalid 1525 void MacroAssembler::mov(FloatRegister Vd, SIMD_Arrangement T, u_int32_t imm32) { 1526 assert(T != T1D && T != T2D, "invalid arrangement"); 1527 if (T == T8B || T == T16B) { 1528 assert((imm32 & ~0xff) == 0, "extraneous bits in unsigned imm32 (T8B/T16B)"); 1529 movi(Vd, T, imm32 & 0xff, 0); 1530 return; 1531 } 1532 u_int32_t nimm32 = ~imm32; 1533 if (T == T4H || T == T8H) { 1534 assert((imm32 & ~0xffff) == 0, "extraneous bits in unsigned imm32 (T4H/T8H)"); 1535 imm32 &= 0xffff; 1536 nimm32 &= 0xffff; 1537 } 1538 u_int32_t x = imm32; 1539 int movi_cnt = 0; 1540 int movn_cnt = 0; 1541 while (x) { if (x & 0xff) movi_cnt++; x >>= 8; } 1542 x = nimm32; 1543 while (x) { if (x & 0xff) movn_cnt++; x >>= 8; } 1544 if (movn_cnt < movi_cnt) imm32 = nimm32; 1545 unsigned lsl = 0; 1546 while (imm32 && (imm32 & 0xff) == 0) { lsl += 8; imm32 >>= 8; } 1547 if (movn_cnt < movi_cnt) 1548 mvni(Vd, T, imm32 & 0xff, lsl); 1549 else 1550 movi(Vd, T, imm32 & 0xff, lsl); 1551 imm32 >>= 8; lsl += 8; 1552 while (imm32) { 1553 while ((imm32 & 0xff) == 0) { lsl += 8; imm32 >>= 8; } 1554 if (movn_cnt < movi_cnt) 1555 bici(Vd, T, imm32 & 0xff, lsl); 1556 else 1557 orri(Vd, T, imm32 & 0xff, lsl); 1558 lsl += 8; imm32 >>= 8; 1559 } 1560 } 1561 1562 void MacroAssembler::mov_immediate64(Register dst, u_int64_t imm64) 1563 { 1564 #ifndef PRODUCT 1565 { 1566 char buffer[64]; 1567 snprintf(buffer, sizeof(buffer), "0x%" PRIX64, imm64); 1568 block_comment(buffer); 1569 } 1570 #endif 1571 if (operand_valid_for_logical_immediate(false, imm64)) { 1572 orr(dst, zr, imm64); 1573 } else { 1574 // we can use a combination of MOVZ or MOVN with 1575 // MOVK to build up the constant 1576 u_int64_t imm_h[4]; 1577 int zero_count = 0; 1578 int neg_count = 0; 1579 int i; 1580 for (i = 0; i < 4; i++) { 1581 imm_h[i] = ((imm64 >> (i * 16)) & 0xffffL); 1582 if (imm_h[i] == 0) { 1583 zero_count++; 1584 } else if (imm_h[i] == 0xffffL) { 1585 neg_count++; 1586 } 1587 } 1588 if (zero_count == 4) { 1589 // one MOVZ will do 1590 movz(dst, 0); 1591 } else if (neg_count == 4) { 1592 // one MOVN will do 1593 movn(dst, 0); 1594 } else if (zero_count == 3) { 1595 for (i = 0; i < 4; i++) { 1596 if (imm_h[i] != 0L) { 1597 movz(dst, (u_int32_t)imm_h[i], (i << 4)); 1598 break; 1599 } 1600 } 1601 } else if (neg_count == 3) { 1602 // one MOVN will do 1603 for (int i = 0; i < 4; i++) { 1604 if (imm_h[i] != 0xffffL) { 1605 movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4)); 1606 break; 1607 } 1608 } 1609 } else if (zero_count == 2) { 1610 // one MOVZ and one MOVK will do 1611 for (i = 0; i < 3; i++) { 1612 if (imm_h[i] != 0L) { 1613 movz(dst, (u_int32_t)imm_h[i], (i << 4)); 1614 i++; 1615 break; 1616 } 1617 } 1618 for (;i < 4; i++) { 1619 if (imm_h[i] != 0L) { 1620 movk(dst, (u_int32_t)imm_h[i], (i << 4)); 1621 } 1622 } 1623 } else if (neg_count == 2) { 1624 // one MOVN and one MOVK will do 1625 for (i = 0; i < 4; i++) { 1626 if (imm_h[i] != 0xffffL) { 1627 movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4)); 1628 i++; 1629 break; 1630 } 1631 } 1632 for (;i < 4; i++) { 1633 if (imm_h[i] != 0xffffL) { 1634 movk(dst, (u_int32_t)imm_h[i], (i << 4)); 1635 } 1636 } 1637 } else if (zero_count == 1) { 1638 // one MOVZ and two MOVKs will do 1639 for (i = 0; i < 4; i++) { 1640 if (imm_h[i] != 0L) { 1641 movz(dst, (u_int32_t)imm_h[i], (i << 4)); 1642 i++; 1643 break; 1644 } 1645 } 1646 for (;i < 4; i++) { 1647 if (imm_h[i] != 0x0L) { 1648 movk(dst, (u_int32_t)imm_h[i], (i << 4)); 1649 } 1650 } 1651 } else if (neg_count == 1) { 1652 // one MOVN and two MOVKs will do 1653 for (i = 0; i < 4; i++) { 1654 if (imm_h[i] != 0xffffL) { 1655 movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4)); 1656 i++; 1657 break; 1658 } 1659 } 1660 for (;i < 4; i++) { 1661 if (imm_h[i] != 0xffffL) { 1662 movk(dst, (u_int32_t)imm_h[i], (i << 4)); 1663 } 1664 } 1665 } else { 1666 // use a MOVZ and 3 MOVKs (makes it easier to debug) 1667 movz(dst, (u_int32_t)imm_h[0], 0); 1668 for (i = 1; i < 4; i++) { 1669 movk(dst, (u_int32_t)imm_h[i], (i << 4)); 1670 } 1671 } 1672 } 1673 } 1674 1675 void MacroAssembler::mov_immediate32(Register dst, u_int32_t imm32) 1676 { 1677 #ifndef PRODUCT 1678 { 1679 char buffer[64]; 1680 snprintf(buffer, sizeof(buffer), "0x%" PRIX32, imm32); 1681 block_comment(buffer); 1682 } 1683 #endif 1684 if (operand_valid_for_logical_immediate(true, imm32)) { 1685 orrw(dst, zr, imm32); 1686 } else { 1687 // we can use MOVZ, MOVN or two calls to MOVK to build up the 1688 // constant 1689 u_int32_t imm_h[2]; 1690 imm_h[0] = imm32 & 0xffff; 1691 imm_h[1] = ((imm32 >> 16) & 0xffff); 1692 if (imm_h[0] == 0) { 1693 movzw(dst, imm_h[1], 16); 1694 } else if (imm_h[0] == 0xffff) { 1695 movnw(dst, imm_h[1] ^ 0xffff, 16); 1696 } else if (imm_h[1] == 0) { 1697 movzw(dst, imm_h[0], 0); 1698 } else if (imm_h[1] == 0xffff) { 1699 movnw(dst, imm_h[0] ^ 0xffff, 0); 1700 } else { 1701 // use a MOVZ and MOVK (makes it easier to debug) 1702 movzw(dst, imm_h[0], 0); 1703 movkw(dst, imm_h[1], 16); 1704 } 1705 } 1706 } 1707 1708 // Form an address from base + offset in Rd. Rd may or may 1709 // not actually be used: you must use the Address that is returned. 1710 // It is up to you to ensure that the shift provided matches the size 1711 // of your data. 1712 Address MacroAssembler::form_address(Register Rd, Register base, long byte_offset, int shift) { 1713 if (Address::offset_ok_for_immed(byte_offset, shift)) 1714 // It fits; no need for any heroics 1715 return Address(base, byte_offset); 1716 1717 // Don't do anything clever with negative or misaligned offsets 1718 unsigned mask = (1 << shift) - 1; 1719 if (byte_offset < 0 || byte_offset & mask) { 1720 mov(Rd, byte_offset); 1721 add(Rd, base, Rd); 1722 return Address(Rd); 1723 } 1724 1725 // See if we can do this with two 12-bit offsets 1726 { 1727 unsigned long word_offset = byte_offset >> shift; 1728 unsigned long masked_offset = word_offset & 0xfff000; 1729 if (Address::offset_ok_for_immed(word_offset - masked_offset) 1730 && Assembler::operand_valid_for_add_sub_immediate(masked_offset << shift)) { 1731 add(Rd, base, masked_offset << shift); 1732 word_offset -= masked_offset; 1733 return Address(Rd, word_offset << shift); 1734 } 1735 } 1736 1737 // Do it the hard way 1738 mov(Rd, byte_offset); 1739 add(Rd, base, Rd); 1740 return Address(Rd); 1741 } 1742 1743 void MacroAssembler::atomic_incw(Register counter_addr, Register tmp, Register tmp2) { 1744 if (UseLSE) { 1745 mov(tmp, 1); 1746 ldadd(Assembler::word, tmp, zr, counter_addr); 1747 return; 1748 } 1749 Label retry_load; 1750 if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH)) 1751 prfm(Address(counter_addr), PSTL1STRM); 1752 bind(retry_load); 1753 // flush and load exclusive from the memory location 1754 ldxrw(tmp, counter_addr); 1755 addw(tmp, tmp, 1); 1756 // if we store+flush with no intervening write tmp wil be zero 1757 stxrw(tmp2, tmp, counter_addr); 1758 cbnzw(tmp2, retry_load); 1759 } 1760 1761 1762 int MacroAssembler::corrected_idivl(Register result, Register ra, Register rb, 1763 bool want_remainder, Register scratch) 1764 { 1765 // Full implementation of Java idiv and irem. The function 1766 // returns the (pc) offset of the div instruction - may be needed 1767 // for implicit exceptions. 1768 // 1769 // constraint : ra/rb =/= scratch 1770 // normal case 1771 // 1772 // input : ra: dividend 1773 // rb: divisor 1774 // 1775 // result: either 1776 // quotient (= ra idiv rb) 1777 // remainder (= ra irem rb) 1778 1779 assert(ra != scratch && rb != scratch, "reg cannot be scratch"); 1780 1781 int idivl_offset = offset(); 1782 if (! want_remainder) { 1783 sdivw(result, ra, rb); 1784 } else { 1785 sdivw(scratch, ra, rb); 1786 Assembler::msubw(result, scratch, rb, ra); 1787 } 1788 1789 return idivl_offset; 1790 } 1791 1792 int MacroAssembler::corrected_idivq(Register result, Register ra, Register rb, 1793 bool want_remainder, Register scratch) 1794 { 1795 // Full implementation of Java ldiv and lrem. The function 1796 // returns the (pc) offset of the div instruction - may be needed 1797 // for implicit exceptions. 1798 // 1799 // constraint : ra/rb =/= scratch 1800 // normal case 1801 // 1802 // input : ra: dividend 1803 // rb: divisor 1804 // 1805 // result: either 1806 // quotient (= ra idiv rb) 1807 // remainder (= ra irem rb) 1808 1809 assert(ra != scratch && rb != scratch, "reg cannot be scratch"); 1810 1811 int idivq_offset = offset(); 1812 if (! want_remainder) { 1813 sdiv(result, ra, rb); 1814 } else { 1815 sdiv(scratch, ra, rb); 1816 Assembler::msub(result, scratch, rb, ra); 1817 } 1818 1819 return idivq_offset; 1820 } 1821 1822 void MacroAssembler::membar(Membar_mask_bits order_constraint) { 1823 address prev = pc() - NativeMembar::instruction_size; 1824 address last = code()->last_insn(); 1825 if (last != NULL && nativeInstruction_at(last)->is_Membar() && prev == last) { 1826 NativeMembar *bar = NativeMembar_at(prev); 1827 // We are merging two memory barrier instructions. On AArch64 we 1828 // can do this simply by ORing them together. 1829 bar->set_kind(bar->get_kind() | order_constraint); 1830 BLOCK_COMMENT("merged membar"); 1831 } else { 1832 code()->set_last_insn(pc()); 1833 dmb(Assembler::barrier(order_constraint)); 1834 } 1835 } 1836 1837 bool MacroAssembler::try_merge_ldst(Register rt, const Address &adr, size_t size_in_bytes, bool is_store) { 1838 if (ldst_can_merge(rt, adr, size_in_bytes, is_store)) { 1839 merge_ldst(rt, adr, size_in_bytes, is_store); 1840 code()->clear_last_insn(); 1841 return true; 1842 } else { 1843 assert(size_in_bytes == 8 || size_in_bytes == 4, "only 8 bytes or 4 bytes load/store is supported."); 1844 const unsigned mask = size_in_bytes - 1; 1845 if (adr.getMode() == Address::base_plus_offset && 1846 (adr.offset() & mask) == 0) { // only supports base_plus_offset. 1847 code()->set_last_insn(pc()); 1848 } 1849 return false; 1850 } 1851 } 1852 1853 void MacroAssembler::ldr(Register Rx, const Address &adr) { 1854 // We always try to merge two adjacent loads into one ldp. 1855 if (!try_merge_ldst(Rx, adr, 8, false)) { 1856 Assembler::ldr(Rx, adr); 1857 } 1858 } 1859 1860 void MacroAssembler::ldrw(Register Rw, const Address &adr) { 1861 // We always try to merge two adjacent loads into one ldp. 1862 if (!try_merge_ldst(Rw, adr, 4, false)) { 1863 Assembler::ldrw(Rw, adr); 1864 } 1865 } 1866 1867 void MacroAssembler::str(Register Rx, const Address &adr) { 1868 // We always try to merge two adjacent stores into one stp. 1869 if (!try_merge_ldst(Rx, adr, 8, true)) { 1870 Assembler::str(Rx, adr); 1871 } 1872 } 1873 1874 void MacroAssembler::strw(Register Rw, const Address &adr) { 1875 // We always try to merge two adjacent stores into one stp. 1876 if (!try_merge_ldst(Rw, adr, 4, true)) { 1877 Assembler::strw(Rw, adr); 1878 } 1879 } 1880 1881 // MacroAssembler routines found actually to be needed 1882 1883 void MacroAssembler::push(Register src) 1884 { 1885 str(src, Address(pre(esp, -1 * wordSize))); 1886 } 1887 1888 void MacroAssembler::pop(Register dst) 1889 { 1890 ldr(dst, Address(post(esp, 1 * wordSize))); 1891 } 1892 1893 // Note: load_unsigned_short used to be called load_unsigned_word. 1894 int MacroAssembler::load_unsigned_short(Register dst, Address src) { 1895 int off = offset(); 1896 ldrh(dst, src); 1897 return off; 1898 } 1899 1900 int MacroAssembler::load_unsigned_byte(Register dst, Address src) { 1901 int off = offset(); 1902 ldrb(dst, src); 1903 return off; 1904 } 1905 1906 int MacroAssembler::load_signed_short(Register dst, Address src) { 1907 int off = offset(); 1908 ldrsh(dst, src); 1909 return off; 1910 } 1911 1912 int MacroAssembler::load_signed_byte(Register dst, Address src) { 1913 int off = offset(); 1914 ldrsb(dst, src); 1915 return off; 1916 } 1917 1918 int MacroAssembler::load_signed_short32(Register dst, Address src) { 1919 int off = offset(); 1920 ldrshw(dst, src); 1921 return off; 1922 } 1923 1924 int MacroAssembler::load_signed_byte32(Register dst, Address src) { 1925 int off = offset(); 1926 ldrsbw(dst, src); 1927 return off; 1928 } 1929 1930 void MacroAssembler::load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed, Register dst2) { 1931 switch (size_in_bytes) { 1932 case 8: ldr(dst, src); break; 1933 case 4: ldrw(dst, src); break; 1934 case 2: is_signed ? load_signed_short(dst, src) : load_unsigned_short(dst, src); break; 1935 case 1: is_signed ? load_signed_byte( dst, src) : load_unsigned_byte( dst, src); break; 1936 default: ShouldNotReachHere(); 1937 } 1938 } 1939 1940 void MacroAssembler::store_sized_value(Address dst, Register src, size_t size_in_bytes, Register src2) { 1941 switch (size_in_bytes) { 1942 case 8: str(src, dst); break; 1943 case 4: strw(src, dst); break; 1944 case 2: strh(src, dst); break; 1945 case 1: strb(src, dst); break; 1946 default: ShouldNotReachHere(); 1947 } 1948 } 1949 1950 void MacroAssembler::decrementw(Register reg, int value) 1951 { 1952 if (value < 0) { incrementw(reg, -value); return; } 1953 if (value == 0) { return; } 1954 if (value < (1 << 12)) { subw(reg, reg, value); return; } 1955 /* else */ { 1956 guarantee(reg != rscratch2, "invalid dst for register decrement"); 1957 movw(rscratch2, (unsigned)value); 1958 subw(reg, reg, rscratch2); 1959 } 1960 } 1961 1962 void MacroAssembler::decrement(Register reg, int value) 1963 { 1964 if (value < 0) { increment(reg, -value); return; } 1965 if (value == 0) { return; } 1966 if (value < (1 << 12)) { sub(reg, reg, value); return; } 1967 /* else */ { 1968 assert(reg != rscratch2, "invalid dst for register decrement"); 1969 mov(rscratch2, (unsigned long)value); 1970 sub(reg, reg, rscratch2); 1971 } 1972 } 1973 1974 void MacroAssembler::decrementw(Address dst, int value) 1975 { 1976 assert(!dst.uses(rscratch1), "invalid dst for address decrement"); 1977 if (dst.getMode() == Address::literal) { 1978 assert(abs(value) < (1 << 12), "invalid value and address mode combination"); 1979 lea(rscratch2, dst); 1980 dst = Address(rscratch2); 1981 } 1982 ldrw(rscratch1, dst); 1983 decrementw(rscratch1, value); 1984 strw(rscratch1, dst); 1985 } 1986 1987 void MacroAssembler::decrement(Address dst, int value) 1988 { 1989 assert(!dst.uses(rscratch1), "invalid address for decrement"); 1990 if (dst.getMode() == Address::literal) { 1991 assert(abs(value) < (1 << 12), "invalid value and address mode combination"); 1992 lea(rscratch2, dst); 1993 dst = Address(rscratch2); 1994 } 1995 ldr(rscratch1, dst); 1996 decrement(rscratch1, value); 1997 str(rscratch1, dst); 1998 } 1999 2000 void MacroAssembler::incrementw(Register reg, int value) 2001 { 2002 if (value < 0) { decrementw(reg, -value); return; } 2003 if (value == 0) { return; } 2004 if (value < (1 << 12)) { addw(reg, reg, value); return; } 2005 /* else */ { 2006 assert(reg != rscratch2, "invalid dst for register increment"); 2007 movw(rscratch2, (unsigned)value); 2008 addw(reg, reg, rscratch2); 2009 } 2010 } 2011 2012 void MacroAssembler::increment(Register reg, int value) 2013 { 2014 if (value < 0) { decrement(reg, -value); return; } 2015 if (value == 0) { return; } 2016 if (value < (1 << 12)) { add(reg, reg, value); return; } 2017 /* else */ { 2018 assert(reg != rscratch2, "invalid dst for register increment"); 2019 movw(rscratch2, (unsigned)value); 2020 add(reg, reg, rscratch2); 2021 } 2022 } 2023 2024 void MacroAssembler::incrementw(Address dst, int value) 2025 { 2026 assert(!dst.uses(rscratch1), "invalid dst for address increment"); 2027 if (dst.getMode() == Address::literal) { 2028 assert(abs(value) < (1 << 12), "invalid value and address mode combination"); 2029 lea(rscratch2, dst); 2030 dst = Address(rscratch2); 2031 } 2032 ldrw(rscratch1, dst); 2033 incrementw(rscratch1, value); 2034 strw(rscratch1, dst); 2035 } 2036 2037 void MacroAssembler::increment(Address dst, int value) 2038 { 2039 assert(!dst.uses(rscratch1), "invalid dst for address increment"); 2040 if (dst.getMode() == Address::literal) { 2041 assert(abs(value) < (1 << 12), "invalid value and address mode combination"); 2042 lea(rscratch2, dst); 2043 dst = Address(rscratch2); 2044 } 2045 ldr(rscratch1, dst); 2046 increment(rscratch1, value); 2047 str(rscratch1, dst); 2048 } 2049 2050 2051 void MacroAssembler::pusha() { 2052 push(0x7fffffff, sp); 2053 } 2054 2055 void MacroAssembler::popa() { 2056 pop(0x7fffffff, sp); 2057 } 2058 2059 // Push lots of registers in the bit set supplied. Don't push sp. 2060 // Return the number of words pushed 2061 int MacroAssembler::push(unsigned int bitset, Register stack) { 2062 int words_pushed = 0; 2063 2064 // Scan bitset to accumulate register pairs 2065 unsigned char regs[32]; 2066 int count = 0; 2067 for (int reg = 0; reg <= 30; reg++) { 2068 if (1 & bitset) 2069 regs[count++] = reg; 2070 bitset >>= 1; 2071 } 2072 regs[count++] = zr->encoding_nocheck(); 2073 count &= ~1; // Only push an even nuber of regs 2074 2075 if (count) { 2076 stp(as_Register(regs[0]), as_Register(regs[1]), 2077 Address(pre(stack, -count * wordSize))); 2078 words_pushed += 2; 2079 } 2080 for (int i = 2; i < count; i += 2) { 2081 stp(as_Register(regs[i]), as_Register(regs[i+1]), 2082 Address(stack, i * wordSize)); 2083 words_pushed += 2; 2084 } 2085 2086 assert(words_pushed == count, "oops, pushed != count"); 2087 2088 return count; 2089 } 2090 2091 int MacroAssembler::pop(unsigned int bitset, Register stack) { 2092 int words_pushed = 0; 2093 2094 // Scan bitset to accumulate register pairs 2095 unsigned char regs[32]; 2096 int count = 0; 2097 for (int reg = 0; reg <= 30; reg++) { 2098 if (1 & bitset) 2099 regs[count++] = reg; 2100 bitset >>= 1; 2101 } 2102 regs[count++] = zr->encoding_nocheck(); 2103 count &= ~1; 2104 2105 for (int i = 2; i < count; i += 2) { 2106 ldp(as_Register(regs[i]), as_Register(regs[i+1]), 2107 Address(stack, i * wordSize)); 2108 words_pushed += 2; 2109 } 2110 if (count) { 2111 ldp(as_Register(regs[0]), as_Register(regs[1]), 2112 Address(post(stack, count * wordSize))); 2113 words_pushed += 2; 2114 } 2115 2116 assert(words_pushed == count, "oops, pushed != count"); 2117 2118 return count; 2119 } 2120 #ifdef ASSERT 2121 void MacroAssembler::verify_heapbase(const char* msg) { 2122 #if 0 2123 assert (UseCompressedOops || UseCompressedClassPointers, "should be compressed"); 2124 assert (Universe::heap() != NULL, "java heap should be initialized"); 2125 if (CheckCompressedOops) { 2126 Label ok; 2127 push(1 << rscratch1->encoding(), sp); // cmpptr trashes rscratch1 2128 cmpptr(rheapbase, ExternalAddress((address)Universe::narrow_ptrs_base_addr())); 2129 br(Assembler::EQ, ok); 2130 stop(msg); 2131 bind(ok); 2132 pop(1 << rscratch1->encoding(), sp); 2133 } 2134 #endif 2135 } 2136 #endif 2137 2138 void MacroAssembler::resolve_jobject(Register value, Register thread, Register tmp) { 2139 Label done, not_weak; 2140 cbz(value, done); // Use NULL as-is. 2141 2142 STATIC_ASSERT(JNIHandles::weak_tag_mask == 1u); 2143 tbz(r0, 0, not_weak); // Test for jweak tag. 2144 2145 // Resolve jweak. 2146 access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF, value, 2147 Address(value, -JNIHandles::weak_tag_value), tmp, thread); 2148 verify_oop(value); 2149 b(done); 2150 2151 bind(not_weak); 2152 // Resolve (untagged) jobject. 2153 access_load_at(T_OBJECT, IN_NATIVE, value, Address(value, 0), tmp, thread); 2154 verify_oop(value); 2155 bind(done); 2156 } 2157 2158 void MacroAssembler::stop(const char* msg) { 2159 address ip = pc(); 2160 pusha(); 2161 mov(c_rarg0, (address)msg); 2162 mov(c_rarg1, (address)ip); 2163 mov(c_rarg2, sp); 2164 mov(c_rarg3, CAST_FROM_FN_PTR(address, MacroAssembler::debug64)); 2165 // call(c_rarg3); 2166 blrt(c_rarg3, 3, 0, 1); 2167 hlt(0); 2168 } 2169 2170 void MacroAssembler::warn(const char* msg) { 2171 pusha(); 2172 mov(c_rarg0, (address)msg); 2173 mov(lr, CAST_FROM_FN_PTR(address, warning)); 2174 blrt(lr, 1, 0, MacroAssembler::ret_type_void); 2175 popa(); 2176 } 2177 2178 void MacroAssembler::unimplemented(const char* what) { 2179 const char* buf = NULL; 2180 { 2181 ResourceMark rm; 2182 stringStream ss; 2183 ss.print("unimplemented: %s", what); 2184 buf = code_string(ss.as_string()); 2185 } 2186 stop(buf); 2187 } 2188 2189 // If a constant does not fit in an immediate field, generate some 2190 // number of MOV instructions and then perform the operation. 2191 void MacroAssembler::wrap_add_sub_imm_insn(Register Rd, Register Rn, unsigned imm, 2192 add_sub_imm_insn insn1, 2193 add_sub_reg_insn insn2) { 2194 assert(Rd != zr, "Rd = zr and not setting flags?"); 2195 if (operand_valid_for_add_sub_immediate((int)imm)) { 2196 (this->*insn1)(Rd, Rn, imm); 2197 } else { 2198 if (uabs(imm) < (1 << 24)) { 2199 (this->*insn1)(Rd, Rn, imm & -(1 << 12)); 2200 (this->*insn1)(Rd, Rd, imm & ((1 << 12)-1)); 2201 } else { 2202 assert_different_registers(Rd, Rn); 2203 mov(Rd, (uint64_t)imm); 2204 (this->*insn2)(Rd, Rn, Rd, LSL, 0); 2205 } 2206 } 2207 } 2208 2209 // Seperate vsn which sets the flags. Optimisations are more restricted 2210 // because we must set the flags correctly. 2211 void MacroAssembler::wrap_adds_subs_imm_insn(Register Rd, Register Rn, unsigned imm, 2212 add_sub_imm_insn insn1, 2213 add_sub_reg_insn insn2) { 2214 if (operand_valid_for_add_sub_immediate((int)imm)) { 2215 (this->*insn1)(Rd, Rn, imm); 2216 } else { 2217 assert_different_registers(Rd, Rn); 2218 assert(Rd != zr, "overflow in immediate operand"); 2219 mov(Rd, (uint64_t)imm); 2220 (this->*insn2)(Rd, Rn, Rd, LSL, 0); 2221 } 2222 } 2223 2224 2225 void MacroAssembler::add(Register Rd, Register Rn, RegisterOrConstant increment) { 2226 if (increment.is_register()) { 2227 add(Rd, Rn, increment.as_register()); 2228 } else { 2229 add(Rd, Rn, increment.as_constant()); 2230 } 2231 } 2232 2233 void MacroAssembler::addw(Register Rd, Register Rn, RegisterOrConstant increment) { 2234 if (increment.is_register()) { 2235 addw(Rd, Rn, increment.as_register()); 2236 } else { 2237 addw(Rd, Rn, increment.as_constant()); 2238 } 2239 } 2240 2241 void MacroAssembler::sub(Register Rd, Register Rn, RegisterOrConstant decrement) { 2242 if (decrement.is_register()) { 2243 sub(Rd, Rn, decrement.as_register()); 2244 } else { 2245 sub(Rd, Rn, decrement.as_constant()); 2246 } 2247 } 2248 2249 void MacroAssembler::subw(Register Rd, Register Rn, RegisterOrConstant decrement) { 2250 if (decrement.is_register()) { 2251 subw(Rd, Rn, decrement.as_register()); 2252 } else { 2253 subw(Rd, Rn, decrement.as_constant()); 2254 } 2255 } 2256 2257 void MacroAssembler::reinit_heapbase() 2258 { 2259 if (UseCompressedOops) { 2260 if (Universe::is_fully_initialized()) { 2261 mov(rheapbase, Universe::narrow_ptrs_base()); 2262 } else { 2263 lea(rheapbase, ExternalAddress((address)Universe::narrow_ptrs_base_addr())); 2264 ldr(rheapbase, Address(rheapbase)); 2265 } 2266 } 2267 } 2268 2269 // this simulates the behaviour of the x86 cmpxchg instruction using a 2270 // load linked/store conditional pair. we use the acquire/release 2271 // versions of these instructions so that we flush pending writes as 2272 // per Java semantics. 2273 2274 // n.b the x86 version assumes the old value to be compared against is 2275 // in rax and updates rax with the value located in memory if the 2276 // cmpxchg fails. we supply a register for the old value explicitly 2277 2278 // the aarch64 load linked/store conditional instructions do not 2279 // accept an offset. so, unlike x86, we must provide a plain register 2280 // to identify the memory word to be compared/exchanged rather than a 2281 // register+offset Address. 2282 2283 void MacroAssembler::cmpxchgptr(Register oldv, Register newv, Register addr, Register tmp, 2284 Label &succeed, Label *fail) { 2285 // oldv holds comparison value 2286 // newv holds value to write in exchange 2287 // addr identifies memory word to compare against/update 2288 if (UseLSE) { 2289 mov(tmp, oldv); 2290 casal(Assembler::xword, oldv, newv, addr); 2291 cmp(tmp, oldv); 2292 br(Assembler::EQ, succeed); 2293 membar(AnyAny); 2294 } else { 2295 Label retry_load, nope; 2296 if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH)) 2297 prfm(Address(addr), PSTL1STRM); 2298 bind(retry_load); 2299 // flush and load exclusive from the memory location 2300 // and fail if it is not what we expect 2301 ldaxr(tmp, addr); 2302 cmp(tmp, oldv); 2303 br(Assembler::NE, nope); 2304 // if we store+flush with no intervening write tmp wil be zero 2305 stlxr(tmp, newv, addr); 2306 cbzw(tmp, succeed); 2307 // retry so we only ever return after a load fails to compare 2308 // ensures we don't return a stale value after a failed write. 2309 b(retry_load); 2310 // if the memory word differs we return it in oldv and signal a fail 2311 bind(nope); 2312 membar(AnyAny); 2313 mov(oldv, tmp); 2314 } 2315 if (fail) 2316 b(*fail); 2317 } 2318 2319 void MacroAssembler::cmpxchg_obj_header(Register oldv, Register newv, Register obj, Register tmp, 2320 Label &succeed, Label *fail) { 2321 assert(oopDesc::mark_offset_in_bytes() == 0, "assumption"); 2322 cmpxchgptr(oldv, newv, obj, tmp, succeed, fail); 2323 } 2324 2325 void MacroAssembler::cmpxchgw(Register oldv, Register newv, Register addr, Register tmp, 2326 Label &succeed, Label *fail) { 2327 // oldv holds comparison value 2328 // newv holds value to write in exchange 2329 // addr identifies memory word to compare against/update 2330 // tmp returns 0/1 for success/failure 2331 if (UseLSE) { 2332 mov(tmp, oldv); 2333 casal(Assembler::word, oldv, newv, addr); 2334 cmp(tmp, oldv); 2335 br(Assembler::EQ, succeed); 2336 membar(AnyAny); 2337 } else { 2338 Label retry_load, nope; 2339 if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH)) 2340 prfm(Address(addr), PSTL1STRM); 2341 bind(retry_load); 2342 // flush and load exclusive from the memory location 2343 // and fail if it is not what we expect 2344 ldaxrw(tmp, addr); 2345 cmp(tmp, oldv); 2346 br(Assembler::NE, nope); 2347 // if we store+flush with no intervening write tmp wil be zero 2348 stlxrw(tmp, newv, addr); 2349 cbzw(tmp, succeed); 2350 // retry so we only ever return after a load fails to compare 2351 // ensures we don't return a stale value after a failed write. 2352 b(retry_load); 2353 // if the memory word differs we return it in oldv and signal a fail 2354 bind(nope); 2355 membar(AnyAny); 2356 mov(oldv, tmp); 2357 } 2358 if (fail) 2359 b(*fail); 2360 } 2361 2362 // A generic CAS; success or failure is in the EQ flag. A weak CAS 2363 // doesn't retry and may fail spuriously. If the oldval is wanted, 2364 // Pass a register for the result, otherwise pass noreg. 2365 2366 // Clobbers rscratch1 2367 void MacroAssembler::cmpxchg(Register addr, Register expected, 2368 Register new_val, 2369 enum operand_size size, 2370 bool acquire, bool release, 2371 bool weak, 2372 Register result) { 2373 if (result == noreg) result = rscratch1; 2374 BLOCK_COMMENT("cmpxchg {"); 2375 if (UseLSE) { 2376 mov(result, expected); 2377 lse_cas(result, new_val, addr, size, acquire, release, /*not_pair*/ true); 2378 compare_eq(result, expected, size); 2379 } else { 2380 Label retry_load, done; 2381 if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH)) 2382 prfm(Address(addr), PSTL1STRM); 2383 bind(retry_load); 2384 load_exclusive(result, addr, size, acquire); 2385 compare_eq(result, expected, size); 2386 br(Assembler::NE, done); 2387 store_exclusive(rscratch1, new_val, addr, size, release); 2388 if (weak) { 2389 cmpw(rscratch1, 0u); // If the store fails, return NE to our caller. 2390 } else { 2391 cbnzw(rscratch1, retry_load); 2392 } 2393 bind(done); 2394 } 2395 BLOCK_COMMENT("} cmpxchg"); 2396 } 2397 2398 // A generic comparison. Only compares for equality, clobbers rscratch1. 2399 void MacroAssembler::compare_eq(Register rm, Register rn, enum operand_size size) { 2400 if (size == xword) { 2401 cmp(rm, rn); 2402 } else if (size == word) { 2403 cmpw(rm, rn); 2404 } else if (size == halfword) { 2405 eorw(rscratch1, rm, rn); 2406 ands(zr, rscratch1, 0xffff); 2407 } else if (size == byte) { 2408 eorw(rscratch1, rm, rn); 2409 ands(zr, rscratch1, 0xff); 2410 } else { 2411 ShouldNotReachHere(); 2412 } 2413 } 2414 2415 2416 static bool different(Register a, RegisterOrConstant b, Register c) { 2417 if (b.is_constant()) 2418 return a != c; 2419 else 2420 return a != b.as_register() && a != c && b.as_register() != c; 2421 } 2422 2423 #define ATOMIC_OP(NAME, LDXR, OP, IOP, AOP, STXR, sz) \ 2424 void MacroAssembler::atomic_##NAME(Register prev, RegisterOrConstant incr, Register addr) { \ 2425 if (UseLSE) { \ 2426 prev = prev->is_valid() ? prev : zr; \ 2427 if (incr.is_register()) { \ 2428 AOP(sz, incr.as_register(), prev, addr); \ 2429 } else { \ 2430 mov(rscratch2, incr.as_constant()); \ 2431 AOP(sz, rscratch2, prev, addr); \ 2432 } \ 2433 return; \ 2434 } \ 2435 Register result = rscratch2; \ 2436 if (prev->is_valid()) \ 2437 result = different(prev, incr, addr) ? prev : rscratch2; \ 2438 \ 2439 Label retry_load; \ 2440 if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH)) \ 2441 prfm(Address(addr), PSTL1STRM); \ 2442 bind(retry_load); \ 2443 LDXR(result, addr); \ 2444 OP(rscratch1, result, incr); \ 2445 STXR(rscratch2, rscratch1, addr); \ 2446 cbnzw(rscratch2, retry_load); \ 2447 if (prev->is_valid() && prev != result) { \ 2448 IOP(prev, rscratch1, incr); \ 2449 } \ 2450 } 2451 2452 ATOMIC_OP(add, ldxr, add, sub, ldadd, stxr, Assembler::xword) 2453 ATOMIC_OP(addw, ldxrw, addw, subw, ldadd, stxrw, Assembler::word) 2454 ATOMIC_OP(addal, ldaxr, add, sub, ldaddal, stlxr, Assembler::xword) 2455 ATOMIC_OP(addalw, ldaxrw, addw, subw, ldaddal, stlxrw, Assembler::word) 2456 2457 #undef ATOMIC_OP 2458 2459 #define ATOMIC_XCHG(OP, AOP, LDXR, STXR, sz) \ 2460 void MacroAssembler::atomic_##OP(Register prev, Register newv, Register addr) { \ 2461 if (UseLSE) { \ 2462 prev = prev->is_valid() ? prev : zr; \ 2463 AOP(sz, newv, prev, addr); \ 2464 return; \ 2465 } \ 2466 Register result = rscratch2; \ 2467 if (prev->is_valid()) \ 2468 result = different(prev, newv, addr) ? prev : rscratch2; \ 2469 \ 2470 Label retry_load; \ 2471 if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH)) \ 2472 prfm(Address(addr), PSTL1STRM); \ 2473 bind(retry_load); \ 2474 LDXR(result, addr); \ 2475 STXR(rscratch1, newv, addr); \ 2476 cbnzw(rscratch1, retry_load); \ 2477 if (prev->is_valid() && prev != result) \ 2478 mov(prev, result); \ 2479 } 2480 2481 ATOMIC_XCHG(xchg, swp, ldxr, stxr, Assembler::xword) 2482 ATOMIC_XCHG(xchgw, swp, ldxrw, stxrw, Assembler::word) 2483 ATOMIC_XCHG(xchgal, swpal, ldaxr, stlxr, Assembler::xword) 2484 ATOMIC_XCHG(xchgalw, swpal, ldaxrw, stlxrw, Assembler::word) 2485 2486 #undef ATOMIC_XCHG 2487 2488 #ifndef PRODUCT 2489 extern "C" void findpc(intptr_t x); 2490 #endif 2491 2492 void MacroAssembler::debug64(char* msg, int64_t pc, int64_t regs[]) 2493 { 2494 // In order to get locks to work, we need to fake a in_VM state 2495 if (ShowMessageBoxOnError ) { 2496 JavaThread* thread = JavaThread::current(); 2497 JavaThreadState saved_state = thread->thread_state(); 2498 thread->set_thread_state(_thread_in_vm); 2499 #ifndef PRODUCT 2500 if (CountBytecodes || TraceBytecodes || StopInterpreterAt) { 2501 ttyLocker ttyl; 2502 BytecodeCounter::print(); 2503 } 2504 #endif 2505 if (os::message_box(msg, "Execution stopped, print registers?")) { 2506 ttyLocker ttyl; 2507 tty->print_cr(" pc = 0x%016lx", pc); 2508 #ifndef PRODUCT 2509 tty->cr(); 2510 findpc(pc); 2511 tty->cr(); 2512 #endif 2513 tty->print_cr(" r0 = 0x%016lx", regs[0]); 2514 tty->print_cr(" r1 = 0x%016lx", regs[1]); 2515 tty->print_cr(" r2 = 0x%016lx", regs[2]); 2516 tty->print_cr(" r3 = 0x%016lx", regs[3]); 2517 tty->print_cr(" r4 = 0x%016lx", regs[4]); 2518 tty->print_cr(" r5 = 0x%016lx", regs[5]); 2519 tty->print_cr(" r6 = 0x%016lx", regs[6]); 2520 tty->print_cr(" r7 = 0x%016lx", regs[7]); 2521 tty->print_cr(" r8 = 0x%016lx", regs[8]); 2522 tty->print_cr(" r9 = 0x%016lx", regs[9]); 2523 tty->print_cr("r10 = 0x%016lx", regs[10]); 2524 tty->print_cr("r11 = 0x%016lx", regs[11]); 2525 tty->print_cr("r12 = 0x%016lx", regs[12]); 2526 tty->print_cr("r13 = 0x%016lx", regs[13]); 2527 tty->print_cr("r14 = 0x%016lx", regs[14]); 2528 tty->print_cr("r15 = 0x%016lx", regs[15]); 2529 tty->print_cr("r16 = 0x%016lx", regs[16]); 2530 tty->print_cr("r17 = 0x%016lx", regs[17]); 2531 tty->print_cr("r18 = 0x%016lx", regs[18]); 2532 tty->print_cr("r19 = 0x%016lx", regs[19]); 2533 tty->print_cr("r20 = 0x%016lx", regs[20]); 2534 tty->print_cr("r21 = 0x%016lx", regs[21]); 2535 tty->print_cr("r22 = 0x%016lx", regs[22]); 2536 tty->print_cr("r23 = 0x%016lx", regs[23]); 2537 tty->print_cr("r24 = 0x%016lx", regs[24]); 2538 tty->print_cr("r25 = 0x%016lx", regs[25]); 2539 tty->print_cr("r26 = 0x%016lx", regs[26]); 2540 tty->print_cr("r27 = 0x%016lx", regs[27]); 2541 tty->print_cr("r28 = 0x%016lx", regs[28]); 2542 tty->print_cr("r30 = 0x%016lx", regs[30]); 2543 tty->print_cr("r31 = 0x%016lx", regs[31]); 2544 BREAKPOINT; 2545 } 2546 ThreadStateTransition::transition(thread, _thread_in_vm, saved_state); 2547 } else { 2548 ttyLocker ttyl; 2549 ::tty->print_cr("=============== DEBUG MESSAGE: %s ================\n", 2550 msg); 2551 assert(false, "DEBUG MESSAGE: %s", msg); 2552 } 2553 } 2554 2555 #ifdef BUILTIN_SIM 2556 // routine to generate an x86 prolog for a stub function which 2557 // bootstraps into the generated ARM code which directly follows the 2558 // stub 2559 // 2560 // the argument encodes the number of general and fp registers 2561 // passed by the caller and the callng convention (currently just 2562 // the number of general registers and assumes C argument passing) 2563 2564 extern "C" { 2565 int aarch64_stub_prolog_size(); 2566 void aarch64_stub_prolog(); 2567 void aarch64_prolog(); 2568 } 2569 2570 void MacroAssembler::c_stub_prolog(int gp_arg_count, int fp_arg_count, int ret_type, 2571 address *prolog_ptr) 2572 { 2573 int calltype = (((ret_type & 0x3) << 8) | 2574 ((fp_arg_count & 0xf) << 4) | 2575 (gp_arg_count & 0xf)); 2576 2577 // the addresses for the x86 to ARM entry code we need to use 2578 address start = pc(); 2579 // printf("start = %lx\n", start); 2580 int byteCount = aarch64_stub_prolog_size(); 2581 // printf("byteCount = %x\n", byteCount); 2582 int instructionCount = (byteCount + 3)/ 4; 2583 // printf("instructionCount = %x\n", instructionCount); 2584 for (int i = 0; i < instructionCount; i++) { 2585 nop(); 2586 } 2587 2588 memcpy(start, (void*)aarch64_stub_prolog, byteCount); 2589 2590 // write the address of the setup routine and the call format at the 2591 // end of into the copied code 2592 u_int64_t *patch_end = (u_int64_t *)(start + byteCount); 2593 if (prolog_ptr) 2594 patch_end[-2] = (u_int64_t)prolog_ptr; 2595 patch_end[-1] = calltype; 2596 } 2597 #endif 2598 2599 void MacroAssembler::push_call_clobbered_registers() { 2600 int step = 4 * wordSize; 2601 push(RegSet::range(r0, r18) - RegSet::of(rscratch1, rscratch2), sp); 2602 sub(sp, sp, step); 2603 mov(rscratch1, -step); 2604 // Push v0-v7, v16-v31. 2605 for (int i = 31; i>= 4; i -= 4) { 2606 if (i <= v7->encoding() || i >= v16->encoding()) 2607 st1(as_FloatRegister(i-3), as_FloatRegister(i-2), as_FloatRegister(i-1), 2608 as_FloatRegister(i), T1D, Address(post(sp, rscratch1))); 2609 } 2610 st1(as_FloatRegister(0), as_FloatRegister(1), as_FloatRegister(2), 2611 as_FloatRegister(3), T1D, Address(sp)); 2612 } 2613 2614 void MacroAssembler::pop_call_clobbered_registers() { 2615 for (int i = 0; i < 32; i += 4) { 2616 if (i <= v7->encoding() || i >= v16->encoding()) 2617 ld1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2), 2618 as_FloatRegister(i+3), T1D, Address(post(sp, 4 * wordSize))); 2619 } 2620 2621 pop(RegSet::range(r0, r18) - RegSet::of(rscratch1, rscratch2), sp); 2622 } 2623 2624 void MacroAssembler::push_CPU_state(bool save_vectors) { 2625 int step = (save_vectors ? 8 : 4) * wordSize; 2626 push(0x3fffffff, sp); // integer registers except lr & sp 2627 mov(rscratch1, -step); 2628 sub(sp, sp, step); 2629 for (int i = 28; i >= 4; i -= 4) { 2630 st1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2), 2631 as_FloatRegister(i+3), save_vectors ? T2D : T1D, Address(post(sp, rscratch1))); 2632 } 2633 st1(v0, v1, v2, v3, save_vectors ? T2D : T1D, sp); 2634 } 2635 2636 void MacroAssembler::pop_CPU_state(bool restore_vectors) { 2637 int step = (restore_vectors ? 8 : 4) * wordSize; 2638 for (int i = 0; i <= 28; i += 4) 2639 ld1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2), 2640 as_FloatRegister(i+3), restore_vectors ? T2D : T1D, Address(post(sp, step))); 2641 pop(0x3fffffff, sp); // integer registers except lr & sp 2642 } 2643 2644 /** 2645 * Helpers for multiply_to_len(). 2646 */ 2647 void MacroAssembler::add2_with_carry(Register final_dest_hi, Register dest_hi, Register dest_lo, 2648 Register src1, Register src2) { 2649 adds(dest_lo, dest_lo, src1); 2650 adc(dest_hi, dest_hi, zr); 2651 adds(dest_lo, dest_lo, src2); 2652 adc(final_dest_hi, dest_hi, zr); 2653 } 2654 2655 // Generate an address from (r + r1 extend offset). "size" is the 2656 // size of the operand. The result may be in rscratch2. 2657 Address MacroAssembler::offsetted_address(Register r, Register r1, 2658 Address::extend ext, int offset, int size) { 2659 if (offset || (ext.shift() % size != 0)) { 2660 lea(rscratch2, Address(r, r1, ext)); 2661 return Address(rscratch2, offset); 2662 } else { 2663 return Address(r, r1, ext); 2664 } 2665 } 2666 2667 Address MacroAssembler::spill_address(int size, int offset, Register tmp) 2668 { 2669 assert(offset >= 0, "spill to negative address?"); 2670 // Offset reachable ? 2671 // Not aligned - 9 bits signed offset 2672 // Aligned - 12 bits unsigned offset shifted 2673 Register base = sp; 2674 if ((offset & (size-1)) && offset >= (1<<8)) { 2675 add(tmp, base, offset & ((1<<12)-1)); 2676 base = tmp; 2677 offset &= -1<<12; 2678 } 2679 2680 if (offset >= (1<<12) * size) { 2681 add(tmp, base, offset & (((1<<12)-1)<<12)); 2682 base = tmp; 2683 offset &= ~(((1<<12)-1)<<12); 2684 } 2685 2686 return Address(base, offset); 2687 } 2688 2689 // Checks whether offset is aligned. 2690 // Returns true if it is, else false. 2691 bool MacroAssembler::merge_alignment_check(Register base, 2692 size_t size, 2693 long cur_offset, 2694 long prev_offset) const { 2695 if (AvoidUnalignedAccesses) { 2696 if (base == sp) { 2697 // Checks whether low offset if aligned to pair of registers. 2698 long pair_mask = size * 2 - 1; 2699 long offset = prev_offset > cur_offset ? cur_offset : prev_offset; 2700 return (offset & pair_mask) == 0; 2701 } else { // If base is not sp, we can't guarantee the access is aligned. 2702 return false; 2703 } 2704 } else { 2705 long mask = size - 1; 2706 // Load/store pair instruction only supports element size aligned offset. 2707 return (cur_offset & mask) == 0 && (prev_offset & mask) == 0; 2708 } 2709 } 2710 2711 // Checks whether current and previous loads/stores can be merged. 2712 // Returns true if it can be merged, else false. 2713 bool MacroAssembler::ldst_can_merge(Register rt, 2714 const Address &adr, 2715 size_t cur_size_in_bytes, 2716 bool is_store) const { 2717 address prev = pc() - NativeInstruction::instruction_size; 2718 address last = code()->last_insn(); 2719 2720 if (last == NULL || !nativeInstruction_at(last)->is_Imm_LdSt()) { 2721 return false; 2722 } 2723 2724 if (adr.getMode() != Address::base_plus_offset || prev != last) { 2725 return false; 2726 } 2727 2728 NativeLdSt* prev_ldst = NativeLdSt_at(prev); 2729 size_t prev_size_in_bytes = prev_ldst->size_in_bytes(); 2730 2731 assert(prev_size_in_bytes == 4 || prev_size_in_bytes == 8, "only supports 64/32bit merging."); 2732 assert(cur_size_in_bytes == 4 || cur_size_in_bytes == 8, "only supports 64/32bit merging."); 2733 2734 if (cur_size_in_bytes != prev_size_in_bytes || is_store != prev_ldst->is_store()) { 2735 return false; 2736 } 2737 2738 long max_offset = 63 * prev_size_in_bytes; 2739 long min_offset = -64 * prev_size_in_bytes; 2740 2741 assert(prev_ldst->is_not_pre_post_index(), "pre-index or post-index is not supported to be merged."); 2742 2743 // Only same base can be merged. 2744 if (adr.base() != prev_ldst->base()) { 2745 return false; 2746 } 2747 2748 long cur_offset = adr.offset(); 2749 long prev_offset = prev_ldst->offset(); 2750 size_t diff = abs(cur_offset - prev_offset); 2751 if (diff != prev_size_in_bytes) { 2752 return false; 2753 } 2754 2755 // Following cases can not be merged: 2756 // ldr x2, [x2, #8] 2757 // ldr x3, [x2, #16] 2758 // or: 2759 // ldr x2, [x3, #8] 2760 // ldr x2, [x3, #16] 2761 // If t1 and t2 is the same in "ldp t1, t2, [xn, #imm]", we'll get SIGILL. 2762 if (!is_store && (adr.base() == prev_ldst->target() || rt == prev_ldst->target())) { 2763 return false; 2764 } 2765 2766 long low_offset = prev_offset > cur_offset ? cur_offset : prev_offset; 2767 // Offset range must be in ldp/stp instruction's range. 2768 if (low_offset > max_offset || low_offset < min_offset) { 2769 return false; 2770 } 2771 2772 if (merge_alignment_check(adr.base(), prev_size_in_bytes, cur_offset, prev_offset)) { 2773 return true; 2774 } 2775 2776 return false; 2777 } 2778 2779 // Merge current load/store with previous load/store into ldp/stp. 2780 void MacroAssembler::merge_ldst(Register rt, 2781 const Address &adr, 2782 size_t cur_size_in_bytes, 2783 bool is_store) { 2784 2785 assert(ldst_can_merge(rt, adr, cur_size_in_bytes, is_store) == true, "cur and prev must be able to be merged."); 2786 2787 Register rt_low, rt_high; 2788 address prev = pc() - NativeInstruction::instruction_size; 2789 NativeLdSt* prev_ldst = NativeLdSt_at(prev); 2790 2791 long offset; 2792 2793 if (adr.offset() < prev_ldst->offset()) { 2794 offset = adr.offset(); 2795 rt_low = rt; 2796 rt_high = prev_ldst->target(); 2797 } else { 2798 offset = prev_ldst->offset(); 2799 rt_low = prev_ldst->target(); 2800 rt_high = rt; 2801 } 2802 2803 Address adr_p = Address(prev_ldst->base(), offset); 2804 // Overwrite previous generated binary. 2805 code_section()->set_end(prev); 2806 2807 const int sz = prev_ldst->size_in_bytes(); 2808 assert(sz == 8 || sz == 4, "only supports 64/32bit merging."); 2809 if (!is_store) { 2810 BLOCK_COMMENT("merged ldr pair"); 2811 if (sz == 8) { 2812 ldp(rt_low, rt_high, adr_p); 2813 } else { 2814 ldpw(rt_low, rt_high, adr_p); 2815 } 2816 } else { 2817 BLOCK_COMMENT("merged str pair"); 2818 if (sz == 8) { 2819 stp(rt_low, rt_high, adr_p); 2820 } else { 2821 stpw(rt_low, rt_high, adr_p); 2822 } 2823 } 2824 } 2825 2826 /** 2827 * Multiply 64 bit by 64 bit first loop. 2828 */ 2829 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart, 2830 Register y, Register y_idx, Register z, 2831 Register carry, Register product, 2832 Register idx, Register kdx) { 2833 // 2834 // jlong carry, x[], y[], z[]; 2835 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) { 2836 // huge_128 product = y[idx] * x[xstart] + carry; 2837 // z[kdx] = (jlong)product; 2838 // carry = (jlong)(product >>> 64); 2839 // } 2840 // z[xstart] = carry; 2841 // 2842 2843 Label L_first_loop, L_first_loop_exit; 2844 Label L_one_x, L_one_y, L_multiply; 2845 2846 subsw(xstart, xstart, 1); 2847 br(Assembler::MI, L_one_x); 2848 2849 lea(rscratch1, Address(x, xstart, Address::lsl(LogBytesPerInt))); 2850 ldr(x_xstart, Address(rscratch1)); 2851 ror(x_xstart, x_xstart, 32); // convert big-endian to little-endian 2852 2853 bind(L_first_loop); 2854 subsw(idx, idx, 1); 2855 br(Assembler::MI, L_first_loop_exit); 2856 subsw(idx, idx, 1); 2857 br(Assembler::MI, L_one_y); 2858 lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt))); 2859 ldr(y_idx, Address(rscratch1)); 2860 ror(y_idx, y_idx, 32); // convert big-endian to little-endian 2861 bind(L_multiply); 2862 2863 // AArch64 has a multiply-accumulate instruction that we can't use 2864 // here because it has no way to process carries, so we have to use 2865 // separate add and adc instructions. Bah. 2866 umulh(rscratch1, x_xstart, y_idx); // x_xstart * y_idx -> rscratch1:product 2867 mul(product, x_xstart, y_idx); 2868 adds(product, product, carry); 2869 adc(carry, rscratch1, zr); // x_xstart * y_idx + carry -> carry:product 2870 2871 subw(kdx, kdx, 2); 2872 ror(product, product, 32); // back to big-endian 2873 str(product, offsetted_address(z, kdx, Address::uxtw(LogBytesPerInt), 0, BytesPerLong)); 2874 2875 b(L_first_loop); 2876 2877 bind(L_one_y); 2878 ldrw(y_idx, Address(y, 0)); 2879 b(L_multiply); 2880 2881 bind(L_one_x); 2882 ldrw(x_xstart, Address(x, 0)); 2883 b(L_first_loop); 2884 2885 bind(L_first_loop_exit); 2886 } 2887 2888 /** 2889 * Multiply 128 bit by 128. Unrolled inner loop. 2890 * 2891 */ 2892 void MacroAssembler::multiply_128_x_128_loop(Register y, Register z, 2893 Register carry, Register carry2, 2894 Register idx, Register jdx, 2895 Register yz_idx1, Register yz_idx2, 2896 Register tmp, Register tmp3, Register tmp4, 2897 Register tmp6, Register product_hi) { 2898 2899 // jlong carry, x[], y[], z[]; 2900 // int kdx = ystart+1; 2901 // for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop 2902 // huge_128 tmp3 = (y[idx+1] * product_hi) + z[kdx+idx+1] + carry; 2903 // jlong carry2 = (jlong)(tmp3 >>> 64); 2904 // huge_128 tmp4 = (y[idx] * product_hi) + z[kdx+idx] + carry2; 2905 // carry = (jlong)(tmp4 >>> 64); 2906 // z[kdx+idx+1] = (jlong)tmp3; 2907 // z[kdx+idx] = (jlong)tmp4; 2908 // } 2909 // idx += 2; 2910 // if (idx > 0) { 2911 // yz_idx1 = (y[idx] * product_hi) + z[kdx+idx] + carry; 2912 // z[kdx+idx] = (jlong)yz_idx1; 2913 // carry = (jlong)(yz_idx1 >>> 64); 2914 // } 2915 // 2916 2917 Label L_third_loop, L_third_loop_exit, L_post_third_loop_done; 2918 2919 lsrw(jdx, idx, 2); 2920 2921 bind(L_third_loop); 2922 2923 subsw(jdx, jdx, 1); 2924 br(Assembler::MI, L_third_loop_exit); 2925 subw(idx, idx, 4); 2926 2927 lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt))); 2928 2929 ldp(yz_idx2, yz_idx1, Address(rscratch1, 0)); 2930 2931 lea(tmp6, Address(z, idx, Address::uxtw(LogBytesPerInt))); 2932 2933 ror(yz_idx1, yz_idx1, 32); // convert big-endian to little-endian 2934 ror(yz_idx2, yz_idx2, 32); 2935 2936 ldp(rscratch2, rscratch1, Address(tmp6, 0)); 2937 2938 mul(tmp3, product_hi, yz_idx1); // yz_idx1 * product_hi -> tmp4:tmp3 2939 umulh(tmp4, product_hi, yz_idx1); 2940 2941 ror(rscratch1, rscratch1, 32); // convert big-endian to little-endian 2942 ror(rscratch2, rscratch2, 32); 2943 2944 mul(tmp, product_hi, yz_idx2); // yz_idx2 * product_hi -> carry2:tmp 2945 umulh(carry2, product_hi, yz_idx2); 2946 2947 // propagate sum of both multiplications into carry:tmp4:tmp3 2948 adds(tmp3, tmp3, carry); 2949 adc(tmp4, tmp4, zr); 2950 adds(tmp3, tmp3, rscratch1); 2951 adcs(tmp4, tmp4, tmp); 2952 adc(carry, carry2, zr); 2953 adds(tmp4, tmp4, rscratch2); 2954 adc(carry, carry, zr); 2955 2956 ror(tmp3, tmp3, 32); // convert little-endian to big-endian 2957 ror(tmp4, tmp4, 32); 2958 stp(tmp4, tmp3, Address(tmp6, 0)); 2959 2960 b(L_third_loop); 2961 bind (L_third_loop_exit); 2962 2963 andw (idx, idx, 0x3); 2964 cbz(idx, L_post_third_loop_done); 2965 2966 Label L_check_1; 2967 subsw(idx, idx, 2); 2968 br(Assembler::MI, L_check_1); 2969 2970 lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt))); 2971 ldr(yz_idx1, Address(rscratch1, 0)); 2972 ror(yz_idx1, yz_idx1, 32); 2973 mul(tmp3, product_hi, yz_idx1); // yz_idx1 * product_hi -> tmp4:tmp3 2974 umulh(tmp4, product_hi, yz_idx1); 2975 lea(rscratch1, Address(z, idx, Address::uxtw(LogBytesPerInt))); 2976 ldr(yz_idx2, Address(rscratch1, 0)); 2977 ror(yz_idx2, yz_idx2, 32); 2978 2979 add2_with_carry(carry, tmp4, tmp3, carry, yz_idx2); 2980 2981 ror(tmp3, tmp3, 32); 2982 str(tmp3, Address(rscratch1, 0)); 2983 2984 bind (L_check_1); 2985 2986 andw (idx, idx, 0x1); 2987 subsw(idx, idx, 1); 2988 br(Assembler::MI, L_post_third_loop_done); 2989 ldrw(tmp4, Address(y, idx, Address::uxtw(LogBytesPerInt))); 2990 mul(tmp3, tmp4, product_hi); // tmp4 * product_hi -> carry2:tmp3 2991 umulh(carry2, tmp4, product_hi); 2992 ldrw(tmp4, Address(z, idx, Address::uxtw(LogBytesPerInt))); 2993 2994 add2_with_carry(carry2, tmp3, tmp4, carry); 2995 2996 strw(tmp3, Address(z, idx, Address::uxtw(LogBytesPerInt))); 2997 extr(carry, carry2, tmp3, 32); 2998 2999 bind(L_post_third_loop_done); 3000 } 3001 3002 /** 3003 * Code for BigInteger::multiplyToLen() instrinsic. 3004 * 3005 * r0: x 3006 * r1: xlen 3007 * r2: y 3008 * r3: ylen 3009 * r4: z 3010 * r5: zlen 3011 * r10: tmp1 3012 * r11: tmp2 3013 * r12: tmp3 3014 * r13: tmp4 3015 * r14: tmp5 3016 * r15: tmp6 3017 * r16: tmp7 3018 * 3019 */ 3020 void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Register ylen, 3021 Register z, Register zlen, 3022 Register tmp1, Register tmp2, Register tmp3, Register tmp4, 3023 Register tmp5, Register tmp6, Register product_hi) { 3024 3025 assert_different_registers(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6); 3026 3027 const Register idx = tmp1; 3028 const Register kdx = tmp2; 3029 const Register xstart = tmp3; 3030 3031 const Register y_idx = tmp4; 3032 const Register carry = tmp5; 3033 const Register product = xlen; 3034 const Register x_xstart = zlen; // reuse register 3035 3036 // First Loop. 3037 // 3038 // final static long LONG_MASK = 0xffffffffL; 3039 // int xstart = xlen - 1; 3040 // int ystart = ylen - 1; 3041 // long carry = 0; 3042 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) { 3043 // long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry; 3044 // z[kdx] = (int)product; 3045 // carry = product >>> 32; 3046 // } 3047 // z[xstart] = (int)carry; 3048 // 3049 3050 movw(idx, ylen); // idx = ylen; 3051 movw(kdx, zlen); // kdx = xlen+ylen; 3052 mov(carry, zr); // carry = 0; 3053 3054 Label L_done; 3055 3056 movw(xstart, xlen); 3057 subsw(xstart, xstart, 1); 3058 br(Assembler::MI, L_done); 3059 3060 multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, carry, product, idx, kdx); 3061 3062 Label L_second_loop; 3063 cbzw(kdx, L_second_loop); 3064 3065 Label L_carry; 3066 subw(kdx, kdx, 1); 3067 cbzw(kdx, L_carry); 3068 3069 strw(carry, Address(z, kdx, Address::uxtw(LogBytesPerInt))); 3070 lsr(carry, carry, 32); 3071 subw(kdx, kdx, 1); 3072 3073 bind(L_carry); 3074 strw(carry, Address(z, kdx, Address::uxtw(LogBytesPerInt))); 3075 3076 // Second and third (nested) loops. 3077 // 3078 // for (int i = xstart-1; i >= 0; i--) { // Second loop 3079 // carry = 0; 3080 // for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop 3081 // long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) + 3082 // (z[k] & LONG_MASK) + carry; 3083 // z[k] = (int)product; 3084 // carry = product >>> 32; 3085 // } 3086 // z[i] = (int)carry; 3087 // } 3088 // 3089 // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = product_hi 3090 3091 const Register jdx = tmp1; 3092 3093 bind(L_second_loop); 3094 mov(carry, zr); // carry = 0; 3095 movw(jdx, ylen); // j = ystart+1 3096 3097 subsw(xstart, xstart, 1); // i = xstart-1; 3098 br(Assembler::MI, L_done); 3099 3100 str(z, Address(pre(sp, -4 * wordSize))); 3101 3102 Label L_last_x; 3103 lea(z, offsetted_address(z, xstart, Address::uxtw(LogBytesPerInt), 4, BytesPerInt)); // z = z + k - j 3104 subsw(xstart, xstart, 1); // i = xstart-1; 3105 br(Assembler::MI, L_last_x); 3106 3107 lea(rscratch1, Address(x, xstart, Address::uxtw(LogBytesPerInt))); 3108 ldr(product_hi, Address(rscratch1)); 3109 ror(product_hi, product_hi, 32); // convert big-endian to little-endian 3110 3111 Label L_third_loop_prologue; 3112 bind(L_third_loop_prologue); 3113 3114 str(ylen, Address(sp, wordSize)); 3115 stp(x, xstart, Address(sp, 2 * wordSize)); 3116 multiply_128_x_128_loop(y, z, carry, x, jdx, ylen, product, 3117 tmp2, x_xstart, tmp3, tmp4, tmp6, product_hi); 3118 ldp(z, ylen, Address(post(sp, 2 * wordSize))); 3119 ldp(x, xlen, Address(post(sp, 2 * wordSize))); // copy old xstart -> xlen 3120 3121 addw(tmp3, xlen, 1); 3122 strw(carry, Address(z, tmp3, Address::uxtw(LogBytesPerInt))); 3123 subsw(tmp3, tmp3, 1); 3124 br(Assembler::MI, L_done); 3125 3126 lsr(carry, carry, 32); 3127 strw(carry, Address(z, tmp3, Address::uxtw(LogBytesPerInt))); 3128 b(L_second_loop); 3129 3130 // Next infrequent code is moved outside loops. 3131 bind(L_last_x); 3132 ldrw(product_hi, Address(x, 0)); 3133 b(L_third_loop_prologue); 3134 3135 bind(L_done); 3136 } 3137 3138 // Code for BigInteger::mulAdd instrinsic 3139 // out = r0 3140 // in = r1 3141 // offset = r2 (already out.length-offset) 3142 // len = r3 3143 // k = r4 3144 // 3145 // pseudo code from java implementation: 3146 // carry = 0; 3147 // offset = out.length-offset - 1; 3148 // for (int j=len-1; j >= 0; j--) { 3149 // product = (in[j] & LONG_MASK) * kLong + (out[offset] & LONG_MASK) + carry; 3150 // out[offset--] = (int)product; 3151 // carry = product >>> 32; 3152 // } 3153 // return (int)carry; 3154 void MacroAssembler::mul_add(Register out, Register in, Register offset, 3155 Register len, Register k) { 3156 Label LOOP, END; 3157 // pre-loop 3158 cmp(len, zr); // cmp, not cbz/cbnz: to use condition twice => less branches 3159 csel(out, zr, out, Assembler::EQ); 3160 br(Assembler::EQ, END); 3161 add(in, in, len, LSL, 2); // in[j+1] address 3162 add(offset, out, offset, LSL, 2); // out[offset + 1] address 3163 mov(out, zr); // used to keep carry now 3164 BIND(LOOP); 3165 ldrw(rscratch1, Address(pre(in, -4))); 3166 madd(rscratch1, rscratch1, k, out); 3167 ldrw(rscratch2, Address(pre(offset, -4))); 3168 add(rscratch1, rscratch1, rscratch2); 3169 strw(rscratch1, Address(offset)); 3170 lsr(out, rscratch1, 32); 3171 subs(len, len, 1); 3172 br(Assembler::NE, LOOP); 3173 BIND(END); 3174 } 3175 3176 /** 3177 * Emits code to update CRC-32 with a byte value according to constants in table 3178 * 3179 * @param [in,out]crc Register containing the crc. 3180 * @param [in]val Register containing the byte to fold into the CRC. 3181 * @param [in]table Register containing the table of crc constants. 3182 * 3183 * uint32_t crc; 3184 * val = crc_table[(val ^ crc) & 0xFF]; 3185 * crc = val ^ (crc >> 8); 3186 * 3187 */ 3188 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) { 3189 eor(val, val, crc); 3190 andr(val, val, 0xff); 3191 ldrw(val, Address(table, val, Address::lsl(2))); 3192 eor(crc, val, crc, Assembler::LSR, 8); 3193 } 3194 3195 /** 3196 * Emits code to update CRC-32 with a 32-bit value according to tables 0 to 3 3197 * 3198 * @param [in,out]crc Register containing the crc. 3199 * @param [in]v Register containing the 32-bit to fold into the CRC. 3200 * @param [in]table0 Register containing table 0 of crc constants. 3201 * @param [in]table1 Register containing table 1 of crc constants. 3202 * @param [in]table2 Register containing table 2 of crc constants. 3203 * @param [in]table3 Register containing table 3 of crc constants. 3204 * 3205 * uint32_t crc; 3206 * v = crc ^ v 3207 * crc = table3[v&0xff]^table2[(v>>8)&0xff]^table1[(v>>16)&0xff]^table0[v>>24] 3208 * 3209 */ 3210 void MacroAssembler::update_word_crc32(Register crc, Register v, Register tmp, 3211 Register table0, Register table1, Register table2, Register table3, 3212 bool upper) { 3213 eor(v, crc, v, upper ? LSR:LSL, upper ? 32:0); 3214 uxtb(tmp, v); 3215 ldrw(crc, Address(table3, tmp, Address::lsl(2))); 3216 ubfx(tmp, v, 8, 8); 3217 ldrw(tmp, Address(table2, tmp, Address::lsl(2))); 3218 eor(crc, crc, tmp); 3219 ubfx(tmp, v, 16, 8); 3220 ldrw(tmp, Address(table1, tmp, Address::lsl(2))); 3221 eor(crc, crc, tmp); 3222 ubfx(tmp, v, 24, 8); 3223 ldrw(tmp, Address(table0, tmp, Address::lsl(2))); 3224 eor(crc, crc, tmp); 3225 } 3226 3227 void MacroAssembler::kernel_crc32_using_crc32(Register crc, Register buf, 3228 Register len, Register tmp0, Register tmp1, Register tmp2, 3229 Register tmp3) { 3230 Label CRC_by64_loop, CRC_by4_loop, CRC_by1_loop, CRC_less64, CRC_by64_pre, CRC_by32_loop, CRC_less32, L_exit; 3231 assert_different_registers(crc, buf, len, tmp0, tmp1, tmp2, tmp3); 3232 3233 mvnw(crc, crc); 3234 3235 subs(len, len, 128); 3236 br(Assembler::GE, CRC_by64_pre); 3237 BIND(CRC_less64); 3238 adds(len, len, 128-32); 3239 br(Assembler::GE, CRC_by32_loop); 3240 BIND(CRC_less32); 3241 adds(len, len, 32-4); 3242 br(Assembler::GE, CRC_by4_loop); 3243 adds(len, len, 4); 3244 br(Assembler::GT, CRC_by1_loop); 3245 b(L_exit); 3246 3247 BIND(CRC_by32_loop); 3248 ldp(tmp0, tmp1, Address(post(buf, 16))); 3249 subs(len, len, 32); 3250 crc32x(crc, crc, tmp0); 3251 ldr(tmp2, Address(post(buf, 8))); 3252 crc32x(crc, crc, tmp1); 3253 ldr(tmp3, Address(post(buf, 8))); 3254 crc32x(crc, crc, tmp2); 3255 crc32x(crc, crc, tmp3); 3256 br(Assembler::GE, CRC_by32_loop); 3257 cmn(len, 32); 3258 br(Assembler::NE, CRC_less32); 3259 b(L_exit); 3260 3261 BIND(CRC_by4_loop); 3262 ldrw(tmp0, Address(post(buf, 4))); 3263 subs(len, len, 4); 3264 crc32w(crc, crc, tmp0); 3265 br(Assembler::GE, CRC_by4_loop); 3266 adds(len, len, 4); 3267 br(Assembler::LE, L_exit); 3268 BIND(CRC_by1_loop); 3269 ldrb(tmp0, Address(post(buf, 1))); 3270 subs(len, len, 1); 3271 crc32b(crc, crc, tmp0); 3272 br(Assembler::GT, CRC_by1_loop); 3273 b(L_exit); 3274 3275 BIND(CRC_by64_pre); 3276 sub(buf, buf, 8); 3277 ldp(tmp0, tmp1, Address(buf, 8)); 3278 crc32x(crc, crc, tmp0); 3279 ldr(tmp2, Address(buf, 24)); 3280 crc32x(crc, crc, tmp1); 3281 ldr(tmp3, Address(buf, 32)); 3282 crc32x(crc, crc, tmp2); 3283 ldr(tmp0, Address(buf, 40)); 3284 crc32x(crc, crc, tmp3); 3285 ldr(tmp1, Address(buf, 48)); 3286 crc32x(crc, crc, tmp0); 3287 ldr(tmp2, Address(buf, 56)); 3288 crc32x(crc, crc, tmp1); 3289 ldr(tmp3, Address(pre(buf, 64))); 3290 3291 b(CRC_by64_loop); 3292 3293 align(CodeEntryAlignment); 3294 BIND(CRC_by64_loop); 3295 subs(len, len, 64); 3296 crc32x(crc, crc, tmp2); 3297 ldr(tmp0, Address(buf, 8)); 3298 crc32x(crc, crc, tmp3); 3299 ldr(tmp1, Address(buf, 16)); 3300 crc32x(crc, crc, tmp0); 3301 ldr(tmp2, Address(buf, 24)); 3302 crc32x(crc, crc, tmp1); 3303 ldr(tmp3, Address(buf, 32)); 3304 crc32x(crc, crc, tmp2); 3305 ldr(tmp0, Address(buf, 40)); 3306 crc32x(crc, crc, tmp3); 3307 ldr(tmp1, Address(buf, 48)); 3308 crc32x(crc, crc, tmp0); 3309 ldr(tmp2, Address(buf, 56)); 3310 crc32x(crc, crc, tmp1); 3311 ldr(tmp3, Address(pre(buf, 64))); 3312 br(Assembler::GE, CRC_by64_loop); 3313 3314 // post-loop 3315 crc32x(crc, crc, tmp2); 3316 crc32x(crc, crc, tmp3); 3317 3318 sub(len, len, 64); 3319 add(buf, buf, 8); 3320 cmn(len, 128); 3321 br(Assembler::NE, CRC_less64); 3322 BIND(L_exit); 3323 mvnw(crc, crc); 3324 } 3325 3326 /** 3327 * @param crc register containing existing CRC (32-bit) 3328 * @param buf register pointing to input byte buffer (byte*) 3329 * @param len register containing number of bytes 3330 * @param table register that will contain address of CRC table 3331 * @param tmp scratch register 3332 */ 3333 void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len, 3334 Register table0, Register table1, Register table2, Register table3, 3335 Register tmp, Register tmp2, Register tmp3) { 3336 Label L_by16, L_by16_loop, L_by4, L_by4_loop, L_by1, L_by1_loop, L_exit; 3337 unsigned long offset; 3338 3339 if (UseCRC32) { 3340 kernel_crc32_using_crc32(crc, buf, len, table0, table1, table2, table3); 3341 return; 3342 } 3343 3344 mvnw(crc, crc); 3345 3346 adrp(table0, ExternalAddress(StubRoutines::crc_table_addr()), offset); 3347 if (offset) add(table0, table0, offset); 3348 add(table1, table0, 1*256*sizeof(juint)); 3349 add(table2, table0, 2*256*sizeof(juint)); 3350 add(table3, table0, 3*256*sizeof(juint)); 3351 3352 if (UseNeon) { 3353 cmp(len, (u1)64); 3354 br(Assembler::LT, L_by16); 3355 eor(v16, T16B, v16, v16); 3356 3357 Label L_fold; 3358 3359 add(tmp, table0, 4*256*sizeof(juint)); // Point at the Neon constants 3360 3361 ld1(v0, v1, T2D, post(buf, 32)); 3362 ld1r(v4, T2D, post(tmp, 8)); 3363 ld1r(v5, T2D, post(tmp, 8)); 3364 ld1r(v6, T2D, post(tmp, 8)); 3365 ld1r(v7, T2D, post(tmp, 8)); 3366 mov(v16, T4S, 0, crc); 3367 3368 eor(v0, T16B, v0, v16); 3369 sub(len, len, 64); 3370 3371 BIND(L_fold); 3372 pmull(v22, T8H, v0, v5, T8B); 3373 pmull(v20, T8H, v0, v7, T8B); 3374 pmull(v23, T8H, v0, v4, T8B); 3375 pmull(v21, T8H, v0, v6, T8B); 3376 3377 pmull2(v18, T8H, v0, v5, T16B); 3378 pmull2(v16, T8H, v0, v7, T16B); 3379 pmull2(v19, T8H, v0, v4, T16B); 3380 pmull2(v17, T8H, v0, v6, T16B); 3381 3382 uzp1(v24, T8H, v20, v22); 3383 uzp2(v25, T8H, v20, v22); 3384 eor(v20, T16B, v24, v25); 3385 3386 uzp1(v26, T8H, v16, v18); 3387 uzp2(v27, T8H, v16, v18); 3388 eor(v16, T16B, v26, v27); 3389 3390 ushll2(v22, T4S, v20, T8H, 8); 3391 ushll(v20, T4S, v20, T4H, 8); 3392 3393 ushll2(v18, T4S, v16, T8H, 8); 3394 ushll(v16, T4S, v16, T4H, 8); 3395 3396 eor(v22, T16B, v23, v22); 3397 eor(v18, T16B, v19, v18); 3398 eor(v20, T16B, v21, v20); 3399 eor(v16, T16B, v17, v16); 3400 3401 uzp1(v17, T2D, v16, v20); 3402 uzp2(v21, T2D, v16, v20); 3403 eor(v17, T16B, v17, v21); 3404 3405 ushll2(v20, T2D, v17, T4S, 16); 3406 ushll(v16, T2D, v17, T2S, 16); 3407 3408 eor(v20, T16B, v20, v22); 3409 eor(v16, T16B, v16, v18); 3410 3411 uzp1(v17, T2D, v20, v16); 3412 uzp2(v21, T2D, v20, v16); 3413 eor(v28, T16B, v17, v21); 3414 3415 pmull(v22, T8H, v1, v5, T8B); 3416 pmull(v20, T8H, v1, v7, T8B); 3417 pmull(v23, T8H, v1, v4, T8B); 3418 pmull(v21, T8H, v1, v6, T8B); 3419 3420 pmull2(v18, T8H, v1, v5, T16B); 3421 pmull2(v16, T8H, v1, v7, T16B); 3422 pmull2(v19, T8H, v1, v4, T16B); 3423 pmull2(v17, T8H, v1, v6, T16B); 3424 3425 ld1(v0, v1, T2D, post(buf, 32)); 3426 3427 uzp1(v24, T8H, v20, v22); 3428 uzp2(v25, T8H, v20, v22); 3429 eor(v20, T16B, v24, v25); 3430 3431 uzp1(v26, T8H, v16, v18); 3432 uzp2(v27, T8H, v16, v18); 3433 eor(v16, T16B, v26, v27); 3434 3435 ushll2(v22, T4S, v20, T8H, 8); 3436 ushll(v20, T4S, v20, T4H, 8); 3437 3438 ushll2(v18, T4S, v16, T8H, 8); 3439 ushll(v16, T4S, v16, T4H, 8); 3440 3441 eor(v22, T16B, v23, v22); 3442 eor(v18, T16B, v19, v18); 3443 eor(v20, T16B, v21, v20); 3444 eor(v16, T16B, v17, v16); 3445 3446 uzp1(v17, T2D, v16, v20); 3447 uzp2(v21, T2D, v16, v20); 3448 eor(v16, T16B, v17, v21); 3449 3450 ushll2(v20, T2D, v16, T4S, 16); 3451 ushll(v16, T2D, v16, T2S, 16); 3452 3453 eor(v20, T16B, v22, v20); 3454 eor(v16, T16B, v16, v18); 3455 3456 uzp1(v17, T2D, v20, v16); 3457 uzp2(v21, T2D, v20, v16); 3458 eor(v20, T16B, v17, v21); 3459 3460 shl(v16, T2D, v28, 1); 3461 shl(v17, T2D, v20, 1); 3462 3463 eor(v0, T16B, v0, v16); 3464 eor(v1, T16B, v1, v17); 3465 3466 subs(len, len, 32); 3467 br(Assembler::GE, L_fold); 3468 3469 mov(crc, 0); 3470 mov(tmp, v0, T1D, 0); 3471 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false); 3472 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true); 3473 mov(tmp, v0, T1D, 1); 3474 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false); 3475 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true); 3476 mov(tmp, v1, T1D, 0); 3477 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false); 3478 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true); 3479 mov(tmp, v1, T1D, 1); 3480 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false); 3481 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true); 3482 3483 add(len, len, 32); 3484 } 3485 3486 BIND(L_by16); 3487 subs(len, len, 16); 3488 br(Assembler::GE, L_by16_loop); 3489 adds(len, len, 16-4); 3490 br(Assembler::GE, L_by4_loop); 3491 adds(len, len, 4); 3492 br(Assembler::GT, L_by1_loop); 3493 b(L_exit); 3494 3495 BIND(L_by4_loop); 3496 ldrw(tmp, Address(post(buf, 4))); 3497 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3); 3498 subs(len, len, 4); 3499 br(Assembler::GE, L_by4_loop); 3500 adds(len, len, 4); 3501 br(Assembler::LE, L_exit); 3502 BIND(L_by1_loop); 3503 subs(len, len, 1); 3504 ldrb(tmp, Address(post(buf, 1))); 3505 update_byte_crc32(crc, tmp, table0); 3506 br(Assembler::GT, L_by1_loop); 3507 b(L_exit); 3508 3509 align(CodeEntryAlignment); 3510 BIND(L_by16_loop); 3511 subs(len, len, 16); 3512 ldp(tmp, tmp3, Address(post(buf, 16))); 3513 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false); 3514 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true); 3515 update_word_crc32(crc, tmp3, tmp2, table0, table1, table2, table3, false); 3516 update_word_crc32(crc, tmp3, tmp2, table0, table1, table2, table3, true); 3517 br(Assembler::GE, L_by16_loop); 3518 adds(len, len, 16-4); 3519 br(Assembler::GE, L_by4_loop); 3520 adds(len, len, 4); 3521 br(Assembler::GT, L_by1_loop); 3522 BIND(L_exit); 3523 mvnw(crc, crc); 3524 } 3525 3526 void MacroAssembler::kernel_crc32c_using_crc32c(Register crc, Register buf, 3527 Register len, Register tmp0, Register tmp1, Register tmp2, 3528 Register tmp3) { 3529 Label CRC_by64_loop, CRC_by4_loop, CRC_by1_loop, CRC_less64, CRC_by64_pre, CRC_by32_loop, CRC_less32, L_exit; 3530 assert_different_registers(crc, buf, len, tmp0, tmp1, tmp2, tmp3); 3531 3532 subs(len, len, 128); 3533 br(Assembler::GE, CRC_by64_pre); 3534 BIND(CRC_less64); 3535 adds(len, len, 128-32); 3536 br(Assembler::GE, CRC_by32_loop); 3537 BIND(CRC_less32); 3538 adds(len, len, 32-4); 3539 br(Assembler::GE, CRC_by4_loop); 3540 adds(len, len, 4); 3541 br(Assembler::GT, CRC_by1_loop); 3542 b(L_exit); 3543 3544 BIND(CRC_by32_loop); 3545 ldp(tmp0, tmp1, Address(post(buf, 16))); 3546 subs(len, len, 32); 3547 crc32cx(crc, crc, tmp0); 3548 ldr(tmp2, Address(post(buf, 8))); 3549 crc32cx(crc, crc, tmp1); 3550 ldr(tmp3, Address(post(buf, 8))); 3551 crc32cx(crc, crc, tmp2); 3552 crc32cx(crc, crc, tmp3); 3553 br(Assembler::GE, CRC_by32_loop); 3554 cmn(len, 32); 3555 br(Assembler::NE, CRC_less32); 3556 b(L_exit); 3557 3558 BIND(CRC_by4_loop); 3559 ldrw(tmp0, Address(post(buf, 4))); 3560 subs(len, len, 4); 3561 crc32cw(crc, crc, tmp0); 3562 br(Assembler::GE, CRC_by4_loop); 3563 adds(len, len, 4); 3564 br(Assembler::LE, L_exit); 3565 BIND(CRC_by1_loop); 3566 ldrb(tmp0, Address(post(buf, 1))); 3567 subs(len, len, 1); 3568 crc32cb(crc, crc, tmp0); 3569 br(Assembler::GT, CRC_by1_loop); 3570 b(L_exit); 3571 3572 BIND(CRC_by64_pre); 3573 sub(buf, buf, 8); 3574 ldp(tmp0, tmp1, Address(buf, 8)); 3575 crc32cx(crc, crc, tmp0); 3576 ldr(tmp2, Address(buf, 24)); 3577 crc32cx(crc, crc, tmp1); 3578 ldr(tmp3, Address(buf, 32)); 3579 crc32cx(crc, crc, tmp2); 3580 ldr(tmp0, Address(buf, 40)); 3581 crc32cx(crc, crc, tmp3); 3582 ldr(tmp1, Address(buf, 48)); 3583 crc32cx(crc, crc, tmp0); 3584 ldr(tmp2, Address(buf, 56)); 3585 crc32cx(crc, crc, tmp1); 3586 ldr(tmp3, Address(pre(buf, 64))); 3587 3588 b(CRC_by64_loop); 3589 3590 align(CodeEntryAlignment); 3591 BIND(CRC_by64_loop); 3592 subs(len, len, 64); 3593 crc32cx(crc, crc, tmp2); 3594 ldr(tmp0, Address(buf, 8)); 3595 crc32cx(crc, crc, tmp3); 3596 ldr(tmp1, Address(buf, 16)); 3597 crc32cx(crc, crc, tmp0); 3598 ldr(tmp2, Address(buf, 24)); 3599 crc32cx(crc, crc, tmp1); 3600 ldr(tmp3, Address(buf, 32)); 3601 crc32cx(crc, crc, tmp2); 3602 ldr(tmp0, Address(buf, 40)); 3603 crc32cx(crc, crc, tmp3); 3604 ldr(tmp1, Address(buf, 48)); 3605 crc32cx(crc, crc, tmp0); 3606 ldr(tmp2, Address(buf, 56)); 3607 crc32cx(crc, crc, tmp1); 3608 ldr(tmp3, Address(pre(buf, 64))); 3609 br(Assembler::GE, CRC_by64_loop); 3610 3611 // post-loop 3612 crc32cx(crc, crc, tmp2); 3613 crc32cx(crc, crc, tmp3); 3614 3615 sub(len, len, 64); 3616 add(buf, buf, 8); 3617 cmn(len, 128); 3618 br(Assembler::NE, CRC_less64); 3619 BIND(L_exit); 3620 } 3621 3622 /** 3623 * @param crc register containing existing CRC (32-bit) 3624 * @param buf register pointing to input byte buffer (byte*) 3625 * @param len register containing number of bytes 3626 * @param table register that will contain address of CRC table 3627 * @param tmp scratch register 3628 */ 3629 void MacroAssembler::kernel_crc32c(Register crc, Register buf, Register len, 3630 Register table0, Register table1, Register table2, Register table3, 3631 Register tmp, Register tmp2, Register tmp3) { 3632 kernel_crc32c_using_crc32c(crc, buf, len, table0, table1, table2, table3); 3633 } 3634 3635 3636 SkipIfEqual::SkipIfEqual( 3637 MacroAssembler* masm, const bool* flag_addr, bool value) { 3638 _masm = masm; 3639 unsigned long offset; 3640 _masm->adrp(rscratch1, ExternalAddress((address)flag_addr), offset); 3641 _masm->ldrb(rscratch1, Address(rscratch1, offset)); 3642 _masm->cbzw(rscratch1, _label); 3643 } 3644 3645 SkipIfEqual::~SkipIfEqual() { 3646 _masm->bind(_label); 3647 } 3648 3649 void MacroAssembler::addptr(const Address &dst, int32_t src) { 3650 Address adr; 3651 switch(dst.getMode()) { 3652 case Address::base_plus_offset: 3653 // This is the expected mode, although we allow all the other 3654 // forms below. 3655 adr = form_address(rscratch2, dst.base(), dst.offset(), LogBytesPerWord); 3656 break; 3657 default: 3658 lea(rscratch2, dst); 3659 adr = Address(rscratch2); 3660 break; 3661 } 3662 ldr(rscratch1, adr); 3663 add(rscratch1, rscratch1, src); 3664 str(rscratch1, adr); 3665 } 3666 3667 void MacroAssembler::cmpptr(Register src1, Address src2) { 3668 unsigned long offset; 3669 adrp(rscratch1, src2, offset); 3670 ldr(rscratch1, Address(rscratch1, offset)); 3671 cmp(src1, rscratch1); 3672 } 3673 3674 void MacroAssembler::cmpoop(Register obj1, Register obj2) { 3675 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 3676 bs->obj_equals(this, obj1, obj2); 3677 } 3678 3679 void MacroAssembler::load_klass(Register dst, Register src) { 3680 if (UseCompressedClassPointers) { 3681 ldrw(dst, Address(src, oopDesc::klass_offset_in_bytes())); 3682 decode_klass_not_null(dst); 3683 } else { 3684 ldr(dst, Address(src, oopDesc::klass_offset_in_bytes())); 3685 } 3686 } 3687 3688 // ((OopHandle)result).resolve(); 3689 void MacroAssembler::resolve_oop_handle(Register result, Register tmp) { 3690 // OopHandle::resolve is an indirection. 3691 access_load_at(T_OBJECT, IN_NATIVE, result, Address(result, 0), tmp, noreg); 3692 } 3693 3694 void MacroAssembler::load_mirror(Register dst, Register method, Register tmp) { 3695 const int mirror_offset = in_bytes(Klass::java_mirror_offset()); 3696 ldr(dst, Address(rmethod, Method::const_offset())); 3697 ldr(dst, Address(dst, ConstMethod::constants_offset())); 3698 ldr(dst, Address(dst, ConstantPool::pool_holder_offset_in_bytes())); 3699 ldr(dst, Address(dst, mirror_offset)); 3700 resolve_oop_handle(dst, tmp); 3701 } 3702 3703 void MacroAssembler::cmp_klass(Register oop, Register trial_klass, Register tmp) { 3704 if (UseCompressedClassPointers) { 3705 ldrw(tmp, Address(oop, oopDesc::klass_offset_in_bytes())); 3706 if (Universe::narrow_klass_base() == NULL) { 3707 cmp(trial_klass, tmp, LSL, Universe::narrow_klass_shift()); 3708 return; 3709 } else if (((uint64_t)Universe::narrow_klass_base() & 0xffffffff) == 0 3710 && Universe::narrow_klass_shift() == 0) { 3711 // Only the bottom 32 bits matter 3712 cmpw(trial_klass, tmp); 3713 return; 3714 } 3715 decode_klass_not_null(tmp); 3716 } else { 3717 ldr(tmp, Address(oop, oopDesc::klass_offset_in_bytes())); 3718 } 3719 cmp(trial_klass, tmp); 3720 } 3721 3722 void MacroAssembler::load_prototype_header(Register dst, Register src) { 3723 load_klass(dst, src); 3724 ldr(dst, Address(dst, Klass::prototype_header_offset())); 3725 } 3726 3727 void MacroAssembler::store_klass(Register dst, Register src) { 3728 // FIXME: Should this be a store release? concurrent gcs assumes 3729 // klass length is valid if klass field is not null. 3730 if (UseCompressedClassPointers) { 3731 encode_klass_not_null(src); 3732 strw(src, Address(dst, oopDesc::klass_offset_in_bytes())); 3733 } else { 3734 str(src, Address(dst, oopDesc::klass_offset_in_bytes())); 3735 } 3736 } 3737 3738 void MacroAssembler::store_klass_gap(Register dst, Register src) { 3739 if (UseCompressedClassPointers) { 3740 // Store to klass gap in destination 3741 strw(src, Address(dst, oopDesc::klass_gap_offset_in_bytes())); 3742 } 3743 } 3744 3745 // Algorithm must match CompressedOops::encode. 3746 void MacroAssembler::encode_heap_oop(Register d, Register s) { 3747 #ifdef ASSERT 3748 verify_heapbase("MacroAssembler::encode_heap_oop: heap base corrupted?"); 3749 #endif 3750 verify_oop(s, "broken oop in encode_heap_oop"); 3751 if (Universe::narrow_oop_base() == NULL) { 3752 if (Universe::narrow_oop_shift() != 0) { 3753 assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong"); 3754 lsr(d, s, LogMinObjAlignmentInBytes); 3755 } else { 3756 mov(d, s); 3757 } 3758 } else { 3759 subs(d, s, rheapbase); 3760 csel(d, d, zr, Assembler::HS); 3761 lsr(d, d, LogMinObjAlignmentInBytes); 3762 3763 /* Old algorithm: is this any worse? 3764 Label nonnull; 3765 cbnz(r, nonnull); 3766 sub(r, r, rheapbase); 3767 bind(nonnull); 3768 lsr(r, r, LogMinObjAlignmentInBytes); 3769 */ 3770 } 3771 } 3772 3773 void MacroAssembler::encode_heap_oop_not_null(Register r) { 3774 #ifdef ASSERT 3775 verify_heapbase("MacroAssembler::encode_heap_oop_not_null: heap base corrupted?"); 3776 if (CheckCompressedOops) { 3777 Label ok; 3778 cbnz(r, ok); 3779 stop("null oop passed to encode_heap_oop_not_null"); 3780 bind(ok); 3781 } 3782 #endif 3783 verify_oop(r, "broken oop in encode_heap_oop_not_null"); 3784 if (Universe::narrow_oop_base() != NULL) { 3785 sub(r, r, rheapbase); 3786 } 3787 if (Universe::narrow_oop_shift() != 0) { 3788 assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong"); 3789 lsr(r, r, LogMinObjAlignmentInBytes); 3790 } 3791 } 3792 3793 void MacroAssembler::encode_heap_oop_not_null(Register dst, Register src) { 3794 #ifdef ASSERT 3795 verify_heapbase("MacroAssembler::encode_heap_oop_not_null2: heap base corrupted?"); 3796 if (CheckCompressedOops) { 3797 Label ok; 3798 cbnz(src, ok); 3799 stop("null oop passed to encode_heap_oop_not_null2"); 3800 bind(ok); 3801 } 3802 #endif 3803 verify_oop(src, "broken oop in encode_heap_oop_not_null2"); 3804 3805 Register data = src; 3806 if (Universe::narrow_oop_base() != NULL) { 3807 sub(dst, src, rheapbase); 3808 data = dst; 3809 } 3810 if (Universe::narrow_oop_shift() != 0) { 3811 assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong"); 3812 lsr(dst, data, LogMinObjAlignmentInBytes); 3813 data = dst; 3814 } 3815 if (data == src) 3816 mov(dst, src); 3817 } 3818 3819 void MacroAssembler::decode_heap_oop(Register d, Register s) { 3820 #ifdef ASSERT 3821 verify_heapbase("MacroAssembler::decode_heap_oop: heap base corrupted?"); 3822 #endif 3823 if (Universe::narrow_oop_base() == NULL) { 3824 if (Universe::narrow_oop_shift() != 0 || d != s) { 3825 lsl(d, s, Universe::narrow_oop_shift()); 3826 } 3827 } else { 3828 Label done; 3829 if (d != s) 3830 mov(d, s); 3831 cbz(s, done); 3832 add(d, rheapbase, s, Assembler::LSL, LogMinObjAlignmentInBytes); 3833 bind(done); 3834 } 3835 verify_oop(d, "broken oop in decode_heap_oop"); 3836 } 3837 3838 void MacroAssembler::decode_heap_oop_not_null(Register r) { 3839 assert (UseCompressedOops, "should only be used for compressed headers"); 3840 assert (Universe::heap() != NULL, "java heap should be initialized"); 3841 // Cannot assert, unverified entry point counts instructions (see .ad file) 3842 // vtableStubs also counts instructions in pd_code_size_limit. 3843 // Also do not verify_oop as this is called by verify_oop. 3844 if (Universe::narrow_oop_shift() != 0) { 3845 assert(LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong"); 3846 if (Universe::narrow_oop_base() != NULL) { 3847 add(r, rheapbase, r, Assembler::LSL, LogMinObjAlignmentInBytes); 3848 } else { 3849 add(r, zr, r, Assembler::LSL, LogMinObjAlignmentInBytes); 3850 } 3851 } else { 3852 assert (Universe::narrow_oop_base() == NULL, "sanity"); 3853 } 3854 } 3855 3856 void MacroAssembler::decode_heap_oop_not_null(Register dst, Register src) { 3857 assert (UseCompressedOops, "should only be used for compressed headers"); 3858 assert (Universe::heap() != NULL, "java heap should be initialized"); 3859 // Cannot assert, unverified entry point counts instructions (see .ad file) 3860 // vtableStubs also counts instructions in pd_code_size_limit. 3861 // Also do not verify_oop as this is called by verify_oop. 3862 if (Universe::narrow_oop_shift() != 0) { 3863 assert(LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong"); 3864 if (Universe::narrow_oop_base() != NULL) { 3865 add(dst, rheapbase, src, Assembler::LSL, LogMinObjAlignmentInBytes); 3866 } else { 3867 add(dst, zr, src, Assembler::LSL, LogMinObjAlignmentInBytes); 3868 } 3869 } else { 3870 assert (Universe::narrow_oop_base() == NULL, "sanity"); 3871 if (dst != src) { 3872 mov(dst, src); 3873 } 3874 } 3875 } 3876 3877 void MacroAssembler::encode_klass_not_null(Register dst, Register src) { 3878 if (Universe::narrow_klass_base() == NULL) { 3879 if (Universe::narrow_klass_shift() != 0) { 3880 assert (LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong"); 3881 lsr(dst, src, LogKlassAlignmentInBytes); 3882 } else { 3883 if (dst != src) mov(dst, src); 3884 } 3885 return; 3886 } 3887 3888 if (use_XOR_for_compressed_class_base) { 3889 if (Universe::narrow_klass_shift() != 0) { 3890 eor(dst, src, (uint64_t)Universe::narrow_klass_base()); 3891 lsr(dst, dst, LogKlassAlignmentInBytes); 3892 } else { 3893 eor(dst, src, (uint64_t)Universe::narrow_klass_base()); 3894 } 3895 return; 3896 } 3897 3898 if (((uint64_t)Universe::narrow_klass_base() & 0xffffffff) == 0 3899 && Universe::narrow_klass_shift() == 0) { 3900 movw(dst, src); 3901 return; 3902 } 3903 3904 #ifdef ASSERT 3905 verify_heapbase("MacroAssembler::encode_klass_not_null2: heap base corrupted?"); 3906 #endif 3907 3908 Register rbase = dst; 3909 if (dst == src) rbase = rheapbase; 3910 mov(rbase, (uint64_t)Universe::narrow_klass_base()); 3911 sub(dst, src, rbase); 3912 if (Universe::narrow_klass_shift() != 0) { 3913 assert (LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong"); 3914 lsr(dst, dst, LogKlassAlignmentInBytes); 3915 } 3916 if (dst == src) reinit_heapbase(); 3917 } 3918 3919 void MacroAssembler::encode_klass_not_null(Register r) { 3920 encode_klass_not_null(r, r); 3921 } 3922 3923 void MacroAssembler::decode_klass_not_null(Register dst, Register src) { 3924 Register rbase = dst; 3925 assert (UseCompressedClassPointers, "should only be used for compressed headers"); 3926 3927 if (Universe::narrow_klass_base() == NULL) { 3928 if (Universe::narrow_klass_shift() != 0) { 3929 assert(LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong"); 3930 lsl(dst, src, LogKlassAlignmentInBytes); 3931 } else { 3932 if (dst != src) mov(dst, src); 3933 } 3934 return; 3935 } 3936 3937 if (use_XOR_for_compressed_class_base) { 3938 if (Universe::narrow_klass_shift() != 0) { 3939 lsl(dst, src, LogKlassAlignmentInBytes); 3940 eor(dst, dst, (uint64_t)Universe::narrow_klass_base()); 3941 } else { 3942 eor(dst, src, (uint64_t)Universe::narrow_klass_base()); 3943 } 3944 return; 3945 } 3946 3947 if (((uint64_t)Universe::narrow_klass_base() & 0xffffffff) == 0 3948 && Universe::narrow_klass_shift() == 0) { 3949 if (dst != src) 3950 movw(dst, src); 3951 movk(dst, (uint64_t)Universe::narrow_klass_base() >> 32, 32); 3952 return; 3953 } 3954 3955 // Cannot assert, unverified entry point counts instructions (see .ad file) 3956 // vtableStubs also counts instructions in pd_code_size_limit. 3957 // Also do not verify_oop as this is called by verify_oop. 3958 if (dst == src) rbase = rheapbase; 3959 mov(rbase, (uint64_t)Universe::narrow_klass_base()); 3960 if (Universe::narrow_klass_shift() != 0) { 3961 assert(LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong"); 3962 add(dst, rbase, src, Assembler::LSL, LogKlassAlignmentInBytes); 3963 } else { 3964 add(dst, rbase, src); 3965 } 3966 if (dst == src) reinit_heapbase(); 3967 } 3968 3969 void MacroAssembler::decode_klass_not_null(Register r) { 3970 decode_klass_not_null(r, r); 3971 } 3972 3973 void MacroAssembler::set_narrow_oop(Register dst, jobject obj) { 3974 #ifdef ASSERT 3975 { 3976 ThreadInVMfromUnknown tiv; 3977 assert (UseCompressedOops, "should only be used for compressed oops"); 3978 assert (Universe::heap() != NULL, "java heap should be initialized"); 3979 assert (oop_recorder() != NULL, "this assembler needs an OopRecorder"); 3980 assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "should be real oop"); 3981 } 3982 #endif 3983 int oop_index = oop_recorder()->find_index(obj); 3984 InstructionMark im(this); 3985 RelocationHolder rspec = oop_Relocation::spec(oop_index); 3986 code_section()->relocate(inst_mark(), rspec); 3987 movz(dst, 0xDEAD, 16); 3988 movk(dst, 0xBEEF); 3989 } 3990 3991 void MacroAssembler::set_narrow_klass(Register dst, Klass* k) { 3992 assert (UseCompressedClassPointers, "should only be used for compressed headers"); 3993 assert (oop_recorder() != NULL, "this assembler needs an OopRecorder"); 3994 int index = oop_recorder()->find_index(k); 3995 assert(! Universe::heap()->is_in_reserved(k), "should not be an oop"); 3996 3997 InstructionMark im(this); 3998 RelocationHolder rspec = metadata_Relocation::spec(index); 3999 code_section()->relocate(inst_mark(), rspec); 4000 narrowKlass nk = Klass::encode_klass(k); 4001 movz(dst, (nk >> 16), 16); 4002 movk(dst, nk & 0xffff); 4003 } 4004 4005 void MacroAssembler::access_load_at(BasicType type, DecoratorSet decorators, 4006 Register dst, Address src, 4007 Register tmp1, Register thread_tmp) { 4008 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 4009 decorators = AccessInternal::decorator_fixup(decorators); 4010 bool as_raw = (decorators & AS_RAW) != 0; 4011 if (as_raw) { 4012 bs->BarrierSetAssembler::load_at(this, decorators, type, dst, src, tmp1, thread_tmp); 4013 } else { 4014 bs->load_at(this, decorators, type, dst, src, tmp1, thread_tmp); 4015 } 4016 } 4017 4018 void MacroAssembler::access_store_at(BasicType type, DecoratorSet decorators, 4019 Address dst, Register src, 4020 Register tmp1, Register thread_tmp) { 4021 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 4022 decorators = AccessInternal::decorator_fixup(decorators); 4023 bool as_raw = (decorators & AS_RAW) != 0; 4024 if (as_raw) { 4025 bs->BarrierSetAssembler::store_at(this, decorators, type, dst, src, tmp1, thread_tmp); 4026 } else { 4027 bs->store_at(this, decorators, type, dst, src, tmp1, thread_tmp); 4028 } 4029 } 4030 4031 void MacroAssembler::resolve(DecoratorSet decorators, Register obj) { 4032 // Use stronger ACCESS_WRITE|ACCESS_READ by default. 4033 if ((decorators & (ACCESS_READ | ACCESS_WRITE)) == 0) { 4034 decorators |= ACCESS_READ | ACCESS_WRITE; 4035 } 4036 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 4037 return bs->resolve(this, decorators, obj); 4038 } 4039 4040 void MacroAssembler::load_heap_oop(Register dst, Address src, Register tmp1, 4041 Register thread_tmp, DecoratorSet decorators) { 4042 access_load_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, thread_tmp); 4043 } 4044 4045 void MacroAssembler::load_heap_oop_not_null(Register dst, Address src, Register tmp1, 4046 Register thread_tmp, DecoratorSet decorators) { 4047 access_load_at(T_OBJECT, IN_HEAP | IS_NOT_NULL | decorators, dst, src, tmp1, thread_tmp); 4048 } 4049 4050 void MacroAssembler::store_heap_oop(Address dst, Register src, Register tmp1, 4051 Register thread_tmp, DecoratorSet decorators) { 4052 access_store_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, thread_tmp); 4053 } 4054 4055 // Used for storing NULLs. 4056 void MacroAssembler::store_heap_oop_null(Address dst) { 4057 access_store_at(T_OBJECT, IN_HEAP, dst, noreg, noreg, noreg); 4058 } 4059 4060 Address MacroAssembler::allocate_metadata_address(Metadata* obj) { 4061 assert(oop_recorder() != NULL, "this assembler needs a Recorder"); 4062 int index = oop_recorder()->allocate_metadata_index(obj); 4063 RelocationHolder rspec = metadata_Relocation::spec(index); 4064 return Address((address)obj, rspec); 4065 } 4066 4067 // Move an oop into a register. immediate is true if we want 4068 // immediate instrcutions, i.e. we are not going to patch this 4069 // instruction while the code is being executed by another thread. In 4070 // that case we can use move immediates rather than the constant pool. 4071 void MacroAssembler::movoop(Register dst, jobject obj, bool immediate) { 4072 int oop_index; 4073 if (obj == NULL) { 4074 oop_index = oop_recorder()->allocate_oop_index(obj); 4075 } else { 4076 #ifdef ASSERT 4077 { 4078 ThreadInVMfromUnknown tiv; 4079 assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "should be real oop"); 4080 } 4081 #endif 4082 oop_index = oop_recorder()->find_index(obj); 4083 } 4084 RelocationHolder rspec = oop_Relocation::spec(oop_index); 4085 if (! immediate) { 4086 address dummy = address(uintptr_t(pc()) & -wordSize); // A nearby aligned address 4087 ldr_constant(dst, Address(dummy, rspec)); 4088 } else 4089 mov(dst, Address((address)obj, rspec)); 4090 } 4091 4092 // Move a metadata address into a register. 4093 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) { 4094 int oop_index; 4095 if (obj == NULL) { 4096 oop_index = oop_recorder()->allocate_metadata_index(obj); 4097 } else { 4098 oop_index = oop_recorder()->find_index(obj); 4099 } 4100 RelocationHolder rspec = metadata_Relocation::spec(oop_index); 4101 mov(dst, Address((address)obj, rspec)); 4102 } 4103 4104 Address MacroAssembler::constant_oop_address(jobject obj) { 4105 #ifdef ASSERT 4106 { 4107 ThreadInVMfromUnknown tiv; 4108 assert(oop_recorder() != NULL, "this assembler needs an OopRecorder"); 4109 assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "not an oop"); 4110 } 4111 #endif 4112 int oop_index = oop_recorder()->find_index(obj); 4113 return Address((address)obj, oop_Relocation::spec(oop_index)); 4114 } 4115 4116 // Defines obj, preserves var_size_in_bytes, okay for t2 == var_size_in_bytes. 4117 void MacroAssembler::tlab_allocate(Register obj, 4118 Register var_size_in_bytes, 4119 int con_size_in_bytes, 4120 Register t1, 4121 Register t2, 4122 Label& slow_case) { 4123 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 4124 bs->tlab_allocate(this, obj, var_size_in_bytes, con_size_in_bytes, t1, t2, slow_case); 4125 } 4126 4127 // Defines obj, preserves var_size_in_bytes 4128 void MacroAssembler::eden_allocate(Register obj, 4129 Register var_size_in_bytes, 4130 int con_size_in_bytes, 4131 Register t1, 4132 Label& slow_case) { 4133 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 4134 bs->eden_allocate(this, obj, var_size_in_bytes, con_size_in_bytes, t1, slow_case); 4135 } 4136 4137 // Zero words; len is in bytes 4138 // Destroys all registers except addr 4139 // len must be a nonzero multiple of wordSize 4140 void MacroAssembler::zero_memory(Register addr, Register len, Register t1) { 4141 assert_different_registers(addr, len, t1, rscratch1, rscratch2); 4142 4143 #ifdef ASSERT 4144 { Label L; 4145 tst(len, BytesPerWord - 1); 4146 br(Assembler::EQ, L); 4147 stop("len is not a multiple of BytesPerWord"); 4148 bind(L); 4149 } 4150 #endif 4151 4152 #ifndef PRODUCT 4153 block_comment("zero memory"); 4154 #endif 4155 4156 Label loop; 4157 Label entry; 4158 4159 // Algorithm: 4160 // 4161 // scratch1 = cnt & 7; 4162 // cnt -= scratch1; 4163 // p += scratch1; 4164 // switch (scratch1) { 4165 // do { 4166 // cnt -= 8; 4167 // p[-8] = 0; 4168 // case 7: 4169 // p[-7] = 0; 4170 // case 6: 4171 // p[-6] = 0; 4172 // // ... 4173 // case 1: 4174 // p[-1] = 0; 4175 // case 0: 4176 // p += 8; 4177 // } while (cnt); 4178 // } 4179 4180 const int unroll = 8; // Number of str(zr) instructions we'll unroll 4181 4182 lsr(len, len, LogBytesPerWord); 4183 andr(rscratch1, len, unroll - 1); // tmp1 = cnt % unroll 4184 sub(len, len, rscratch1); // cnt -= unroll 4185 // t1 always points to the end of the region we're about to zero 4186 add(t1, addr, rscratch1, Assembler::LSL, LogBytesPerWord); 4187 adr(rscratch2, entry); 4188 sub(rscratch2, rscratch2, rscratch1, Assembler::LSL, 2); 4189 br(rscratch2); 4190 bind(loop); 4191 sub(len, len, unroll); 4192 for (int i = -unroll; i < 0; i++) 4193 Assembler::str(zr, Address(t1, i * wordSize)); 4194 bind(entry); 4195 add(t1, t1, unroll * wordSize); 4196 cbnz(len, loop); 4197 } 4198 4199 void MacroAssembler::verify_tlab() { 4200 #ifdef ASSERT 4201 if (UseTLAB && VerifyOops) { 4202 Label next, ok; 4203 4204 stp(rscratch2, rscratch1, Address(pre(sp, -16))); 4205 4206 ldr(rscratch2, Address(rthread, in_bytes(JavaThread::tlab_top_offset()))); 4207 ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_start_offset()))); 4208 cmp(rscratch2, rscratch1); 4209 br(Assembler::HS, next); 4210 STOP("assert(top >= start)"); 4211 should_not_reach_here(); 4212 4213 bind(next); 4214 ldr(rscratch2, Address(rthread, in_bytes(JavaThread::tlab_end_offset()))); 4215 ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_top_offset()))); 4216 cmp(rscratch2, rscratch1); 4217 br(Assembler::HS, ok); 4218 STOP("assert(top <= end)"); 4219 should_not_reach_here(); 4220 4221 bind(ok); 4222 ldp(rscratch2, rscratch1, Address(post(sp, 16))); 4223 } 4224 #endif 4225 } 4226 4227 // Writes to stack successive pages until offset reached to check for 4228 // stack overflow + shadow pages. This clobbers tmp. 4229 void MacroAssembler::bang_stack_size(Register size, Register tmp) { 4230 assert_different_registers(tmp, size, rscratch1); 4231 mov(tmp, sp); 4232 // Bang stack for total size given plus shadow page size. 4233 // Bang one page at a time because large size can bang beyond yellow and 4234 // red zones. 4235 Label loop; 4236 mov(rscratch1, os::vm_page_size()); 4237 bind(loop); 4238 lea(tmp, Address(tmp, -os::vm_page_size())); 4239 subsw(size, size, rscratch1); 4240 str(size, Address(tmp)); 4241 br(Assembler::GT, loop); 4242 4243 // Bang down shadow pages too. 4244 // At this point, (tmp-0) is the last address touched, so don't 4245 // touch it again. (It was touched as (tmp-pagesize) but then tmp 4246 // was post-decremented.) Skip this address by starting at i=1, and 4247 // touch a few more pages below. N.B. It is important to touch all 4248 // the way down to and including i=StackShadowPages. 4249 for (int i = 0; i < (int)(JavaThread::stack_shadow_zone_size() / os::vm_page_size()) - 1; i++) { 4250 // this could be any sized move but this is can be a debugging crumb 4251 // so the bigger the better. 4252 lea(tmp, Address(tmp, -os::vm_page_size())); 4253 str(size, Address(tmp)); 4254 } 4255 } 4256 4257 4258 // Move the address of the polling page into dest. 4259 void MacroAssembler::get_polling_page(Register dest, address page, relocInfo::relocType rtype) { 4260 if (SafepointMechanism::uses_thread_local_poll()) { 4261 ldr(dest, Address(rthread, Thread::polling_page_offset())); 4262 } else { 4263 unsigned long off; 4264 adrp(dest, Address(page, rtype), off); 4265 assert(off == 0, "polling page must be page aligned"); 4266 } 4267 } 4268 4269 // Move the address of the polling page into r, then read the polling 4270 // page. 4271 address MacroAssembler::read_polling_page(Register r, address page, relocInfo::relocType rtype) { 4272 get_polling_page(r, page, rtype); 4273 return read_polling_page(r, rtype); 4274 } 4275 4276 // Read the polling page. The address of the polling page must 4277 // already be in r. 4278 address MacroAssembler::read_polling_page(Register r, relocInfo::relocType rtype) { 4279 InstructionMark im(this); 4280 code_section()->relocate(inst_mark(), rtype); 4281 ldrw(zr, Address(r, 0)); 4282 return inst_mark(); 4283 } 4284 4285 void MacroAssembler::adrp(Register reg1, const Address &dest, unsigned long &byte_offset) { 4286 relocInfo::relocType rtype = dest.rspec().reloc()->type(); 4287 unsigned long low_page = (unsigned long)CodeCache::low_bound() >> 12; 4288 unsigned long high_page = (unsigned long)(CodeCache::high_bound()-1) >> 12; 4289 unsigned long dest_page = (unsigned long)dest.target() >> 12; 4290 long offset_low = dest_page - low_page; 4291 long offset_high = dest_page - high_page; 4292 4293 assert(is_valid_AArch64_address(dest.target()), "bad address"); 4294 assert(dest.getMode() == Address::literal, "ADRP must be applied to a literal address"); 4295 4296 InstructionMark im(this); 4297 code_section()->relocate(inst_mark(), dest.rspec()); 4298 // 8143067: Ensure that the adrp can reach the dest from anywhere within 4299 // the code cache so that if it is relocated we know it will still reach 4300 if (offset_high >= -(1<<20) && offset_low < (1<<20)) { 4301 _adrp(reg1, dest.target()); 4302 } else { 4303 unsigned long target = (unsigned long)dest.target(); 4304 unsigned long adrp_target 4305 = (target & 0xffffffffUL) | ((unsigned long)pc() & 0xffff00000000UL); 4306 4307 _adrp(reg1, (address)adrp_target); 4308 movk(reg1, target >> 32, 32); 4309 } 4310 byte_offset = (unsigned long)dest.target() & 0xfff; 4311 } 4312 4313 void MacroAssembler::load_byte_map_base(Register reg) { 4314 jbyte *byte_map_base = 4315 ((CardTableBarrierSet*)(BarrierSet::barrier_set()))->card_table()->byte_map_base(); 4316 4317 if (is_valid_AArch64_address((address)byte_map_base)) { 4318 // Strictly speaking the byte_map_base isn't an address at all, 4319 // and it might even be negative. 4320 unsigned long offset; 4321 adrp(reg, ExternalAddress((address)byte_map_base), offset); 4322 // We expect offset to be zero with most collectors. 4323 if (offset != 0) { 4324 add(reg, reg, offset); 4325 } 4326 } else { 4327 mov(reg, (uint64_t)byte_map_base); 4328 } 4329 } 4330 4331 void MacroAssembler::build_frame(int framesize) { 4332 assert(framesize > 0, "framesize must be > 0"); 4333 if (framesize < ((1 << 9) + 2 * wordSize)) { 4334 sub(sp, sp, framesize); 4335 stp(rfp, lr, Address(sp, framesize - 2 * wordSize)); 4336 if (PreserveFramePointer) add(rfp, sp, framesize - 2 * wordSize); 4337 } else { 4338 stp(rfp, lr, Address(pre(sp, -2 * wordSize))); 4339 if (PreserveFramePointer) mov(rfp, sp); 4340 if (framesize < ((1 << 12) + 2 * wordSize)) 4341 sub(sp, sp, framesize - 2 * wordSize); 4342 else { 4343 mov(rscratch1, framesize - 2 * wordSize); 4344 sub(sp, sp, rscratch1); 4345 } 4346 } 4347 } 4348 4349 void MacroAssembler::remove_frame(int framesize) { 4350 assert(framesize > 0, "framesize must be > 0"); 4351 if (framesize < ((1 << 9) + 2 * wordSize)) { 4352 ldp(rfp, lr, Address(sp, framesize - 2 * wordSize)); 4353 add(sp, sp, framesize); 4354 } else { 4355 if (framesize < ((1 << 12) + 2 * wordSize)) 4356 add(sp, sp, framesize - 2 * wordSize); 4357 else { 4358 mov(rscratch1, framesize - 2 * wordSize); 4359 add(sp, sp, rscratch1); 4360 } 4361 ldp(rfp, lr, Address(post(sp, 2 * wordSize))); 4362 } 4363 } 4364 4365 #ifdef COMPILER2 4366 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr); 4367 4368 // Search for str1 in str2 and return index or -1 4369 void MacroAssembler::string_indexof(Register str2, Register str1, 4370 Register cnt2, Register cnt1, 4371 Register tmp1, Register tmp2, 4372 Register tmp3, Register tmp4, 4373 Register tmp5, Register tmp6, 4374 int icnt1, Register result, int ae) { 4375 // NOTE: tmp5, tmp6 can be zr depending on specific method version 4376 Label LINEARSEARCH, LINEARSTUB, LINEAR_MEDIUM, DONE, NOMATCH, MATCH; 4377 4378 Register ch1 = rscratch1; 4379 Register ch2 = rscratch2; 4380 Register cnt1tmp = tmp1; 4381 Register cnt2tmp = tmp2; 4382 Register cnt1_neg = cnt1; 4383 Register cnt2_neg = cnt2; 4384 Register result_tmp = tmp4; 4385 4386 bool isL = ae == StrIntrinsicNode::LL; 4387 4388 bool str1_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL; 4389 bool str2_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU; 4390 int str1_chr_shift = str1_isL ? 0:1; 4391 int str2_chr_shift = str2_isL ? 0:1; 4392 int str1_chr_size = str1_isL ? 1:2; 4393 int str2_chr_size = str2_isL ? 1:2; 4394 chr_insn str1_load_1chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb : 4395 (chr_insn)&MacroAssembler::ldrh; 4396 chr_insn str2_load_1chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb : 4397 (chr_insn)&MacroAssembler::ldrh; 4398 chr_insn load_2chr = isL ? (chr_insn)&MacroAssembler::ldrh : (chr_insn)&MacroAssembler::ldrw; 4399 chr_insn load_4chr = isL ? (chr_insn)&MacroAssembler::ldrw : (chr_insn)&MacroAssembler::ldr; 4400 4401 // Note, inline_string_indexOf() generates checks: 4402 // if (substr.count > string.count) return -1; 4403 // if (substr.count == 0) return 0; 4404 4405 // We have two strings, a source string in str2, cnt2 and a pattern string 4406 // in str1, cnt1. Find the 1st occurence of pattern in source or return -1. 4407 4408 // For larger pattern and source we use a simplified Boyer Moore algorithm. 4409 // With a small pattern and source we use linear scan. 4410 4411 if (icnt1 == -1) { 4412 sub(result_tmp, cnt2, cnt1); 4413 cmp(cnt1, (u1)8); // Use Linear Scan if cnt1 < 8 || cnt1 >= 256 4414 br(LT, LINEARSEARCH); 4415 dup(v0, T16B, cnt1); // done in separate FPU pipeline. Almost no penalty 4416 subs(zr, cnt1, 256); 4417 lsr(tmp1, cnt2, 2); 4418 ccmp(cnt1, tmp1, 0b0000, LT); // Source must be 4 * pattern for BM 4419 br(GE, LINEARSTUB); 4420 } 4421 4422 // The Boyer Moore alogorithm is based on the description here:- 4423 // 4424 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm 4425 // 4426 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule 4427 // and the 'Good Suffix' rule. 4428 // 4429 // These rules are essentially heuristics for how far we can shift the 4430 // pattern along the search string. 4431 // 4432 // The implementation here uses the 'Bad Character' rule only because of the 4433 // complexity of initialisation for the 'Good Suffix' rule. 4434 // 4435 // This is also known as the Boyer-Moore-Horspool algorithm:- 4436 // 4437 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm 4438 // 4439 // This particular implementation has few java-specific optimizations. 4440 // 4441 // #define ASIZE 256 4442 // 4443 // int bm(unsigned char *x, int m, unsigned char *y, int n) { 4444 // int i, j; 4445 // unsigned c; 4446 // unsigned char bc[ASIZE]; 4447 // 4448 // /* Preprocessing */ 4449 // for (i = 0; i < ASIZE; ++i) 4450 // bc[i] = m; 4451 // for (i = 0; i < m - 1; ) { 4452 // c = x[i]; 4453 // ++i; 4454 // // c < 256 for Latin1 string, so, no need for branch 4455 // #ifdef PATTERN_STRING_IS_LATIN1 4456 // bc[c] = m - i; 4457 // #else 4458 // if (c < ASIZE) bc[c] = m - i; 4459 // #endif 4460 // } 4461 // 4462 // /* Searching */ 4463 // j = 0; 4464 // while (j <= n - m) { 4465 // c = y[i+j]; 4466 // if (x[m-1] == c) 4467 // for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i); 4468 // if (i < 0) return j; 4469 // // c < 256 for Latin1 string, so, no need for branch 4470 // #ifdef SOURCE_STRING_IS_LATIN1 4471 // // LL case: (c< 256) always true. Remove branch 4472 // j += bc[y[j+m-1]]; 4473 // #endif 4474 // #ifndef PATTERN_STRING_IS_UTF 4475 // // UU case: need if (c<ASIZE) check. Skip 1 character if not. 4476 // if (c < ASIZE) 4477 // j += bc[y[j+m-1]]; 4478 // else 4479 // j += 1 4480 // #endif 4481 // #ifdef PATTERN_IS_LATIN1_AND_SOURCE_IS_UTF 4482 // // UL case: need if (c<ASIZE) check. Skip <pattern length> if not. 4483 // if (c < ASIZE) 4484 // j += bc[y[j+m-1]]; 4485 // else 4486 // j += m 4487 // #endif 4488 // } 4489 // } 4490 4491 if (icnt1 == -1) { 4492 Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH, 4493 BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP; 4494 Register cnt1end = tmp2; 4495 Register str2end = cnt2; 4496 Register skipch = tmp2; 4497 4498 // str1 length is >=8, so, we can read at least 1 register for cases when 4499 // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for 4500 // UL case. We'll re-read last character in inner pre-loop code to have 4501 // single outer pre-loop load 4502 const int firstStep = isL ? 7 : 3; 4503 4504 const int ASIZE = 256; 4505 const int STORED_BYTES = 32; // amount of bytes stored per instruction 4506 sub(sp, sp, ASIZE); 4507 mov(tmp5, ASIZE/STORED_BYTES); // loop iterations 4508 mov(ch1, sp); 4509 BIND(BM_INIT_LOOP); 4510 stpq(v0, v0, Address(post(ch1, STORED_BYTES))); 4511 subs(tmp5, tmp5, 1); 4512 br(GT, BM_INIT_LOOP); 4513 4514 sub(cnt1tmp, cnt1, 1); 4515 mov(tmp5, str2); 4516 add(str2end, str2, result_tmp, LSL, str2_chr_shift); 4517 sub(ch2, cnt1, 1); 4518 mov(tmp3, str1); 4519 BIND(BCLOOP); 4520 (this->*str1_load_1chr)(ch1, Address(post(tmp3, str1_chr_size))); 4521 if (!str1_isL) { 4522 subs(zr, ch1, ASIZE); 4523 br(HS, BCSKIP); 4524 } 4525 strb(ch2, Address(sp, ch1)); 4526 BIND(BCSKIP); 4527 subs(ch2, ch2, 1); 4528 br(GT, BCLOOP); 4529 4530 add(tmp6, str1, cnt1, LSL, str1_chr_shift); // address after str1 4531 if (str1_isL == str2_isL) { 4532 // load last 8 bytes (8LL/4UU symbols) 4533 ldr(tmp6, Address(tmp6, -wordSize)); 4534 } else { 4535 ldrw(tmp6, Address(tmp6, -wordSize/2)); // load last 4 bytes(4 symbols) 4536 // convert Latin1 to UTF. We'll have to wait until load completed, but 4537 // it's still faster than per-character loads+checks 4538 lsr(tmp3, tmp6, BitsPerByte * (wordSize/2 - str1_chr_size)); // str1[N-1] 4539 ubfx(ch1, tmp6, 8, 8); // str1[N-2] 4540 ubfx(ch2, tmp6, 16, 8); // str1[N-3] 4541 andr(tmp6, tmp6, 0xFF); // str1[N-4] 4542 orr(ch2, ch1, ch2, LSL, 16); 4543 orr(tmp6, tmp6, tmp3, LSL, 48); 4544 orr(tmp6, tmp6, ch2, LSL, 16); 4545 } 4546 BIND(BMLOOPSTR2); 4547 (this->*str2_load_1chr)(skipch, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift))); 4548 sub(cnt1tmp, cnt1tmp, firstStep); // cnt1tmp is positive here, because cnt1 >= 8 4549 if (str1_isL == str2_isL) { 4550 // re-init tmp3. It's for free because it's executed in parallel with 4551 // load above. Alternative is to initialize it before loop, but it'll 4552 // affect performance on in-order systems with 2 or more ld/st pipelines 4553 lsr(tmp3, tmp6, BitsPerByte * (wordSize - str1_chr_size)); 4554 } 4555 if (!isL) { // UU/UL case 4556 lsl(ch2, cnt1tmp, 1); // offset in bytes 4557 } 4558 cmp(tmp3, skipch); 4559 br(NE, BMSKIP); 4560 ldr(ch2, Address(str2, isL ? cnt1tmp : ch2)); 4561 mov(ch1, tmp6); 4562 if (isL) { 4563 b(BMLOOPSTR1_AFTER_LOAD); 4564 } else { 4565 sub(cnt1tmp, cnt1tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8 4566 b(BMLOOPSTR1_CMP); 4567 } 4568 BIND(BMLOOPSTR1); 4569 (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift))); 4570 (this->*str2_load_1chr)(ch2, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift))); 4571 BIND(BMLOOPSTR1_AFTER_LOAD); 4572 subs(cnt1tmp, cnt1tmp, 1); 4573 br(LT, BMLOOPSTR1_LASTCMP); 4574 BIND(BMLOOPSTR1_CMP); 4575 cmp(ch1, ch2); 4576 br(EQ, BMLOOPSTR1); 4577 BIND(BMSKIP); 4578 if (!isL) { 4579 // if we've met UTF symbol while searching Latin1 pattern, then we can 4580 // skip cnt1 symbols 4581 if (str1_isL != str2_isL) { 4582 mov(result_tmp, cnt1); 4583 } else { 4584 mov(result_tmp, 1); 4585 } 4586 subs(zr, skipch, ASIZE); 4587 br(HS, BMADV); 4588 } 4589 ldrb(result_tmp, Address(sp, skipch)); // load skip distance 4590 BIND(BMADV); 4591 sub(cnt1tmp, cnt1, 1); 4592 add(str2, str2, result_tmp, LSL, str2_chr_shift); 4593 cmp(str2, str2end); 4594 br(LE, BMLOOPSTR2); 4595 add(sp, sp, ASIZE); 4596 b(NOMATCH); 4597 BIND(BMLOOPSTR1_LASTCMP); 4598 cmp(ch1, ch2); 4599 br(NE, BMSKIP); 4600 BIND(BMMATCH); 4601 sub(result, str2, tmp5); 4602 if (!str2_isL) lsr(result, result, 1); 4603 add(sp, sp, ASIZE); 4604 b(DONE); 4605 4606 BIND(LINEARSTUB); 4607 cmp(cnt1, (u1)16); // small patterns still should be handled by simple algorithm 4608 br(LT, LINEAR_MEDIUM); 4609 mov(result, zr); 4610 RuntimeAddress stub = NULL; 4611 if (isL) { 4612 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ll()); 4613 assert(stub.target() != NULL, "string_indexof_linear_ll stub has not been generated"); 4614 } else if (str1_isL) { 4615 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ul()); 4616 assert(stub.target() != NULL, "string_indexof_linear_ul stub has not been generated"); 4617 } else { 4618 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_uu()); 4619 assert(stub.target() != NULL, "string_indexof_linear_uu stub has not been generated"); 4620 } 4621 trampoline_call(stub); 4622 b(DONE); 4623 } 4624 4625 BIND(LINEARSEARCH); 4626 { 4627 Label DO1, DO2, DO3; 4628 4629 Register str2tmp = tmp2; 4630 Register first = tmp3; 4631 4632 if (icnt1 == -1) 4633 { 4634 Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT; 4635 4636 cmp(cnt1, u1(str1_isL == str2_isL ? 4 : 2)); 4637 br(LT, DOSHORT); 4638 BIND(LINEAR_MEDIUM); 4639 (this->*str1_load_1chr)(first, Address(str1)); 4640 lea(str1, Address(str1, cnt1, Address::lsl(str1_chr_shift))); 4641 sub(cnt1_neg, zr, cnt1, LSL, str1_chr_shift); 4642 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 4643 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 4644 4645 BIND(FIRST_LOOP); 4646 (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg)); 4647 cmp(first, ch2); 4648 br(EQ, STR1_LOOP); 4649 BIND(STR2_NEXT); 4650 adds(cnt2_neg, cnt2_neg, str2_chr_size); 4651 br(LE, FIRST_LOOP); 4652 b(NOMATCH); 4653 4654 BIND(STR1_LOOP); 4655 adds(cnt1tmp, cnt1_neg, str1_chr_size); 4656 add(cnt2tmp, cnt2_neg, str2_chr_size); 4657 br(GE, MATCH); 4658 4659 BIND(STR1_NEXT); 4660 (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp)); 4661 (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp)); 4662 cmp(ch1, ch2); 4663 br(NE, STR2_NEXT); 4664 adds(cnt1tmp, cnt1tmp, str1_chr_size); 4665 add(cnt2tmp, cnt2tmp, str2_chr_size); 4666 br(LT, STR1_NEXT); 4667 b(MATCH); 4668 4669 BIND(DOSHORT); 4670 if (str1_isL == str2_isL) { 4671 cmp(cnt1, (u1)2); 4672 br(LT, DO1); 4673 br(GT, DO3); 4674 } 4675 } 4676 4677 if (icnt1 == 4) { 4678 Label CH1_LOOP; 4679 4680 (this->*load_4chr)(ch1, str1); 4681 sub(result_tmp, cnt2, 4); 4682 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 4683 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 4684 4685 BIND(CH1_LOOP); 4686 (this->*load_4chr)(ch2, Address(str2, cnt2_neg)); 4687 cmp(ch1, ch2); 4688 br(EQ, MATCH); 4689 adds(cnt2_neg, cnt2_neg, str2_chr_size); 4690 br(LE, CH1_LOOP); 4691 b(NOMATCH); 4692 } 4693 4694 if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 2) { 4695 Label CH1_LOOP; 4696 4697 BIND(DO2); 4698 (this->*load_2chr)(ch1, str1); 4699 if (icnt1 == 2) { 4700 sub(result_tmp, cnt2, 2); 4701 } 4702 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 4703 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 4704 BIND(CH1_LOOP); 4705 (this->*load_2chr)(ch2, Address(str2, cnt2_neg)); 4706 cmp(ch1, ch2); 4707 br(EQ, MATCH); 4708 adds(cnt2_neg, cnt2_neg, str2_chr_size); 4709 br(LE, CH1_LOOP); 4710 b(NOMATCH); 4711 } 4712 4713 if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 3) { 4714 Label FIRST_LOOP, STR2_NEXT, STR1_LOOP; 4715 4716 BIND(DO3); 4717 (this->*load_2chr)(first, str1); 4718 (this->*str1_load_1chr)(ch1, Address(str1, 2*str1_chr_size)); 4719 if (icnt1 == 3) { 4720 sub(result_tmp, cnt2, 3); 4721 } 4722 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 4723 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 4724 BIND(FIRST_LOOP); 4725 (this->*load_2chr)(ch2, Address(str2, cnt2_neg)); 4726 cmpw(first, ch2); 4727 br(EQ, STR1_LOOP); 4728 BIND(STR2_NEXT); 4729 adds(cnt2_neg, cnt2_neg, str2_chr_size); 4730 br(LE, FIRST_LOOP); 4731 b(NOMATCH); 4732 4733 BIND(STR1_LOOP); 4734 add(cnt2tmp, cnt2_neg, 2*str2_chr_size); 4735 (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp)); 4736 cmp(ch1, ch2); 4737 br(NE, STR2_NEXT); 4738 b(MATCH); 4739 } 4740 4741 if (icnt1 == -1 || icnt1 == 1) { 4742 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP; 4743 4744 BIND(DO1); 4745 (this->*str1_load_1chr)(ch1, str1); 4746 cmp(cnt2, (u1)8); 4747 br(LT, DO1_SHORT); 4748 4749 sub(result_tmp, cnt2, 8/str2_chr_size); 4750 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 4751 mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001); 4752 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 4753 4754 if (str2_isL) { 4755 orr(ch1, ch1, ch1, LSL, 8); 4756 } 4757 orr(ch1, ch1, ch1, LSL, 16); 4758 orr(ch1, ch1, ch1, LSL, 32); 4759 BIND(CH1_LOOP); 4760 ldr(ch2, Address(str2, cnt2_neg)); 4761 eor(ch2, ch1, ch2); 4762 sub(tmp1, ch2, tmp3); 4763 orr(tmp2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 4764 bics(tmp1, tmp1, tmp2); 4765 br(NE, HAS_ZERO); 4766 adds(cnt2_neg, cnt2_neg, 8); 4767 br(LT, CH1_LOOP); 4768 4769 cmp(cnt2_neg, (u1)8); 4770 mov(cnt2_neg, 0); 4771 br(LT, CH1_LOOP); 4772 b(NOMATCH); 4773 4774 BIND(HAS_ZERO); 4775 rev(tmp1, tmp1); 4776 clz(tmp1, tmp1); 4777 add(cnt2_neg, cnt2_neg, tmp1, LSR, 3); 4778 b(MATCH); 4779 4780 BIND(DO1_SHORT); 4781 mov(result_tmp, cnt2); 4782 lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift))); 4783 sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift); 4784 BIND(DO1_LOOP); 4785 (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg)); 4786 cmpw(ch1, ch2); 4787 br(EQ, MATCH); 4788 adds(cnt2_neg, cnt2_neg, str2_chr_size); 4789 br(LT, DO1_LOOP); 4790 } 4791 } 4792 BIND(NOMATCH); 4793 mov(result, -1); 4794 b(DONE); 4795 BIND(MATCH); 4796 add(result, result_tmp, cnt2_neg, ASR, str2_chr_shift); 4797 BIND(DONE); 4798 } 4799 4800 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr); 4801 typedef void (MacroAssembler::* uxt_insn)(Register Rd, Register Rn); 4802 4803 void MacroAssembler::string_indexof_char(Register str1, Register cnt1, 4804 Register ch, Register result, 4805 Register tmp1, Register tmp2, Register tmp3) 4806 { 4807 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE; 4808 Register cnt1_neg = cnt1; 4809 Register ch1 = rscratch1; 4810 Register result_tmp = rscratch2; 4811 4812 cmp(cnt1, (u1)4); 4813 br(LT, DO1_SHORT); 4814 4815 orr(ch, ch, ch, LSL, 16); 4816 orr(ch, ch, ch, LSL, 32); 4817 4818 sub(cnt1, cnt1, 4); 4819 mov(result_tmp, cnt1); 4820 lea(str1, Address(str1, cnt1, Address::uxtw(1))); 4821 sub(cnt1_neg, zr, cnt1, LSL, 1); 4822 4823 mov(tmp3, 0x0001000100010001); 4824 4825 BIND(CH1_LOOP); 4826 ldr(ch1, Address(str1, cnt1_neg)); 4827 eor(ch1, ch, ch1); 4828 sub(tmp1, ch1, tmp3); 4829 orr(tmp2, ch1, 0x7fff7fff7fff7fff); 4830 bics(tmp1, tmp1, tmp2); 4831 br(NE, HAS_ZERO); 4832 adds(cnt1_neg, cnt1_neg, 8); 4833 br(LT, CH1_LOOP); 4834 4835 cmp(cnt1_neg, (u1)8); 4836 mov(cnt1_neg, 0); 4837 br(LT, CH1_LOOP); 4838 b(NOMATCH); 4839 4840 BIND(HAS_ZERO); 4841 rev(tmp1, tmp1); 4842 clz(tmp1, tmp1); 4843 add(cnt1_neg, cnt1_neg, tmp1, LSR, 3); 4844 b(MATCH); 4845 4846 BIND(DO1_SHORT); 4847 mov(result_tmp, cnt1); 4848 lea(str1, Address(str1, cnt1, Address::uxtw(1))); 4849 sub(cnt1_neg, zr, cnt1, LSL, 1); 4850 BIND(DO1_LOOP); 4851 ldrh(ch1, Address(str1, cnt1_neg)); 4852 cmpw(ch, ch1); 4853 br(EQ, MATCH); 4854 adds(cnt1_neg, cnt1_neg, 2); 4855 br(LT, DO1_LOOP); 4856 BIND(NOMATCH); 4857 mov(result, -1); 4858 b(DONE); 4859 BIND(MATCH); 4860 add(result, result_tmp, cnt1_neg, ASR, 1); 4861 BIND(DONE); 4862 } 4863 4864 // Summary: Compare strings intrinsic implementation. All combinations of UTF-16 4865 // and Latin1 encodings for both strings are considered. Comparison 4866 // is performed in lexical order. 4867 // 4868 // Input: str1: pointer to 1st string 4869 // str2: pointer to 2nd string 4870 // cnt1: number of bytes in 1st string 4871 // cnt2: number of bytes in 2nd string 4872 // 4873 // Algorithm parameter: 4874 // ae: encodings used in 1st and 2nd strings 4875 // 4876 // Temporary registers: 4877 // tmp1, tmp2, rscratch1, rscratch2: always used 4878 // vtmp1, vtmp2, vtmp3: used in case encodings are different 4879 // 4880 // Output: result - return 0 if strings are equal. Returns positive value 4881 // if 1st string > 2nd string in lexical order. Returns 4882 // negative value if 1st string < 2nd string. 4883 // 4884 // Side effects: str1, str2, cnt1, cnt2, tmp1, tmp2, rscratch1, rscratch2: clobbered. 4885 // vtmp1, vtmp2, vtmp2: clobbered if encodings are different 4886 // 4887 // Additional data: boolean values: isLL, isLU, isUL, str1_isL, str2_isL and 4888 // int minCharInWords are derived from ae parameter based on encodings used 4889 // in strings. Different code is generated depending of these values: 4890 // 4891 // isLL = both strings are Latin1 4892 // isLU = 1st string is Latin1, 2nd string is UTF-16 4893 // isUL = 1st string is UTF-16, 2nd string is Latin1 4894 // str1_isL = 1st string is Latin1 4895 // str2_isL = 2nd string is Latin1 4896 // str1_chr_shift = shift value to convert between characters counter to byte counter for 1st string 4897 // str2_chr_shift = shift value to convert between characters counter to byte counter for 2nd string 4898 // minCharInWords = minimum number of characters that fit in register (8 for LL case, 4 otherwise) 4899 // 4900 // 4901 // PSEUDO CODE: 4902 // 4903 // // N.B.: this pseudo-code doesn't strictly follow implementation details. 4904 // // It is here to help understand the basics. Detailed implementation 4905 // // description is listed after this code. 4906 // 4907 // <convert byte counters cnt1, cnt2 into character counters if UTF-16 encoding is used>; 4908 // result = cnt1 - cnt2; // length difference. Used in if all min(cnt1, cnt2) characters are same 4909 // cnt2 = min(cnt1, cnt2); // amount of characters to check 4910 // if (result <= minCharInWords) { // <= wordSize bytes should be loaded for comparison 4911 // if (cnt2 == 0) return result; 4912 // while (cnt2 != 0) { 4913 // char str1char = str1[0]; 4914 // char str2char = str2[0]; 4915 // str1 += 1 << str1_chr_shift; // advance pointer by size of str1 character 4916 // str2 += 1 << str2_chr_shift; // advance pointer by size of str2 character 4917 // if (str1[0] != str2[0]) return str1[0] - str2[0]; 4918 // cnt2--; 4919 // } 4920 // } else { // > wordSize bytes should be loaded for comparison 4921 // // This code checks string in 8-byte blocks. If encodings are 4922 // // different, Latin1 string will be loaded via 4-byte blocks and then 4923 // // each block will be converted to 8-byte UTF-16 equivalent. Then 8 byte 4924 // // blocks are compared. Each load is 8 characters for LL case and 4 4925 // // characters for LU/UL/UU. 4926 // // This set of instructions (load 8 Latin1 character OR load 4 Latin1 4927 // // characters and convert it to 4 UTF-16 character OR load 4 UTF-16 4928 // // character) is referred as <load-and-probably-convert ...> below. 4929 // 4930 // // First iteration in the loop is unrolled to add initialization. 4931 // 4932 // // The code below calculates addresses of each string last load: addresses 4933 // // of last 8 characters for LL case and last 4 characters otherwise. 4934 // // Then offsets from the addresses to the beginning of the strings are 4935 // // calculated. Offset is then use as loop counter. When offset is >= 0, then 4936 // // only last loads (possible overlapped) are left to be checked. 4937 // // N.B.: in case of same encodings, offsets are the same for both strings. 4938 // // Then offset for 2nd string is used for both strings. 4939 // 4940 // tmp1 = <load-and-probably-convert str1>; 4941 // if (str1 == str2) return result; 4942 // tmp2 = <load-and-probably-convert str2>; 4943 // 4944 // // use special implementation optimized for large strings. See detailed code and stub comments. 4945 // if (cnt2 >= 72) return compare_long_string_implementation(<args>); 4946 // 4947 // cnt2 -= <amount of loaded characters>; // 8 for isLL case. 4 otherwise. 4948 // 4949 // if (str1_isL == str2_isL) { 4950 // // Optional optimization for same encoding cases. Can be applied for all 4951 // // cases, but is faster in same encoding cases only. Without this branch 4952 // // smallest string (8 character for LL and 4 characters for others) would 4953 // // be checked twice. 4954 // if (cnt2 == 0) goto TAIL_CHECK; // no more characters to be loaded. Just check already loaded data. 4955 // } 4956 // 4957 // // calculate addresses of last loads. use str1 and str2 pointers for that 4958 // str1 = str1 + cnt2 << str1_chr_shift; 4959 // str2 = str2 + cnt2 << str2_chr_shift; 4960 // 4961 // // calculate offsets for both strings. cnt1 and cnt2 can be reused 4962 // if (str1_isL != str2_isL) cnt1 = - (cnt2 << str1_chr_shift); 4963 // cnt2 = - (cnt2 << str2_chr_shift); 4964 // 4965 // // increment calculated offsets by the number of already loaded bytes 4966 // if (isLU) cnt1 += 4; 4967 // if (isUL) cnt1 += 8; 4968 // cnt2 += isUL ? 4 : 8; 4969 // 4970 // if (cnt2 >= 0) goto TAIL; // only last loads remains. Still need to check currently loaded data. 4971 // 4972 // rscratch2 = tmp1 BIT_XOR tmp2; 4973 // if (rscratch2 != 0) goto DIFFERENCE; 4974 // 4975 // // main loop. Label = NEXT_WORD 4976 // do { 4977 // tmp1 = <load-and-probably-convert str1 at offset of (str1_isL == str2_isL ? cnt2 : cnt1)>; 4978 // tmp2 = <load-and-probably-convert str2 at offset of cnt2>; 4979 // 4980 // // update offsets by the number of loaded bytes 4981 // cnt2 += isUL ? 4 : 8; 4982 // if (isLU) cnt1 += 4; 4983 // if (isUL) cnt1 += 8; 4984 // 4985 // if (cnt2 >= 0) goto TAIL; // last block left to be loaded. Still need to check currently loaded block. 4986 // rscratch2 = tmp1 BIT_XOR tmp2; 4987 // } while (rscratch2 == 0); 4988 // goto DIFFERENCE: 4989 // 4990 // TAIL: // last block left to be loaded. Still need to check currently loaded block. 4991 // rscratch2 = tmp1 BIT_XOR tmp2; 4992 // if (rscratch2 != 0) goto DIFFERENCE; 4993 // tmp1 = <load-and-probably-convert str1>; 4994 // tmp2 = <load-and-probably-convert str2>; 4995 // // fallthrough to TAIL_CHECK 4996 // TAIL_CHECK: 4997 // rscratch2 = tmp1 BIT_XOR tmp2; 4998 // if (rscratch2 == 0) return result; 4999 // DIFFERENCE: // different character found. Find it and compute difference 5000 // // tmp1 and tmp2 have current data with at least 1 different character. 5001 // // Find index of first such character. 5002 // rscratch2 = REVERSE_BITS(rscratch2); 5003 // rscratch2 = COUNT_LEADING_ZEROES(rscratch2); // position of different bit in current 8 bytes 5004 // rscratch2 = rscratch2 & (isLL ? -8 : -16); // number of bits until (possibly converted) different characters in tmp1 and tmp2 5005 // tmp1 = tmp1 >> rscratch2; // now first character in tmp1 is the one sought for 5006 // tmp1 = tmp1 & (isLL ? 0xFF : 0xFFFF); // only first different character left 5007 // tmp2 = tmp2 >> rscratch2; // now first character in tmp2 is the one sought for 5008 // tmp2 = tmp2 & (isLL ? 0xFF : 0xFFFF); // only first different character left 5009 // result = tmp1 - tmp2; 5010 // } 5011 // return result; 5012 // 5013 // 5014 // 5015 // DETAILED CODE: 5016 // 5017 // if (!str1_isL) cnt1 = cnt1 >> 1; // counter for 1st string (in characters) 5018 // if (!str2_isL) cnt2 = cnt2 >> 1; // counter for 2nd string (in characters) 5019 // result = cnt1 - cnt2; // keep in flags the result of operation 5020 // cnt2 = min(cnt1, cnt2); // implemented as csel instruction using stored flag value above 5021 // bool shortStringsCase = cnt2 <= minCharInWords; // kept in flag 5022 // if (shortStringsCase) goto SHORT_STRING; // separate code for short strings 5023 // if (str1_isL == str2_isL) { // same encoding case 5024 // tmp1 = LOAD8BYTES(str1); 5025 // bool sameString = str1 == str2; // kept in flags 5026 // if (sameString) goto DONE; // the string is the same, return 5027 // tmp2 = LOAD8BYTES(str2); 5028 // bool largeStrings = cnt2 >= 72; // kept in flags 5029 // if (largeStrings) goto STUB; // handled in separate stub implementation for large strings 5030 // cnt2 = cnt2 - minCharsInWord; // decrement counter by the number of loaded characters 5031 // bool noMoreLoadsAvailable = cnt2 == 0; // kept in flags 5032 // if (noMoreLoadsAvailable) goto TAIL_CHECK; 5033 // str2 = str2 + cnt2 << str2_chr_shift; // address of str2 last load 5034 // str1 = str1 + cnt2 << str1_chr_shift; // address of str1 last load 5035 // cnt2 = -(cnt2 << str2_chr_shift); // byte offset to 1st character in each string 5036 // } else if (isLU) { 5037 // vtmp = LOAD4BYTES(str1); 5038 // bool sameString = str1 == str2; // kept in flags 5039 // if (sameString) goto DONE; // return 5040 // tmp2 = LOAD8BYTES(str2); 5041 // bool largeStrings = cnt2 >= 72; // kept in flags 5042 // if (largeStrings) goto STUB; // handled in separate stub implementation for large strings 5043 // cnt2 = cnt2 - 4; // decrement counter by the number of loaded characters 5044 // vtmpz = 0; // implemented as eor 5045 // str1 = str1 + cnt2 << str1_chr_shift; // address of str1 last load 5046 // str2 = str2 + cnt2 << str2_chr_shift; // address of str2 last load 5047 // vtmp = CONVERT_LATIN1_TO_UTF16(vtmp, vtmpz); // convert Latin1 to UTF16 because it'll be compared with UTF16. Implemented as zip instruction 5048 // cnt1 = -(cnt2 << str1_chr_shift); // byte offset to 1st character in 1st string 5049 // cnt2 = -(cnt2 << str2_chr_shift); // byte offset to 1st character in 2nd string 5050 // cnt1 = cnt1 + 4; // advance 1st string offset by the number of loaded bytes 5051 // tmp1 = vtmp; // move converted characters from FPU register to GPR 5052 // } else { // UL 5053 // tmp1 = LOAD8BYTES(str1); 5054 // bool sameString = str1 == str2; // kept in flags 5055 // if (sameString) goto DONE; // return 5056 // vtmp = LOAD4BYTES(str2); 5057 // bool largeStrings = cnt2 >= 72; // kept in flags 5058 // if (largeStrings) goto STUB; // separate stub implementation for large strings 5059 // cnt2 = cnt2 - 4; // update counter by the number of loaded characters 5060 // str1 = str1 + cnt2 << str1_chr_shift; // address of str1 last load 5061 // vtmpz = 0; // implemented as eor 5062 // str2 = str2 + cnt2 << str2_chr_shift; // address of str2 last load 5063 // cnt1 = -(cnt2 << str1_chr_shift); // byte offset to 1st character in 1st string 5064 // vtmp = CONVERT_LATIN1_TO_UTF16(vtmp, vtmpz); // convert Latin1 to UTF16 because it'll be compared with UTF16. implemented as zip instruction 5065 // cnt2 = -(cnt2 << str2_chr_shift); // byte offset to 1st character in 2nd string 5066 // cnt1 = cnt1 + 8; // advance 1st string offset by the number of loaded bytes 5067 // tmp2 = vtmp; // move converted characters from FPU register to GPR 5068 // } 5069 // cnt2 = cnt2 + (isUL ? 4 : 8); // update offset by the number of loaded bytes 5070 // bool onlyLastLoadRemains = cnt2 >= 0; // kept in flags 5071 // if (onlyLastLoadRemains) goto TAIL; 5072 // rscratch2 = BIT_XOR(tmp1, tmp2); // current block comparison result 5073 // if (rscratch2 != 0) goto DIFFERENCE; // found different characters in current block 5074 // NEXT_WORD: // main loop 5075 // // implementation for each encoding loads 4 or 8 characters at calculated 5076 // // offsets from each string and convert encodings if necessary. Then offsets 5077 // // are updated. 5078 // if (str1_isL == str2_isL) { 5079 // tmp1 = LOAD8BYTES(str1, cnt2); 5080 // tmp2 = LOAD8BYTES(str2, cnt2); 5081 // cnt2 = cnt2 + 8; // update counter by the number of loaded bytes 5082 // onlyLastLoadRemains = cnt2 >= 0; // kept in flags 5083 // } else if (isLU) { 5084 // vtmp = LOAD4BYTES(str1, cnt1); 5085 // tmp2 = LOAD8BYTES(str2, cnt2); 5086 // cnt1 = cnt1 + 4; 5087 // vtmp = CONVERT_LATIN1_TO_UTF16(vtmp, vtmpz); 5088 // tmp1 = vtmp; 5089 // cnt2 = cnt2 + 8; 5090 // onlyLastLoadRemains = cnt2 >= 0; // kept in flags 5091 // } else { // UL 5092 // vtmp = LOAD4BYTES(str2, cnt2); 5093 // tmp1 = LOAD8BYTES(str1, cnt1); 5094 // vtmp = CONVERT_LATIN1_TO_UTF16(vtmp, vtmpz); 5095 // cnt1 = cnt1 + 8; 5096 // tmp2 = vtmp; 5097 // cnt2 = cnt2 + 4; 5098 // onlyLastLoadRemains = cnt2 >= 0; // kept in flags 5099 // } 5100 // if (onlyLastLoadRemains) goto TAIL; 5101 // rscratch2 = BIT_XOR(tmp1, tmp2); // current block comparison result 5102 // if (rscratch2 == 0) goto NEXT_WORD; 5103 // goto DIFFERENCE; 5104 // TAIL: // check already loaded data and last load 5105 // rscratch2 = BIT_XOR(tmp1, tmp2); // current block comparison result 5106 // if (rscratch2 != 0) goto DIFFERENCE; 5107 // 5108 // // last load (and convert if needed) from each string 5109 // if (str1_isL == str2_isL) { 5110 // tmp1 = LOAD8BYTES(str1); 5111 // tmp2 = LOAD8BYTES(str2); 5112 // } else if (isLU) { 5113 // vtmp = LOAD4BYTES(str1); 5114 // tmp2 = LOAD8BYTES(str2); 5115 // vtmp = CONVERT_LATIN1_TO_UTF16(vtmp, vtmpz); 5116 // tmp1 = vtmp; 5117 // } else { // UL 5118 // vtmp = LOAD4BYTES(str2); 5119 // tmp1 = LOAD8BYTES(str1); 5120 // vtmp = CONVERT_LATIN1_TO_UTF16(vtmp, vtmpz); 5121 // tmp2 = vtmp; 5122 // } 5123 // TAIL_CHECK: // last check 5124 // rscratch2 = BIT_XOR(tmp1, tmp2); // current block comparison result 5125 // if (rscratch2 == 0) goto DONE; // return 5126 // DIFFERENCE: 5127 // rscratch2 = REVERSE_BITS(rscratch2); // It's not possible to count trailing zeroes. Reverse bits and then count leading zeroes instead. 5128 // rscratch2 = COUNT_LEADING_ZEROES(rscratch2); // position of different bit in current 8 bytes 5129 // rscratch2 = rscratch2 & (isLL ? -8 : -16); // number of bits until (possibly converted) different characters in tmp1 and tmp2 5130 // tmp1 = tmp1 >> rscratch2; // first character in tmp1 is the one sought for 5131 // tmp1 = isLL ? UNSIGNED_EXTEND_BYTE2INT(tmp1) : UNSIGNED_EXTEND_SHORT2INT(tmp1); // only first different character left 5132 // tmp2 = tmp2 >> rscratch2; // first character in tmp2 is the one sought for 5133 // tmp2 = isLL ? UNSIGNED_EXTEND_BYTE2INT(tmp2) : UNSIGNED_EXTEND_SHORT2INT(tmp1); // only first different character left 5134 // result = tmp1 - tmp2; 5135 // goto DONE; 5136 // } 5137 // 5138 // STUB: 5139 // <get address of compare_long_string_[LL|UU|LU|UL] stub routine and call it> 5140 // goto DONE; 5141 // 5142 // // Short strings comparison code. Instead of simple per-character loop with 5143 // // load-and-compare code it uses loop than issues 2 per-character loads from 5144 // // each string per iteration. Different registers are used for that to 5145 // // remove dependencies: (tmp1, cnt1) and (tmp2, rscratch1) pairs. 5146 // // First characters loads are issued in pre-loop. 5147 // SHORT_STRING: 5148 // if (cnt2 == 0) goto DONE; // no characters to compare. Length difference (already calculated) should be used as result 5149 // tmp1 = LOAD_STR1_CHAR(str1); 5150 // str1 = str1 + str1_chr_size); // merged with load above as post-increment 5151 // cnt2 = cnt2 - 1; // calculate remaining length after first character is loaded 5152 // bool endReached = cnt2 == 0; // kept in flags 5153 // if (endReached) goto SHORT_LAST_INIT; // load 1 character to complete from 2nd string to complete init and compare it with 1st string character 5154 // cnt1 = LOAD_STR2_CHAR(str2); 5155 // str2 = str2 + str2_chr_size; // merged with load above as post-increment 5156 // goto SHORT_LOOP_START; // per-character loop entry point 5157 // SHORT_LOOP: // per-character loop 5158 // cnt2 = cnt2 - 1; // calculate remaining length 5159 // endReached = cnt2 == 0; 5160 // if (endReached) goto SHORT_LAST_INIT; 5161 // SHORT_LOOP_START: // per-character loop entry point 5162 // tmp2 = LOAD_STR1_CHAR(str1); 5163 // rscratch1 = LOAD_STR2_CHAR(str2); 5164 // bool differentResult = tmp1 != cnt1; // check difference of previously loaded pair of registers while last pair is still loading. Kept in flags 5165 // if (differentResult) goto SHORT_LOOP_TAIL; // calculate character difference and return 5166 // cnt2 = cnt2 - 1; // calculate remaining length 5167 // endReached = cnt2 == 0; 5168 // if (endReached) goto SHORT_LAST2; // last comparison of second pair of registers (tmp2, rscratch1) is left 5169 // tmp1 = LOAD_STR1_CHAR(str1); 5170 // cnt1 = LOAD_STR2_CHAR(str2); 5171 // bool sameResult = tmp2 == rscratch1; // check difference of previously loaded pair of registers while last pair is still loading. Kept in flags 5172 // if (sameResult) goto SHORT_LOOP; 5173 // result = tmp2 - rscratch1; 5174 // goto DONE; 5175 // SHORT_LAST2: // last comparison is left: (tmp2, rscratch1) 5176 // sameResult = tmp2 == rscratch1; 5177 // if (sameResult) goto DONE; 5178 // result = tmp2 - rscratch1; 5179 // goto DONE; 5180 // SHORT_LAST_INIT: 5181 // cnt1 = LOAD_STR2_CHAR(str2); 5182 // SHORT_LAST: // last comparison of second pair of registers (tmp1, cnt1) is left 5183 // sameResult = tmp1 == cnt1; 5184 // if (sameResult) goto DONE; 5185 // result = tmp1 - cnt1; 5186 // DONE: 5187 // return; // result 5188 5189 void MacroAssembler::string_compare(Register str1, Register str2, 5190 Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2, 5191 FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3, int ae) { 5192 Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB, 5193 DIFFERENCE, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, 5194 SHORT_LOOP_START, TAIL_CHECK; 5195 5196 const u1 STUB_THRESHOLD = 64 + 8; 5197 bool isLL = ae == StrIntrinsicNode::LL; 5198 bool isLU = ae == StrIntrinsicNode::LU; 5199 bool isUL = ae == StrIntrinsicNode::UL; 5200 5201 bool str1_isL = isLL || isLU; 5202 bool str2_isL = isLL || isUL; 5203 5204 int str1_chr_shift = str1_isL ? 0 : 1; 5205 int str2_chr_shift = str2_isL ? 0 : 1; 5206 int str1_chr_size = str1_isL ? 1 : 2; 5207 int str2_chr_size = str2_isL ? 1 : 2; 5208 int minCharsInWord = isLL ? wordSize : wordSize/2; 5209 5210 FloatRegister vtmpZ = vtmp1, vtmp = vtmp2; 5211 chr_insn str1_load_chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb : 5212 (chr_insn)&MacroAssembler::ldrh; 5213 chr_insn str2_load_chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb : 5214 (chr_insn)&MacroAssembler::ldrh; 5215 uxt_insn ext_chr = isLL ? (uxt_insn)&MacroAssembler::uxtbw : 5216 (uxt_insn)&MacroAssembler::uxthw; 5217 5218 BLOCK_COMMENT("string_compare {"); 5219 5220 // Bizzarely, the counts are passed in bytes, regardless of whether they 5221 // are L or U strings, however the result is always in characters. 5222 if (!str1_isL) asrw(cnt1, cnt1, 1); 5223 if (!str2_isL) asrw(cnt2, cnt2, 1); 5224 5225 // Compute the minimum of the string lengths and save the difference. 5226 subsw(result, cnt1, cnt2); 5227 cselw(cnt2, cnt1, cnt2, Assembler::LE); // min 5228 5229 // A very short string 5230 cmpw(cnt2, minCharsInWord); 5231 br(Assembler::LE, SHORT_STRING); 5232 5233 // Compare longwords 5234 // load first parts of strings and finish initialization while loading 5235 { 5236 if (str1_isL == str2_isL) { // LL or UU 5237 ldr(tmp1, Address(str1)); 5238 cmp(str1, str2); 5239 br(Assembler::EQ, DONE); 5240 ldr(tmp2, Address(str2)); 5241 cmp(cnt2, STUB_THRESHOLD); 5242 br(GE, STUB); 5243 subsw(cnt2, cnt2, minCharsInWord); 5244 br(EQ, TAIL_CHECK); 5245 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift))); 5246 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift))); 5247 sub(cnt2, zr, cnt2, LSL, str2_chr_shift); 5248 } else if (isLU) { 5249 ldrs(vtmp, Address(str1)); 5250 cmp(str1, str2); 5251 br(Assembler::EQ, DONE); 5252 ldr(tmp2, Address(str2)); 5253 cmp(cnt2, STUB_THRESHOLD); 5254 br(GE, STUB); 5255 subw(cnt2, cnt2, 4); 5256 eor(vtmpZ, T16B, vtmpZ, vtmpZ); 5257 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift))); 5258 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift))); 5259 zip1(vtmp, T8B, vtmp, vtmpZ); 5260 sub(cnt1, zr, cnt2, LSL, str1_chr_shift); 5261 sub(cnt2, zr, cnt2, LSL, str2_chr_shift); 5262 add(cnt1, cnt1, 4); 5263 fmovd(tmp1, vtmp); 5264 } else { // UL case 5265 ldr(tmp1, Address(str1)); 5266 cmp(str1, str2); 5267 br(Assembler::EQ, DONE); 5268 ldrs(vtmp, Address(str2)); 5269 cmp(cnt2, STUB_THRESHOLD); 5270 br(GE, STUB); 5271 subw(cnt2, cnt2, 4); 5272 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift))); 5273 eor(vtmpZ, T16B, vtmpZ, vtmpZ); 5274 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift))); 5275 sub(cnt1, zr, cnt2, LSL, str1_chr_shift); 5276 zip1(vtmp, T8B, vtmp, vtmpZ); 5277 sub(cnt2, zr, cnt2, LSL, str2_chr_shift); 5278 add(cnt1, cnt1, 8); 5279 fmovd(tmp2, vtmp); 5280 } 5281 adds(cnt2, cnt2, isUL ? 4 : 8); 5282 br(GE, TAIL); 5283 eor(rscratch2, tmp1, tmp2); 5284 cbnz(rscratch2, DIFFERENCE); 5285 // main loop 5286 bind(NEXT_WORD); 5287 if (str1_isL == str2_isL) { 5288 ldr(tmp1, Address(str1, cnt2)); 5289 ldr(tmp2, Address(str2, cnt2)); 5290 adds(cnt2, cnt2, 8); 5291 } else if (isLU) { 5292 ldrs(vtmp, Address(str1, cnt1)); 5293 ldr(tmp2, Address(str2, cnt2)); 5294 add(cnt1, cnt1, 4); 5295 zip1(vtmp, T8B, vtmp, vtmpZ); 5296 fmovd(tmp1, vtmp); 5297 adds(cnt2, cnt2, 8); 5298 } else { // UL 5299 ldrs(vtmp, Address(str2, cnt2)); 5300 ldr(tmp1, Address(str1, cnt1)); 5301 zip1(vtmp, T8B, vtmp, vtmpZ); 5302 add(cnt1, cnt1, 8); 5303 fmovd(tmp2, vtmp); 5304 adds(cnt2, cnt2, 4); 5305 } 5306 br(GE, TAIL); 5307 5308 eor(rscratch2, tmp1, tmp2); 5309 cbz(rscratch2, NEXT_WORD); 5310 b(DIFFERENCE); 5311 bind(TAIL); 5312 eor(rscratch2, tmp1, tmp2); 5313 cbnz(rscratch2, DIFFERENCE); 5314 // Last longword. In the case where length == 4 we compare the 5315 // same longword twice, but that's still faster than another 5316 // conditional branch. 5317 if (str1_isL == str2_isL) { 5318 ldr(tmp1, Address(str1)); 5319 ldr(tmp2, Address(str2)); 5320 } else if (isLU) { 5321 ldrs(vtmp, Address(str1)); 5322 ldr(tmp2, Address(str2)); 5323 zip1(vtmp, T8B, vtmp, vtmpZ); 5324 fmovd(tmp1, vtmp); 5325 } else { // UL 5326 ldrs(vtmp, Address(str2)); 5327 ldr(tmp1, Address(str1)); 5328 zip1(vtmp, T8B, vtmp, vtmpZ); 5329 fmovd(tmp2, vtmp); 5330 } 5331 bind(TAIL_CHECK); 5332 eor(rscratch2, tmp1, tmp2); 5333 cbz(rscratch2, DONE); 5334 5335 // Find the first different characters in the longwords and 5336 // compute their difference. 5337 bind(DIFFERENCE); 5338 rev(rscratch2, rscratch2); 5339 clz(rscratch2, rscratch2); 5340 andr(rscratch2, rscratch2, isLL ? -8 : -16); 5341 lsrv(tmp1, tmp1, rscratch2); 5342 (this->*ext_chr)(tmp1, tmp1); 5343 lsrv(tmp2, tmp2, rscratch2); 5344 (this->*ext_chr)(tmp2, tmp2); 5345 subw(result, tmp1, tmp2); 5346 b(DONE); 5347 } 5348 5349 bind(STUB); 5350 RuntimeAddress stub = NULL; 5351 switch(ae) { 5352 case StrIntrinsicNode::LL: 5353 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LL()); 5354 break; 5355 case StrIntrinsicNode::UU: 5356 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UU()); 5357 break; 5358 case StrIntrinsicNode::LU: 5359 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LU()); 5360 break; 5361 case StrIntrinsicNode::UL: 5362 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UL()); 5363 break; 5364 default: 5365 ShouldNotReachHere(); 5366 } 5367 assert(stub.target() != NULL, "compare_long_string stub has not been generated"); 5368 trampoline_call(stub); 5369 b(DONE); 5370 5371 bind(SHORT_STRING); 5372 // Is the minimum length zero? 5373 cbz(cnt2, DONE); 5374 // arrange code to do most branches while loading and loading next characters 5375 // while comparing previous 5376 (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size))); 5377 subs(cnt2, cnt2, 1); 5378 br(EQ, SHORT_LAST_INIT); 5379 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size))); 5380 b(SHORT_LOOP_START); 5381 bind(SHORT_LOOP); 5382 subs(cnt2, cnt2, 1); 5383 br(EQ, SHORT_LAST); 5384 bind(SHORT_LOOP_START); 5385 (this->*str1_load_chr)(tmp2, Address(post(str1, str1_chr_size))); 5386 (this->*str2_load_chr)(rscratch1, Address(post(str2, str2_chr_size))); 5387 cmp(tmp1, cnt1); 5388 br(NE, SHORT_LOOP_TAIL); 5389 subs(cnt2, cnt2, 1); 5390 br(EQ, SHORT_LAST2); 5391 (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size))); 5392 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size))); 5393 cmp(tmp2, rscratch1); 5394 br(EQ, SHORT_LOOP); 5395 sub(result, tmp2, rscratch1); 5396 b(DONE); 5397 bind(SHORT_LOOP_TAIL); 5398 sub(result, tmp1, cnt1); 5399 b(DONE); 5400 bind(SHORT_LAST2); 5401 cmp(tmp2, rscratch1); 5402 br(EQ, DONE); 5403 sub(result, tmp2, rscratch1); 5404 5405 b(DONE); 5406 bind(SHORT_LAST_INIT); 5407 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size))); 5408 bind(SHORT_LAST); 5409 cmp(tmp1, cnt1); 5410 br(EQ, DONE); 5411 sub(result, tmp1, cnt1); 5412 5413 bind(DONE); 5414 5415 BLOCK_COMMENT("} string_compare"); 5416 } 5417 #endif // COMPILER2 5418 5419 // This method checks if provided byte array contains byte with highest bit set. 5420 void MacroAssembler::has_negatives(Register ary1, Register len, Register result) { 5421 // Simple and most common case of aligned small array which is not at the 5422 // end of memory page is placed here. All other cases are in stub. 5423 Label LOOP, END, STUB, STUB_LONG, SET_RESULT, DONE; 5424 const uint64_t UPPER_BIT_MASK=0x8080808080808080; 5425 assert_different_registers(ary1, len, result); 5426 5427 cmpw(len, 0); 5428 br(LE, SET_RESULT); 5429 cmpw(len, 4 * wordSize); 5430 br(GE, STUB_LONG); // size > 32 then go to stub 5431 5432 int shift = 64 - exact_log2(os::vm_page_size()); 5433 lsl(rscratch1, ary1, shift); 5434 mov(rscratch2, (size_t)(4 * wordSize) << shift); 5435 adds(rscratch2, rscratch1, rscratch2); // At end of page? 5436 br(CS, STUB); // at the end of page then go to stub 5437 subs(len, len, wordSize); 5438 br(LT, END); 5439 5440 BIND(LOOP); 5441 ldr(rscratch1, Address(post(ary1, wordSize))); 5442 tst(rscratch1, UPPER_BIT_MASK); 5443 br(NE, SET_RESULT); 5444 subs(len, len, wordSize); 5445 br(GE, LOOP); 5446 cmpw(len, -wordSize); 5447 br(EQ, SET_RESULT); 5448 5449 BIND(END); 5450 ldr(result, Address(ary1)); 5451 sub(len, zr, len, LSL, 3); // LSL 3 is to get bits from bytes 5452 lslv(result, result, len); 5453 tst(result, UPPER_BIT_MASK); 5454 b(SET_RESULT); 5455 5456 BIND(STUB); 5457 RuntimeAddress has_neg = RuntimeAddress(StubRoutines::aarch64::has_negatives()); 5458 assert(has_neg.target() != NULL, "has_negatives stub has not been generated"); 5459 trampoline_call(has_neg); 5460 b(DONE); 5461 5462 BIND(STUB_LONG); 5463 RuntimeAddress has_neg_long = RuntimeAddress( 5464 StubRoutines::aarch64::has_negatives_long()); 5465 assert(has_neg_long.target() != NULL, "has_negatives stub has not been generated"); 5466 trampoline_call(has_neg_long); 5467 b(DONE); 5468 5469 BIND(SET_RESULT); 5470 cset(result, NE); // set true or false 5471 5472 BIND(DONE); 5473 } 5474 5475 void MacroAssembler::arrays_equals(Register a1, Register a2, Register tmp3, 5476 Register tmp4, Register tmp5, Register result, 5477 Register cnt1, int elem_size) { 5478 Label DONE, SAME; 5479 Register tmp1 = rscratch1; 5480 Register tmp2 = rscratch2; 5481 Register cnt2 = tmp2; // cnt2 only used in array length compare 5482 int elem_per_word = wordSize/elem_size; 5483 int log_elem_size = exact_log2(elem_size); 5484 int length_offset = arrayOopDesc::length_offset_in_bytes(); 5485 int base_offset 5486 = arrayOopDesc::base_offset_in_bytes(elem_size == 2 ? T_CHAR : T_BYTE); 5487 int stubBytesThreshold = 3 * 64 + (UseSIMDForArrayEquals ? 0 : 16); 5488 5489 assert(elem_size == 1 || elem_size == 2, "must be char or byte"); 5490 assert_different_registers(a1, a2, result, cnt1, rscratch1, rscratch2); 5491 5492 #ifndef PRODUCT 5493 { 5494 const char kind = (elem_size == 2) ? 'U' : 'L'; 5495 char comment[64]; 5496 snprintf(comment, sizeof comment, "array_equals%c{", kind); 5497 BLOCK_COMMENT(comment); 5498 } 5499 #endif 5500 5501 // if (a1 == a2) 5502 // return true; 5503 cmpoop(a1, a2); // May have read barriers for a1 and a2. 5504 br(EQ, SAME); 5505 5506 if (UseSimpleArrayEquals) { 5507 Label NEXT_WORD, SHORT, TAIL03, TAIL01, A_MIGHT_BE_NULL, A_IS_NOT_NULL; 5508 // if (a1 == null || a2 == null) 5509 // return false; 5510 // a1 & a2 == 0 means (some-pointer is null) or 5511 // (very-rare-or-even-probably-impossible-pointer-values) 5512 // so, we can save one branch in most cases 5513 tst(a1, a2); 5514 mov(result, false); 5515 br(EQ, A_MIGHT_BE_NULL); 5516 // if (a1.length != a2.length) 5517 // return false; 5518 bind(A_IS_NOT_NULL); 5519 ldrw(cnt1, Address(a1, length_offset)); 5520 ldrw(cnt2, Address(a2, length_offset)); 5521 eorw(tmp5, cnt1, cnt2); 5522 cbnzw(tmp5, DONE); 5523 lea(a1, Address(a1, base_offset)); 5524 lea(a2, Address(a2, base_offset)); 5525 // Check for short strings, i.e. smaller than wordSize. 5526 subs(cnt1, cnt1, elem_per_word); 5527 br(Assembler::LT, SHORT); 5528 // Main 8 byte comparison loop. 5529 bind(NEXT_WORD); { 5530 ldr(tmp1, Address(post(a1, wordSize))); 5531 ldr(tmp2, Address(post(a2, wordSize))); 5532 subs(cnt1, cnt1, elem_per_word); 5533 eor(tmp5, tmp1, tmp2); 5534 cbnz(tmp5, DONE); 5535 } br(GT, NEXT_WORD); 5536 // Last longword. In the case where length == 4 we compare the 5537 // same longword twice, but that's still faster than another 5538 // conditional branch. 5539 // cnt1 could be 0, -1, -2, -3, -4 for chars; -4 only happens when 5540 // length == 4. 5541 if (log_elem_size > 0) 5542 lsl(cnt1, cnt1, log_elem_size); 5543 ldr(tmp3, Address(a1, cnt1)); 5544 ldr(tmp4, Address(a2, cnt1)); 5545 eor(tmp5, tmp3, tmp4); 5546 cbnz(tmp5, DONE); 5547 b(SAME); 5548 bind(A_MIGHT_BE_NULL); 5549 // in case both a1 and a2 are not-null, proceed with loads 5550 cbz(a1, DONE); 5551 cbz(a2, DONE); 5552 b(A_IS_NOT_NULL); 5553 bind(SHORT); 5554 5555 tbz(cnt1, 2 - log_elem_size, TAIL03); // 0-7 bytes left. 5556 { 5557 ldrw(tmp1, Address(post(a1, 4))); 5558 ldrw(tmp2, Address(post(a2, 4))); 5559 eorw(tmp5, tmp1, tmp2); 5560 cbnzw(tmp5, DONE); 5561 } 5562 bind(TAIL03); 5563 tbz(cnt1, 1 - log_elem_size, TAIL01); // 0-3 bytes left. 5564 { 5565 ldrh(tmp3, Address(post(a1, 2))); 5566 ldrh(tmp4, Address(post(a2, 2))); 5567 eorw(tmp5, tmp3, tmp4); 5568 cbnzw(tmp5, DONE); 5569 } 5570 bind(TAIL01); 5571 if (elem_size == 1) { // Only needed when comparing byte arrays. 5572 tbz(cnt1, 0, SAME); // 0-1 bytes left. 5573 { 5574 ldrb(tmp1, a1); 5575 ldrb(tmp2, a2); 5576 eorw(tmp5, tmp1, tmp2); 5577 cbnzw(tmp5, DONE); 5578 } 5579 } 5580 } else { 5581 Label NEXT_DWORD, SHORT, TAIL, TAIL2, STUB, EARLY_OUT, 5582 CSET_EQ, LAST_CHECK; 5583 mov(result, false); 5584 cbz(a1, DONE); 5585 ldrw(cnt1, Address(a1, length_offset)); 5586 cbz(a2, DONE); 5587 ldrw(cnt2, Address(a2, length_offset)); 5588 // on most CPUs a2 is still "locked"(surprisingly) in ldrw and it's 5589 // faster to perform another branch before comparing a1 and a2 5590 cmp(cnt1, (u1)elem_per_word); 5591 br(LE, SHORT); // short or same 5592 ldr(tmp3, Address(pre(a1, base_offset))); 5593 subs(zr, cnt1, stubBytesThreshold); 5594 br(GE, STUB); 5595 ldr(tmp4, Address(pre(a2, base_offset))); 5596 sub(tmp5, zr, cnt1, LSL, 3 + log_elem_size); 5597 cmp(cnt2, cnt1); 5598 br(NE, DONE); 5599 5600 // Main 16 byte comparison loop with 2 exits 5601 bind(NEXT_DWORD); { 5602 ldr(tmp1, Address(pre(a1, wordSize))); 5603 ldr(tmp2, Address(pre(a2, wordSize))); 5604 subs(cnt1, cnt1, 2 * elem_per_word); 5605 br(LE, TAIL); 5606 eor(tmp4, tmp3, tmp4); 5607 cbnz(tmp4, DONE); 5608 ldr(tmp3, Address(pre(a1, wordSize))); 5609 ldr(tmp4, Address(pre(a2, wordSize))); 5610 cmp(cnt1, (u1)elem_per_word); 5611 br(LE, TAIL2); 5612 cmp(tmp1, tmp2); 5613 } br(EQ, NEXT_DWORD); 5614 b(DONE); 5615 5616 bind(TAIL); 5617 eor(tmp4, tmp3, tmp4); 5618 eor(tmp2, tmp1, tmp2); 5619 lslv(tmp2, tmp2, tmp5); 5620 orr(tmp5, tmp4, tmp2); 5621 cmp(tmp5, zr); 5622 b(CSET_EQ); 5623 5624 bind(TAIL2); 5625 eor(tmp2, tmp1, tmp2); 5626 cbnz(tmp2, DONE); 5627 b(LAST_CHECK); 5628 5629 bind(STUB); 5630 ldr(tmp4, Address(pre(a2, base_offset))); 5631 cmp(cnt2, cnt1); 5632 br(NE, DONE); 5633 if (elem_size == 2) { // convert to byte counter 5634 lsl(cnt1, cnt1, 1); 5635 } 5636 eor(tmp5, tmp3, tmp4); 5637 cbnz(tmp5, DONE); 5638 RuntimeAddress stub = RuntimeAddress(StubRoutines::aarch64::large_array_equals()); 5639 assert(stub.target() != NULL, "array_equals_long stub has not been generated"); 5640 trampoline_call(stub); 5641 b(DONE); 5642 5643 bind(EARLY_OUT); 5644 // (a1 != null && a2 == null) || (a1 != null && a2 != null && a1 == a2) 5645 // so, if a2 == null => return false(0), else return true, so we can return a2 5646 mov(result, a2); 5647 b(DONE); 5648 bind(SHORT); 5649 cmp(cnt2, cnt1); 5650 br(NE, DONE); 5651 cbz(cnt1, SAME); 5652 sub(tmp5, zr, cnt1, LSL, 3 + log_elem_size); 5653 ldr(tmp3, Address(a1, base_offset)); 5654 ldr(tmp4, Address(a2, base_offset)); 5655 bind(LAST_CHECK); 5656 eor(tmp4, tmp3, tmp4); 5657 lslv(tmp5, tmp4, tmp5); 5658 cmp(tmp5, zr); 5659 bind(CSET_EQ); 5660 cset(result, EQ); 5661 b(DONE); 5662 } 5663 5664 bind(SAME); 5665 mov(result, true); 5666 // That's it. 5667 bind(DONE); 5668 5669 BLOCK_COMMENT("} array_equals"); 5670 } 5671 5672 // Compare Strings 5673 5674 // For Strings we're passed the address of the first characters in a1 5675 // and a2 and the length in cnt1. 5676 // elem_size is the element size in bytes: either 1 or 2. 5677 // There are two implementations. For arrays >= 8 bytes, all 5678 // comparisons (including the final one, which may overlap) are 5679 // performed 8 bytes at a time. For strings < 8 bytes, we compare a 5680 // halfword, then a short, and then a byte. 5681 5682 void MacroAssembler::string_equals(Register a1, Register a2, 5683 Register result, Register cnt1, int elem_size) 5684 { 5685 Label SAME, DONE, SHORT, NEXT_WORD; 5686 Register tmp1 = rscratch1; 5687 Register tmp2 = rscratch2; 5688 Register cnt2 = tmp2; // cnt2 only used in array length compare 5689 5690 assert(elem_size == 1 || elem_size == 2, "must be 2 or 1 byte"); 5691 assert_different_registers(a1, a2, result, cnt1, rscratch1, rscratch2); 5692 5693 #ifndef PRODUCT 5694 { 5695 const char kind = (elem_size == 2) ? 'U' : 'L'; 5696 char comment[64]; 5697 snprintf(comment, sizeof comment, "{string_equals%c", kind); 5698 BLOCK_COMMENT(comment); 5699 } 5700 #endif 5701 5702 mov(result, false); 5703 5704 // Check for short strings, i.e. smaller than wordSize. 5705 subs(cnt1, cnt1, wordSize); 5706 br(Assembler::LT, SHORT); 5707 // Main 8 byte comparison loop. 5708 bind(NEXT_WORD); { 5709 ldr(tmp1, Address(post(a1, wordSize))); 5710 ldr(tmp2, Address(post(a2, wordSize))); 5711 subs(cnt1, cnt1, wordSize); 5712 eor(tmp1, tmp1, tmp2); 5713 cbnz(tmp1, DONE); 5714 } br(GT, NEXT_WORD); 5715 // Last longword. In the case where length == 4 we compare the 5716 // same longword twice, but that's still faster than another 5717 // conditional branch. 5718 // cnt1 could be 0, -1, -2, -3, -4 for chars; -4 only happens when 5719 // length == 4. 5720 ldr(tmp1, Address(a1, cnt1)); 5721 ldr(tmp2, Address(a2, cnt1)); 5722 eor(tmp2, tmp1, tmp2); 5723 cbnz(tmp2, DONE); 5724 b(SAME); 5725 5726 bind(SHORT); 5727 Label TAIL03, TAIL01; 5728 5729 tbz(cnt1, 2, TAIL03); // 0-7 bytes left. 5730 { 5731 ldrw(tmp1, Address(post(a1, 4))); 5732 ldrw(tmp2, Address(post(a2, 4))); 5733 eorw(tmp1, tmp1, tmp2); 5734 cbnzw(tmp1, DONE); 5735 } 5736 bind(TAIL03); 5737 tbz(cnt1, 1, TAIL01); // 0-3 bytes left. 5738 { 5739 ldrh(tmp1, Address(post(a1, 2))); 5740 ldrh(tmp2, Address(post(a2, 2))); 5741 eorw(tmp1, tmp1, tmp2); 5742 cbnzw(tmp1, DONE); 5743 } 5744 bind(TAIL01); 5745 if (elem_size == 1) { // Only needed when comparing 1-byte elements 5746 tbz(cnt1, 0, SAME); // 0-1 bytes left. 5747 { 5748 ldrb(tmp1, a1); 5749 ldrb(tmp2, a2); 5750 eorw(tmp1, tmp1, tmp2); 5751 cbnzw(tmp1, DONE); 5752 } 5753 } 5754 // Arrays are equal. 5755 bind(SAME); 5756 mov(result, true); 5757 5758 // That's it. 5759 bind(DONE); 5760 BLOCK_COMMENT("} string_equals"); 5761 } 5762 5763 5764 // The size of the blocks erased by the zero_blocks stub. We must 5765 // handle anything smaller than this ourselves in zero_words(). 5766 const int MacroAssembler::zero_words_block_size = 8; 5767 5768 // zero_words() is used by C2 ClearArray patterns. It is as small as 5769 // possible, handling small word counts locally and delegating 5770 // anything larger to the zero_blocks stub. It is expanded many times 5771 // in compiled code, so it is important to keep it short. 5772 5773 // ptr: Address of a buffer to be zeroed. 5774 // cnt: Count in HeapWords. 5775 // 5776 // ptr, cnt, rscratch1, and rscratch2 are clobbered. 5777 void MacroAssembler::zero_words(Register ptr, Register cnt) 5778 { 5779 assert(is_power_of_2(zero_words_block_size), "adjust this"); 5780 assert(ptr == r10 && cnt == r11, "mismatch in register usage"); 5781 5782 BLOCK_COMMENT("zero_words {"); 5783 cmp(cnt, (u1)zero_words_block_size); 5784 Label around; 5785 br(LO, around); 5786 { 5787 RuntimeAddress zero_blocks = RuntimeAddress(StubRoutines::aarch64::zero_blocks()); 5788 assert(zero_blocks.target() != NULL, "zero_blocks stub has not been generated"); 5789 if (StubRoutines::aarch64::complete()) { 5790 trampoline_call(zero_blocks); 5791 } else { 5792 bl(zero_blocks); 5793 } 5794 } 5795 bind(around); 5796 for (int i = zero_words_block_size >> 1; i > 1; i >>= 1) { 5797 Label l; 5798 tbz(cnt, exact_log2(i), l); 5799 for (int j = 0; j < i; j += 2) { 5800 stp(zr, zr, post(ptr, 16)); 5801 } 5802 bind(l); 5803 } 5804 { 5805 Label l; 5806 tbz(cnt, 0, l); 5807 str(zr, Address(ptr)); 5808 bind(l); 5809 } 5810 BLOCK_COMMENT("} zero_words"); 5811 } 5812 5813 // base: Address of a buffer to be zeroed, 8 bytes aligned. 5814 // cnt: Immediate count in HeapWords. 5815 #define SmallArraySize (18 * BytesPerLong) 5816 void MacroAssembler::zero_words(Register base, u_int64_t cnt) 5817 { 5818 BLOCK_COMMENT("zero_words {"); 5819 int i = cnt & 1; // store any odd word to start 5820 if (i) str(zr, Address(base)); 5821 5822 if (cnt <= SmallArraySize / BytesPerLong) { 5823 for (; i < (int)cnt; i += 2) 5824 stp(zr, zr, Address(base, i * wordSize)); 5825 } else { 5826 const int unroll = 4; // Number of stp(zr, zr) instructions we'll unroll 5827 int remainder = cnt % (2 * unroll); 5828 for (; i < remainder; i += 2) 5829 stp(zr, zr, Address(base, i * wordSize)); 5830 5831 Label loop; 5832 Register cnt_reg = rscratch1; 5833 Register loop_base = rscratch2; 5834 cnt = cnt - remainder; 5835 mov(cnt_reg, cnt); 5836 // adjust base and prebias by -2 * wordSize so we can pre-increment 5837 add(loop_base, base, (remainder - 2) * wordSize); 5838 bind(loop); 5839 sub(cnt_reg, cnt_reg, 2 * unroll); 5840 for (i = 1; i < unroll; i++) 5841 stp(zr, zr, Address(loop_base, 2 * i * wordSize)); 5842 stp(zr, zr, Address(pre(loop_base, 2 * unroll * wordSize))); 5843 cbnz(cnt_reg, loop); 5844 } 5845 BLOCK_COMMENT("} zero_words"); 5846 } 5847 5848 // Zero blocks of memory by using DC ZVA. 5849 // 5850 // Aligns the base address first sufficently for DC ZVA, then uses 5851 // DC ZVA repeatedly for every full block. cnt is the size to be 5852 // zeroed in HeapWords. Returns the count of words left to be zeroed 5853 // in cnt. 5854 // 5855 // NOTE: This is intended to be used in the zero_blocks() stub. If 5856 // you want to use it elsewhere, note that cnt must be >= 2*zva_length. 5857 void MacroAssembler::zero_dcache_blocks(Register base, Register cnt) { 5858 Register tmp = rscratch1; 5859 Register tmp2 = rscratch2; 5860 int zva_length = VM_Version::zva_length(); 5861 Label initial_table_end, loop_zva; 5862 Label fini; 5863 5864 // Base must be 16 byte aligned. If not just return and let caller handle it 5865 tst(base, 0x0f); 5866 br(Assembler::NE, fini); 5867 // Align base with ZVA length. 5868 neg(tmp, base); 5869 andr(tmp, tmp, zva_length - 1); 5870 5871 // tmp: the number of bytes to be filled to align the base with ZVA length. 5872 add(base, base, tmp); 5873 sub(cnt, cnt, tmp, Assembler::ASR, 3); 5874 adr(tmp2, initial_table_end); 5875 sub(tmp2, tmp2, tmp, Assembler::LSR, 2); 5876 br(tmp2); 5877 5878 for (int i = -zva_length + 16; i < 0; i += 16) 5879 stp(zr, zr, Address(base, i)); 5880 bind(initial_table_end); 5881 5882 sub(cnt, cnt, zva_length >> 3); 5883 bind(loop_zva); 5884 dc(Assembler::ZVA, base); 5885 subs(cnt, cnt, zva_length >> 3); 5886 add(base, base, zva_length); 5887 br(Assembler::GE, loop_zva); 5888 add(cnt, cnt, zva_length >> 3); // count not zeroed by DC ZVA 5889 bind(fini); 5890 } 5891 5892 // base: Address of a buffer to be filled, 8 bytes aligned. 5893 // cnt: Count in 8-byte unit. 5894 // value: Value to be filled with. 5895 // base will point to the end of the buffer after filling. 5896 void MacroAssembler::fill_words(Register base, Register cnt, Register value) 5897 { 5898 // Algorithm: 5899 // 5900 // scratch1 = cnt & 7; 5901 // cnt -= scratch1; 5902 // p += scratch1; 5903 // switch (scratch1) { 5904 // do { 5905 // cnt -= 8; 5906 // p[-8] = v; 5907 // case 7: 5908 // p[-7] = v; 5909 // case 6: 5910 // p[-6] = v; 5911 // // ... 5912 // case 1: 5913 // p[-1] = v; 5914 // case 0: 5915 // p += 8; 5916 // } while (cnt); 5917 // } 5918 5919 assert_different_registers(base, cnt, value, rscratch1, rscratch2); 5920 5921 Label fini, skip, entry, loop; 5922 const int unroll = 8; // Number of stp instructions we'll unroll 5923 5924 cbz(cnt, fini); 5925 tbz(base, 3, skip); 5926 str(value, Address(post(base, 8))); 5927 sub(cnt, cnt, 1); 5928 bind(skip); 5929 5930 andr(rscratch1, cnt, (unroll-1) * 2); 5931 sub(cnt, cnt, rscratch1); 5932 add(base, base, rscratch1, Assembler::LSL, 3); 5933 adr(rscratch2, entry); 5934 sub(rscratch2, rscratch2, rscratch1, Assembler::LSL, 1); 5935 br(rscratch2); 5936 5937 bind(loop); 5938 add(base, base, unroll * 16); 5939 for (int i = -unroll; i < 0; i++) 5940 stp(value, value, Address(base, i * 16)); 5941 bind(entry); 5942 subs(cnt, cnt, unroll * 2); 5943 br(Assembler::GE, loop); 5944 5945 tbz(cnt, 0, fini); 5946 str(value, Address(post(base, 8))); 5947 bind(fini); 5948 } 5949 5950 // Intrinsic for sun/nio/cs/ISO_8859_1$Encoder.implEncodeISOArray and 5951 // java/lang/StringUTF16.compress. 5952 void MacroAssembler::encode_iso_array(Register src, Register dst, 5953 Register len, Register result, 5954 FloatRegister Vtmp1, FloatRegister Vtmp2, 5955 FloatRegister Vtmp3, FloatRegister Vtmp4) 5956 { 5957 Label DONE, SET_RESULT, NEXT_32, NEXT_32_PRFM, LOOP_8, NEXT_8, LOOP_1, NEXT_1, 5958 NEXT_32_START, NEXT_32_PRFM_START; 5959 Register tmp1 = rscratch1, tmp2 = rscratch2; 5960 5961 mov(result, len); // Save initial len 5962 5963 #ifndef BUILTIN_SIM 5964 cmp(len, (u1)8); // handle shortest strings first 5965 br(LT, LOOP_1); 5966 cmp(len, (u1)32); 5967 br(LT, NEXT_8); 5968 // The following code uses the SIMD 'uzp1' and 'uzp2' instructions 5969 // to convert chars to bytes 5970 if (SoftwarePrefetchHintDistance >= 0) { 5971 ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src); 5972 subs(tmp2, len, SoftwarePrefetchHintDistance/2 + 16); 5973 br(LE, NEXT_32_START); 5974 b(NEXT_32_PRFM_START); 5975 BIND(NEXT_32_PRFM); 5976 ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src); 5977 BIND(NEXT_32_PRFM_START); 5978 prfm(Address(src, SoftwarePrefetchHintDistance)); 5979 orr(v4, T16B, Vtmp1, Vtmp2); 5980 orr(v5, T16B, Vtmp3, Vtmp4); 5981 uzp1(Vtmp1, T16B, Vtmp1, Vtmp2); 5982 uzp1(Vtmp3, T16B, Vtmp3, Vtmp4); 5983 uzp2(v5, T16B, v4, v5); // high bytes 5984 umov(tmp2, v5, D, 1); 5985 fmovd(tmp1, v5); 5986 orr(tmp1, tmp1, tmp2); 5987 cbnz(tmp1, LOOP_8); 5988 stpq(Vtmp1, Vtmp3, dst); 5989 sub(len, len, 32); 5990 add(dst, dst, 32); 5991 add(src, src, 64); 5992 subs(tmp2, len, SoftwarePrefetchHintDistance/2 + 16); 5993 br(GE, NEXT_32_PRFM); 5994 cmp(len, (u1)32); 5995 br(LT, LOOP_8); 5996 BIND(NEXT_32); 5997 ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src); 5998 BIND(NEXT_32_START); 5999 } else { 6000 BIND(NEXT_32); 6001 ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src); 6002 } 6003 prfm(Address(src, SoftwarePrefetchHintDistance)); 6004 uzp1(v4, T16B, Vtmp1, Vtmp2); 6005 uzp1(v5, T16B, Vtmp3, Vtmp4); 6006 orr(Vtmp1, T16B, Vtmp1, Vtmp2); 6007 orr(Vtmp3, T16B, Vtmp3, Vtmp4); 6008 uzp2(Vtmp1, T16B, Vtmp1, Vtmp3); // high bytes 6009 umov(tmp2, Vtmp1, D, 1); 6010 fmovd(tmp1, Vtmp1); 6011 orr(tmp1, tmp1, tmp2); 6012 cbnz(tmp1, LOOP_8); 6013 stpq(v4, v5, dst); 6014 sub(len, len, 32); 6015 add(dst, dst, 32); 6016 add(src, src, 64); 6017 cmp(len, (u1)32); 6018 br(GE, NEXT_32); 6019 cbz(len, DONE); 6020 6021 BIND(LOOP_8); 6022 cmp(len, (u1)8); 6023 br(LT, LOOP_1); 6024 BIND(NEXT_8); 6025 ld1(Vtmp1, T8H, src); 6026 uzp1(Vtmp2, T16B, Vtmp1, Vtmp1); // low bytes 6027 uzp2(Vtmp3, T16B, Vtmp1, Vtmp1); // high bytes 6028 fmovd(tmp1, Vtmp3); 6029 cbnz(tmp1, NEXT_1); 6030 strd(Vtmp2, dst); 6031 6032 sub(len, len, 8); 6033 add(dst, dst, 8); 6034 add(src, src, 16); 6035 cmp(len, (u1)8); 6036 br(GE, NEXT_8); 6037 6038 BIND(LOOP_1); 6039 #endif 6040 cbz(len, DONE); 6041 BIND(NEXT_1); 6042 ldrh(tmp1, Address(post(src, 2))); 6043 tst(tmp1, 0xff00); 6044 br(NE, SET_RESULT); 6045 strb(tmp1, Address(post(dst, 1))); 6046 subs(len, len, 1); 6047 br(GT, NEXT_1); 6048 6049 BIND(SET_RESULT); 6050 sub(result, result, len); // Return index where we stopped 6051 // Return len == 0 if we processed all 6052 // characters 6053 BIND(DONE); 6054 } 6055 6056 6057 // Inflate byte[] array to char[]. 6058 void MacroAssembler::byte_array_inflate(Register src, Register dst, Register len, 6059 FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3, 6060 Register tmp4) { 6061 Label big, done, after_init, to_stub; 6062 6063 assert_different_registers(src, dst, len, tmp4, rscratch1); 6064 6065 fmovd(vtmp1, zr); 6066 lsrw(tmp4, len, 3); 6067 bind(after_init); 6068 cbnzw(tmp4, big); 6069 // Short string: less than 8 bytes. 6070 { 6071 Label loop, tiny; 6072 6073 cmpw(len, 4); 6074 br(LT, tiny); 6075 // Use SIMD to do 4 bytes. 6076 ldrs(vtmp2, post(src, 4)); 6077 zip1(vtmp3, T8B, vtmp2, vtmp1); 6078 subw(len, len, 4); 6079 strd(vtmp3, post(dst, 8)); 6080 6081 cbzw(len, done); 6082 6083 // Do the remaining bytes by steam. 6084 bind(loop); 6085 ldrb(tmp4, post(src, 1)); 6086 strh(tmp4, post(dst, 2)); 6087 subw(len, len, 1); 6088 6089 bind(tiny); 6090 cbnz(len, loop); 6091 6092 b(done); 6093 } 6094 6095 if (SoftwarePrefetchHintDistance >= 0) { 6096 bind(to_stub); 6097 RuntimeAddress stub = RuntimeAddress(StubRoutines::aarch64::large_byte_array_inflate()); 6098 assert(stub.target() != NULL, "large_byte_array_inflate stub has not been generated"); 6099 trampoline_call(stub); 6100 b(after_init); 6101 } 6102 6103 // Unpack the bytes 8 at a time. 6104 bind(big); 6105 { 6106 Label loop, around, loop_last, loop_start; 6107 6108 if (SoftwarePrefetchHintDistance >= 0) { 6109 const int large_loop_threshold = (64 + 16)/8; 6110 ldrd(vtmp2, post(src, 8)); 6111 andw(len, len, 7); 6112 cmp(tmp4, (u1)large_loop_threshold); 6113 br(GE, to_stub); 6114 b(loop_start); 6115 6116 bind(loop); 6117 ldrd(vtmp2, post(src, 8)); 6118 bind(loop_start); 6119 subs(tmp4, tmp4, 1); 6120 br(EQ, loop_last); 6121 zip1(vtmp2, T16B, vtmp2, vtmp1); 6122 ldrd(vtmp3, post(src, 8)); 6123 st1(vtmp2, T8H, post(dst, 16)); 6124 subs(tmp4, tmp4, 1); 6125 zip1(vtmp3, T16B, vtmp3, vtmp1); 6126 st1(vtmp3, T8H, post(dst, 16)); 6127 br(NE, loop); 6128 b(around); 6129 bind(loop_last); 6130 zip1(vtmp2, T16B, vtmp2, vtmp1); 6131 st1(vtmp2, T8H, post(dst, 16)); 6132 bind(around); 6133 cbz(len, done); 6134 } else { 6135 andw(len, len, 7); 6136 bind(loop); 6137 ldrd(vtmp2, post(src, 8)); 6138 sub(tmp4, tmp4, 1); 6139 zip1(vtmp3, T16B, vtmp2, vtmp1); 6140 st1(vtmp3, T8H, post(dst, 16)); 6141 cbnz(tmp4, loop); 6142 } 6143 } 6144 6145 // Do the tail of up to 8 bytes. 6146 add(src, src, len); 6147 ldrd(vtmp3, Address(src, -8)); 6148 add(dst, dst, len, ext::uxtw, 1); 6149 zip1(vtmp3, T16B, vtmp3, vtmp1); 6150 strq(vtmp3, Address(dst, -16)); 6151 6152 bind(done); 6153 } 6154 6155 // Compress char[] array to byte[]. 6156 void MacroAssembler::char_array_compress(Register src, Register dst, Register len, 6157 FloatRegister tmp1Reg, FloatRegister tmp2Reg, 6158 FloatRegister tmp3Reg, FloatRegister tmp4Reg, 6159 Register result) { 6160 encode_iso_array(src, dst, len, result, 6161 tmp1Reg, tmp2Reg, tmp3Reg, tmp4Reg); 6162 cmp(len, zr); 6163 csel(result, result, zr, EQ); 6164 } 6165 6166 // get_thread() can be called anywhere inside generated code so we 6167 // need to save whatever non-callee save context might get clobbered 6168 // by the call to JavaThread::aarch64_get_thread_helper() or, indeed, 6169 // the call setup code. 6170 // 6171 // aarch64_get_thread_helper() clobbers only r0, r1, and flags. 6172 // 6173 void MacroAssembler::get_thread(Register dst) { 6174 RegSet saved_regs = RegSet::range(r0, r1) + lr - dst; 6175 push(saved_regs, sp); 6176 6177 mov(lr, CAST_FROM_FN_PTR(address, JavaThread::aarch64_get_thread_helper)); 6178 blrt(lr, 1, 0, 1); 6179 if (dst != c_rarg0) { 6180 mov(dst, c_rarg0); 6181 } 6182 6183 pop(saved_regs, sp); 6184 }