1 /* 2 * Copyright (c) 1997, 2016, Oracle and/or its affiliates. All rights reserved. 3 * Copyright (c) 2014, 2015, Red Hat Inc. All rights reserved. 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. 9 * 10 * This code is distributed in the hope that it will be useful, but WITHOUT 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 * version 2 for more details (a copy is included in the LICENSE file that 14 * accompanied this code). 15 * 16 * You should have received a copy of the GNU General Public License version 17 * 2 along with this work; if not, write to the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 19 * 20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 21 * or visit www.oracle.com if you need additional information or have any 22 * questions. 23 * 24 */ 25 26 #include <sys/types.h> 27 28 #include "precompiled.hpp" 29 #include "asm/assembler.hpp" 30 #include "asm/assembler.inline.hpp" 31 #include "interpreter/interpreter.hpp" 32 33 #include "compiler/disassembler.hpp" 34 #include "memory/resourceArea.hpp" 35 #include "nativeInst_aarch64.hpp" 36 #include "oops/klass.inline.hpp" 37 #include "oops/oop.inline.hpp" 38 #include "opto/compile.hpp" 39 #include "opto/intrinsicnode.hpp" 40 #include "opto/node.hpp" 41 #include "runtime/biasedLocking.hpp" 42 #include "runtime/icache.hpp" 43 #include "runtime/interfaceSupport.hpp" 44 #include "runtime/sharedRuntime.hpp" 45 #include "runtime/thread.hpp" 46 47 #if INCLUDE_ALL_GCS 48 #include "gc/g1/g1CollectedHeap.inline.hpp" 49 #include "gc/g1/g1SATBCardTableModRefBS.hpp" 50 #include "gc/g1/heapRegion.hpp" 51 #endif 52 53 #ifdef PRODUCT 54 #define BLOCK_COMMENT(str) /* nothing */ 55 #define STOP(error) stop(error) 56 #else 57 #define BLOCK_COMMENT(str) block_comment(str) 58 #define STOP(error) block_comment(error); stop(error) 59 #endif 60 61 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 62 63 // Patch any kind of instruction; there may be several instructions. 64 // Return the total length (in bytes) of the instructions. 65 int MacroAssembler::pd_patch_instruction_size(address branch, address target) { 66 int instructions = 1; 67 assert((uint64_t)target < (1ul << 48), "48-bit overflow in address constant"); 68 long offset = (target - branch) >> 2; 69 unsigned insn = *(unsigned*)branch; 70 if ((Instruction_aarch64::extract(insn, 29, 24) & 0b111011) == 0b011000) { 71 // Load register (literal) 72 Instruction_aarch64::spatch(branch, 23, 5, offset); 73 } else if (Instruction_aarch64::extract(insn, 30, 26) == 0b00101) { 74 // Unconditional branch (immediate) 75 Instruction_aarch64::spatch(branch, 25, 0, offset); 76 } else if (Instruction_aarch64::extract(insn, 31, 25) == 0b0101010) { 77 // Conditional branch (immediate) 78 Instruction_aarch64::spatch(branch, 23, 5, offset); 79 } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011010) { 80 // Compare & branch (immediate) 81 Instruction_aarch64::spatch(branch, 23, 5, offset); 82 } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011011) { 83 // Test & branch (immediate) 84 Instruction_aarch64::spatch(branch, 18, 5, offset); 85 } else if (Instruction_aarch64::extract(insn, 28, 24) == 0b10000) { 86 // PC-rel. addressing 87 offset = target-branch; 88 int shift = Instruction_aarch64::extract(insn, 31, 31); 89 if (shift) { 90 u_int64_t dest = (u_int64_t)target; 91 uint64_t pc_page = (uint64_t)branch >> 12; 92 uint64_t adr_page = (uint64_t)target >> 12; 93 unsigned offset_lo = dest & 0xfff; 94 offset = adr_page - pc_page; 95 96 // We handle 4 types of PC relative addressing 97 // 1 - adrp Rx, target_page 98 // ldr/str Ry, [Rx, #offset_in_page] 99 // 2 - adrp Rx, target_page 100 // add Ry, Rx, #offset_in_page 101 // 3 - adrp Rx, target_page (page aligned reloc, offset == 0) 102 // movk Rx, #imm16<<32 103 // 4 - adrp Rx, target_page (page aligned reloc, offset == 0) 104 // In the first 3 cases we must check that Rx is the same in the adrp and the 105 // subsequent ldr/str, add or movk instruction. Otherwise we could accidentally end 106 // up treating a type 4 relocation as a type 1, 2 or 3 just because it happened 107 // to be followed by a random unrelated ldr/str, add or movk instruction. 108 // 109 unsigned insn2 = ((unsigned*)branch)[1]; 110 if (Instruction_aarch64::extract(insn2, 29, 24) == 0b111001 && 111 Instruction_aarch64::extract(insn, 4, 0) == 112 Instruction_aarch64::extract(insn2, 9, 5)) { 113 // Load/store register (unsigned immediate) 114 unsigned size = Instruction_aarch64::extract(insn2, 31, 30); 115 Instruction_aarch64::patch(branch + sizeof (unsigned), 116 21, 10, offset_lo >> size); 117 guarantee(((dest >> size) << size) == dest, "misaligned target"); 118 instructions = 2; 119 } else if (Instruction_aarch64::extract(insn2, 31, 22) == 0b1001000100 && 120 Instruction_aarch64::extract(insn, 4, 0) == 121 Instruction_aarch64::extract(insn2, 4, 0)) { 122 // add (immediate) 123 Instruction_aarch64::patch(branch + sizeof (unsigned), 124 21, 10, offset_lo); 125 instructions = 2; 126 } else if (Instruction_aarch64::extract(insn2, 31, 21) == 0b11110010110 && 127 Instruction_aarch64::extract(insn, 4, 0) == 128 Instruction_aarch64::extract(insn2, 4, 0)) { 129 // movk #imm16<<32 130 Instruction_aarch64::patch(branch + 4, 20, 5, (uint64_t)target >> 32); 131 long dest = ((long)target & 0xffffffffL) | ((long)branch & 0xffff00000000L); 132 long pc_page = (long)branch >> 12; 133 long adr_page = (long)dest >> 12; 134 offset = adr_page - pc_page; 135 instructions = 2; 136 } 137 } 138 int offset_lo = offset & 3; 139 offset >>= 2; 140 Instruction_aarch64::spatch(branch, 23, 5, offset); 141 Instruction_aarch64::patch(branch, 30, 29, offset_lo); 142 } else if (Instruction_aarch64::extract(insn, 31, 21) == 0b11010010100) { 143 u_int64_t dest = (u_int64_t)target; 144 // Move wide constant 145 assert(nativeInstruction_at(branch+4)->is_movk(), "wrong insns in patch"); 146 assert(nativeInstruction_at(branch+8)->is_movk(), "wrong insns in patch"); 147 Instruction_aarch64::patch(branch, 20, 5, dest & 0xffff); 148 Instruction_aarch64::patch(branch+4, 20, 5, (dest >>= 16) & 0xffff); 149 Instruction_aarch64::patch(branch+8, 20, 5, (dest >>= 16) & 0xffff); 150 assert(target_addr_for_insn(branch) == target, "should be"); 151 instructions = 3; 152 } else if (Instruction_aarch64::extract(insn, 31, 22) == 0b1011100101 && 153 Instruction_aarch64::extract(insn, 4, 0) == 0b11111) { 154 // nothing to do 155 assert(target == 0, "did not expect to relocate target for polling page load"); 156 } else { 157 ShouldNotReachHere(); 158 } 159 return instructions * NativeInstruction::instruction_size; 160 } 161 162 int MacroAssembler::patch_oop(address insn_addr, address o) { 163 int instructions; 164 unsigned insn = *(unsigned*)insn_addr; 165 assert(nativeInstruction_at(insn_addr+4)->is_movk(), "wrong insns in patch"); 166 167 // OOPs are either narrow (32 bits) or wide (48 bits). We encode 168 // narrow OOPs by setting the upper 16 bits in the first 169 // instruction. 170 if (Instruction_aarch64::extract(insn, 31, 21) == 0b11010010101) { 171 // Move narrow OOP 172 narrowOop n = oopDesc::encode_heap_oop((oop)o); 173 Instruction_aarch64::patch(insn_addr, 20, 5, n >> 16); 174 Instruction_aarch64::patch(insn_addr+4, 20, 5, n & 0xffff); 175 instructions = 2; 176 } else { 177 // Move wide OOP 178 assert(nativeInstruction_at(insn_addr+8)->is_movk(), "wrong insns in patch"); 179 uintptr_t dest = (uintptr_t)o; 180 Instruction_aarch64::patch(insn_addr, 20, 5, dest & 0xffff); 181 Instruction_aarch64::patch(insn_addr+4, 20, 5, (dest >>= 16) & 0xffff); 182 Instruction_aarch64::patch(insn_addr+8, 20, 5, (dest >>= 16) & 0xffff); 183 instructions = 3; 184 } 185 return instructions * NativeInstruction::instruction_size; 186 } 187 188 address MacroAssembler::target_addr_for_insn(address insn_addr, unsigned insn) { 189 long offset = 0; 190 if ((Instruction_aarch64::extract(insn, 29, 24) & 0b011011) == 0b00011000) { 191 // Load register (literal) 192 offset = Instruction_aarch64::sextract(insn, 23, 5); 193 return address(((uint64_t)insn_addr + (offset << 2))); 194 } else if (Instruction_aarch64::extract(insn, 30, 26) == 0b00101) { 195 // Unconditional branch (immediate) 196 offset = Instruction_aarch64::sextract(insn, 25, 0); 197 } else if (Instruction_aarch64::extract(insn, 31, 25) == 0b0101010) { 198 // Conditional branch (immediate) 199 offset = Instruction_aarch64::sextract(insn, 23, 5); 200 } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011010) { 201 // Compare & branch (immediate) 202 offset = Instruction_aarch64::sextract(insn, 23, 5); 203 } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011011) { 204 // Test & branch (immediate) 205 offset = Instruction_aarch64::sextract(insn, 18, 5); 206 } else if (Instruction_aarch64::extract(insn, 28, 24) == 0b10000) { 207 // PC-rel. addressing 208 offset = Instruction_aarch64::extract(insn, 30, 29); 209 offset |= Instruction_aarch64::sextract(insn, 23, 5) << 2; 210 int shift = Instruction_aarch64::extract(insn, 31, 31) ? 12 : 0; 211 if (shift) { 212 offset <<= shift; 213 uint64_t target_page = ((uint64_t)insn_addr) + offset; 214 target_page &= ((uint64_t)-1) << shift; 215 // Return the target address for the following sequences 216 // 1 - adrp Rx, target_page 217 // ldr/str Ry, [Rx, #offset_in_page] 218 // 2 - adrp Rx, target_page 219 // add Ry, Rx, #offset_in_page 220 // 3 - adrp Rx, target_page (page aligned reloc, offset == 0) 221 // movk Rx, #imm12<<32 222 // 4 - adrp Rx, target_page (page aligned reloc, offset == 0) 223 // 224 // In the first two cases we check that the register is the same and 225 // return the target_page + the offset within the page. 226 // Otherwise we assume it is a page aligned relocation and return 227 // the target page only. 228 // 229 unsigned insn2 = ((unsigned*)insn_addr)[1]; 230 if (Instruction_aarch64::extract(insn2, 29, 24) == 0b111001 && 231 Instruction_aarch64::extract(insn, 4, 0) == 232 Instruction_aarch64::extract(insn2, 9, 5)) { 233 // Load/store register (unsigned immediate) 234 unsigned int byte_offset = Instruction_aarch64::extract(insn2, 21, 10); 235 unsigned int size = Instruction_aarch64::extract(insn2, 31, 30); 236 return address(target_page + (byte_offset << size)); 237 } else if (Instruction_aarch64::extract(insn2, 31, 22) == 0b1001000100 && 238 Instruction_aarch64::extract(insn, 4, 0) == 239 Instruction_aarch64::extract(insn2, 4, 0)) { 240 // add (immediate) 241 unsigned int byte_offset = Instruction_aarch64::extract(insn2, 21, 10); 242 return address(target_page + byte_offset); 243 } else { 244 if (Instruction_aarch64::extract(insn2, 31, 21) == 0b11110010110 && 245 Instruction_aarch64::extract(insn, 4, 0) == 246 Instruction_aarch64::extract(insn2, 4, 0)) { 247 target_page = (target_page & 0xffffffff) | 248 ((uint64_t)Instruction_aarch64::extract(insn2, 20, 5) << 32); 249 } 250 return (address)target_page; 251 } 252 } else { 253 ShouldNotReachHere(); 254 } 255 } else if (Instruction_aarch64::extract(insn, 31, 23) == 0b110100101) { 256 u_int32_t *insns = (u_int32_t *)insn_addr; 257 // Move wide constant: movz, movk, movk. See movptr(). 258 assert(nativeInstruction_at(insns+1)->is_movk(), "wrong insns in patch"); 259 assert(nativeInstruction_at(insns+2)->is_movk(), "wrong insns in patch"); 260 return address(u_int64_t(Instruction_aarch64::extract(insns[0], 20, 5)) 261 + (u_int64_t(Instruction_aarch64::extract(insns[1], 20, 5)) << 16) 262 + (u_int64_t(Instruction_aarch64::extract(insns[2], 20, 5)) << 32)); 263 } else if (Instruction_aarch64::extract(insn, 31, 22) == 0b1011100101 && 264 Instruction_aarch64::extract(insn, 4, 0) == 0b11111) { 265 return 0; 266 } else { 267 ShouldNotReachHere(); 268 } 269 return address(((uint64_t)insn_addr + (offset << 2))); 270 } 271 272 void MacroAssembler::serialize_memory(Register thread, Register tmp) { 273 dsb(Assembler::SY); 274 } 275 276 277 void MacroAssembler::reset_last_Java_frame(bool clear_fp) { 278 // we must set sp to zero to clear frame 279 str(zr, Address(rthread, JavaThread::last_Java_sp_offset())); 280 281 // must clear fp, so that compiled frames are not confused; it is 282 // possible that we need it only for debugging 283 if (clear_fp) { 284 str(zr, Address(rthread, JavaThread::last_Java_fp_offset())); 285 } 286 287 // Always clear the pc because it could have been set by make_walkable() 288 str(zr, Address(rthread, JavaThread::last_Java_pc_offset())); 289 } 290 291 // Calls to C land 292 // 293 // When entering C land, the rfp, & resp of the last Java frame have to be recorded 294 // in the (thread-local) JavaThread object. When leaving C land, the last Java fp 295 // has to be reset to 0. This is required to allow proper stack traversal. 296 void MacroAssembler::set_last_Java_frame(Register last_java_sp, 297 Register last_java_fp, 298 Register last_java_pc, 299 Register scratch) { 300 301 if (last_java_pc->is_valid()) { 302 str(last_java_pc, Address(rthread, 303 JavaThread::frame_anchor_offset() 304 + JavaFrameAnchor::last_Java_pc_offset())); 305 } 306 307 // determine last_java_sp register 308 if (last_java_sp == sp) { 309 mov(scratch, sp); 310 last_java_sp = scratch; 311 } else if (!last_java_sp->is_valid()) { 312 last_java_sp = esp; 313 } 314 315 str(last_java_sp, Address(rthread, JavaThread::last_Java_sp_offset())); 316 317 // last_java_fp is optional 318 if (last_java_fp->is_valid()) { 319 str(last_java_fp, Address(rthread, JavaThread::last_Java_fp_offset())); 320 } 321 } 322 323 void MacroAssembler::set_last_Java_frame(Register last_java_sp, 324 Register last_java_fp, 325 address last_java_pc, 326 Register scratch) { 327 if (last_java_pc != NULL) { 328 adr(scratch, last_java_pc); 329 } else { 330 // FIXME: This is almost never correct. We should delete all 331 // cases of set_last_Java_frame with last_java_pc=NULL and use the 332 // correct return address instead. 333 adr(scratch, pc()); 334 } 335 336 str(scratch, Address(rthread, 337 JavaThread::frame_anchor_offset() 338 + JavaFrameAnchor::last_Java_pc_offset())); 339 340 set_last_Java_frame(last_java_sp, last_java_fp, noreg, scratch); 341 } 342 343 void MacroAssembler::set_last_Java_frame(Register last_java_sp, 344 Register last_java_fp, 345 Label &L, 346 Register scratch) { 347 if (L.is_bound()) { 348 set_last_Java_frame(last_java_sp, last_java_fp, target(L), scratch); 349 } else { 350 InstructionMark im(this); 351 L.add_patch_at(code(), locator()); 352 set_last_Java_frame(last_java_sp, last_java_fp, (address)NULL, scratch); 353 } 354 } 355 356 void MacroAssembler::far_call(Address entry, CodeBuffer *cbuf, Register tmp) { 357 assert(ReservedCodeCacheSize < 4*G, "branch out of range"); 358 assert(CodeCache::find_blob(entry.target()) != NULL, 359 "destination of far call not found in code cache"); 360 if (far_branches()) { 361 unsigned long offset; 362 // We can use ADRP here because we know that the total size of 363 // the code cache cannot exceed 2Gb. 364 adrp(tmp, entry, offset); 365 add(tmp, tmp, offset); 366 if (cbuf) cbuf->set_insts_mark(); 367 blr(tmp); 368 } else { 369 if (cbuf) cbuf->set_insts_mark(); 370 bl(entry); 371 } 372 } 373 374 void MacroAssembler::far_jump(Address entry, CodeBuffer *cbuf, Register tmp) { 375 assert(ReservedCodeCacheSize < 4*G, "branch out of range"); 376 assert(CodeCache::find_blob(entry.target()) != NULL, 377 "destination of far call not found in code cache"); 378 if (far_branches()) { 379 unsigned long offset; 380 // We can use ADRP here because we know that the total size of 381 // the code cache cannot exceed 2Gb. 382 adrp(tmp, entry, offset); 383 add(tmp, tmp, offset); 384 if (cbuf) cbuf->set_insts_mark(); 385 br(tmp); 386 } else { 387 if (cbuf) cbuf->set_insts_mark(); 388 b(entry); 389 } 390 } 391 392 int MacroAssembler::biased_locking_enter(Register lock_reg, 393 Register obj_reg, 394 Register swap_reg, 395 Register tmp_reg, 396 bool swap_reg_contains_mark, 397 Label& done, 398 Label* slow_case, 399 BiasedLockingCounters* counters) { 400 assert(UseBiasedLocking, "why call this otherwise?"); 401 assert_different_registers(lock_reg, obj_reg, swap_reg); 402 403 if (PrintBiasedLockingStatistics && counters == NULL) 404 counters = BiasedLocking::counters(); 405 406 assert_different_registers(lock_reg, obj_reg, swap_reg, tmp_reg, rscratch1, rscratch2, noreg); 407 assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits, "biased locking makes assumptions about bit layout"); 408 Address mark_addr (obj_reg, oopDesc::mark_offset_in_bytes()); 409 Address klass_addr (obj_reg, oopDesc::klass_offset_in_bytes()); 410 Address saved_mark_addr(lock_reg, 0); 411 412 // Biased locking 413 // See whether the lock is currently biased toward our thread and 414 // whether the epoch is still valid 415 // Note that the runtime guarantees sufficient alignment of JavaThread 416 // pointers to allow age to be placed into low bits 417 // First check to see whether biasing is even enabled for this object 418 Label cas_label; 419 int null_check_offset = -1; 420 if (!swap_reg_contains_mark) { 421 null_check_offset = offset(); 422 ldr(swap_reg, mark_addr); 423 } 424 andr(tmp_reg, swap_reg, markOopDesc::biased_lock_mask_in_place); 425 cmp(tmp_reg, markOopDesc::biased_lock_pattern); 426 br(Assembler::NE, cas_label); 427 // The bias pattern is present in the object's header. Need to check 428 // whether the bias owner and the epoch are both still current. 429 load_prototype_header(tmp_reg, obj_reg); 430 orr(tmp_reg, tmp_reg, rthread); 431 eor(tmp_reg, swap_reg, tmp_reg); 432 andr(tmp_reg, tmp_reg, ~((int) markOopDesc::age_mask_in_place)); 433 if (counters != NULL) { 434 Label around; 435 cbnz(tmp_reg, around); 436 atomic_incw(Address((address)counters->biased_lock_entry_count_addr()), tmp_reg, rscratch1, rscratch2); 437 b(done); 438 bind(around); 439 } else { 440 cbz(tmp_reg, done); 441 } 442 443 Label try_revoke_bias; 444 Label try_rebias; 445 446 // At this point we know that the header has the bias pattern and 447 // that we are not the bias owner in the current epoch. We need to 448 // figure out more details about the state of the header in order to 449 // know what operations can be legally performed on the object's 450 // header. 451 452 // If the low three bits in the xor result aren't clear, that means 453 // the prototype header is no longer biased and we have to revoke 454 // the bias on this object. 455 andr(rscratch1, tmp_reg, markOopDesc::biased_lock_mask_in_place); 456 cbnz(rscratch1, try_revoke_bias); 457 458 // Biasing is still enabled for this data type. See whether the 459 // epoch of the current bias is still valid, meaning that the epoch 460 // bits of the mark word are equal to the epoch bits of the 461 // prototype header. (Note that the prototype header's epoch bits 462 // only change at a safepoint.) If not, attempt to rebias the object 463 // toward the current thread. Note that we must be absolutely sure 464 // that the current epoch is invalid in order to do this because 465 // otherwise the manipulations it performs on the mark word are 466 // illegal. 467 andr(rscratch1, tmp_reg, markOopDesc::epoch_mask_in_place); 468 cbnz(rscratch1, try_rebias); 469 470 // The epoch of the current bias is still valid but we know nothing 471 // about the owner; it might be set or it might be clear. Try to 472 // acquire the bias of the object using an atomic operation. If this 473 // fails we will go in to the runtime to revoke the object's bias. 474 // Note that we first construct the presumed unbiased header so we 475 // don't accidentally blow away another thread's valid bias. 476 { 477 Label here; 478 mov(rscratch1, markOopDesc::biased_lock_mask_in_place | markOopDesc::age_mask_in_place | markOopDesc::epoch_mask_in_place); 479 andr(swap_reg, swap_reg, rscratch1); 480 orr(tmp_reg, swap_reg, rthread); 481 cmpxchgptr(swap_reg, tmp_reg, obj_reg, rscratch1, here, slow_case); 482 // If the biasing toward our thread failed, this means that 483 // another thread succeeded in biasing it toward itself and we 484 // need to revoke that bias. The revocation will occur in the 485 // interpreter runtime in the slow case. 486 bind(here); 487 if (counters != NULL) { 488 atomic_incw(Address((address)counters->anonymously_biased_lock_entry_count_addr()), 489 tmp_reg, rscratch1, rscratch2); 490 } 491 } 492 b(done); 493 494 bind(try_rebias); 495 // At this point we know the epoch has expired, meaning that the 496 // current "bias owner", if any, is actually invalid. Under these 497 // circumstances _only_, we are allowed to use the current header's 498 // value as the comparison value when doing the cas to acquire the 499 // bias in the current epoch. In other words, we allow transfer of 500 // the bias from one thread to another directly in this situation. 501 // 502 // FIXME: due to a lack of registers we currently blow away the age 503 // bits in this situation. Should attempt to preserve them. 504 { 505 Label here; 506 load_prototype_header(tmp_reg, obj_reg); 507 orr(tmp_reg, rthread, tmp_reg); 508 cmpxchgptr(swap_reg, tmp_reg, obj_reg, rscratch1, here, slow_case); 509 // If the biasing toward our thread failed, then another thread 510 // succeeded in biasing it toward itself and we need to revoke that 511 // bias. The revocation will occur in the runtime in the slow case. 512 bind(here); 513 if (counters != NULL) { 514 atomic_incw(Address((address)counters->rebiased_lock_entry_count_addr()), 515 tmp_reg, rscratch1, rscratch2); 516 } 517 } 518 b(done); 519 520 bind(try_revoke_bias); 521 // The prototype mark in the klass doesn't have the bias bit set any 522 // more, indicating that objects of this data type are not supposed 523 // to be biased any more. We are going to try to reset the mark of 524 // this object to the prototype value and fall through to the 525 // CAS-based locking scheme. Note that if our CAS fails, it means 526 // that another thread raced us for the privilege of revoking the 527 // bias of this particular object, so it's okay to continue in the 528 // normal locking code. 529 // 530 // FIXME: due to a lack of registers we currently blow away the age 531 // bits in this situation. Should attempt to preserve them. 532 { 533 Label here, nope; 534 load_prototype_header(tmp_reg, obj_reg); 535 cmpxchgptr(swap_reg, tmp_reg, obj_reg, rscratch1, here, &nope); 536 bind(here); 537 538 // Fall through to the normal CAS-based lock, because no matter what 539 // the result of the above CAS, some thread must have succeeded in 540 // removing the bias bit from the object's header. 541 if (counters != NULL) { 542 atomic_incw(Address((address)counters->revoked_lock_entry_count_addr()), tmp_reg, 543 rscratch1, rscratch2); 544 } 545 bind(nope); 546 } 547 548 bind(cas_label); 549 550 return null_check_offset; 551 } 552 553 void MacroAssembler::biased_locking_exit(Register obj_reg, Register temp_reg, Label& done) { 554 assert(UseBiasedLocking, "why call this otherwise?"); 555 556 // Check for biased locking unlock case, which is a no-op 557 // Note: we do not have to check the thread ID for two reasons. 558 // First, the interpreter checks for IllegalMonitorStateException at 559 // a higher level. Second, if the bias was revoked while we held the 560 // lock, the object could not be rebiased toward another thread, so 561 // the bias bit would be clear. 562 ldr(temp_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes())); 563 andr(temp_reg, temp_reg, markOopDesc::biased_lock_mask_in_place); 564 cmp(temp_reg, markOopDesc::biased_lock_pattern); 565 br(Assembler::EQ, done); 566 } 567 568 static void pass_arg0(MacroAssembler* masm, Register arg) { 569 if (c_rarg0 != arg ) { 570 masm->mov(c_rarg0, arg); 571 } 572 } 573 574 static void pass_arg1(MacroAssembler* masm, Register arg) { 575 if (c_rarg1 != arg ) { 576 masm->mov(c_rarg1, arg); 577 } 578 } 579 580 static void pass_arg2(MacroAssembler* masm, Register arg) { 581 if (c_rarg2 != arg ) { 582 masm->mov(c_rarg2, arg); 583 } 584 } 585 586 static void pass_arg3(MacroAssembler* masm, Register arg) { 587 if (c_rarg3 != arg ) { 588 masm->mov(c_rarg3, arg); 589 } 590 } 591 592 void MacroAssembler::call_VM_base(Register oop_result, 593 Register java_thread, 594 Register last_java_sp, 595 address entry_point, 596 int number_of_arguments, 597 bool check_exceptions) { 598 // determine java_thread register 599 if (!java_thread->is_valid()) { 600 java_thread = rthread; 601 } 602 603 // determine last_java_sp register 604 if (!last_java_sp->is_valid()) { 605 last_java_sp = esp; 606 } 607 608 // debugging support 609 assert(number_of_arguments >= 0 , "cannot have negative number of arguments"); 610 assert(java_thread == rthread, "unexpected register"); 611 #ifdef ASSERT 612 // TraceBytecodes does not use r12 but saves it over the call, so don't verify 613 // if ((UseCompressedOops || UseCompressedClassPointers) && !TraceBytecodes) verify_heapbase("call_VM_base: heap base corrupted?"); 614 #endif // ASSERT 615 616 assert(java_thread != oop_result , "cannot use the same register for java_thread & oop_result"); 617 assert(java_thread != last_java_sp, "cannot use the same register for java_thread & last_java_sp"); 618 619 // push java thread (becomes first argument of C function) 620 621 mov(c_rarg0, java_thread); 622 623 // set last Java frame before call 624 assert(last_java_sp != rfp, "can't use rfp"); 625 626 Label l; 627 set_last_Java_frame(last_java_sp, rfp, l, rscratch1); 628 629 // do the call, remove parameters 630 MacroAssembler::call_VM_leaf_base(entry_point, number_of_arguments, &l); 631 632 // reset last Java frame 633 // Only interpreter should have to clear fp 634 reset_last_Java_frame(true); 635 636 // C++ interp handles this in the interpreter 637 check_and_handle_popframe(java_thread); 638 check_and_handle_earlyret(java_thread); 639 640 if (check_exceptions) { 641 // check for pending exceptions (java_thread is set upon return) 642 ldr(rscratch1, Address(java_thread, in_bytes(Thread::pending_exception_offset()))); 643 Label ok; 644 cbz(rscratch1, ok); 645 lea(rscratch1, RuntimeAddress(StubRoutines::forward_exception_entry())); 646 br(rscratch1); 647 bind(ok); 648 } 649 650 // get oop result if there is one and reset the value in the thread 651 if (oop_result->is_valid()) { 652 get_vm_result(oop_result, java_thread); 653 } 654 } 655 656 void MacroAssembler::call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions) { 657 call_VM_base(oop_result, noreg, noreg, entry_point, number_of_arguments, check_exceptions); 658 } 659 660 // Maybe emit a call via a trampoline. If the code cache is small 661 // trampolines won't be emitted. 662 663 address MacroAssembler::trampoline_call(Address entry, CodeBuffer *cbuf) { 664 assert(entry.rspec().type() == relocInfo::runtime_call_type 665 || entry.rspec().type() == relocInfo::opt_virtual_call_type 666 || entry.rspec().type() == relocInfo::static_call_type 667 || entry.rspec().type() == relocInfo::virtual_call_type, "wrong reloc type"); 668 669 unsigned int start_offset = offset(); 670 if (far_branches() && !Compile::current()->in_scratch_emit_size()) { 671 address stub = emit_trampoline_stub(start_offset, entry.target()); 672 if (stub == NULL) { 673 return NULL; // CodeCache is full 674 } 675 } 676 677 if (cbuf) cbuf->set_insts_mark(); 678 relocate(entry.rspec()); 679 if (!far_branches()) { 680 bl(entry.target()); 681 } else { 682 bl(pc()); 683 } 684 // just need to return a non-null address 685 return pc(); 686 } 687 688 689 // Emit a trampoline stub for a call to a target which is too far away. 690 // 691 // code sequences: 692 // 693 // call-site: 694 // branch-and-link to <destination> or <trampoline stub> 695 // 696 // Related trampoline stub for this call site in the stub section: 697 // load the call target from the constant pool 698 // branch (LR still points to the call site above) 699 700 address MacroAssembler::emit_trampoline_stub(int insts_call_instruction_offset, 701 address dest) { 702 address stub = start_a_stub(Compile::MAX_stubs_size/2); 703 if (stub == NULL) { 704 return NULL; // CodeBuffer::expand failed 705 } 706 707 // Create a trampoline stub relocation which relates this trampoline stub 708 // with the call instruction at insts_call_instruction_offset in the 709 // instructions code-section. 710 align(wordSize); 711 relocate(trampoline_stub_Relocation::spec(code()->insts()->start() 712 + insts_call_instruction_offset)); 713 const int stub_start_offset = offset(); 714 715 // Now, create the trampoline stub's code: 716 // - load the call 717 // - call 718 Label target; 719 ldr(rscratch1, target); 720 br(rscratch1); 721 bind(target); 722 assert(offset() - stub_start_offset == NativeCallTrampolineStub::data_offset, 723 "should be"); 724 emit_int64((int64_t)dest); 725 726 const address stub_start_addr = addr_at(stub_start_offset); 727 728 assert(is_NativeCallTrampolineStub_at(stub_start_addr), "doesn't look like a trampoline"); 729 730 end_a_stub(); 731 return stub; 732 } 733 734 address MacroAssembler::ic_call(address entry, jint method_index) { 735 RelocationHolder rh = virtual_call_Relocation::spec(pc(), method_index); 736 // address const_ptr = long_constant((jlong)Universe::non_oop_word()); 737 // unsigned long offset; 738 // ldr_constant(rscratch2, const_ptr); 739 movptr(rscratch2, (uintptr_t)Universe::non_oop_word()); 740 return trampoline_call(Address(entry, rh)); 741 } 742 743 // Implementation of call_VM versions 744 745 void MacroAssembler::call_VM(Register oop_result, 746 address entry_point, 747 bool check_exceptions) { 748 call_VM_helper(oop_result, entry_point, 0, check_exceptions); 749 } 750 751 void MacroAssembler::call_VM(Register oop_result, 752 address entry_point, 753 Register arg_1, 754 bool check_exceptions) { 755 pass_arg1(this, arg_1); 756 call_VM_helper(oop_result, entry_point, 1, check_exceptions); 757 } 758 759 void MacroAssembler::call_VM(Register oop_result, 760 address entry_point, 761 Register arg_1, 762 Register arg_2, 763 bool check_exceptions) { 764 assert(arg_1 != c_rarg2, "smashed arg"); 765 pass_arg2(this, arg_2); 766 pass_arg1(this, arg_1); 767 call_VM_helper(oop_result, entry_point, 2, check_exceptions); 768 } 769 770 void MacroAssembler::call_VM(Register oop_result, 771 address entry_point, 772 Register arg_1, 773 Register arg_2, 774 Register arg_3, 775 bool check_exceptions) { 776 assert(arg_1 != c_rarg3, "smashed arg"); 777 assert(arg_2 != c_rarg3, "smashed arg"); 778 pass_arg3(this, arg_3); 779 780 assert(arg_1 != c_rarg2, "smashed arg"); 781 pass_arg2(this, arg_2); 782 783 pass_arg1(this, arg_1); 784 call_VM_helper(oop_result, entry_point, 3, check_exceptions); 785 } 786 787 void MacroAssembler::call_VM(Register oop_result, 788 Register last_java_sp, 789 address entry_point, 790 int number_of_arguments, 791 bool check_exceptions) { 792 call_VM_base(oop_result, rthread, last_java_sp, entry_point, number_of_arguments, check_exceptions); 793 } 794 795 void MacroAssembler::call_VM(Register oop_result, 796 Register last_java_sp, 797 address entry_point, 798 Register arg_1, 799 bool check_exceptions) { 800 pass_arg1(this, arg_1); 801 call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions); 802 } 803 804 void MacroAssembler::call_VM(Register oop_result, 805 Register last_java_sp, 806 address entry_point, 807 Register arg_1, 808 Register arg_2, 809 bool check_exceptions) { 810 811 assert(arg_1 != c_rarg2, "smashed arg"); 812 pass_arg2(this, arg_2); 813 pass_arg1(this, arg_1); 814 call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions); 815 } 816 817 void MacroAssembler::call_VM(Register oop_result, 818 Register last_java_sp, 819 address entry_point, 820 Register arg_1, 821 Register arg_2, 822 Register arg_3, 823 bool check_exceptions) { 824 assert(arg_1 != c_rarg3, "smashed arg"); 825 assert(arg_2 != c_rarg3, "smashed arg"); 826 pass_arg3(this, arg_3); 827 assert(arg_1 != c_rarg2, "smashed arg"); 828 pass_arg2(this, arg_2); 829 pass_arg1(this, arg_1); 830 call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions); 831 } 832 833 834 void MacroAssembler::get_vm_result(Register oop_result, Register java_thread) { 835 ldr(oop_result, Address(java_thread, JavaThread::vm_result_offset())); 836 str(zr, Address(java_thread, JavaThread::vm_result_offset())); 837 verify_oop(oop_result, "broken oop in call_VM_base"); 838 } 839 840 void MacroAssembler::get_vm_result_2(Register metadata_result, Register java_thread) { 841 ldr(metadata_result, Address(java_thread, JavaThread::vm_result_2_offset())); 842 str(zr, Address(java_thread, JavaThread::vm_result_2_offset())); 843 } 844 845 void MacroAssembler::align(int modulus) { 846 while (offset() % modulus != 0) nop(); 847 } 848 849 // these are no-ops overridden by InterpreterMacroAssembler 850 851 void MacroAssembler::check_and_handle_earlyret(Register java_thread) { } 852 853 void MacroAssembler::check_and_handle_popframe(Register java_thread) { } 854 855 856 RegisterOrConstant MacroAssembler::delayed_value_impl(intptr_t* delayed_value_addr, 857 Register tmp, 858 int offset) { 859 intptr_t value = *delayed_value_addr; 860 if (value != 0) 861 return RegisterOrConstant(value + offset); 862 863 // load indirectly to solve generation ordering problem 864 ldr(tmp, ExternalAddress((address) delayed_value_addr)); 865 866 if (offset != 0) 867 add(tmp, tmp, offset); 868 869 return RegisterOrConstant(tmp); 870 } 871 872 873 void MacroAssembler:: notify(int type) { 874 if (type == bytecode_start) { 875 // set_last_Java_frame(esp, rfp, (address)NULL); 876 Assembler:: notify(type); 877 // reset_last_Java_frame(true); 878 } 879 else 880 Assembler:: notify(type); 881 } 882 883 // Look up the method for a megamorphic invokeinterface call. 884 // The target method is determined by <intf_klass, itable_index>. 885 // The receiver klass is in recv_klass. 886 // On success, the result will be in method_result, and execution falls through. 887 // On failure, execution transfers to the given label. 888 void MacroAssembler::lookup_interface_method(Register recv_klass, 889 Register intf_klass, 890 RegisterOrConstant itable_index, 891 Register method_result, 892 Register scan_temp, 893 Label& L_no_such_interface) { 894 assert_different_registers(recv_klass, intf_klass, method_result, scan_temp); 895 assert(itable_index.is_constant() || itable_index.as_register() == method_result, 896 "caller must use same register for non-constant itable index as for method"); 897 898 // Compute start of first itableOffsetEntry (which is at the end of the vtable) 899 int vtable_base = in_bytes(Klass::vtable_start_offset()); 900 int itentry_off = itableMethodEntry::method_offset_in_bytes(); 901 int scan_step = itableOffsetEntry::size() * wordSize; 902 int vte_size = vtableEntry::size_in_bytes(); 903 assert(vte_size == wordSize, "else adjust times_vte_scale"); 904 905 ldrw(scan_temp, Address(recv_klass, Klass::vtable_length_offset())); 906 907 // %%% Could store the aligned, prescaled offset in the klassoop. 908 // lea(scan_temp, Address(recv_klass, scan_temp, times_vte_scale, vtable_base)); 909 lea(scan_temp, Address(recv_klass, scan_temp, Address::lsl(3))); 910 add(scan_temp, scan_temp, vtable_base); 911 912 // Adjust recv_klass by scaled itable_index, so we can free itable_index. 913 assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below"); 914 // lea(recv_klass, Address(recv_klass, itable_index, Address::times_ptr, itentry_off)); 915 lea(recv_klass, Address(recv_klass, itable_index, Address::lsl(3))); 916 if (itentry_off) 917 add(recv_klass, recv_klass, itentry_off); 918 919 // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) { 920 // if (scan->interface() == intf) { 921 // result = (klass + scan->offset() + itable_index); 922 // } 923 // } 924 Label search, found_method; 925 926 for (int peel = 1; peel >= 0; peel--) { 927 ldr(method_result, Address(scan_temp, itableOffsetEntry::interface_offset_in_bytes())); 928 cmp(intf_klass, method_result); 929 930 if (peel) { 931 br(Assembler::EQ, found_method); 932 } else { 933 br(Assembler::NE, search); 934 // (invert the test to fall through to found_method...) 935 } 936 937 if (!peel) break; 938 939 bind(search); 940 941 // Check that the previous entry is non-null. A null entry means that 942 // the receiver class doesn't implement the interface, and wasn't the 943 // same as when the caller was compiled. 944 cbz(method_result, L_no_such_interface); 945 add(scan_temp, scan_temp, scan_step); 946 } 947 948 bind(found_method); 949 950 // Got a hit. 951 ldr(scan_temp, Address(scan_temp, itableOffsetEntry::offset_offset_in_bytes())); 952 ldr(method_result, Address(recv_klass, scan_temp)); 953 } 954 955 // virtual method calling 956 void MacroAssembler::lookup_virtual_method(Register recv_klass, 957 RegisterOrConstant vtable_index, 958 Register method_result) { 959 const int base = in_bytes(Klass::vtable_start_offset()); 960 assert(vtableEntry::size() * wordSize == 8, 961 "adjust the scaling in the code below"); 962 int vtable_offset_in_bytes = base + vtableEntry::method_offset_in_bytes(); 963 964 if (vtable_index.is_register()) { 965 lea(method_result, Address(recv_klass, 966 vtable_index.as_register(), 967 Address::lsl(LogBytesPerWord))); 968 ldr(method_result, Address(method_result, vtable_offset_in_bytes)); 969 } else { 970 vtable_offset_in_bytes += vtable_index.as_constant() * wordSize; 971 ldr(method_result, Address(recv_klass, vtable_offset_in_bytes)); 972 } 973 } 974 975 void MacroAssembler::check_klass_subtype(Register sub_klass, 976 Register super_klass, 977 Register temp_reg, 978 Label& L_success) { 979 Label L_failure; 980 check_klass_subtype_fast_path(sub_klass, super_klass, temp_reg, &L_success, &L_failure, NULL); 981 check_klass_subtype_slow_path(sub_klass, super_klass, temp_reg, noreg, &L_success, NULL); 982 bind(L_failure); 983 } 984 985 986 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass, 987 Register super_klass, 988 Register temp_reg, 989 Label* L_success, 990 Label* L_failure, 991 Label* L_slow_path, 992 RegisterOrConstant super_check_offset) { 993 assert_different_registers(sub_klass, super_klass, temp_reg); 994 bool must_load_sco = (super_check_offset.constant_or_zero() == -1); 995 if (super_check_offset.is_register()) { 996 assert_different_registers(sub_klass, super_klass, 997 super_check_offset.as_register()); 998 } else if (must_load_sco) { 999 assert(temp_reg != noreg, "supply either a temp or a register offset"); 1000 } 1001 1002 Label L_fallthrough; 1003 int label_nulls = 0; 1004 if (L_success == NULL) { L_success = &L_fallthrough; label_nulls++; } 1005 if (L_failure == NULL) { L_failure = &L_fallthrough; label_nulls++; } 1006 if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; } 1007 assert(label_nulls <= 1, "at most one NULL in the batch"); 1008 1009 int sc_offset = in_bytes(Klass::secondary_super_cache_offset()); 1010 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 1011 Address super_check_offset_addr(super_klass, sco_offset); 1012 1013 // Hacked jmp, which may only be used just before L_fallthrough. 1014 #define final_jmp(label) \ 1015 if (&(label) == &L_fallthrough) { /*do nothing*/ } \ 1016 else b(label) /*omit semi*/ 1017 1018 // If the pointers are equal, we are done (e.g., String[] elements). 1019 // This self-check enables sharing of secondary supertype arrays among 1020 // non-primary types such as array-of-interface. Otherwise, each such 1021 // type would need its own customized SSA. 1022 // We move this check to the front of the fast path because many 1023 // type checks are in fact trivially successful in this manner, 1024 // so we get a nicely predicted branch right at the start of the check. 1025 cmp(sub_klass, super_klass); 1026 br(Assembler::EQ, *L_success); 1027 1028 // Check the supertype display: 1029 if (must_load_sco) { 1030 ldrw(temp_reg, super_check_offset_addr); 1031 super_check_offset = RegisterOrConstant(temp_reg); 1032 } 1033 Address super_check_addr(sub_klass, super_check_offset); 1034 ldr(rscratch1, super_check_addr); 1035 cmp(super_klass, rscratch1); // load displayed supertype 1036 1037 // This check has worked decisively for primary supers. 1038 // Secondary supers are sought in the super_cache ('super_cache_addr'). 1039 // (Secondary supers are interfaces and very deeply nested subtypes.) 1040 // This works in the same check above because of a tricky aliasing 1041 // between the super_cache and the primary super display elements. 1042 // (The 'super_check_addr' can address either, as the case requires.) 1043 // Note that the cache is updated below if it does not help us find 1044 // what we need immediately. 1045 // So if it was a primary super, we can just fail immediately. 1046 // Otherwise, it's the slow path for us (no success at this point). 1047 1048 if (super_check_offset.is_register()) { 1049 br(Assembler::EQ, *L_success); 1050 cmp(super_check_offset.as_register(), sc_offset); 1051 if (L_failure == &L_fallthrough) { 1052 br(Assembler::EQ, *L_slow_path); 1053 } else { 1054 br(Assembler::NE, *L_failure); 1055 final_jmp(*L_slow_path); 1056 } 1057 } else if (super_check_offset.as_constant() == sc_offset) { 1058 // Need a slow path; fast failure is impossible. 1059 if (L_slow_path == &L_fallthrough) { 1060 br(Assembler::EQ, *L_success); 1061 } else { 1062 br(Assembler::NE, *L_slow_path); 1063 final_jmp(*L_success); 1064 } 1065 } else { 1066 // No slow path; it's a fast decision. 1067 if (L_failure == &L_fallthrough) { 1068 br(Assembler::EQ, *L_success); 1069 } else { 1070 br(Assembler::NE, *L_failure); 1071 final_jmp(*L_success); 1072 } 1073 } 1074 1075 bind(L_fallthrough); 1076 1077 #undef final_jmp 1078 } 1079 1080 // These two are taken from x86, but they look generally useful 1081 1082 // scans count pointer sized words at [addr] for occurence of value, 1083 // generic 1084 void MacroAssembler::repne_scan(Register addr, Register value, Register count, 1085 Register scratch) { 1086 Label Lloop, Lexit; 1087 cbz(count, Lexit); 1088 bind(Lloop); 1089 ldr(scratch, post(addr, wordSize)); 1090 cmp(value, scratch); 1091 br(EQ, Lexit); 1092 sub(count, count, 1); 1093 cbnz(count, Lloop); 1094 bind(Lexit); 1095 } 1096 1097 // scans count 4 byte words at [addr] for occurence of value, 1098 // generic 1099 void MacroAssembler::repne_scanw(Register addr, Register value, Register count, 1100 Register scratch) { 1101 Label Lloop, Lexit; 1102 cbz(count, Lexit); 1103 bind(Lloop); 1104 ldrw(scratch, post(addr, wordSize)); 1105 cmpw(value, scratch); 1106 br(EQ, Lexit); 1107 sub(count, count, 1); 1108 cbnz(count, Lloop); 1109 bind(Lexit); 1110 } 1111 1112 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass, 1113 Register super_klass, 1114 Register temp_reg, 1115 Register temp2_reg, 1116 Label* L_success, 1117 Label* L_failure, 1118 bool set_cond_codes) { 1119 assert_different_registers(sub_klass, super_klass, temp_reg); 1120 if (temp2_reg != noreg) 1121 assert_different_registers(sub_klass, super_klass, temp_reg, temp2_reg, rscratch1); 1122 #define IS_A_TEMP(reg) ((reg) == temp_reg || (reg) == temp2_reg) 1123 1124 Label L_fallthrough; 1125 int label_nulls = 0; 1126 if (L_success == NULL) { L_success = &L_fallthrough; label_nulls++; } 1127 if (L_failure == NULL) { L_failure = &L_fallthrough; label_nulls++; } 1128 assert(label_nulls <= 1, "at most one NULL in the batch"); 1129 1130 // a couple of useful fields in sub_klass: 1131 int ss_offset = in_bytes(Klass::secondary_supers_offset()); 1132 int sc_offset = in_bytes(Klass::secondary_super_cache_offset()); 1133 Address secondary_supers_addr(sub_klass, ss_offset); 1134 Address super_cache_addr( sub_klass, sc_offset); 1135 1136 BLOCK_COMMENT("check_klass_subtype_slow_path"); 1137 1138 // Do a linear scan of the secondary super-klass chain. 1139 // This code is rarely used, so simplicity is a virtue here. 1140 // The repne_scan instruction uses fixed registers, which we must spill. 1141 // Don't worry too much about pre-existing connections with the input regs. 1142 1143 assert(sub_klass != r0, "killed reg"); // killed by mov(r0, super) 1144 assert(sub_klass != r2, "killed reg"); // killed by lea(r2, &pst_counter) 1145 1146 // Get super_klass value into r0 (even if it was in r5 or r2). 1147 RegSet pushed_registers; 1148 if (!IS_A_TEMP(r2)) pushed_registers += r2; 1149 if (!IS_A_TEMP(r5)) pushed_registers += r5; 1150 1151 if (super_klass != r0 || UseCompressedOops) { 1152 if (!IS_A_TEMP(r0)) pushed_registers += r0; 1153 } 1154 1155 push(pushed_registers, sp); 1156 1157 #ifndef PRODUCT 1158 mov(rscratch2, (address)&SharedRuntime::_partial_subtype_ctr); 1159 Address pst_counter_addr(rscratch2); 1160 ldr(rscratch1, pst_counter_addr); 1161 add(rscratch1, rscratch1, 1); 1162 str(rscratch1, pst_counter_addr); 1163 #endif //PRODUCT 1164 1165 // We will consult the secondary-super array. 1166 ldr(r5, secondary_supers_addr); 1167 // Load the array length. 1168 ldrw(r2, Address(r5, Array<Klass*>::length_offset_in_bytes())); 1169 // Skip to start of data. 1170 add(r5, r5, Array<Klass*>::base_offset_in_bytes()); 1171 1172 cmp(sp, zr); // Clear Z flag; SP is never zero 1173 // Scan R2 words at [R5] for an occurrence of R0. 1174 // Set NZ/Z based on last compare. 1175 repne_scan(r5, r0, r2, rscratch1); 1176 1177 // Unspill the temp. registers: 1178 pop(pushed_registers, sp); 1179 1180 br(Assembler::NE, *L_failure); 1181 1182 // Success. Cache the super we found and proceed in triumph. 1183 str(super_klass, super_cache_addr); 1184 1185 if (L_success != &L_fallthrough) { 1186 b(*L_success); 1187 } 1188 1189 #undef IS_A_TEMP 1190 1191 bind(L_fallthrough); 1192 } 1193 1194 1195 void MacroAssembler::verify_oop(Register reg, const char* s) { 1196 if (!VerifyOops) return; 1197 1198 // Pass register number to verify_oop_subroutine 1199 const char* b = NULL; 1200 { 1201 ResourceMark rm; 1202 stringStream ss; 1203 ss.print("verify_oop: %s: %s", reg->name(), s); 1204 b = code_string(ss.as_string()); 1205 } 1206 BLOCK_COMMENT("verify_oop {"); 1207 1208 stp(r0, rscratch1, Address(pre(sp, -2 * wordSize))); 1209 stp(rscratch2, lr, Address(pre(sp, -2 * wordSize))); 1210 1211 mov(r0, reg); 1212 mov(rscratch1, (address)b); 1213 1214 // call indirectly to solve generation ordering problem 1215 lea(rscratch2, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address())); 1216 ldr(rscratch2, Address(rscratch2)); 1217 blr(rscratch2); 1218 1219 ldp(rscratch2, lr, Address(post(sp, 2 * wordSize))); 1220 ldp(r0, rscratch1, Address(post(sp, 2 * wordSize))); 1221 1222 BLOCK_COMMENT("} verify_oop"); 1223 } 1224 1225 void MacroAssembler::verify_oop_addr(Address addr, const char* s) { 1226 if (!VerifyOops) return; 1227 1228 const char* b = NULL; 1229 { 1230 ResourceMark rm; 1231 stringStream ss; 1232 ss.print("verify_oop_addr: %s", s); 1233 b = code_string(ss.as_string()); 1234 } 1235 BLOCK_COMMENT("verify_oop_addr {"); 1236 1237 stp(r0, rscratch1, Address(pre(sp, -2 * wordSize))); 1238 stp(rscratch2, lr, Address(pre(sp, -2 * wordSize))); 1239 1240 // addr may contain sp so we will have to adjust it based on the 1241 // pushes that we just did. 1242 if (addr.uses(sp)) { 1243 lea(r0, addr); 1244 ldr(r0, Address(r0, 4 * wordSize)); 1245 } else { 1246 ldr(r0, addr); 1247 } 1248 mov(rscratch1, (address)b); 1249 1250 // call indirectly to solve generation ordering problem 1251 lea(rscratch2, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address())); 1252 ldr(rscratch2, Address(rscratch2)); 1253 blr(rscratch2); 1254 1255 ldp(rscratch2, lr, Address(post(sp, 2 * wordSize))); 1256 ldp(r0, rscratch1, Address(post(sp, 2 * wordSize))); 1257 1258 BLOCK_COMMENT("} verify_oop_addr"); 1259 } 1260 1261 Address MacroAssembler::argument_address(RegisterOrConstant arg_slot, 1262 int extra_slot_offset) { 1263 // cf. TemplateTable::prepare_invoke(), if (load_receiver). 1264 int stackElementSize = Interpreter::stackElementSize; 1265 int offset = Interpreter::expr_offset_in_bytes(extra_slot_offset+0); 1266 #ifdef ASSERT 1267 int offset1 = Interpreter::expr_offset_in_bytes(extra_slot_offset+1); 1268 assert(offset1 - offset == stackElementSize, "correct arithmetic"); 1269 #endif 1270 if (arg_slot.is_constant()) { 1271 return Address(esp, arg_slot.as_constant() * stackElementSize 1272 + offset); 1273 } else { 1274 add(rscratch1, esp, arg_slot.as_register(), 1275 ext::uxtx, exact_log2(stackElementSize)); 1276 return Address(rscratch1, offset); 1277 } 1278 } 1279 1280 void MacroAssembler::call_VM_leaf_base(address entry_point, 1281 int number_of_arguments, 1282 Label *retaddr) { 1283 call_VM_leaf_base1(entry_point, number_of_arguments, 0, ret_type_integral, retaddr); 1284 } 1285 1286 void MacroAssembler::call_VM_leaf_base1(address entry_point, 1287 int number_of_gp_arguments, 1288 int number_of_fp_arguments, 1289 ret_type type, 1290 Label *retaddr) { 1291 Label E, L; 1292 1293 stp(rscratch1, rmethod, Address(pre(sp, -2 * wordSize))); 1294 1295 // We add 1 to number_of_arguments because the thread in arg0 is 1296 // not counted 1297 mov(rscratch1, entry_point); 1298 blrt(rscratch1, number_of_gp_arguments + 1, number_of_fp_arguments, type); 1299 if (retaddr) 1300 bind(*retaddr); 1301 1302 ldp(rscratch1, rmethod, Address(post(sp, 2 * wordSize))); 1303 maybe_isb(); 1304 } 1305 1306 void MacroAssembler::call_VM_leaf(address entry_point, int number_of_arguments) { 1307 call_VM_leaf_base(entry_point, number_of_arguments); 1308 } 1309 1310 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0) { 1311 pass_arg0(this, arg_0); 1312 call_VM_leaf_base(entry_point, 1); 1313 } 1314 1315 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1) { 1316 pass_arg0(this, arg_0); 1317 pass_arg1(this, arg_1); 1318 call_VM_leaf_base(entry_point, 2); 1319 } 1320 1321 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, 1322 Register arg_1, Register arg_2) { 1323 pass_arg0(this, arg_0); 1324 pass_arg1(this, arg_1); 1325 pass_arg2(this, arg_2); 1326 call_VM_leaf_base(entry_point, 3); 1327 } 1328 1329 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0) { 1330 pass_arg0(this, arg_0); 1331 MacroAssembler::call_VM_leaf_base(entry_point, 1); 1332 } 1333 1334 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1) { 1335 1336 assert(arg_0 != c_rarg1, "smashed arg"); 1337 pass_arg1(this, arg_1); 1338 pass_arg0(this, arg_0); 1339 MacroAssembler::call_VM_leaf_base(entry_point, 2); 1340 } 1341 1342 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) { 1343 assert(arg_0 != c_rarg2, "smashed arg"); 1344 assert(arg_1 != c_rarg2, "smashed arg"); 1345 pass_arg2(this, arg_2); 1346 assert(arg_0 != c_rarg1, "smashed arg"); 1347 pass_arg1(this, arg_1); 1348 pass_arg0(this, arg_0); 1349 MacroAssembler::call_VM_leaf_base(entry_point, 3); 1350 } 1351 1352 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) { 1353 assert(arg_0 != c_rarg3, "smashed arg"); 1354 assert(arg_1 != c_rarg3, "smashed arg"); 1355 assert(arg_2 != c_rarg3, "smashed arg"); 1356 pass_arg3(this, arg_3); 1357 assert(arg_0 != c_rarg2, "smashed arg"); 1358 assert(arg_1 != c_rarg2, "smashed arg"); 1359 pass_arg2(this, arg_2); 1360 assert(arg_0 != c_rarg1, "smashed arg"); 1361 pass_arg1(this, arg_1); 1362 pass_arg0(this, arg_0); 1363 MacroAssembler::call_VM_leaf_base(entry_point, 4); 1364 } 1365 1366 void MacroAssembler::null_check(Register reg, int offset) { 1367 if (needs_explicit_null_check(offset)) { 1368 // provoke OS NULL exception if reg = NULL by 1369 // accessing M[reg] w/o changing any registers 1370 // NOTE: this is plenty to provoke a segv 1371 ldr(zr, Address(reg)); 1372 } else { 1373 // nothing to do, (later) access of M[reg + offset] 1374 // will provoke OS NULL exception if reg = NULL 1375 } 1376 } 1377 1378 // MacroAssembler protected routines needed to implement 1379 // public methods 1380 1381 void MacroAssembler::mov(Register r, Address dest) { 1382 code_section()->relocate(pc(), dest.rspec()); 1383 u_int64_t imm64 = (u_int64_t)dest.target(); 1384 movptr(r, imm64); 1385 } 1386 1387 // Move a constant pointer into r. In AArch64 mode the virtual 1388 // address space is 48 bits in size, so we only need three 1389 // instructions to create a patchable instruction sequence that can 1390 // reach anywhere. 1391 void MacroAssembler::movptr(Register r, uintptr_t imm64) { 1392 #ifndef PRODUCT 1393 { 1394 char buffer[64]; 1395 snprintf(buffer, sizeof(buffer), "0x%"PRIX64, imm64); 1396 block_comment(buffer); 1397 } 1398 #endif 1399 assert(imm64 < (1ul << 48), "48-bit overflow in address constant"); 1400 movz(r, imm64 & 0xffff); 1401 imm64 >>= 16; 1402 movk(r, imm64 & 0xffff, 16); 1403 imm64 >>= 16; 1404 movk(r, imm64 & 0xffff, 32); 1405 } 1406 1407 // Macro to mov replicated immediate to vector register. 1408 // Vd will get the following values for different arrangements in T 1409 // imm32 == hex 000000gh T8B: Vd = ghghghghghghghgh 1410 // imm32 == hex 000000gh T16B: Vd = ghghghghghghghghghghghghghghghgh 1411 // imm32 == hex 0000efgh T4H: Vd = efghefghefghefgh 1412 // imm32 == hex 0000efgh T8H: Vd = efghefghefghefghefghefghefghefgh 1413 // imm32 == hex abcdefgh T2S: Vd = abcdefghabcdefgh 1414 // imm32 == hex abcdefgh T4S: Vd = abcdefghabcdefghabcdefghabcdefgh 1415 // T1D/T2D: invalid 1416 void MacroAssembler::mov(FloatRegister Vd, SIMD_Arrangement T, u_int32_t imm32) { 1417 assert(T != T1D && T != T2D, "invalid arrangement"); 1418 if (T == T8B || T == T16B) { 1419 assert((imm32 & ~0xff) == 0, "extraneous bits in unsigned imm32 (T8B/T16B)"); 1420 movi(Vd, T, imm32 & 0xff, 0); 1421 return; 1422 } 1423 u_int32_t nimm32 = ~imm32; 1424 if (T == T4H || T == T8H) { 1425 assert((imm32 & ~0xffff) == 0, "extraneous bits in unsigned imm32 (T4H/T8H)"); 1426 imm32 &= 0xffff; 1427 nimm32 &= 0xffff; 1428 } 1429 u_int32_t x = imm32; 1430 int movi_cnt = 0; 1431 int movn_cnt = 0; 1432 while (x) { if (x & 0xff) movi_cnt++; x >>= 8; } 1433 x = nimm32; 1434 while (x) { if (x & 0xff) movn_cnt++; x >>= 8; } 1435 if (movn_cnt < movi_cnt) imm32 = nimm32; 1436 unsigned lsl = 0; 1437 while (imm32 && (imm32 & 0xff) == 0) { lsl += 8; imm32 >>= 8; } 1438 if (movn_cnt < movi_cnt) 1439 mvni(Vd, T, imm32 & 0xff, lsl); 1440 else 1441 movi(Vd, T, imm32 & 0xff, lsl); 1442 imm32 >>= 8; lsl += 8; 1443 while (imm32) { 1444 while ((imm32 & 0xff) == 0) { lsl += 8; imm32 >>= 8; } 1445 if (movn_cnt < movi_cnt) 1446 bici(Vd, T, imm32 & 0xff, lsl); 1447 else 1448 orri(Vd, T, imm32 & 0xff, lsl); 1449 lsl += 8; imm32 >>= 8; 1450 } 1451 } 1452 1453 void MacroAssembler::mov_immediate64(Register dst, u_int64_t imm64) 1454 { 1455 #ifndef PRODUCT 1456 { 1457 char buffer[64]; 1458 snprintf(buffer, sizeof(buffer), "0x%"PRIX64, imm64); 1459 block_comment(buffer); 1460 } 1461 #endif 1462 if (operand_valid_for_logical_immediate(false, imm64)) { 1463 orr(dst, zr, imm64); 1464 } else { 1465 // we can use a combination of MOVZ or MOVN with 1466 // MOVK to build up the constant 1467 u_int64_t imm_h[4]; 1468 int zero_count = 0; 1469 int neg_count = 0; 1470 int i; 1471 for (i = 0; i < 4; i++) { 1472 imm_h[i] = ((imm64 >> (i * 16)) & 0xffffL); 1473 if (imm_h[i] == 0) { 1474 zero_count++; 1475 } else if (imm_h[i] == 0xffffL) { 1476 neg_count++; 1477 } 1478 } 1479 if (zero_count == 4) { 1480 // one MOVZ will do 1481 movz(dst, 0); 1482 } else if (neg_count == 4) { 1483 // one MOVN will do 1484 movn(dst, 0); 1485 } else if (zero_count == 3) { 1486 for (i = 0; i < 4; i++) { 1487 if (imm_h[i] != 0L) { 1488 movz(dst, (u_int32_t)imm_h[i], (i << 4)); 1489 break; 1490 } 1491 } 1492 } else if (neg_count == 3) { 1493 // one MOVN will do 1494 for (int i = 0; i < 4; i++) { 1495 if (imm_h[i] != 0xffffL) { 1496 movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4)); 1497 break; 1498 } 1499 } 1500 } else if (zero_count == 2) { 1501 // one MOVZ and one MOVK will do 1502 for (i = 0; i < 3; i++) { 1503 if (imm_h[i] != 0L) { 1504 movz(dst, (u_int32_t)imm_h[i], (i << 4)); 1505 i++; 1506 break; 1507 } 1508 } 1509 for (;i < 4; i++) { 1510 if (imm_h[i] != 0L) { 1511 movk(dst, (u_int32_t)imm_h[i], (i << 4)); 1512 } 1513 } 1514 } else if (neg_count == 2) { 1515 // one MOVN and one MOVK will do 1516 for (i = 0; i < 4; i++) { 1517 if (imm_h[i] != 0xffffL) { 1518 movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4)); 1519 i++; 1520 break; 1521 } 1522 } 1523 for (;i < 4; i++) { 1524 if (imm_h[i] != 0xffffL) { 1525 movk(dst, (u_int32_t)imm_h[i], (i << 4)); 1526 } 1527 } 1528 } else if (zero_count == 1) { 1529 // one MOVZ and two MOVKs will do 1530 for (i = 0; i < 4; i++) { 1531 if (imm_h[i] != 0L) { 1532 movz(dst, (u_int32_t)imm_h[i], (i << 4)); 1533 i++; 1534 break; 1535 } 1536 } 1537 for (;i < 4; i++) { 1538 if (imm_h[i] != 0x0L) { 1539 movk(dst, (u_int32_t)imm_h[i], (i << 4)); 1540 } 1541 } 1542 } else if (neg_count == 1) { 1543 // one MOVN and two MOVKs will do 1544 for (i = 0; i < 4; i++) { 1545 if (imm_h[i] != 0xffffL) { 1546 movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4)); 1547 i++; 1548 break; 1549 } 1550 } 1551 for (;i < 4; i++) { 1552 if (imm_h[i] != 0xffffL) { 1553 movk(dst, (u_int32_t)imm_h[i], (i << 4)); 1554 } 1555 } 1556 } else { 1557 // use a MOVZ and 3 MOVKs (makes it easier to debug) 1558 movz(dst, (u_int32_t)imm_h[0], 0); 1559 for (i = 1; i < 4; i++) { 1560 movk(dst, (u_int32_t)imm_h[i], (i << 4)); 1561 } 1562 } 1563 } 1564 } 1565 1566 void MacroAssembler::mov_immediate32(Register dst, u_int32_t imm32) 1567 { 1568 #ifndef PRODUCT 1569 { 1570 char buffer[64]; 1571 snprintf(buffer, sizeof(buffer), "0x%"PRIX32, imm32); 1572 block_comment(buffer); 1573 } 1574 #endif 1575 if (operand_valid_for_logical_immediate(true, imm32)) { 1576 orrw(dst, zr, imm32); 1577 } else { 1578 // we can use MOVZ, MOVN or two calls to MOVK to build up the 1579 // constant 1580 u_int32_t imm_h[2]; 1581 imm_h[0] = imm32 & 0xffff; 1582 imm_h[1] = ((imm32 >> 16) & 0xffff); 1583 if (imm_h[0] == 0) { 1584 movzw(dst, imm_h[1], 16); 1585 } else if (imm_h[0] == 0xffff) { 1586 movnw(dst, imm_h[1] ^ 0xffff, 16); 1587 } else if (imm_h[1] == 0) { 1588 movzw(dst, imm_h[0], 0); 1589 } else if (imm_h[1] == 0xffff) { 1590 movnw(dst, imm_h[0] ^ 0xffff, 0); 1591 } else { 1592 // use a MOVZ and MOVK (makes it easier to debug) 1593 movzw(dst, imm_h[0], 0); 1594 movkw(dst, imm_h[1], 16); 1595 } 1596 } 1597 } 1598 1599 // Form an address from base + offset in Rd. Rd may or may 1600 // not actually be used: you must use the Address that is returned. 1601 // It is up to you to ensure that the shift provided matches the size 1602 // of your data. 1603 Address MacroAssembler::form_address(Register Rd, Register base, long byte_offset, int shift) { 1604 if (Address::offset_ok_for_immed(byte_offset, shift)) 1605 // It fits; no need for any heroics 1606 return Address(base, byte_offset); 1607 1608 // Don't do anything clever with negative or misaligned offsets 1609 unsigned mask = (1 << shift) - 1; 1610 if (byte_offset < 0 || byte_offset & mask) { 1611 mov(Rd, byte_offset); 1612 add(Rd, base, Rd); 1613 return Address(Rd); 1614 } 1615 1616 // See if we can do this with two 12-bit offsets 1617 { 1618 unsigned long word_offset = byte_offset >> shift; 1619 unsigned long masked_offset = word_offset & 0xfff000; 1620 if (Address::offset_ok_for_immed(word_offset - masked_offset) 1621 && Assembler::operand_valid_for_add_sub_immediate(masked_offset << shift)) { 1622 add(Rd, base, masked_offset << shift); 1623 word_offset -= masked_offset; 1624 return Address(Rd, word_offset << shift); 1625 } 1626 } 1627 1628 // Do it the hard way 1629 mov(Rd, byte_offset); 1630 add(Rd, base, Rd); 1631 return Address(Rd); 1632 } 1633 1634 void MacroAssembler::atomic_incw(Register counter_addr, Register tmp, Register tmp2) { 1635 if (UseLSE) { 1636 mov(tmp, 1); 1637 ldadd(Assembler::word, tmp, zr, counter_addr); 1638 return; 1639 } 1640 Label retry_load; 1641 if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH)) 1642 prfm(Address(counter_addr), PSTL1STRM); 1643 bind(retry_load); 1644 // flush and load exclusive from the memory location 1645 ldxrw(tmp, counter_addr); 1646 addw(tmp, tmp, 1); 1647 // if we store+flush with no intervening write tmp wil be zero 1648 stxrw(tmp2, tmp, counter_addr); 1649 cbnzw(tmp2, retry_load); 1650 } 1651 1652 1653 int MacroAssembler::corrected_idivl(Register result, Register ra, Register rb, 1654 bool want_remainder, Register scratch) 1655 { 1656 // Full implementation of Java idiv and irem. The function 1657 // returns the (pc) offset of the div instruction - may be needed 1658 // for implicit exceptions. 1659 // 1660 // constraint : ra/rb =/= scratch 1661 // normal case 1662 // 1663 // input : ra: dividend 1664 // rb: divisor 1665 // 1666 // result: either 1667 // quotient (= ra idiv rb) 1668 // remainder (= ra irem rb) 1669 1670 assert(ra != scratch && rb != scratch, "reg cannot be scratch"); 1671 1672 int idivl_offset = offset(); 1673 if (! want_remainder) { 1674 sdivw(result, ra, rb); 1675 } else { 1676 sdivw(scratch, ra, rb); 1677 Assembler::msubw(result, scratch, rb, ra); 1678 } 1679 1680 return idivl_offset; 1681 } 1682 1683 int MacroAssembler::corrected_idivq(Register result, Register ra, Register rb, 1684 bool want_remainder, Register scratch) 1685 { 1686 // Full implementation of Java ldiv and lrem. The function 1687 // returns the (pc) offset of the div instruction - may be needed 1688 // for implicit exceptions. 1689 // 1690 // constraint : ra/rb =/= scratch 1691 // normal case 1692 // 1693 // input : ra: dividend 1694 // rb: divisor 1695 // 1696 // result: either 1697 // quotient (= ra idiv rb) 1698 // remainder (= ra irem rb) 1699 1700 assert(ra != scratch && rb != scratch, "reg cannot be scratch"); 1701 1702 int idivq_offset = offset(); 1703 if (! want_remainder) { 1704 sdiv(result, ra, rb); 1705 } else { 1706 sdiv(scratch, ra, rb); 1707 Assembler::msub(result, scratch, rb, ra); 1708 } 1709 1710 return idivq_offset; 1711 } 1712 1713 void MacroAssembler::membar(Membar_mask_bits order_constraint) { 1714 address prev = pc() - NativeMembar::instruction_size; 1715 if (prev == code()->last_membar()) { 1716 NativeMembar *bar = NativeMembar_at(prev); 1717 // We are merging two memory barrier instructions. On AArch64 we 1718 // can do this simply by ORing them together. 1719 bar->set_kind(bar->get_kind() | order_constraint); 1720 BLOCK_COMMENT("merged membar"); 1721 } else { 1722 code()->set_last_membar(pc()); 1723 dmb(Assembler::barrier(order_constraint)); 1724 } 1725 } 1726 1727 // MacroAssembler routines found actually to be needed 1728 1729 void MacroAssembler::push(Register src) 1730 { 1731 str(src, Address(pre(esp, -1 * wordSize))); 1732 } 1733 1734 void MacroAssembler::pop(Register dst) 1735 { 1736 ldr(dst, Address(post(esp, 1 * wordSize))); 1737 } 1738 1739 // Note: load_unsigned_short used to be called load_unsigned_word. 1740 int MacroAssembler::load_unsigned_short(Register dst, Address src) { 1741 int off = offset(); 1742 ldrh(dst, src); 1743 return off; 1744 } 1745 1746 int MacroAssembler::load_unsigned_byte(Register dst, Address src) { 1747 int off = offset(); 1748 ldrb(dst, src); 1749 return off; 1750 } 1751 1752 int MacroAssembler::load_signed_short(Register dst, Address src) { 1753 int off = offset(); 1754 ldrsh(dst, src); 1755 return off; 1756 } 1757 1758 int MacroAssembler::load_signed_byte(Register dst, Address src) { 1759 int off = offset(); 1760 ldrsb(dst, src); 1761 return off; 1762 } 1763 1764 int MacroAssembler::load_signed_short32(Register dst, Address src) { 1765 int off = offset(); 1766 ldrshw(dst, src); 1767 return off; 1768 } 1769 1770 int MacroAssembler::load_signed_byte32(Register dst, Address src) { 1771 int off = offset(); 1772 ldrsbw(dst, src); 1773 return off; 1774 } 1775 1776 void MacroAssembler::load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed, Register dst2) { 1777 switch (size_in_bytes) { 1778 case 8: ldr(dst, src); break; 1779 case 4: ldrw(dst, src); break; 1780 case 2: is_signed ? load_signed_short(dst, src) : load_unsigned_short(dst, src); break; 1781 case 1: is_signed ? load_signed_byte( dst, src) : load_unsigned_byte( dst, src); break; 1782 default: ShouldNotReachHere(); 1783 } 1784 } 1785 1786 void MacroAssembler::store_sized_value(Address dst, Register src, size_t size_in_bytes, Register src2) { 1787 switch (size_in_bytes) { 1788 case 8: str(src, dst); break; 1789 case 4: strw(src, dst); break; 1790 case 2: strh(src, dst); break; 1791 case 1: strb(src, dst); break; 1792 default: ShouldNotReachHere(); 1793 } 1794 } 1795 1796 void MacroAssembler::decrementw(Register reg, int value) 1797 { 1798 if (value < 0) { incrementw(reg, -value); return; } 1799 if (value == 0) { return; } 1800 if (value < (1 << 12)) { subw(reg, reg, value); return; } 1801 /* else */ { 1802 guarantee(reg != rscratch2, "invalid dst for register decrement"); 1803 movw(rscratch2, (unsigned)value); 1804 subw(reg, reg, rscratch2); 1805 } 1806 } 1807 1808 void MacroAssembler::decrement(Register reg, int value) 1809 { 1810 if (value < 0) { increment(reg, -value); return; } 1811 if (value == 0) { return; } 1812 if (value < (1 << 12)) { sub(reg, reg, value); return; } 1813 /* else */ { 1814 assert(reg != rscratch2, "invalid dst for register decrement"); 1815 mov(rscratch2, (unsigned long)value); 1816 sub(reg, reg, rscratch2); 1817 } 1818 } 1819 1820 void MacroAssembler::decrementw(Address dst, int value) 1821 { 1822 assert(!dst.uses(rscratch1), "invalid dst for address decrement"); 1823 ldrw(rscratch1, dst); 1824 decrementw(rscratch1, value); 1825 strw(rscratch1, dst); 1826 } 1827 1828 void MacroAssembler::decrement(Address dst, int value) 1829 { 1830 assert(!dst.uses(rscratch1), "invalid address for decrement"); 1831 ldr(rscratch1, dst); 1832 decrement(rscratch1, value); 1833 str(rscratch1, dst); 1834 } 1835 1836 void MacroAssembler::incrementw(Register reg, int value) 1837 { 1838 if (value < 0) { decrementw(reg, -value); return; } 1839 if (value == 0) { return; } 1840 if (value < (1 << 12)) { addw(reg, reg, value); return; } 1841 /* else */ { 1842 assert(reg != rscratch2, "invalid dst for register increment"); 1843 movw(rscratch2, (unsigned)value); 1844 addw(reg, reg, rscratch2); 1845 } 1846 } 1847 1848 void MacroAssembler::increment(Register reg, int value) 1849 { 1850 if (value < 0) { decrement(reg, -value); return; } 1851 if (value == 0) { return; } 1852 if (value < (1 << 12)) { add(reg, reg, value); return; } 1853 /* else */ { 1854 assert(reg != rscratch2, "invalid dst for register increment"); 1855 movw(rscratch2, (unsigned)value); 1856 add(reg, reg, rscratch2); 1857 } 1858 } 1859 1860 void MacroAssembler::incrementw(Address dst, int value) 1861 { 1862 assert(!dst.uses(rscratch1), "invalid dst for address increment"); 1863 ldrw(rscratch1, dst); 1864 incrementw(rscratch1, value); 1865 strw(rscratch1, dst); 1866 } 1867 1868 void MacroAssembler::increment(Address dst, int value) 1869 { 1870 assert(!dst.uses(rscratch1), "invalid dst for address increment"); 1871 ldr(rscratch1, dst); 1872 increment(rscratch1, value); 1873 str(rscratch1, dst); 1874 } 1875 1876 1877 void MacroAssembler::pusha() { 1878 push(0x7fffffff, sp); 1879 } 1880 1881 void MacroAssembler::popa() { 1882 pop(0x7fffffff, sp); 1883 } 1884 1885 // Push lots of registers in the bit set supplied. Don't push sp. 1886 // Return the number of words pushed 1887 int MacroAssembler::push(unsigned int bitset, Register stack) { 1888 int words_pushed = 0; 1889 1890 // Scan bitset to accumulate register pairs 1891 unsigned char regs[32]; 1892 int count = 0; 1893 for (int reg = 0; reg <= 30; reg++) { 1894 if (1 & bitset) 1895 regs[count++] = reg; 1896 bitset >>= 1; 1897 } 1898 regs[count++] = zr->encoding_nocheck(); 1899 count &= ~1; // Only push an even nuber of regs 1900 1901 if (count) { 1902 stp(as_Register(regs[0]), as_Register(regs[1]), 1903 Address(pre(stack, -count * wordSize))); 1904 words_pushed += 2; 1905 } 1906 for (int i = 2; i < count; i += 2) { 1907 stp(as_Register(regs[i]), as_Register(regs[i+1]), 1908 Address(stack, i * wordSize)); 1909 words_pushed += 2; 1910 } 1911 1912 assert(words_pushed == count, "oops, pushed != count"); 1913 1914 return count; 1915 } 1916 1917 int MacroAssembler::pop(unsigned int bitset, Register stack) { 1918 int words_pushed = 0; 1919 1920 // Scan bitset to accumulate register pairs 1921 unsigned char regs[32]; 1922 int count = 0; 1923 for (int reg = 0; reg <= 30; reg++) { 1924 if (1 & bitset) 1925 regs[count++] = reg; 1926 bitset >>= 1; 1927 } 1928 regs[count++] = zr->encoding_nocheck(); 1929 count &= ~1; 1930 1931 for (int i = 2; i < count; i += 2) { 1932 ldp(as_Register(regs[i]), as_Register(regs[i+1]), 1933 Address(stack, i * wordSize)); 1934 words_pushed += 2; 1935 } 1936 if (count) { 1937 ldp(as_Register(regs[0]), as_Register(regs[1]), 1938 Address(post(stack, count * wordSize))); 1939 words_pushed += 2; 1940 } 1941 1942 assert(words_pushed == count, "oops, pushed != count"); 1943 1944 return count; 1945 } 1946 #ifdef ASSERT 1947 void MacroAssembler::verify_heapbase(const char* msg) { 1948 #if 0 1949 assert (UseCompressedOops || UseCompressedClassPointers, "should be compressed"); 1950 assert (Universe::heap() != NULL, "java heap should be initialized"); 1951 if (CheckCompressedOops) { 1952 Label ok; 1953 push(1 << rscratch1->encoding(), sp); // cmpptr trashes rscratch1 1954 cmpptr(rheapbase, ExternalAddress((address)Universe::narrow_ptrs_base_addr())); 1955 br(Assembler::EQ, ok); 1956 stop(msg); 1957 bind(ok); 1958 pop(1 << rscratch1->encoding(), sp); 1959 } 1960 #endif 1961 } 1962 #endif 1963 1964 void MacroAssembler::stop(const char* msg) { 1965 address ip = pc(); 1966 pusha(); 1967 mov(c_rarg0, (address)msg); 1968 mov(c_rarg1, (address)ip); 1969 mov(c_rarg2, sp); 1970 mov(c_rarg3, CAST_FROM_FN_PTR(address, MacroAssembler::debug64)); 1971 // call(c_rarg3); 1972 blrt(c_rarg3, 3, 0, 1); 1973 hlt(0); 1974 } 1975 1976 // If a constant does not fit in an immediate field, generate some 1977 // number of MOV instructions and then perform the operation. 1978 void MacroAssembler::wrap_add_sub_imm_insn(Register Rd, Register Rn, unsigned imm, 1979 add_sub_imm_insn insn1, 1980 add_sub_reg_insn insn2) { 1981 assert(Rd != zr, "Rd = zr and not setting flags?"); 1982 if (operand_valid_for_add_sub_immediate((int)imm)) { 1983 (this->*insn1)(Rd, Rn, imm); 1984 } else { 1985 if (uabs(imm) < (1 << 24)) { 1986 (this->*insn1)(Rd, Rn, imm & -(1 << 12)); 1987 (this->*insn1)(Rd, Rd, imm & ((1 << 12)-1)); 1988 } else { 1989 assert_different_registers(Rd, Rn); 1990 mov(Rd, (uint64_t)imm); 1991 (this->*insn2)(Rd, Rn, Rd, LSL, 0); 1992 } 1993 } 1994 } 1995 1996 // Seperate vsn which sets the flags. Optimisations are more restricted 1997 // because we must set the flags correctly. 1998 void MacroAssembler::wrap_adds_subs_imm_insn(Register Rd, Register Rn, unsigned imm, 1999 add_sub_imm_insn insn1, 2000 add_sub_reg_insn insn2) { 2001 if (operand_valid_for_add_sub_immediate((int)imm)) { 2002 (this->*insn1)(Rd, Rn, imm); 2003 } else { 2004 assert_different_registers(Rd, Rn); 2005 assert(Rd != zr, "overflow in immediate operand"); 2006 mov(Rd, (uint64_t)imm); 2007 (this->*insn2)(Rd, Rn, Rd, LSL, 0); 2008 } 2009 } 2010 2011 2012 void MacroAssembler::add(Register Rd, Register Rn, RegisterOrConstant increment) { 2013 if (increment.is_register()) { 2014 add(Rd, Rn, increment.as_register()); 2015 } else { 2016 add(Rd, Rn, increment.as_constant()); 2017 } 2018 } 2019 2020 void MacroAssembler::addw(Register Rd, Register Rn, RegisterOrConstant increment) { 2021 if (increment.is_register()) { 2022 addw(Rd, Rn, increment.as_register()); 2023 } else { 2024 addw(Rd, Rn, increment.as_constant()); 2025 } 2026 } 2027 2028 void MacroAssembler::sub(Register Rd, Register Rn, RegisterOrConstant decrement) { 2029 if (decrement.is_register()) { 2030 sub(Rd, Rn, decrement.as_register()); 2031 } else { 2032 sub(Rd, Rn, decrement.as_constant()); 2033 } 2034 } 2035 2036 void MacroAssembler::subw(Register Rd, Register Rn, RegisterOrConstant decrement) { 2037 if (decrement.is_register()) { 2038 subw(Rd, Rn, decrement.as_register()); 2039 } else { 2040 subw(Rd, Rn, decrement.as_constant()); 2041 } 2042 } 2043 2044 void MacroAssembler::reinit_heapbase() 2045 { 2046 if (UseCompressedOops) { 2047 if (Universe::is_fully_initialized()) { 2048 mov(rheapbase, Universe::narrow_ptrs_base()); 2049 } else { 2050 lea(rheapbase, ExternalAddress((address)Universe::narrow_ptrs_base_addr())); 2051 ldr(rheapbase, Address(rheapbase)); 2052 } 2053 } 2054 } 2055 2056 // this simulates the behaviour of the x86 cmpxchg instruction using a 2057 // load linked/store conditional pair. we use the acquire/release 2058 // versions of these instructions so that we flush pending writes as 2059 // per Java semantics. 2060 2061 // n.b the x86 version assumes the old value to be compared against is 2062 // in rax and updates rax with the value located in memory if the 2063 // cmpxchg fails. we supply a register for the old value explicitly 2064 2065 // the aarch64 load linked/store conditional instructions do not 2066 // accept an offset. so, unlike x86, we must provide a plain register 2067 // to identify the memory word to be compared/exchanged rather than a 2068 // register+offset Address. 2069 2070 void MacroAssembler::cmpxchgptr(Register oldv, Register newv, Register addr, Register tmp, 2071 Label &succeed, Label *fail) { 2072 // oldv holds comparison value 2073 // newv holds value to write in exchange 2074 // addr identifies memory word to compare against/update 2075 if (UseLSE) { 2076 mov(tmp, oldv); 2077 casal(Assembler::xword, oldv, newv, addr); 2078 cmp(tmp, oldv); 2079 br(Assembler::EQ, succeed); 2080 membar(AnyAny); 2081 } else { 2082 Label retry_load, nope; 2083 if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH)) 2084 prfm(Address(addr), PSTL1STRM); 2085 bind(retry_load); 2086 // flush and load exclusive from the memory location 2087 // and fail if it is not what we expect 2088 ldaxr(tmp, addr); 2089 cmp(tmp, oldv); 2090 br(Assembler::NE, nope); 2091 // if we store+flush with no intervening write tmp wil be zero 2092 stlxr(tmp, newv, addr); 2093 cbzw(tmp, succeed); 2094 // retry so we only ever return after a load fails to compare 2095 // ensures we don't return a stale value after a failed write. 2096 b(retry_load); 2097 // if the memory word differs we return it in oldv and signal a fail 2098 bind(nope); 2099 membar(AnyAny); 2100 mov(oldv, tmp); 2101 } 2102 if (fail) 2103 b(*fail); 2104 } 2105 2106 void MacroAssembler::cmpxchgw(Register oldv, Register newv, Register addr, Register tmp, 2107 Label &succeed, Label *fail) { 2108 // oldv holds comparison value 2109 // newv holds value to write in exchange 2110 // addr identifies memory word to compare against/update 2111 // tmp returns 0/1 for success/failure 2112 if (UseLSE) { 2113 mov(tmp, oldv); 2114 casal(Assembler::word, oldv, newv, addr); 2115 cmp(tmp, oldv); 2116 br(Assembler::EQ, succeed); 2117 membar(AnyAny); 2118 } else { 2119 Label retry_load, nope; 2120 if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH)) 2121 prfm(Address(addr), PSTL1STRM); 2122 bind(retry_load); 2123 // flush and load exclusive from the memory location 2124 // and fail if it is not what we expect 2125 ldaxrw(tmp, addr); 2126 cmp(tmp, oldv); 2127 br(Assembler::NE, nope); 2128 // if we store+flush with no intervening write tmp wil be zero 2129 stlxrw(tmp, newv, addr); 2130 cbzw(tmp, succeed); 2131 // retry so we only ever return after a load fails to compare 2132 // ensures we don't return a stale value after a failed write. 2133 b(retry_load); 2134 // if the memory word differs we return it in oldv and signal a fail 2135 bind(nope); 2136 membar(AnyAny); 2137 mov(oldv, tmp); 2138 } 2139 if (fail) 2140 b(*fail); 2141 } 2142 2143 // A generic CAS; success or failure is in the EQ flag. A weak CAS 2144 // doesn't retry and may fail spuriously. If the oldval is wanted, 2145 // Pass a register for the result, otherwise pass noreg. 2146 2147 // Clobbers rscratch1 2148 void MacroAssembler::cmpxchg(Register addr, Register expected, 2149 Register new_val, 2150 enum operand_size size, 2151 bool acquire, bool release, 2152 bool weak, 2153 Register result) { 2154 if (result == noreg) result = rscratch1; 2155 if (UseLSE) { 2156 mov(result, expected); 2157 lse_cas(result, new_val, addr, size, acquire, release, /*not_pair*/ true); 2158 cmp(result, expected); 2159 } else { 2160 BLOCK_COMMENT("cmpxchg {"); 2161 Label retry_load, done; 2162 if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH)) 2163 prfm(Address(addr), PSTL1STRM); 2164 bind(retry_load); 2165 load_exclusive(result, addr, size, acquire); 2166 if (size == xword) 2167 cmp(result, expected); 2168 else 2169 cmpw(result, expected); 2170 br(Assembler::NE, done); 2171 store_exclusive(rscratch1, new_val, addr, size, release); 2172 if (weak) { 2173 cmpw(rscratch1, 0u); // If the store fails, return NE to our caller. 2174 } else { 2175 cbnzw(rscratch1, retry_load); 2176 } 2177 bind(done); 2178 BLOCK_COMMENT("} cmpxchg"); 2179 } 2180 } 2181 2182 static bool different(Register a, RegisterOrConstant b, Register c) { 2183 if (b.is_constant()) 2184 return a != c; 2185 else 2186 return a != b.as_register() && a != c && b.as_register() != c; 2187 } 2188 2189 #define ATOMIC_OP(NAME, LDXR, OP, IOP, AOP, STXR, sz) \ 2190 void MacroAssembler::atomic_##NAME(Register prev, RegisterOrConstant incr, Register addr) { \ 2191 if (UseLSE) { \ 2192 prev = prev->is_valid() ? prev : zr; \ 2193 if (incr.is_register()) { \ 2194 AOP(sz, incr.as_register(), prev, addr); \ 2195 } else { \ 2196 mov(rscratch2, incr.as_constant()); \ 2197 AOP(sz, rscratch2, prev, addr); \ 2198 } \ 2199 return; \ 2200 } \ 2201 Register result = rscratch2; \ 2202 if (prev->is_valid()) \ 2203 result = different(prev, incr, addr) ? prev : rscratch2; \ 2204 \ 2205 Label retry_load; \ 2206 if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH)) \ 2207 prfm(Address(addr), PSTL1STRM); \ 2208 bind(retry_load); \ 2209 LDXR(result, addr); \ 2210 OP(rscratch1, result, incr); \ 2211 STXR(rscratch2, rscratch1, addr); \ 2212 cbnzw(rscratch2, retry_load); \ 2213 if (prev->is_valid() && prev != result) { \ 2214 IOP(prev, rscratch1, incr); \ 2215 } \ 2216 } 2217 2218 ATOMIC_OP(add, ldxr, add, sub, ldadd, stxr, Assembler::xword) 2219 ATOMIC_OP(addw, ldxrw, addw, subw, ldadd, stxrw, Assembler::word) 2220 ATOMIC_OP(addal, ldaxr, add, sub, ldaddal, stlxr, Assembler::xword) 2221 ATOMIC_OP(addalw, ldaxrw, addw, subw, ldaddal, stlxrw, Assembler::word) 2222 2223 #undef ATOMIC_OP 2224 2225 #define ATOMIC_XCHG(OP, AOP, LDXR, STXR, sz) \ 2226 void MacroAssembler::atomic_##OP(Register prev, Register newv, Register addr) { \ 2227 if (UseLSE) { \ 2228 prev = prev->is_valid() ? prev : zr; \ 2229 AOP(sz, newv, prev, addr); \ 2230 return; \ 2231 } \ 2232 Register result = rscratch2; \ 2233 if (prev->is_valid()) \ 2234 result = different(prev, newv, addr) ? prev : rscratch2; \ 2235 \ 2236 Label retry_load; \ 2237 if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH)) \ 2238 prfm(Address(addr), PSTL1STRM); \ 2239 bind(retry_load); \ 2240 LDXR(result, addr); \ 2241 STXR(rscratch1, newv, addr); \ 2242 cbnzw(rscratch1, retry_load); \ 2243 if (prev->is_valid() && prev != result) \ 2244 mov(prev, result); \ 2245 } 2246 2247 ATOMIC_XCHG(xchg, swp, ldxr, stxr, Assembler::xword) 2248 ATOMIC_XCHG(xchgw, swp, ldxrw, stxrw, Assembler::word) 2249 ATOMIC_XCHG(xchgal, swpal, ldaxr, stlxr, Assembler::xword) 2250 ATOMIC_XCHG(xchgalw, swpal, ldaxrw, stlxrw, Assembler::word) 2251 2252 #undef ATOMIC_XCHG 2253 2254 void MacroAssembler::incr_allocated_bytes(Register thread, 2255 Register var_size_in_bytes, 2256 int con_size_in_bytes, 2257 Register t1) { 2258 if (!thread->is_valid()) { 2259 thread = rthread; 2260 } 2261 assert(t1->is_valid(), "need temp reg"); 2262 2263 ldr(t1, Address(thread, in_bytes(JavaThread::allocated_bytes_offset()))); 2264 if (var_size_in_bytes->is_valid()) { 2265 add(t1, t1, var_size_in_bytes); 2266 } else { 2267 add(t1, t1, con_size_in_bytes); 2268 } 2269 str(t1, Address(thread, in_bytes(JavaThread::allocated_bytes_offset()))); 2270 } 2271 2272 #ifndef PRODUCT 2273 extern "C" void findpc(intptr_t x); 2274 #endif 2275 2276 void MacroAssembler::debug64(char* msg, int64_t pc, int64_t regs[]) 2277 { 2278 // In order to get locks to work, we need to fake a in_VM state 2279 if (ShowMessageBoxOnError ) { 2280 JavaThread* thread = JavaThread::current(); 2281 JavaThreadState saved_state = thread->thread_state(); 2282 thread->set_thread_state(_thread_in_vm); 2283 #ifndef PRODUCT 2284 if (CountBytecodes || TraceBytecodes || StopInterpreterAt) { 2285 ttyLocker ttyl; 2286 BytecodeCounter::print(); 2287 } 2288 #endif 2289 if (os::message_box(msg, "Execution stopped, print registers?")) { 2290 ttyLocker ttyl; 2291 tty->print_cr(" pc = 0x%016lx", pc); 2292 #ifndef PRODUCT 2293 tty->cr(); 2294 findpc(pc); 2295 tty->cr(); 2296 #endif 2297 tty->print_cr(" r0 = 0x%016lx", regs[0]); 2298 tty->print_cr(" r1 = 0x%016lx", regs[1]); 2299 tty->print_cr(" r2 = 0x%016lx", regs[2]); 2300 tty->print_cr(" r3 = 0x%016lx", regs[3]); 2301 tty->print_cr(" r4 = 0x%016lx", regs[4]); 2302 tty->print_cr(" r5 = 0x%016lx", regs[5]); 2303 tty->print_cr(" r6 = 0x%016lx", regs[6]); 2304 tty->print_cr(" r7 = 0x%016lx", regs[7]); 2305 tty->print_cr(" r8 = 0x%016lx", regs[8]); 2306 tty->print_cr(" r9 = 0x%016lx", regs[9]); 2307 tty->print_cr("r10 = 0x%016lx", regs[10]); 2308 tty->print_cr("r11 = 0x%016lx", regs[11]); 2309 tty->print_cr("r12 = 0x%016lx", regs[12]); 2310 tty->print_cr("r13 = 0x%016lx", regs[13]); 2311 tty->print_cr("r14 = 0x%016lx", regs[14]); 2312 tty->print_cr("r15 = 0x%016lx", regs[15]); 2313 tty->print_cr("r16 = 0x%016lx", regs[16]); 2314 tty->print_cr("r17 = 0x%016lx", regs[17]); 2315 tty->print_cr("r18 = 0x%016lx", regs[18]); 2316 tty->print_cr("r19 = 0x%016lx", regs[19]); 2317 tty->print_cr("r20 = 0x%016lx", regs[20]); 2318 tty->print_cr("r21 = 0x%016lx", regs[21]); 2319 tty->print_cr("r22 = 0x%016lx", regs[22]); 2320 tty->print_cr("r23 = 0x%016lx", regs[23]); 2321 tty->print_cr("r24 = 0x%016lx", regs[24]); 2322 tty->print_cr("r25 = 0x%016lx", regs[25]); 2323 tty->print_cr("r26 = 0x%016lx", regs[26]); 2324 tty->print_cr("r27 = 0x%016lx", regs[27]); 2325 tty->print_cr("r28 = 0x%016lx", regs[28]); 2326 tty->print_cr("r30 = 0x%016lx", regs[30]); 2327 tty->print_cr("r31 = 0x%016lx", regs[31]); 2328 BREAKPOINT; 2329 } 2330 ThreadStateTransition::transition(thread, _thread_in_vm, saved_state); 2331 } else { 2332 ttyLocker ttyl; 2333 ::tty->print_cr("=============== DEBUG MESSAGE: %s ================\n", 2334 msg); 2335 assert(false, "DEBUG MESSAGE: %s", msg); 2336 } 2337 } 2338 2339 #ifdef BUILTIN_SIM 2340 // routine to generate an x86 prolog for a stub function which 2341 // bootstraps into the generated ARM code which directly follows the 2342 // stub 2343 // 2344 // the argument encodes the number of general and fp registers 2345 // passed by the caller and the callng convention (currently just 2346 // the number of general registers and assumes C argument passing) 2347 2348 extern "C" { 2349 int aarch64_stub_prolog_size(); 2350 void aarch64_stub_prolog(); 2351 void aarch64_prolog(); 2352 } 2353 2354 void MacroAssembler::c_stub_prolog(int gp_arg_count, int fp_arg_count, int ret_type, 2355 address *prolog_ptr) 2356 { 2357 int calltype = (((ret_type & 0x3) << 8) | 2358 ((fp_arg_count & 0xf) << 4) | 2359 (gp_arg_count & 0xf)); 2360 2361 // the addresses for the x86 to ARM entry code we need to use 2362 address start = pc(); 2363 // printf("start = %lx\n", start); 2364 int byteCount = aarch64_stub_prolog_size(); 2365 // printf("byteCount = %x\n", byteCount); 2366 int instructionCount = (byteCount + 3)/ 4; 2367 // printf("instructionCount = %x\n", instructionCount); 2368 for (int i = 0; i < instructionCount; i++) { 2369 nop(); 2370 } 2371 2372 memcpy(start, (void*)aarch64_stub_prolog, byteCount); 2373 2374 // write the address of the setup routine and the call format at the 2375 // end of into the copied code 2376 u_int64_t *patch_end = (u_int64_t *)(start + byteCount); 2377 if (prolog_ptr) 2378 patch_end[-2] = (u_int64_t)prolog_ptr; 2379 patch_end[-1] = calltype; 2380 } 2381 #endif 2382 2383 void MacroAssembler::push_call_clobbered_registers() { 2384 push(RegSet::range(r0, r18) - RegSet::of(rscratch1, rscratch2), sp); 2385 2386 // Push v0-v7, v16-v31. 2387 for (int i = 30; i >= 0; i -= 2) { 2388 if (i <= v7->encoding() || i >= v16->encoding()) { 2389 stpd(as_FloatRegister(i), as_FloatRegister(i+1), 2390 Address(pre(sp, -2 * wordSize))); 2391 } 2392 } 2393 } 2394 2395 void MacroAssembler::pop_call_clobbered_registers() { 2396 2397 for (int i = 0; i < 32; i += 2) { 2398 if (i <= v7->encoding() || i >= v16->encoding()) { 2399 ldpd(as_FloatRegister(i), as_FloatRegister(i+1), 2400 Address(post(sp, 2 * wordSize))); 2401 } 2402 } 2403 2404 pop(RegSet::range(r0, r18) - RegSet::of(rscratch1, rscratch2), sp); 2405 } 2406 2407 void MacroAssembler::push_CPU_state(bool save_vectors) { 2408 push(0x3fffffff, sp); // integer registers except lr & sp 2409 2410 if (!save_vectors) { 2411 for (int i = 30; i >= 0; i -= 2) 2412 stpd(as_FloatRegister(i), as_FloatRegister(i+1), 2413 Address(pre(sp, -2 * wordSize))); 2414 } else { 2415 for (int i = 30; i >= 0; i -= 2) 2416 stpq(as_FloatRegister(i), as_FloatRegister(i+1), 2417 Address(pre(sp, -4 * wordSize))); 2418 } 2419 } 2420 2421 void MacroAssembler::pop_CPU_state(bool restore_vectors) { 2422 if (!restore_vectors) { 2423 for (int i = 0; i < 32; i += 2) 2424 ldpd(as_FloatRegister(i), as_FloatRegister(i+1), 2425 Address(post(sp, 2 * wordSize))); 2426 } else { 2427 for (int i = 0; i < 32; i += 2) 2428 ldpq(as_FloatRegister(i), as_FloatRegister(i+1), 2429 Address(post(sp, 4 * wordSize))); 2430 } 2431 2432 pop(0x3fffffff, sp); // integer registers except lr & sp 2433 } 2434 2435 /** 2436 * Helpers for multiply_to_len(). 2437 */ 2438 void MacroAssembler::add2_with_carry(Register final_dest_hi, Register dest_hi, Register dest_lo, 2439 Register src1, Register src2) { 2440 adds(dest_lo, dest_lo, src1); 2441 adc(dest_hi, dest_hi, zr); 2442 adds(dest_lo, dest_lo, src2); 2443 adc(final_dest_hi, dest_hi, zr); 2444 } 2445 2446 // Generate an address from (r + r1 extend offset). "size" is the 2447 // size of the operand. The result may be in rscratch2. 2448 Address MacroAssembler::offsetted_address(Register r, Register r1, 2449 Address::extend ext, int offset, int size) { 2450 if (offset || (ext.shift() % size != 0)) { 2451 lea(rscratch2, Address(r, r1, ext)); 2452 return Address(rscratch2, offset); 2453 } else { 2454 return Address(r, r1, ext); 2455 } 2456 } 2457 2458 Address MacroAssembler::spill_address(int size, int offset, Register tmp) 2459 { 2460 assert(offset >= 0, "spill to negative address?"); 2461 // Offset reachable ? 2462 // Not aligned - 9 bits signed offset 2463 // Aligned - 12 bits unsigned offset shifted 2464 Register base = sp; 2465 if ((offset & (size-1)) && offset >= (1<<8)) { 2466 add(tmp, base, offset & ((1<<12)-1)); 2467 base = tmp; 2468 offset &= -1<<12; 2469 } 2470 2471 if (offset >= (1<<12) * size) { 2472 add(tmp, base, offset & (((1<<12)-1)<<12)); 2473 base = tmp; 2474 offset &= ~(((1<<12)-1)<<12); 2475 } 2476 2477 return Address(base, offset); 2478 } 2479 2480 /** 2481 * Multiply 64 bit by 64 bit first loop. 2482 */ 2483 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart, 2484 Register y, Register y_idx, Register z, 2485 Register carry, Register product, 2486 Register idx, Register kdx) { 2487 // 2488 // jlong carry, x[], y[], z[]; 2489 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) { 2490 // huge_128 product = y[idx] * x[xstart] + carry; 2491 // z[kdx] = (jlong)product; 2492 // carry = (jlong)(product >>> 64); 2493 // } 2494 // z[xstart] = carry; 2495 // 2496 2497 Label L_first_loop, L_first_loop_exit; 2498 Label L_one_x, L_one_y, L_multiply; 2499 2500 subsw(xstart, xstart, 1); 2501 br(Assembler::MI, L_one_x); 2502 2503 lea(rscratch1, Address(x, xstart, Address::lsl(LogBytesPerInt))); 2504 ldr(x_xstart, Address(rscratch1)); 2505 ror(x_xstart, x_xstart, 32); // convert big-endian to little-endian 2506 2507 bind(L_first_loop); 2508 subsw(idx, idx, 1); 2509 br(Assembler::MI, L_first_loop_exit); 2510 subsw(idx, idx, 1); 2511 br(Assembler::MI, L_one_y); 2512 lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt))); 2513 ldr(y_idx, Address(rscratch1)); 2514 ror(y_idx, y_idx, 32); // convert big-endian to little-endian 2515 bind(L_multiply); 2516 2517 // AArch64 has a multiply-accumulate instruction that we can't use 2518 // here because it has no way to process carries, so we have to use 2519 // separate add and adc instructions. Bah. 2520 umulh(rscratch1, x_xstart, y_idx); // x_xstart * y_idx -> rscratch1:product 2521 mul(product, x_xstart, y_idx); 2522 adds(product, product, carry); 2523 adc(carry, rscratch1, zr); // x_xstart * y_idx + carry -> carry:product 2524 2525 subw(kdx, kdx, 2); 2526 ror(product, product, 32); // back to big-endian 2527 str(product, offsetted_address(z, kdx, Address::uxtw(LogBytesPerInt), 0, BytesPerLong)); 2528 2529 b(L_first_loop); 2530 2531 bind(L_one_y); 2532 ldrw(y_idx, Address(y, 0)); 2533 b(L_multiply); 2534 2535 bind(L_one_x); 2536 ldrw(x_xstart, Address(x, 0)); 2537 b(L_first_loop); 2538 2539 bind(L_first_loop_exit); 2540 } 2541 2542 /** 2543 * Multiply 128 bit by 128. Unrolled inner loop. 2544 * 2545 */ 2546 void MacroAssembler::multiply_128_x_128_loop(Register y, Register z, 2547 Register carry, Register carry2, 2548 Register idx, Register jdx, 2549 Register yz_idx1, Register yz_idx2, 2550 Register tmp, Register tmp3, Register tmp4, 2551 Register tmp6, Register product_hi) { 2552 2553 // jlong carry, x[], y[], z[]; 2554 // int kdx = ystart+1; 2555 // for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop 2556 // huge_128 tmp3 = (y[idx+1] * product_hi) + z[kdx+idx+1] + carry; 2557 // jlong carry2 = (jlong)(tmp3 >>> 64); 2558 // huge_128 tmp4 = (y[idx] * product_hi) + z[kdx+idx] + carry2; 2559 // carry = (jlong)(tmp4 >>> 64); 2560 // z[kdx+idx+1] = (jlong)tmp3; 2561 // z[kdx+idx] = (jlong)tmp4; 2562 // } 2563 // idx += 2; 2564 // if (idx > 0) { 2565 // yz_idx1 = (y[idx] * product_hi) + z[kdx+idx] + carry; 2566 // z[kdx+idx] = (jlong)yz_idx1; 2567 // carry = (jlong)(yz_idx1 >>> 64); 2568 // } 2569 // 2570 2571 Label L_third_loop, L_third_loop_exit, L_post_third_loop_done; 2572 2573 lsrw(jdx, idx, 2); 2574 2575 bind(L_third_loop); 2576 2577 subsw(jdx, jdx, 1); 2578 br(Assembler::MI, L_third_loop_exit); 2579 subw(idx, idx, 4); 2580 2581 lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt))); 2582 2583 ldp(yz_idx2, yz_idx1, Address(rscratch1, 0)); 2584 2585 lea(tmp6, Address(z, idx, Address::uxtw(LogBytesPerInt))); 2586 2587 ror(yz_idx1, yz_idx1, 32); // convert big-endian to little-endian 2588 ror(yz_idx2, yz_idx2, 32); 2589 2590 ldp(rscratch2, rscratch1, Address(tmp6, 0)); 2591 2592 mul(tmp3, product_hi, yz_idx1); // yz_idx1 * product_hi -> tmp4:tmp3 2593 umulh(tmp4, product_hi, yz_idx1); 2594 2595 ror(rscratch1, rscratch1, 32); // convert big-endian to little-endian 2596 ror(rscratch2, rscratch2, 32); 2597 2598 mul(tmp, product_hi, yz_idx2); // yz_idx2 * product_hi -> carry2:tmp 2599 umulh(carry2, product_hi, yz_idx2); 2600 2601 // propagate sum of both multiplications into carry:tmp4:tmp3 2602 adds(tmp3, tmp3, carry); 2603 adc(tmp4, tmp4, zr); 2604 adds(tmp3, tmp3, rscratch1); 2605 adcs(tmp4, tmp4, tmp); 2606 adc(carry, carry2, zr); 2607 adds(tmp4, tmp4, rscratch2); 2608 adc(carry, carry, zr); 2609 2610 ror(tmp3, tmp3, 32); // convert little-endian to big-endian 2611 ror(tmp4, tmp4, 32); 2612 stp(tmp4, tmp3, Address(tmp6, 0)); 2613 2614 b(L_third_loop); 2615 bind (L_third_loop_exit); 2616 2617 andw (idx, idx, 0x3); 2618 cbz(idx, L_post_third_loop_done); 2619 2620 Label L_check_1; 2621 subsw(idx, idx, 2); 2622 br(Assembler::MI, L_check_1); 2623 2624 lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt))); 2625 ldr(yz_idx1, Address(rscratch1, 0)); 2626 ror(yz_idx1, yz_idx1, 32); 2627 mul(tmp3, product_hi, yz_idx1); // yz_idx1 * product_hi -> tmp4:tmp3 2628 umulh(tmp4, product_hi, yz_idx1); 2629 lea(rscratch1, Address(z, idx, Address::uxtw(LogBytesPerInt))); 2630 ldr(yz_idx2, Address(rscratch1, 0)); 2631 ror(yz_idx2, yz_idx2, 32); 2632 2633 add2_with_carry(carry, tmp4, tmp3, carry, yz_idx2); 2634 2635 ror(tmp3, tmp3, 32); 2636 str(tmp3, Address(rscratch1, 0)); 2637 2638 bind (L_check_1); 2639 2640 andw (idx, idx, 0x1); 2641 subsw(idx, idx, 1); 2642 br(Assembler::MI, L_post_third_loop_done); 2643 ldrw(tmp4, Address(y, idx, Address::uxtw(LogBytesPerInt))); 2644 mul(tmp3, tmp4, product_hi); // tmp4 * product_hi -> carry2:tmp3 2645 umulh(carry2, tmp4, product_hi); 2646 ldrw(tmp4, Address(z, idx, Address::uxtw(LogBytesPerInt))); 2647 2648 add2_with_carry(carry2, tmp3, tmp4, carry); 2649 2650 strw(tmp3, Address(z, idx, Address::uxtw(LogBytesPerInt))); 2651 extr(carry, carry2, tmp3, 32); 2652 2653 bind(L_post_third_loop_done); 2654 } 2655 2656 /** 2657 * Code for BigInteger::multiplyToLen() instrinsic. 2658 * 2659 * r0: x 2660 * r1: xlen 2661 * r2: y 2662 * r3: ylen 2663 * r4: z 2664 * r5: zlen 2665 * r10: tmp1 2666 * r11: tmp2 2667 * r12: tmp3 2668 * r13: tmp4 2669 * r14: tmp5 2670 * r15: tmp6 2671 * r16: tmp7 2672 * 2673 */ 2674 void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Register ylen, 2675 Register z, Register zlen, 2676 Register tmp1, Register tmp2, Register tmp3, Register tmp4, 2677 Register tmp5, Register tmp6, Register product_hi) { 2678 2679 assert_different_registers(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6); 2680 2681 const Register idx = tmp1; 2682 const Register kdx = tmp2; 2683 const Register xstart = tmp3; 2684 2685 const Register y_idx = tmp4; 2686 const Register carry = tmp5; 2687 const Register product = xlen; 2688 const Register x_xstart = zlen; // reuse register 2689 2690 // First Loop. 2691 // 2692 // final static long LONG_MASK = 0xffffffffL; 2693 // int xstart = xlen - 1; 2694 // int ystart = ylen - 1; 2695 // long carry = 0; 2696 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) { 2697 // long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry; 2698 // z[kdx] = (int)product; 2699 // carry = product >>> 32; 2700 // } 2701 // z[xstart] = (int)carry; 2702 // 2703 2704 movw(idx, ylen); // idx = ylen; 2705 movw(kdx, zlen); // kdx = xlen+ylen; 2706 mov(carry, zr); // carry = 0; 2707 2708 Label L_done; 2709 2710 movw(xstart, xlen); 2711 subsw(xstart, xstart, 1); 2712 br(Assembler::MI, L_done); 2713 2714 multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, carry, product, idx, kdx); 2715 2716 Label L_second_loop; 2717 cbzw(kdx, L_second_loop); 2718 2719 Label L_carry; 2720 subw(kdx, kdx, 1); 2721 cbzw(kdx, L_carry); 2722 2723 strw(carry, Address(z, kdx, Address::uxtw(LogBytesPerInt))); 2724 lsr(carry, carry, 32); 2725 subw(kdx, kdx, 1); 2726 2727 bind(L_carry); 2728 strw(carry, Address(z, kdx, Address::uxtw(LogBytesPerInt))); 2729 2730 // Second and third (nested) loops. 2731 // 2732 // for (int i = xstart-1; i >= 0; i--) { // Second loop 2733 // carry = 0; 2734 // for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop 2735 // long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) + 2736 // (z[k] & LONG_MASK) + carry; 2737 // z[k] = (int)product; 2738 // carry = product >>> 32; 2739 // } 2740 // z[i] = (int)carry; 2741 // } 2742 // 2743 // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = product_hi 2744 2745 const Register jdx = tmp1; 2746 2747 bind(L_second_loop); 2748 mov(carry, zr); // carry = 0; 2749 movw(jdx, ylen); // j = ystart+1 2750 2751 subsw(xstart, xstart, 1); // i = xstart-1; 2752 br(Assembler::MI, L_done); 2753 2754 str(z, Address(pre(sp, -4 * wordSize))); 2755 2756 Label L_last_x; 2757 lea(z, offsetted_address(z, xstart, Address::uxtw(LogBytesPerInt), 4, BytesPerInt)); // z = z + k - j 2758 subsw(xstart, xstart, 1); // i = xstart-1; 2759 br(Assembler::MI, L_last_x); 2760 2761 lea(rscratch1, Address(x, xstart, Address::uxtw(LogBytesPerInt))); 2762 ldr(product_hi, Address(rscratch1)); 2763 ror(product_hi, product_hi, 32); // convert big-endian to little-endian 2764 2765 Label L_third_loop_prologue; 2766 bind(L_third_loop_prologue); 2767 2768 str(ylen, Address(sp, wordSize)); 2769 stp(x, xstart, Address(sp, 2 * wordSize)); 2770 multiply_128_x_128_loop(y, z, carry, x, jdx, ylen, product, 2771 tmp2, x_xstart, tmp3, tmp4, tmp6, product_hi); 2772 ldp(z, ylen, Address(post(sp, 2 * wordSize))); 2773 ldp(x, xlen, Address(post(sp, 2 * wordSize))); // copy old xstart -> xlen 2774 2775 addw(tmp3, xlen, 1); 2776 strw(carry, Address(z, tmp3, Address::uxtw(LogBytesPerInt))); 2777 subsw(tmp3, tmp3, 1); 2778 br(Assembler::MI, L_done); 2779 2780 lsr(carry, carry, 32); 2781 strw(carry, Address(z, tmp3, Address::uxtw(LogBytesPerInt))); 2782 b(L_second_loop); 2783 2784 // Next infrequent code is moved outside loops. 2785 bind(L_last_x); 2786 ldrw(product_hi, Address(x, 0)); 2787 b(L_third_loop_prologue); 2788 2789 bind(L_done); 2790 } 2791 2792 /** 2793 * Emits code to update CRC-32 with a byte value according to constants in table 2794 * 2795 * @param [in,out]crc Register containing the crc. 2796 * @param [in]val Register containing the byte to fold into the CRC. 2797 * @param [in]table Register containing the table of crc constants. 2798 * 2799 * uint32_t crc; 2800 * val = crc_table[(val ^ crc) & 0xFF]; 2801 * crc = val ^ (crc >> 8); 2802 * 2803 */ 2804 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) { 2805 eor(val, val, crc); 2806 andr(val, val, 0xff); 2807 ldrw(val, Address(table, val, Address::lsl(2))); 2808 eor(crc, val, crc, Assembler::LSR, 8); 2809 } 2810 2811 /** 2812 * Emits code to update CRC-32 with a 32-bit value according to tables 0 to 3 2813 * 2814 * @param [in,out]crc Register containing the crc. 2815 * @param [in]v Register containing the 32-bit to fold into the CRC. 2816 * @param [in]table0 Register containing table 0 of crc constants. 2817 * @param [in]table1 Register containing table 1 of crc constants. 2818 * @param [in]table2 Register containing table 2 of crc constants. 2819 * @param [in]table3 Register containing table 3 of crc constants. 2820 * 2821 * uint32_t crc; 2822 * v = crc ^ v 2823 * crc = table3[v&0xff]^table2[(v>>8)&0xff]^table1[(v>>16)&0xff]^table0[v>>24] 2824 * 2825 */ 2826 void MacroAssembler::update_word_crc32(Register crc, Register v, Register tmp, 2827 Register table0, Register table1, Register table2, Register table3, 2828 bool upper) { 2829 eor(v, crc, v, upper ? LSR:LSL, upper ? 32:0); 2830 uxtb(tmp, v); 2831 ldrw(crc, Address(table3, tmp, Address::lsl(2))); 2832 ubfx(tmp, v, 8, 8); 2833 ldrw(tmp, Address(table2, tmp, Address::lsl(2))); 2834 eor(crc, crc, tmp); 2835 ubfx(tmp, v, 16, 8); 2836 ldrw(tmp, Address(table1, tmp, Address::lsl(2))); 2837 eor(crc, crc, tmp); 2838 ubfx(tmp, v, 24, 8); 2839 ldrw(tmp, Address(table0, tmp, Address::lsl(2))); 2840 eor(crc, crc, tmp); 2841 } 2842 2843 /** 2844 * @param crc register containing existing CRC (32-bit) 2845 * @param buf register pointing to input byte buffer (byte*) 2846 * @param len register containing number of bytes 2847 * @param table register that will contain address of CRC table 2848 * @param tmp scratch register 2849 */ 2850 void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len, 2851 Register table0, Register table1, Register table2, Register table3, 2852 Register tmp, Register tmp2, Register tmp3) { 2853 Label L_by16, L_by16_loop, L_by4, L_by4_loop, L_by1, L_by1_loop, L_exit; 2854 unsigned long offset; 2855 2856 ornw(crc, zr, crc); 2857 2858 if (UseCRC32) { 2859 Label CRC_by64_loop, CRC_by4_loop, CRC_by1_loop; 2860 2861 subs(len, len, 64); 2862 br(Assembler::GE, CRC_by64_loop); 2863 adds(len, len, 64-4); 2864 br(Assembler::GE, CRC_by4_loop); 2865 adds(len, len, 4); 2866 br(Assembler::GT, CRC_by1_loop); 2867 b(L_exit); 2868 2869 BIND(CRC_by4_loop); 2870 ldrw(tmp, Address(post(buf, 4))); 2871 subs(len, len, 4); 2872 crc32w(crc, crc, tmp); 2873 br(Assembler::GE, CRC_by4_loop); 2874 adds(len, len, 4); 2875 br(Assembler::LE, L_exit); 2876 BIND(CRC_by1_loop); 2877 ldrb(tmp, Address(post(buf, 1))); 2878 subs(len, len, 1); 2879 crc32b(crc, crc, tmp); 2880 br(Assembler::GT, CRC_by1_loop); 2881 b(L_exit); 2882 2883 align(CodeEntryAlignment); 2884 BIND(CRC_by64_loop); 2885 subs(len, len, 64); 2886 ldp(tmp, tmp3, Address(post(buf, 16))); 2887 crc32x(crc, crc, tmp); 2888 crc32x(crc, crc, tmp3); 2889 ldp(tmp, tmp3, Address(post(buf, 16))); 2890 crc32x(crc, crc, tmp); 2891 crc32x(crc, crc, tmp3); 2892 ldp(tmp, tmp3, Address(post(buf, 16))); 2893 crc32x(crc, crc, tmp); 2894 crc32x(crc, crc, tmp3); 2895 ldp(tmp, tmp3, Address(post(buf, 16))); 2896 crc32x(crc, crc, tmp); 2897 crc32x(crc, crc, tmp3); 2898 br(Assembler::GE, CRC_by64_loop); 2899 adds(len, len, 64-4); 2900 br(Assembler::GE, CRC_by4_loop); 2901 adds(len, len, 4); 2902 br(Assembler::GT, CRC_by1_loop); 2903 BIND(L_exit); 2904 ornw(crc, zr, crc); 2905 return; 2906 } 2907 2908 adrp(table0, ExternalAddress(StubRoutines::crc_table_addr()), offset); 2909 if (offset) add(table0, table0, offset); 2910 add(table1, table0, 1*256*sizeof(juint)); 2911 add(table2, table0, 2*256*sizeof(juint)); 2912 add(table3, table0, 3*256*sizeof(juint)); 2913 2914 if (UseNeon) { 2915 cmp(len, 64); 2916 br(Assembler::LT, L_by16); 2917 eor(v16, T16B, v16, v16); 2918 2919 Label L_fold; 2920 2921 add(tmp, table0, 4*256*sizeof(juint)); // Point at the Neon constants 2922 2923 ld1(v0, v1, T2D, post(buf, 32)); 2924 ld1r(v4, T2D, post(tmp, 8)); 2925 ld1r(v5, T2D, post(tmp, 8)); 2926 ld1r(v6, T2D, post(tmp, 8)); 2927 ld1r(v7, T2D, post(tmp, 8)); 2928 mov(v16, T4S, 0, crc); 2929 2930 eor(v0, T16B, v0, v16); 2931 sub(len, len, 64); 2932 2933 BIND(L_fold); 2934 pmull(v22, T8H, v0, v5, T8B); 2935 pmull(v20, T8H, v0, v7, T8B); 2936 pmull(v23, T8H, v0, v4, T8B); 2937 pmull(v21, T8H, v0, v6, T8B); 2938 2939 pmull2(v18, T8H, v0, v5, T16B); 2940 pmull2(v16, T8H, v0, v7, T16B); 2941 pmull2(v19, T8H, v0, v4, T16B); 2942 pmull2(v17, T8H, v0, v6, T16B); 2943 2944 uzp1(v24, v20, v22, T8H); 2945 uzp2(v25, v20, v22, T8H); 2946 eor(v20, T16B, v24, v25); 2947 2948 uzp1(v26, v16, v18, T8H); 2949 uzp2(v27, v16, v18, T8H); 2950 eor(v16, T16B, v26, v27); 2951 2952 ushll2(v22, T4S, v20, T8H, 8); 2953 ushll(v20, T4S, v20, T4H, 8); 2954 2955 ushll2(v18, T4S, v16, T8H, 8); 2956 ushll(v16, T4S, v16, T4H, 8); 2957 2958 eor(v22, T16B, v23, v22); 2959 eor(v18, T16B, v19, v18); 2960 eor(v20, T16B, v21, v20); 2961 eor(v16, T16B, v17, v16); 2962 2963 uzp1(v17, v16, v20, T2D); 2964 uzp2(v21, v16, v20, T2D); 2965 eor(v17, T16B, v17, v21); 2966 2967 ushll2(v20, T2D, v17, T4S, 16); 2968 ushll(v16, T2D, v17, T2S, 16); 2969 2970 eor(v20, T16B, v20, v22); 2971 eor(v16, T16B, v16, v18); 2972 2973 uzp1(v17, v20, v16, T2D); 2974 uzp2(v21, v20, v16, T2D); 2975 eor(v28, T16B, v17, v21); 2976 2977 pmull(v22, T8H, v1, v5, T8B); 2978 pmull(v20, T8H, v1, v7, T8B); 2979 pmull(v23, T8H, v1, v4, T8B); 2980 pmull(v21, T8H, v1, v6, T8B); 2981 2982 pmull2(v18, T8H, v1, v5, T16B); 2983 pmull2(v16, T8H, v1, v7, T16B); 2984 pmull2(v19, T8H, v1, v4, T16B); 2985 pmull2(v17, T8H, v1, v6, T16B); 2986 2987 ld1(v0, v1, T2D, post(buf, 32)); 2988 2989 uzp1(v24, v20, v22, T8H); 2990 uzp2(v25, v20, v22, T8H); 2991 eor(v20, T16B, v24, v25); 2992 2993 uzp1(v26, v16, v18, T8H); 2994 uzp2(v27, v16, v18, T8H); 2995 eor(v16, T16B, v26, v27); 2996 2997 ushll2(v22, T4S, v20, T8H, 8); 2998 ushll(v20, T4S, v20, T4H, 8); 2999 3000 ushll2(v18, T4S, v16, T8H, 8); 3001 ushll(v16, T4S, v16, T4H, 8); 3002 3003 eor(v22, T16B, v23, v22); 3004 eor(v18, T16B, v19, v18); 3005 eor(v20, T16B, v21, v20); 3006 eor(v16, T16B, v17, v16); 3007 3008 uzp1(v17, v16, v20, T2D); 3009 uzp2(v21, v16, v20, T2D); 3010 eor(v16, T16B, v17, v21); 3011 3012 ushll2(v20, T2D, v16, T4S, 16); 3013 ushll(v16, T2D, v16, T2S, 16); 3014 3015 eor(v20, T16B, v22, v20); 3016 eor(v16, T16B, v16, v18); 3017 3018 uzp1(v17, v20, v16, T2D); 3019 uzp2(v21, v20, v16, T2D); 3020 eor(v20, T16B, v17, v21); 3021 3022 shl(v16, T2D, v28, 1); 3023 shl(v17, T2D, v20, 1); 3024 3025 eor(v0, T16B, v0, v16); 3026 eor(v1, T16B, v1, v17); 3027 3028 subs(len, len, 32); 3029 br(Assembler::GE, L_fold); 3030 3031 mov(crc, 0); 3032 mov(tmp, v0, T1D, 0); 3033 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false); 3034 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true); 3035 mov(tmp, v0, T1D, 1); 3036 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false); 3037 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true); 3038 mov(tmp, v1, T1D, 0); 3039 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false); 3040 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true); 3041 mov(tmp, v1, T1D, 1); 3042 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false); 3043 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true); 3044 3045 add(len, len, 32); 3046 } 3047 3048 BIND(L_by16); 3049 subs(len, len, 16); 3050 br(Assembler::GE, L_by16_loop); 3051 adds(len, len, 16-4); 3052 br(Assembler::GE, L_by4_loop); 3053 adds(len, len, 4); 3054 br(Assembler::GT, L_by1_loop); 3055 b(L_exit); 3056 3057 BIND(L_by4_loop); 3058 ldrw(tmp, Address(post(buf, 4))); 3059 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3); 3060 subs(len, len, 4); 3061 br(Assembler::GE, L_by4_loop); 3062 adds(len, len, 4); 3063 br(Assembler::LE, L_exit); 3064 BIND(L_by1_loop); 3065 subs(len, len, 1); 3066 ldrb(tmp, Address(post(buf, 1))); 3067 update_byte_crc32(crc, tmp, table0); 3068 br(Assembler::GT, L_by1_loop); 3069 b(L_exit); 3070 3071 align(CodeEntryAlignment); 3072 BIND(L_by16_loop); 3073 subs(len, len, 16); 3074 ldp(tmp, tmp3, Address(post(buf, 16))); 3075 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false); 3076 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true); 3077 update_word_crc32(crc, tmp3, tmp2, table0, table1, table2, table3, false); 3078 update_word_crc32(crc, tmp3, tmp2, table0, table1, table2, table3, true); 3079 br(Assembler::GE, L_by16_loop); 3080 adds(len, len, 16-4); 3081 br(Assembler::GE, L_by4_loop); 3082 adds(len, len, 4); 3083 br(Assembler::GT, L_by1_loop); 3084 BIND(L_exit); 3085 ornw(crc, zr, crc); 3086 } 3087 3088 /** 3089 * @param crc register containing existing CRC (32-bit) 3090 * @param buf register pointing to input byte buffer (byte*) 3091 * @param len register containing number of bytes 3092 * @param table register that will contain address of CRC table 3093 * @param tmp scratch register 3094 */ 3095 void MacroAssembler::kernel_crc32c(Register crc, Register buf, Register len, 3096 Register table0, Register table1, Register table2, Register table3, 3097 Register tmp, Register tmp2, Register tmp3) { 3098 Label L_exit; 3099 Label CRC_by64_loop, CRC_by4_loop, CRC_by1_loop; 3100 3101 subs(len, len, 64); 3102 br(Assembler::GE, CRC_by64_loop); 3103 adds(len, len, 64-4); 3104 br(Assembler::GE, CRC_by4_loop); 3105 adds(len, len, 4); 3106 br(Assembler::GT, CRC_by1_loop); 3107 b(L_exit); 3108 3109 BIND(CRC_by4_loop); 3110 ldrw(tmp, Address(post(buf, 4))); 3111 subs(len, len, 4); 3112 crc32cw(crc, crc, tmp); 3113 br(Assembler::GE, CRC_by4_loop); 3114 adds(len, len, 4); 3115 br(Assembler::LE, L_exit); 3116 BIND(CRC_by1_loop); 3117 ldrb(tmp, Address(post(buf, 1))); 3118 subs(len, len, 1); 3119 crc32cb(crc, crc, tmp); 3120 br(Assembler::GT, CRC_by1_loop); 3121 b(L_exit); 3122 3123 align(CodeEntryAlignment); 3124 BIND(CRC_by64_loop); 3125 subs(len, len, 64); 3126 ldp(tmp, tmp3, Address(post(buf, 16))); 3127 crc32cx(crc, crc, tmp); 3128 crc32cx(crc, crc, tmp3); 3129 ldp(tmp, tmp3, Address(post(buf, 16))); 3130 crc32cx(crc, crc, tmp); 3131 crc32cx(crc, crc, tmp3); 3132 ldp(tmp, tmp3, Address(post(buf, 16))); 3133 crc32cx(crc, crc, tmp); 3134 crc32cx(crc, crc, tmp3); 3135 ldp(tmp, tmp3, Address(post(buf, 16))); 3136 crc32cx(crc, crc, tmp); 3137 crc32cx(crc, crc, tmp3); 3138 br(Assembler::GE, CRC_by64_loop); 3139 adds(len, len, 64-4); 3140 br(Assembler::GE, CRC_by4_loop); 3141 adds(len, len, 4); 3142 br(Assembler::GT, CRC_by1_loop); 3143 BIND(L_exit); 3144 return; 3145 } 3146 3147 SkipIfEqual::SkipIfEqual( 3148 MacroAssembler* masm, const bool* flag_addr, bool value) { 3149 _masm = masm; 3150 unsigned long offset; 3151 _masm->adrp(rscratch1, ExternalAddress((address)flag_addr), offset); 3152 _masm->ldrb(rscratch1, Address(rscratch1, offset)); 3153 _masm->cbzw(rscratch1, _label); 3154 } 3155 3156 SkipIfEqual::~SkipIfEqual() { 3157 _masm->bind(_label); 3158 } 3159 3160 void MacroAssembler::addptr(const Address &dst, int32_t src) { 3161 Address adr; 3162 switch(dst.getMode()) { 3163 case Address::base_plus_offset: 3164 // This is the expected mode, although we allow all the other 3165 // forms below. 3166 adr = form_address(rscratch2, dst.base(), dst.offset(), LogBytesPerWord); 3167 break; 3168 default: 3169 lea(rscratch2, dst); 3170 adr = Address(rscratch2); 3171 break; 3172 } 3173 ldr(rscratch1, adr); 3174 add(rscratch1, rscratch1, src); 3175 str(rscratch1, adr); 3176 } 3177 3178 void MacroAssembler::cmpptr(Register src1, Address src2) { 3179 unsigned long offset; 3180 adrp(rscratch1, src2, offset); 3181 ldr(rscratch1, Address(rscratch1, offset)); 3182 cmp(src1, rscratch1); 3183 } 3184 3185 void MacroAssembler::store_check(Register obj, Address dst) { 3186 store_check(obj); 3187 } 3188 3189 void MacroAssembler::store_check(Register obj) { 3190 // Does a store check for the oop in register obj. The content of 3191 // register obj is destroyed afterwards. 3192 3193 BarrierSet* bs = Universe::heap()->barrier_set(); 3194 assert(bs->kind() == BarrierSet::CardTableForRS || 3195 bs->kind() == BarrierSet::CardTableExtension, 3196 "Wrong barrier set kind"); 3197 3198 CardTableModRefBS* ct = barrier_set_cast<CardTableModRefBS>(bs); 3199 assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code"); 3200 3201 lsr(obj, obj, CardTableModRefBS::card_shift); 3202 3203 assert(CardTableModRefBS::dirty_card_val() == 0, "must be"); 3204 3205 load_byte_map_base(rscratch1); 3206 3207 if (UseCondCardMark) { 3208 Label L_already_dirty; 3209 membar(StoreLoad); 3210 ldrb(rscratch2, Address(obj, rscratch1)); 3211 cbz(rscratch2, L_already_dirty); 3212 strb(zr, Address(obj, rscratch1)); 3213 bind(L_already_dirty); 3214 } else { 3215 if (UseConcMarkSweepGC && CMSPrecleaningEnabled) { 3216 membar(StoreStore); 3217 } 3218 strb(zr, Address(obj, rscratch1)); 3219 } 3220 } 3221 3222 void MacroAssembler::load_klass(Register dst, Register src) { 3223 if (UseCompressedClassPointers) { 3224 ldrw(dst, Address(src, oopDesc::klass_offset_in_bytes())); 3225 decode_klass_not_null(dst); 3226 } else { 3227 ldr(dst, Address(src, oopDesc::klass_offset_in_bytes())); 3228 } 3229 } 3230 3231 void MacroAssembler::load_mirror(Register dst, Register method) { 3232 const int mirror_offset = in_bytes(Klass::java_mirror_offset()); 3233 ldr(dst, Address(rmethod, Method::const_offset())); 3234 ldr(dst, Address(dst, ConstMethod::constants_offset())); 3235 ldr(dst, Address(dst, ConstantPool::pool_holder_offset_in_bytes())); 3236 ldr(dst, Address(dst, mirror_offset)); 3237 } 3238 3239 void MacroAssembler::cmp_klass(Register oop, Register trial_klass, Register tmp) { 3240 if (UseCompressedClassPointers) { 3241 ldrw(tmp, Address(oop, oopDesc::klass_offset_in_bytes())); 3242 if (Universe::narrow_klass_base() == NULL) { 3243 cmp(trial_klass, tmp, LSL, Universe::narrow_klass_shift()); 3244 return; 3245 } else if (((uint64_t)Universe::narrow_klass_base() & 0xffffffff) == 0 3246 && Universe::narrow_klass_shift() == 0) { 3247 // Only the bottom 32 bits matter 3248 cmpw(trial_klass, tmp); 3249 return; 3250 } 3251 decode_klass_not_null(tmp); 3252 } else { 3253 ldr(tmp, Address(oop, oopDesc::klass_offset_in_bytes())); 3254 } 3255 cmp(trial_klass, tmp); 3256 } 3257 3258 void MacroAssembler::load_prototype_header(Register dst, Register src) { 3259 load_klass(dst, src); 3260 ldr(dst, Address(dst, Klass::prototype_header_offset())); 3261 } 3262 3263 void MacroAssembler::store_klass(Register dst, Register src) { 3264 // FIXME: Should this be a store release? concurrent gcs assumes 3265 // klass length is valid if klass field is not null. 3266 if (UseCompressedClassPointers) { 3267 encode_klass_not_null(src); 3268 strw(src, Address(dst, oopDesc::klass_offset_in_bytes())); 3269 } else { 3270 str(src, Address(dst, oopDesc::klass_offset_in_bytes())); 3271 } 3272 } 3273 3274 void MacroAssembler::store_klass_gap(Register dst, Register src) { 3275 if (UseCompressedClassPointers) { 3276 // Store to klass gap in destination 3277 strw(src, Address(dst, oopDesc::klass_gap_offset_in_bytes())); 3278 } 3279 } 3280 3281 // Algorithm must match oop.inline.hpp encode_heap_oop. 3282 void MacroAssembler::encode_heap_oop(Register d, Register s) { 3283 #ifdef ASSERT 3284 verify_heapbase("MacroAssembler::encode_heap_oop: heap base corrupted?"); 3285 #endif 3286 verify_oop(s, "broken oop in encode_heap_oop"); 3287 if (Universe::narrow_oop_base() == NULL) { 3288 if (Universe::narrow_oop_shift() != 0) { 3289 assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong"); 3290 lsr(d, s, LogMinObjAlignmentInBytes); 3291 } else { 3292 mov(d, s); 3293 } 3294 } else { 3295 subs(d, s, rheapbase); 3296 csel(d, d, zr, Assembler::HS); 3297 lsr(d, d, LogMinObjAlignmentInBytes); 3298 3299 /* Old algorithm: is this any worse? 3300 Label nonnull; 3301 cbnz(r, nonnull); 3302 sub(r, r, rheapbase); 3303 bind(nonnull); 3304 lsr(r, r, LogMinObjAlignmentInBytes); 3305 */ 3306 } 3307 } 3308 3309 void MacroAssembler::encode_heap_oop_not_null(Register r) { 3310 #ifdef ASSERT 3311 verify_heapbase("MacroAssembler::encode_heap_oop_not_null: heap base corrupted?"); 3312 if (CheckCompressedOops) { 3313 Label ok; 3314 cbnz(r, ok); 3315 stop("null oop passed to encode_heap_oop_not_null"); 3316 bind(ok); 3317 } 3318 #endif 3319 verify_oop(r, "broken oop in encode_heap_oop_not_null"); 3320 if (Universe::narrow_oop_base() != NULL) { 3321 sub(r, r, rheapbase); 3322 } 3323 if (Universe::narrow_oop_shift() != 0) { 3324 assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong"); 3325 lsr(r, r, LogMinObjAlignmentInBytes); 3326 } 3327 } 3328 3329 void MacroAssembler::encode_heap_oop_not_null(Register dst, Register src) { 3330 #ifdef ASSERT 3331 verify_heapbase("MacroAssembler::encode_heap_oop_not_null2: heap base corrupted?"); 3332 if (CheckCompressedOops) { 3333 Label ok; 3334 cbnz(src, ok); 3335 stop("null oop passed to encode_heap_oop_not_null2"); 3336 bind(ok); 3337 } 3338 #endif 3339 verify_oop(src, "broken oop in encode_heap_oop_not_null2"); 3340 3341 Register data = src; 3342 if (Universe::narrow_oop_base() != NULL) { 3343 sub(dst, src, rheapbase); 3344 data = dst; 3345 } 3346 if (Universe::narrow_oop_shift() != 0) { 3347 assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong"); 3348 lsr(dst, data, LogMinObjAlignmentInBytes); 3349 data = dst; 3350 } 3351 if (data == src) 3352 mov(dst, src); 3353 } 3354 3355 void MacroAssembler::decode_heap_oop(Register d, Register s) { 3356 #ifdef ASSERT 3357 verify_heapbase("MacroAssembler::decode_heap_oop: heap base corrupted?"); 3358 #endif 3359 if (Universe::narrow_oop_base() == NULL) { 3360 if (Universe::narrow_oop_shift() != 0 || d != s) { 3361 lsl(d, s, Universe::narrow_oop_shift()); 3362 } 3363 } else { 3364 Label done; 3365 if (d != s) 3366 mov(d, s); 3367 cbz(s, done); 3368 add(d, rheapbase, s, Assembler::LSL, LogMinObjAlignmentInBytes); 3369 bind(done); 3370 } 3371 verify_oop(d, "broken oop in decode_heap_oop"); 3372 } 3373 3374 void MacroAssembler::decode_heap_oop_not_null(Register r) { 3375 assert (UseCompressedOops, "should only be used for compressed headers"); 3376 assert (Universe::heap() != NULL, "java heap should be initialized"); 3377 // Cannot assert, unverified entry point counts instructions (see .ad file) 3378 // vtableStubs also counts instructions in pd_code_size_limit. 3379 // Also do not verify_oop as this is called by verify_oop. 3380 if (Universe::narrow_oop_shift() != 0) { 3381 assert(LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong"); 3382 if (Universe::narrow_oop_base() != NULL) { 3383 add(r, rheapbase, r, Assembler::LSL, LogMinObjAlignmentInBytes); 3384 } else { 3385 add(r, zr, r, Assembler::LSL, LogMinObjAlignmentInBytes); 3386 } 3387 } else { 3388 assert (Universe::narrow_oop_base() == NULL, "sanity"); 3389 } 3390 } 3391 3392 void MacroAssembler::decode_heap_oop_not_null(Register dst, Register src) { 3393 assert (UseCompressedOops, "should only be used for compressed headers"); 3394 assert (Universe::heap() != NULL, "java heap should be initialized"); 3395 // Cannot assert, unverified entry point counts instructions (see .ad file) 3396 // vtableStubs also counts instructions in pd_code_size_limit. 3397 // Also do not verify_oop as this is called by verify_oop. 3398 if (Universe::narrow_oop_shift() != 0) { 3399 assert(LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong"); 3400 if (Universe::narrow_oop_base() != NULL) { 3401 add(dst, rheapbase, src, Assembler::LSL, LogMinObjAlignmentInBytes); 3402 } else { 3403 add(dst, zr, src, Assembler::LSL, LogMinObjAlignmentInBytes); 3404 } 3405 } else { 3406 assert (Universe::narrow_oop_base() == NULL, "sanity"); 3407 if (dst != src) { 3408 mov(dst, src); 3409 } 3410 } 3411 } 3412 3413 void MacroAssembler::encode_klass_not_null(Register dst, Register src) { 3414 if (Universe::narrow_klass_base() == NULL) { 3415 if (Universe::narrow_klass_shift() != 0) { 3416 assert (LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong"); 3417 lsr(dst, src, LogKlassAlignmentInBytes); 3418 } else { 3419 if (dst != src) mov(dst, src); 3420 } 3421 return; 3422 } 3423 3424 if (use_XOR_for_compressed_class_base) { 3425 if (Universe::narrow_klass_shift() != 0) { 3426 eor(dst, src, (uint64_t)Universe::narrow_klass_base()); 3427 lsr(dst, dst, LogKlassAlignmentInBytes); 3428 } else { 3429 eor(dst, src, (uint64_t)Universe::narrow_klass_base()); 3430 } 3431 return; 3432 } 3433 3434 if (((uint64_t)Universe::narrow_klass_base() & 0xffffffff) == 0 3435 && Universe::narrow_klass_shift() == 0) { 3436 movw(dst, src); 3437 return; 3438 } 3439 3440 #ifdef ASSERT 3441 verify_heapbase("MacroAssembler::encode_klass_not_null2: heap base corrupted?"); 3442 #endif 3443 3444 Register rbase = dst; 3445 if (dst == src) rbase = rheapbase; 3446 mov(rbase, (uint64_t)Universe::narrow_klass_base()); 3447 sub(dst, src, rbase); 3448 if (Universe::narrow_klass_shift() != 0) { 3449 assert (LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong"); 3450 lsr(dst, dst, LogKlassAlignmentInBytes); 3451 } 3452 if (dst == src) reinit_heapbase(); 3453 } 3454 3455 void MacroAssembler::encode_klass_not_null(Register r) { 3456 encode_klass_not_null(r, r); 3457 } 3458 3459 void MacroAssembler::decode_klass_not_null(Register dst, Register src) { 3460 Register rbase = dst; 3461 assert (UseCompressedClassPointers, "should only be used for compressed headers"); 3462 3463 if (Universe::narrow_klass_base() == NULL) { 3464 if (Universe::narrow_klass_shift() != 0) { 3465 assert(LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong"); 3466 lsl(dst, src, LogKlassAlignmentInBytes); 3467 } else { 3468 if (dst != src) mov(dst, src); 3469 } 3470 return; 3471 } 3472 3473 if (use_XOR_for_compressed_class_base) { 3474 if (Universe::narrow_klass_shift() != 0) { 3475 lsl(dst, src, LogKlassAlignmentInBytes); 3476 eor(dst, dst, (uint64_t)Universe::narrow_klass_base()); 3477 } else { 3478 eor(dst, src, (uint64_t)Universe::narrow_klass_base()); 3479 } 3480 return; 3481 } 3482 3483 if (((uint64_t)Universe::narrow_klass_base() & 0xffffffff) == 0 3484 && Universe::narrow_klass_shift() == 0) { 3485 if (dst != src) 3486 movw(dst, src); 3487 movk(dst, (uint64_t)Universe::narrow_klass_base() >> 32, 32); 3488 return; 3489 } 3490 3491 // Cannot assert, unverified entry point counts instructions (see .ad file) 3492 // vtableStubs also counts instructions in pd_code_size_limit. 3493 // Also do not verify_oop as this is called by verify_oop. 3494 if (dst == src) rbase = rheapbase; 3495 mov(rbase, (uint64_t)Universe::narrow_klass_base()); 3496 if (Universe::narrow_klass_shift() != 0) { 3497 assert(LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong"); 3498 add(dst, rbase, src, Assembler::LSL, LogKlassAlignmentInBytes); 3499 } else { 3500 add(dst, rbase, src); 3501 } 3502 if (dst == src) reinit_heapbase(); 3503 } 3504 3505 void MacroAssembler::decode_klass_not_null(Register r) { 3506 decode_klass_not_null(r, r); 3507 } 3508 3509 void MacroAssembler::set_narrow_oop(Register dst, jobject obj) { 3510 assert (UseCompressedOops, "should only be used for compressed oops"); 3511 assert (Universe::heap() != NULL, "java heap should be initialized"); 3512 assert (oop_recorder() != NULL, "this assembler needs an OopRecorder"); 3513 3514 int oop_index = oop_recorder()->find_index(obj); 3515 assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "should be real oop"); 3516 3517 InstructionMark im(this); 3518 RelocationHolder rspec = oop_Relocation::spec(oop_index); 3519 code_section()->relocate(inst_mark(), rspec); 3520 movz(dst, 0xDEAD, 16); 3521 movk(dst, 0xBEEF); 3522 } 3523 3524 void MacroAssembler::set_narrow_klass(Register dst, Klass* k) { 3525 assert (UseCompressedClassPointers, "should only be used for compressed headers"); 3526 assert (oop_recorder() != NULL, "this assembler needs an OopRecorder"); 3527 int index = oop_recorder()->find_index(k); 3528 assert(! Universe::heap()->is_in_reserved(k), "should not be an oop"); 3529 3530 InstructionMark im(this); 3531 RelocationHolder rspec = metadata_Relocation::spec(index); 3532 code_section()->relocate(inst_mark(), rspec); 3533 narrowKlass nk = Klass::encode_klass(k); 3534 movz(dst, (nk >> 16), 16); 3535 movk(dst, nk & 0xffff); 3536 } 3537 3538 void MacroAssembler::load_heap_oop(Register dst, Address src) 3539 { 3540 if (UseCompressedOops) { 3541 ldrw(dst, src); 3542 decode_heap_oop(dst); 3543 } else { 3544 ldr(dst, src); 3545 } 3546 } 3547 3548 void MacroAssembler::load_heap_oop_not_null(Register dst, Address src) 3549 { 3550 if (UseCompressedOops) { 3551 ldrw(dst, src); 3552 decode_heap_oop_not_null(dst); 3553 } else { 3554 ldr(dst, src); 3555 } 3556 } 3557 3558 void MacroAssembler::store_heap_oop(Address dst, Register src) { 3559 if (UseCompressedOops) { 3560 assert(!dst.uses(src), "not enough registers"); 3561 encode_heap_oop(src); 3562 strw(src, dst); 3563 } else 3564 str(src, dst); 3565 } 3566 3567 // Used for storing NULLs. 3568 void MacroAssembler::store_heap_oop_null(Address dst) { 3569 if (UseCompressedOops) { 3570 strw(zr, dst); 3571 } else 3572 str(zr, dst); 3573 } 3574 3575 #if INCLUDE_ALL_GCS 3576 void MacroAssembler::g1_write_barrier_pre(Register obj, 3577 Register pre_val, 3578 Register thread, 3579 Register tmp, 3580 bool tosca_live, 3581 bool expand_call) { 3582 // If expand_call is true then we expand the call_VM_leaf macro 3583 // directly to skip generating the check by 3584 // InterpreterMacroAssembler::call_VM_leaf_base that checks _last_sp. 3585 3586 assert(thread == rthread, "must be"); 3587 3588 Label done; 3589 Label runtime; 3590 3591 assert(pre_val != noreg, "check this code"); 3592 3593 if (obj != noreg) 3594 assert_different_registers(obj, pre_val, tmp); 3595 3596 Address in_progress(thread, in_bytes(JavaThread::satb_mark_queue_offset() + 3597 SATBMarkQueue::byte_offset_of_active())); 3598 Address index(thread, in_bytes(JavaThread::satb_mark_queue_offset() + 3599 SATBMarkQueue::byte_offset_of_index())); 3600 Address buffer(thread, in_bytes(JavaThread::satb_mark_queue_offset() + 3601 SATBMarkQueue::byte_offset_of_buf())); 3602 3603 3604 // Is marking active? 3605 if (in_bytes(SATBMarkQueue::byte_width_of_active()) == 4) { 3606 ldrw(tmp, in_progress); 3607 } else { 3608 assert(in_bytes(SATBMarkQueue::byte_width_of_active()) == 1, "Assumption"); 3609 ldrb(tmp, in_progress); 3610 } 3611 cbzw(tmp, done); 3612 3613 // Do we need to load the previous value? 3614 if (obj != noreg) { 3615 load_heap_oop(pre_val, Address(obj, 0)); 3616 } 3617 3618 // Is the previous value null? 3619 cbz(pre_val, done); 3620 3621 // Can we store original value in the thread's buffer? 3622 // Is index == 0? 3623 // (The index field is typed as size_t.) 3624 3625 ldr(tmp, index); // tmp := *index_adr 3626 cbz(tmp, runtime); // tmp == 0? 3627 // If yes, goto runtime 3628 3629 sub(tmp, tmp, wordSize); // tmp := tmp - wordSize 3630 str(tmp, index); // *index_adr := tmp 3631 ldr(rscratch1, buffer); 3632 add(tmp, tmp, rscratch1); // tmp := tmp + *buffer_adr 3633 3634 // Record the previous value 3635 str(pre_val, Address(tmp, 0)); 3636 b(done); 3637 3638 bind(runtime); 3639 // save the live input values 3640 push(r0->bit(tosca_live) | obj->bit(obj != noreg) | pre_val->bit(true), sp); 3641 3642 // Calling the runtime using the regular call_VM_leaf mechanism generates 3643 // code (generated by InterpreterMacroAssember::call_VM_leaf_base) 3644 // that checks that the *(rfp+frame::interpreter_frame_last_sp) == NULL. 3645 // 3646 // If we care generating the pre-barrier without a frame (e.g. in the 3647 // intrinsified Reference.get() routine) then ebp might be pointing to 3648 // the caller frame and so this check will most likely fail at runtime. 3649 // 3650 // Expanding the call directly bypasses the generation of the check. 3651 // So when we do not have have a full interpreter frame on the stack 3652 // expand_call should be passed true. 3653 3654 if (expand_call) { 3655 assert(pre_val != c_rarg1, "smashed arg"); 3656 pass_arg1(this, thread); 3657 pass_arg0(this, pre_val); 3658 MacroAssembler::call_VM_leaf_base(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_pre), 2); 3659 } else { 3660 call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_pre), pre_val, thread); 3661 } 3662 3663 pop(r0->bit(tosca_live) | obj->bit(obj != noreg) | pre_val->bit(true), sp); 3664 3665 bind(done); 3666 } 3667 3668 void MacroAssembler::g1_write_barrier_post(Register store_addr, 3669 Register new_val, 3670 Register thread, 3671 Register tmp, 3672 Register tmp2) { 3673 assert(thread == rthread, "must be"); 3674 3675 Address queue_index(thread, in_bytes(JavaThread::dirty_card_queue_offset() + 3676 DirtyCardQueue::byte_offset_of_index())); 3677 Address buffer(thread, in_bytes(JavaThread::dirty_card_queue_offset() + 3678 DirtyCardQueue::byte_offset_of_buf())); 3679 3680 BarrierSet* bs = Universe::heap()->barrier_set(); 3681 CardTableModRefBS* ct = (CardTableModRefBS*)bs; 3682 assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code"); 3683 3684 Label done; 3685 Label runtime; 3686 3687 // Does store cross heap regions? 3688 3689 eor(tmp, store_addr, new_val); 3690 lsr(tmp, tmp, HeapRegion::LogOfHRGrainBytes); 3691 cbz(tmp, done); 3692 3693 // crosses regions, storing NULL? 3694 3695 cbz(new_val, done); 3696 3697 // storing region crossing non-NULL, is card already dirty? 3698 3699 ExternalAddress cardtable((address) ct->byte_map_base); 3700 assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code"); 3701 const Register card_addr = tmp; 3702 3703 lsr(card_addr, store_addr, CardTableModRefBS::card_shift); 3704 3705 // get the address of the card 3706 load_byte_map_base(tmp2); 3707 add(card_addr, card_addr, tmp2); 3708 ldrb(tmp2, Address(card_addr)); 3709 cmpw(tmp2, (int)G1SATBCardTableModRefBS::g1_young_card_val()); 3710 br(Assembler::EQ, done); 3711 3712 assert((int)CardTableModRefBS::dirty_card_val() == 0, "must be 0"); 3713 3714 membar(Assembler::StoreLoad); 3715 3716 ldrb(tmp2, Address(card_addr)); 3717 cbzw(tmp2, done); 3718 3719 // storing a region crossing, non-NULL oop, card is clean. 3720 // dirty card and log. 3721 3722 strb(zr, Address(card_addr)); 3723 3724 ldr(rscratch1, queue_index); 3725 cbz(rscratch1, runtime); 3726 sub(rscratch1, rscratch1, wordSize); 3727 str(rscratch1, queue_index); 3728 3729 ldr(tmp2, buffer); 3730 str(card_addr, Address(tmp2, rscratch1)); 3731 b(done); 3732 3733 bind(runtime); 3734 // save the live input values 3735 push(store_addr->bit(true) | new_val->bit(true), sp); 3736 call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_post), card_addr, thread); 3737 pop(store_addr->bit(true) | new_val->bit(true), sp); 3738 3739 bind(done); 3740 } 3741 3742 #endif // INCLUDE_ALL_GCS 3743 3744 Address MacroAssembler::allocate_metadata_address(Metadata* obj) { 3745 assert(oop_recorder() != NULL, "this assembler needs a Recorder"); 3746 int index = oop_recorder()->allocate_metadata_index(obj); 3747 RelocationHolder rspec = metadata_Relocation::spec(index); 3748 return Address((address)obj, rspec); 3749 } 3750 3751 // Move an oop into a register. immediate is true if we want 3752 // immediate instrcutions, i.e. we are not going to patch this 3753 // instruction while the code is being executed by another thread. In 3754 // that case we can use move immediates rather than the constant pool. 3755 void MacroAssembler::movoop(Register dst, jobject obj, bool immediate) { 3756 int oop_index; 3757 if (obj == NULL) { 3758 oop_index = oop_recorder()->allocate_oop_index(obj); 3759 } else { 3760 oop_index = oop_recorder()->find_index(obj); 3761 assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "should be real oop"); 3762 } 3763 RelocationHolder rspec = oop_Relocation::spec(oop_index); 3764 if (! immediate) { 3765 address dummy = address(uintptr_t(pc()) & -wordSize); // A nearby aligned address 3766 ldr_constant(dst, Address(dummy, rspec)); 3767 } else 3768 mov(dst, Address((address)obj, rspec)); 3769 } 3770 3771 // Move a metadata address into a register. 3772 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) { 3773 int oop_index; 3774 if (obj == NULL) { 3775 oop_index = oop_recorder()->allocate_metadata_index(obj); 3776 } else { 3777 oop_index = oop_recorder()->find_index(obj); 3778 } 3779 RelocationHolder rspec = metadata_Relocation::spec(oop_index); 3780 mov(dst, Address((address)obj, rspec)); 3781 } 3782 3783 Address MacroAssembler::constant_oop_address(jobject obj) { 3784 assert(oop_recorder() != NULL, "this assembler needs an OopRecorder"); 3785 assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "not an oop"); 3786 int oop_index = oop_recorder()->find_index(obj); 3787 return Address((address)obj, oop_Relocation::spec(oop_index)); 3788 } 3789 3790 // Defines obj, preserves var_size_in_bytes, okay for t2 == var_size_in_bytes. 3791 void MacroAssembler::tlab_allocate(Register obj, 3792 Register var_size_in_bytes, 3793 int con_size_in_bytes, 3794 Register t1, 3795 Register t2, 3796 Label& slow_case) { 3797 assert_different_registers(obj, t2); 3798 assert_different_registers(obj, var_size_in_bytes); 3799 Register end = t2; 3800 3801 // verify_tlab(); 3802 3803 ldr(obj, Address(rthread, JavaThread::tlab_top_offset())); 3804 if (var_size_in_bytes == noreg) { 3805 lea(end, Address(obj, con_size_in_bytes)); 3806 } else { 3807 lea(end, Address(obj, var_size_in_bytes)); 3808 } 3809 ldr(rscratch1, Address(rthread, JavaThread::tlab_end_offset())); 3810 cmp(end, rscratch1); 3811 br(Assembler::HI, slow_case); 3812 3813 // update the tlab top pointer 3814 str(end, Address(rthread, JavaThread::tlab_top_offset())); 3815 3816 // recover var_size_in_bytes if necessary 3817 if (var_size_in_bytes == end) { 3818 sub(var_size_in_bytes, var_size_in_bytes, obj); 3819 } 3820 // verify_tlab(); 3821 } 3822 3823 // Preserves r19, and r3. 3824 Register MacroAssembler::tlab_refill(Label& retry, 3825 Label& try_eden, 3826 Label& slow_case) { 3827 Register top = r0; 3828 Register t1 = r2; 3829 Register t2 = r4; 3830 assert_different_registers(top, rthread, t1, t2, /* preserve: */ r19, r3); 3831 Label do_refill, discard_tlab; 3832 3833 if (!Universe::heap()->supports_inline_contig_alloc()) { 3834 // No allocation in the shared eden. 3835 b(slow_case); 3836 } 3837 3838 ldr(top, Address(rthread, in_bytes(JavaThread::tlab_top_offset()))); 3839 ldr(t1, Address(rthread, in_bytes(JavaThread::tlab_end_offset()))); 3840 3841 // calculate amount of free space 3842 sub(t1, t1, top); 3843 lsr(t1, t1, LogHeapWordSize); 3844 3845 // Retain tlab and allocate object in shared space if 3846 // the amount free in the tlab is too large to discard. 3847 3848 ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_refill_waste_limit_offset()))); 3849 cmp(t1, rscratch1); 3850 br(Assembler::LE, discard_tlab); 3851 3852 // Retain 3853 // ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_refill_waste_limit_offset()))); 3854 mov(t2, (int32_t) ThreadLocalAllocBuffer::refill_waste_limit_increment()); 3855 add(rscratch1, rscratch1, t2); 3856 str(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_refill_waste_limit_offset()))); 3857 3858 if (TLABStats) { 3859 // increment number of slow_allocations 3860 addmw(Address(rthread, in_bytes(JavaThread::tlab_slow_allocations_offset())), 3861 1, rscratch1); 3862 } 3863 b(try_eden); 3864 3865 bind(discard_tlab); 3866 if (TLABStats) { 3867 // increment number of refills 3868 addmw(Address(rthread, in_bytes(JavaThread::tlab_number_of_refills_offset())), 1, 3869 rscratch1); 3870 // accumulate wastage -- t1 is amount free in tlab 3871 addmw(Address(rthread, in_bytes(JavaThread::tlab_fast_refill_waste_offset())), t1, 3872 rscratch1); 3873 } 3874 3875 // if tlab is currently allocated (top or end != null) then 3876 // fill [top, end + alignment_reserve) with array object 3877 cbz(top, do_refill); 3878 3879 // set up the mark word 3880 mov(rscratch1, (intptr_t)markOopDesc::prototype()->copy_set_hash(0x2)); 3881 str(rscratch1, Address(top, oopDesc::mark_offset_in_bytes())); 3882 // set the length to the remaining space 3883 sub(t1, t1, typeArrayOopDesc::header_size(T_INT)); 3884 add(t1, t1, (int32_t)ThreadLocalAllocBuffer::alignment_reserve()); 3885 lsl(t1, t1, log2_intptr(HeapWordSize/sizeof(jint))); 3886 strw(t1, Address(top, arrayOopDesc::length_offset_in_bytes())); 3887 // set klass to intArrayKlass 3888 { 3889 unsigned long offset; 3890 // dubious reloc why not an oop reloc? 3891 adrp(rscratch1, ExternalAddress((address)Universe::intArrayKlassObj_addr()), 3892 offset); 3893 ldr(t1, Address(rscratch1, offset)); 3894 } 3895 // store klass last. concurrent gcs assumes klass length is valid if 3896 // klass field is not null. 3897 store_klass(top, t1); 3898 3899 mov(t1, top); 3900 ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_start_offset()))); 3901 sub(t1, t1, rscratch1); 3902 incr_allocated_bytes(rthread, t1, 0, rscratch1); 3903 3904 // refill the tlab with an eden allocation 3905 bind(do_refill); 3906 ldr(t1, Address(rthread, in_bytes(JavaThread::tlab_size_offset()))); 3907 lsl(t1, t1, LogHeapWordSize); 3908 // allocate new tlab, address returned in top 3909 eden_allocate(top, t1, 0, t2, slow_case); 3910 3911 // Check that t1 was preserved in eden_allocate. 3912 #ifdef ASSERT 3913 if (UseTLAB) { 3914 Label ok; 3915 Register tsize = r4; 3916 assert_different_registers(tsize, rthread, t1); 3917 str(tsize, Address(pre(sp, -16))); 3918 ldr(tsize, Address(rthread, in_bytes(JavaThread::tlab_size_offset()))); 3919 lsl(tsize, tsize, LogHeapWordSize); 3920 cmp(t1, tsize); 3921 br(Assembler::EQ, ok); 3922 STOP("assert(t1 != tlab size)"); 3923 should_not_reach_here(); 3924 3925 bind(ok); 3926 ldr(tsize, Address(post(sp, 16))); 3927 } 3928 #endif 3929 str(top, Address(rthread, in_bytes(JavaThread::tlab_start_offset()))); 3930 str(top, Address(rthread, in_bytes(JavaThread::tlab_top_offset()))); 3931 add(top, top, t1); 3932 sub(top, top, (int32_t)ThreadLocalAllocBuffer::alignment_reserve_in_bytes()); 3933 str(top, Address(rthread, in_bytes(JavaThread::tlab_end_offset()))); 3934 verify_tlab(); 3935 b(retry); 3936 3937 return rthread; // for use by caller 3938 } 3939 3940 // Defines obj, preserves var_size_in_bytes 3941 void MacroAssembler::eden_allocate(Register obj, 3942 Register var_size_in_bytes, 3943 int con_size_in_bytes, 3944 Register t1, 3945 Label& slow_case) { 3946 assert_different_registers(obj, var_size_in_bytes, t1); 3947 if (!Universe::heap()->supports_inline_contig_alloc()) { 3948 b(slow_case); 3949 } else { 3950 Register end = t1; 3951 Register heap_end = rscratch2; 3952 Label retry; 3953 bind(retry); 3954 { 3955 unsigned long offset; 3956 adrp(rscratch1, ExternalAddress((address) Universe::heap()->end_addr()), offset); 3957 ldr(heap_end, Address(rscratch1, offset)); 3958 } 3959 3960 ExternalAddress heap_top((address) Universe::heap()->top_addr()); 3961 3962 // Get the current top of the heap 3963 { 3964 unsigned long offset; 3965 adrp(rscratch1, heap_top, offset); 3966 // Use add() here after ARDP, rather than lea(). 3967 // lea() does not generate anything if its offset is zero. 3968 // However, relocs expect to find either an ADD or a load/store 3969 // insn after an ADRP. add() always generates an ADD insn, even 3970 // for add(Rn, Rn, 0). 3971 add(rscratch1, rscratch1, offset); 3972 ldaxr(obj, rscratch1); 3973 } 3974 3975 // Adjust it my the size of our new object 3976 if (var_size_in_bytes == noreg) { 3977 lea(end, Address(obj, con_size_in_bytes)); 3978 } else { 3979 lea(end, Address(obj, var_size_in_bytes)); 3980 } 3981 3982 // if end < obj then we wrapped around high memory 3983 cmp(end, obj); 3984 br(Assembler::LO, slow_case); 3985 3986 cmp(end, heap_end); 3987 br(Assembler::HI, slow_case); 3988 3989 // If heap_top hasn't been changed by some other thread, update it. 3990 stlxr(rscratch2, end, rscratch1); 3991 cbnzw(rscratch2, retry); 3992 } 3993 } 3994 3995 void MacroAssembler::verify_tlab() { 3996 #ifdef ASSERT 3997 if (UseTLAB && VerifyOops) { 3998 Label next, ok; 3999 4000 stp(rscratch2, rscratch1, Address(pre(sp, -16))); 4001 4002 ldr(rscratch2, Address(rthread, in_bytes(JavaThread::tlab_top_offset()))); 4003 ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_start_offset()))); 4004 cmp(rscratch2, rscratch1); 4005 br(Assembler::HS, next); 4006 STOP("assert(top >= start)"); 4007 should_not_reach_here(); 4008 4009 bind(next); 4010 ldr(rscratch2, Address(rthread, in_bytes(JavaThread::tlab_end_offset()))); 4011 ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_top_offset()))); 4012 cmp(rscratch2, rscratch1); 4013 br(Assembler::HS, ok); 4014 STOP("assert(top <= end)"); 4015 should_not_reach_here(); 4016 4017 bind(ok); 4018 ldp(rscratch2, rscratch1, Address(post(sp, 16))); 4019 } 4020 #endif 4021 } 4022 4023 // Writes to stack successive pages until offset reached to check for 4024 // stack overflow + shadow pages. This clobbers tmp. 4025 void MacroAssembler::bang_stack_size(Register size, Register tmp) { 4026 assert_different_registers(tmp, size, rscratch1); 4027 mov(tmp, sp); 4028 // Bang stack for total size given plus shadow page size. 4029 // Bang one page at a time because large size can bang beyond yellow and 4030 // red zones. 4031 Label loop; 4032 mov(rscratch1, os::vm_page_size()); 4033 bind(loop); 4034 lea(tmp, Address(tmp, -os::vm_page_size())); 4035 subsw(size, size, rscratch1); 4036 str(size, Address(tmp)); 4037 br(Assembler::GT, loop); 4038 4039 // Bang down shadow pages too. 4040 // At this point, (tmp-0) is the last address touched, so don't 4041 // touch it again. (It was touched as (tmp-pagesize) but then tmp 4042 // was post-decremented.) Skip this address by starting at i=1, and 4043 // touch a few more pages below. N.B. It is important to touch all 4044 // the way down to and including i=StackShadowPages. 4045 for (int i = 0; i < (int)(JavaThread::stack_shadow_zone_size() / os::vm_page_size()) - 1; i++) { 4046 // this could be any sized move but this is can be a debugging crumb 4047 // so the bigger the better. 4048 lea(tmp, Address(tmp, -os::vm_page_size())); 4049 str(size, Address(tmp)); 4050 } 4051 } 4052 4053 4054 address MacroAssembler::read_polling_page(Register r, address page, relocInfo::relocType rtype) { 4055 unsigned long off; 4056 adrp(r, Address(page, rtype), off); 4057 InstructionMark im(this); 4058 code_section()->relocate(inst_mark(), rtype); 4059 ldrw(zr, Address(r, off)); 4060 return inst_mark(); 4061 } 4062 4063 address MacroAssembler::read_polling_page(Register r, relocInfo::relocType rtype) { 4064 InstructionMark im(this); 4065 code_section()->relocate(inst_mark(), rtype); 4066 ldrw(zr, Address(r, 0)); 4067 return inst_mark(); 4068 } 4069 4070 void MacroAssembler::adrp(Register reg1, const Address &dest, unsigned long &byte_offset) { 4071 relocInfo::relocType rtype = dest.rspec().reloc()->type(); 4072 unsigned long low_page = (unsigned long)CodeCache::low_bound() >> 12; 4073 unsigned long high_page = (unsigned long)(CodeCache::high_bound()-1) >> 12; 4074 unsigned long dest_page = (unsigned long)dest.target() >> 12; 4075 long offset_low = dest_page - low_page; 4076 long offset_high = dest_page - high_page; 4077 4078 assert(is_valid_AArch64_address(dest.target()), "bad address"); 4079 assert(dest.getMode() == Address::literal, "ADRP must be applied to a literal address"); 4080 4081 InstructionMark im(this); 4082 code_section()->relocate(inst_mark(), dest.rspec()); 4083 // 8143067: Ensure that the adrp can reach the dest from anywhere within 4084 // the code cache so that if it is relocated we know it will still reach 4085 if (offset_high >= -(1<<20) && offset_low < (1<<20)) { 4086 _adrp(reg1, dest.target()); 4087 } else { 4088 unsigned long target = (unsigned long)dest.target(); 4089 unsigned long adrp_target 4090 = (target & 0xffffffffUL) | ((unsigned long)pc() & 0xffff00000000UL); 4091 4092 _adrp(reg1, (address)adrp_target); 4093 movk(reg1, target >> 32, 32); 4094 } 4095 byte_offset = (unsigned long)dest.target() & 0xfff; 4096 } 4097 4098 void MacroAssembler::load_byte_map_base(Register reg) { 4099 jbyte *byte_map_base = 4100 ((CardTableModRefBS*)(Universe::heap()->barrier_set()))->byte_map_base; 4101 4102 if (is_valid_AArch64_address((address)byte_map_base)) { 4103 // Strictly speaking the byte_map_base isn't an address at all, 4104 // and it might even be negative. 4105 unsigned long offset; 4106 adrp(reg, ExternalAddress((address)byte_map_base), offset); 4107 // We expect offset to be zero with most collectors. 4108 if (offset != 0) { 4109 add(reg, reg, offset); 4110 } 4111 } else { 4112 mov(reg, (uint64_t)byte_map_base); 4113 } 4114 } 4115 4116 void MacroAssembler::build_frame(int framesize) { 4117 assert(framesize > 0, "framesize must be > 0"); 4118 if (framesize < ((1 << 9) + 2 * wordSize)) { 4119 sub(sp, sp, framesize); 4120 stp(rfp, lr, Address(sp, framesize - 2 * wordSize)); 4121 if (PreserveFramePointer) add(rfp, sp, framesize - 2 * wordSize); 4122 } else { 4123 stp(rfp, lr, Address(pre(sp, -2 * wordSize))); 4124 if (PreserveFramePointer) mov(rfp, sp); 4125 if (framesize < ((1 << 12) + 2 * wordSize)) 4126 sub(sp, sp, framesize - 2 * wordSize); 4127 else { 4128 mov(rscratch1, framesize - 2 * wordSize); 4129 sub(sp, sp, rscratch1); 4130 } 4131 } 4132 } 4133 4134 void MacroAssembler::remove_frame(int framesize) { 4135 assert(framesize > 0, "framesize must be > 0"); 4136 if (framesize < ((1 << 9) + 2 * wordSize)) { 4137 ldp(rfp, lr, Address(sp, framesize - 2 * wordSize)); 4138 add(sp, sp, framesize); 4139 } else { 4140 if (framesize < ((1 << 12) + 2 * wordSize)) 4141 add(sp, sp, framesize - 2 * wordSize); 4142 else { 4143 mov(rscratch1, framesize - 2 * wordSize); 4144 add(sp, sp, rscratch1); 4145 } 4146 ldp(rfp, lr, Address(post(sp, 2 * wordSize))); 4147 } 4148 } 4149 4150 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr); 4151 4152 // Search for str1 in str2 and return index or -1 4153 void MacroAssembler::string_indexof(Register str2, Register str1, 4154 Register cnt2, Register cnt1, 4155 Register tmp1, Register tmp2, 4156 Register tmp3, Register tmp4, 4157 int icnt1, Register result, int ae) { 4158 Label BM, LINEARSEARCH, DONE, NOMATCH, MATCH; 4159 4160 Register ch1 = rscratch1; 4161 Register ch2 = rscratch2; 4162 Register cnt1tmp = tmp1; 4163 Register cnt2tmp = tmp2; 4164 Register cnt1_neg = cnt1; 4165 Register cnt2_neg = cnt2; 4166 Register result_tmp = tmp4; 4167 4168 bool isL = ae == StrIntrinsicNode::LL; 4169 4170 bool str1_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL; 4171 bool str2_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU; 4172 int str1_chr_shift = str1_isL ? 0:1; 4173 int str2_chr_shift = str2_isL ? 0:1; 4174 int str1_chr_size = str1_isL ? 1:2; 4175 int str2_chr_size = str2_isL ? 1:2; 4176 chr_insn str1_load_1chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb : 4177 (chr_insn)&MacroAssembler::ldrh; 4178 chr_insn str2_load_1chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb : 4179 (chr_insn)&MacroAssembler::ldrh; 4180 chr_insn load_2chr = isL ? (chr_insn)&MacroAssembler::ldrh : (chr_insn)&MacroAssembler::ldrw; 4181 chr_insn load_4chr = isL ? (chr_insn)&MacroAssembler::ldrw : (chr_insn)&MacroAssembler::ldr; 4182 4183 // Note, inline_string_indexOf() generates checks: 4184 // if (substr.count > string.count) return -1; 4185 // if (substr.count == 0) return 0; 4186 4187 // We have two strings, a source string in str2, cnt2 and a pattern string 4188 // in str1, cnt1. Find the 1st occurence of pattern in source or return -1. 4189 4190 // For larger pattern and source we use a simplified Boyer Moore algorithm. 4191 // With a small pattern and source we use linear scan. 4192 4193 if (icnt1 == -1) { 4194 cmp(cnt1, 256); // Use Linear Scan if cnt1 < 8 || cnt1 >= 256 4195 ccmp(cnt1, 8, 0b0000, LO); // Can't handle skip >= 256 because we use 4196 br(LO, LINEARSEARCH); // a byte array. 4197 cmp(cnt1, cnt2, LSR, 2); // Source must be 4 * pattern for BM 4198 br(HS, LINEARSEARCH); 4199 } 4200 4201 // The Boyer Moore alogorithm is based on the description here:- 4202 // 4203 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm 4204 // 4205 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule 4206 // and the 'Good Suffix' rule. 4207 // 4208 // These rules are essentially heuristics for how far we can shift the 4209 // pattern along the search string. 4210 // 4211 // The implementation here uses the 'Bad Character' rule only because of the 4212 // complexity of initialisation for the 'Good Suffix' rule. 4213 // 4214 // This is also known as the Boyer-Moore-Horspool algorithm:- 4215 // 4216 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm 4217 // 4218 // #define ASIZE 128 4219 // 4220 // int bm(unsigned char *x, int m, unsigned char *y, int n) { 4221 // int i, j; 4222 // unsigned c; 4223 // unsigned char bc[ASIZE]; 4224 // 4225 // /* Preprocessing */ 4226 // for (i = 0; i < ASIZE; ++i) 4227 // bc[i] = 0; 4228 // for (i = 0; i < m - 1; ) { 4229 // c = x[i]; 4230 // ++i; 4231 // if (c < ASIZE) bc[c] = i; 4232 // } 4233 // 4234 // /* Searching */ 4235 // j = 0; 4236 // while (j <= n - m) { 4237 // c = y[i+j]; 4238 // if (x[m-1] == c) 4239 // for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i); 4240 // if (i < 0) return j; 4241 // if (c < ASIZE) 4242 // j = j - bc[y[j+m-1]] + m; 4243 // else 4244 // j += 1; // Advance by 1 only if char >= ASIZE 4245 // } 4246 // } 4247 4248 if (icnt1 == -1) { 4249 BIND(BM); 4250 4251 Label ZLOOP, BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP; 4252 Label BMADV, BMMATCH, BMCHECKEND; 4253 4254 Register cnt1end = tmp2; 4255 Register str2end = cnt2; 4256 Register skipch = tmp2; 4257 4258 // Restrict ASIZE to 128 to reduce stack space/initialisation. 4259 // The presence of chars >= ASIZE in the target string does not affect 4260 // performance, but we must be careful not to initialise them in the stack 4261 // array. 4262 // The presence of chars >= ASIZE in the source string may adversely affect 4263 // performance since we can only advance by one when we encounter one. 4264 4265 stp(zr, zr, pre(sp, -128)); 4266 for (int i = 1; i < 8; i++) 4267 stp(zr, zr, Address(sp, i*16)); 4268 4269 mov(cnt1tmp, 0); 4270 sub(cnt1end, cnt1, 1); 4271 BIND(BCLOOP); 4272 (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift))); 4273 cmp(ch1, 128); 4274 add(cnt1tmp, cnt1tmp, 1); 4275 br(HS, BCSKIP); 4276 strb(cnt1tmp, Address(sp, ch1)); 4277 BIND(BCSKIP); 4278 cmp(cnt1tmp, cnt1end); 4279 br(LT, BCLOOP); 4280 4281 mov(result_tmp, str2); 4282 4283 sub(cnt2, cnt2, cnt1); 4284 add(str2end, str2, cnt2, LSL, str2_chr_shift); 4285 BIND(BMLOOPSTR2); 4286 sub(cnt1tmp, cnt1, 1); 4287 (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift))); 4288 (this->*str2_load_1chr)(skipch, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift))); 4289 cmp(ch1, skipch); 4290 br(NE, BMSKIP); 4291 subs(cnt1tmp, cnt1tmp, 1); 4292 br(LT, BMMATCH); 4293 BIND(BMLOOPSTR1); 4294 (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift))); 4295 (this->*str2_load_1chr)(ch2, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift))); 4296 cmp(ch1, ch2); 4297 br(NE, BMSKIP); 4298 subs(cnt1tmp, cnt1tmp, 1); 4299 br(GE, BMLOOPSTR1); 4300 BIND(BMMATCH); 4301 sub(result, str2, result_tmp); 4302 if (!str2_isL) lsr(result, result, 1); 4303 add(sp, sp, 128); 4304 b(DONE); 4305 BIND(BMADV); 4306 add(str2, str2, str2_chr_size); 4307 b(BMCHECKEND); 4308 BIND(BMSKIP); 4309 cmp(skipch, 128); 4310 br(HS, BMADV); 4311 ldrb(ch2, Address(sp, skipch)); 4312 add(str2, str2, cnt1, LSL, str2_chr_shift); 4313 sub(str2, str2, ch2, LSL, str2_chr_shift); 4314 BIND(BMCHECKEND); 4315 cmp(str2, str2end); 4316 br(LE, BMLOOPSTR2); 4317 add(sp, sp, 128); 4318 b(NOMATCH); 4319 } 4320 4321 BIND(LINEARSEARCH); 4322 { 4323 Label DO1, DO2, DO3; 4324 4325 Register str2tmp = tmp2; 4326 Register first = tmp3; 4327 4328 if (icnt1 == -1) 4329 { 4330 Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT; 4331 4332 cmp(cnt1, str1_isL == str2_isL ? 4 : 2); 4333 br(LT, DOSHORT); 4334 4335 sub(cnt2, cnt2, cnt1); 4336 mov(result_tmp, cnt2); 4337 4338 lea(str1, Address(str1, cnt1, Address::lsl(str1_chr_shift))); 4339 lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift))); 4340 sub(cnt1_neg, zr, cnt1, LSL, str1_chr_shift); 4341 sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift); 4342 (this->*str1_load_1chr)(first, Address(str1, cnt1_neg)); 4343 4344 BIND(FIRST_LOOP); 4345 (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg)); 4346 cmp(first, ch2); 4347 br(EQ, STR1_LOOP); 4348 BIND(STR2_NEXT); 4349 adds(cnt2_neg, cnt2_neg, str2_chr_size); 4350 br(LE, FIRST_LOOP); 4351 b(NOMATCH); 4352 4353 BIND(STR1_LOOP); 4354 adds(cnt1tmp, cnt1_neg, str1_chr_size); 4355 add(cnt2tmp, cnt2_neg, str2_chr_size); 4356 br(GE, MATCH); 4357 4358 BIND(STR1_NEXT); 4359 (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp)); 4360 (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp)); 4361 cmp(ch1, ch2); 4362 br(NE, STR2_NEXT); 4363 adds(cnt1tmp, cnt1tmp, str1_chr_size); 4364 add(cnt2tmp, cnt2tmp, str2_chr_size); 4365 br(LT, STR1_NEXT); 4366 b(MATCH); 4367 4368 BIND(DOSHORT); 4369 if (str1_isL == str2_isL) { 4370 cmp(cnt1, 2); 4371 br(LT, DO1); 4372 br(GT, DO3); 4373 } 4374 } 4375 4376 if (icnt1 == 4) { 4377 Label CH1_LOOP; 4378 4379 (this->*load_4chr)(ch1, str1); 4380 sub(cnt2, cnt2, 4); 4381 mov(result_tmp, cnt2); 4382 lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift))); 4383 sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift); 4384 4385 BIND(CH1_LOOP); 4386 (this->*load_4chr)(ch2, Address(str2, cnt2_neg)); 4387 cmp(ch1, ch2); 4388 br(EQ, MATCH); 4389 adds(cnt2_neg, cnt2_neg, str2_chr_size); 4390 br(LE, CH1_LOOP); 4391 b(NOMATCH); 4392 } 4393 4394 if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 2) { 4395 Label CH1_LOOP; 4396 4397 BIND(DO2); 4398 (this->*load_2chr)(ch1, str1); 4399 sub(cnt2, cnt2, 2); 4400 mov(result_tmp, cnt2); 4401 lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift))); 4402 sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift); 4403 4404 BIND(CH1_LOOP); 4405 (this->*load_2chr)(ch2, Address(str2, cnt2_neg)); 4406 cmp(ch1, ch2); 4407 br(EQ, MATCH); 4408 adds(cnt2_neg, cnt2_neg, str2_chr_size); 4409 br(LE, CH1_LOOP); 4410 b(NOMATCH); 4411 } 4412 4413 if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 3) { 4414 Label FIRST_LOOP, STR2_NEXT, STR1_LOOP; 4415 4416 BIND(DO3); 4417 (this->*load_2chr)(first, str1); 4418 (this->*str1_load_1chr)(ch1, Address(str1, 2*str1_chr_size)); 4419 4420 sub(cnt2, cnt2, 3); 4421 mov(result_tmp, cnt2); 4422 lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift))); 4423 sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift); 4424 4425 BIND(FIRST_LOOP); 4426 (this->*load_2chr)(ch2, Address(str2, cnt2_neg)); 4427 cmpw(first, ch2); 4428 br(EQ, STR1_LOOP); 4429 BIND(STR2_NEXT); 4430 adds(cnt2_neg, cnt2_neg, str2_chr_size); 4431 br(LE, FIRST_LOOP); 4432 b(NOMATCH); 4433 4434 BIND(STR1_LOOP); 4435 add(cnt2tmp, cnt2_neg, 2*str2_chr_size); 4436 (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp)); 4437 cmp(ch1, ch2); 4438 br(NE, STR2_NEXT); 4439 b(MATCH); 4440 } 4441 4442 if (icnt1 == -1 || icnt1 == 1) { 4443 Label CH1_LOOP, HAS_ZERO; 4444 Label DO1_SHORT, DO1_LOOP; 4445 4446 BIND(DO1); 4447 (this->*str1_load_1chr)(ch1, str1); 4448 cmp(cnt2, 8); 4449 br(LT, DO1_SHORT); 4450 4451 if (str2_isL) { 4452 if (!str1_isL) { 4453 tst(ch1, 0xff00); 4454 br(NE, NOMATCH); 4455 } 4456 orr(ch1, ch1, ch1, LSL, 8); 4457 } 4458 orr(ch1, ch1, ch1, LSL, 16); 4459 orr(ch1, ch1, ch1, LSL, 32); 4460 4461 sub(cnt2, cnt2, 8/str2_chr_size); 4462 mov(result_tmp, cnt2); 4463 lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift))); 4464 sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift); 4465 4466 mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001); 4467 BIND(CH1_LOOP); 4468 ldr(ch2, Address(str2, cnt2_neg)); 4469 eor(ch2, ch1, ch2); 4470 sub(tmp1, ch2, tmp3); 4471 orr(tmp2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 4472 bics(tmp1, tmp1, tmp2); 4473 br(NE, HAS_ZERO); 4474 adds(cnt2_neg, cnt2_neg, 8); 4475 br(LT, CH1_LOOP); 4476 4477 cmp(cnt2_neg, 8); 4478 mov(cnt2_neg, 0); 4479 br(LT, CH1_LOOP); 4480 b(NOMATCH); 4481 4482 BIND(HAS_ZERO); 4483 rev(tmp1, tmp1); 4484 clz(tmp1, tmp1); 4485 add(cnt2_neg, cnt2_neg, tmp1, LSR, 3); 4486 b(MATCH); 4487 4488 BIND(DO1_SHORT); 4489 mov(result_tmp, cnt2); 4490 lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift))); 4491 sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift); 4492 BIND(DO1_LOOP); 4493 (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg)); 4494 cmpw(ch1, ch2); 4495 br(EQ, MATCH); 4496 adds(cnt2_neg, cnt2_neg, str2_chr_size); 4497 br(LT, DO1_LOOP); 4498 } 4499 } 4500 BIND(NOMATCH); 4501 mov(result, -1); 4502 b(DONE); 4503 BIND(MATCH); 4504 add(result, result_tmp, cnt2_neg, ASR, str2_chr_shift); 4505 BIND(DONE); 4506 } 4507 4508 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr); 4509 typedef void (MacroAssembler::* uxt_insn)(Register Rd, Register Rn); 4510 4511 void MacroAssembler::string_indexof_char(Register str1, Register cnt1, 4512 Register ch, Register result, 4513 Register tmp1, Register tmp2, Register tmp3) 4514 { 4515 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE; 4516 Register cnt1_neg = cnt1; 4517 Register ch1 = rscratch1; 4518 Register result_tmp = rscratch2; 4519 4520 cmp(cnt1, 4); 4521 br(LT, DO1_SHORT); 4522 4523 orr(ch, ch, ch, LSL, 16); 4524 orr(ch, ch, ch, LSL, 32); 4525 4526 sub(cnt1, cnt1, 4); 4527 mov(result_tmp, cnt1); 4528 lea(str1, Address(str1, cnt1, Address::uxtw(1))); 4529 sub(cnt1_neg, zr, cnt1, LSL, 1); 4530 4531 mov(tmp3, 0x0001000100010001); 4532 4533 BIND(CH1_LOOP); 4534 ldr(ch1, Address(str1, cnt1_neg)); 4535 eor(ch1, ch, ch1); 4536 sub(tmp1, ch1, tmp3); 4537 orr(tmp2, ch1, 0x7fff7fff7fff7fff); 4538 bics(tmp1, tmp1, tmp2); 4539 br(NE, HAS_ZERO); 4540 adds(cnt1_neg, cnt1_neg, 8); 4541 br(LT, CH1_LOOP); 4542 4543 cmp(cnt1_neg, 8); 4544 mov(cnt1_neg, 0); 4545 br(LT, CH1_LOOP); 4546 b(NOMATCH); 4547 4548 BIND(HAS_ZERO); 4549 rev(tmp1, tmp1); 4550 clz(tmp1, tmp1); 4551 add(cnt1_neg, cnt1_neg, tmp1, LSR, 3); 4552 b(MATCH); 4553 4554 BIND(DO1_SHORT); 4555 mov(result_tmp, cnt1); 4556 lea(str1, Address(str1, cnt1, Address::uxtw(1))); 4557 sub(cnt1_neg, zr, cnt1, LSL, 1); 4558 BIND(DO1_LOOP); 4559 ldrh(ch1, Address(str1, cnt1_neg)); 4560 cmpw(ch, ch1); 4561 br(EQ, MATCH); 4562 adds(cnt1_neg, cnt1_neg, 2); 4563 br(LT, DO1_LOOP); 4564 BIND(NOMATCH); 4565 mov(result, -1); 4566 b(DONE); 4567 BIND(MATCH); 4568 add(result, result_tmp, cnt1_neg, ASR, 1); 4569 BIND(DONE); 4570 } 4571 4572 // Compare strings. 4573 void MacroAssembler::string_compare(Register str1, Register str2, 4574 Register cnt1, Register cnt2, Register result, 4575 Register tmp1, 4576 FloatRegister vtmp, FloatRegister vtmpZ, int ae) { 4577 Label LENGTH_DIFF, DONE, SHORT_LOOP, SHORT_STRING, 4578 NEXT_WORD, DIFFERENCE; 4579 4580 bool isLL = ae == StrIntrinsicNode::LL; 4581 bool isLU = ae == StrIntrinsicNode::LU; 4582 bool isUL = ae == StrIntrinsicNode::UL; 4583 4584 bool str1_isL = isLL || isLU; 4585 bool str2_isL = isLL || isUL; 4586 4587 int str1_chr_shift = str1_isL ? 0 : 1; 4588 int str2_chr_shift = str2_isL ? 0 : 1; 4589 int str1_chr_size = str1_isL ? 1 : 2; 4590 int str2_chr_size = str2_isL ? 1 : 2; 4591 4592 chr_insn str1_load_chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb : 4593 (chr_insn)&MacroAssembler::ldrh; 4594 chr_insn str2_load_chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb : 4595 (chr_insn)&MacroAssembler::ldrh; 4596 uxt_insn ext_chr = isLL ? (uxt_insn)&MacroAssembler::uxtbw : 4597 (uxt_insn)&MacroAssembler::uxthw; 4598 4599 BLOCK_COMMENT("string_compare {"); 4600 4601 // Bizzarely, the counts are passed in bytes, regardless of whether they 4602 // are L or U strings, however the result is always in characters. 4603 if (!str1_isL) asrw(cnt1, cnt1, 1); 4604 if (!str2_isL) asrw(cnt2, cnt2, 1); 4605 4606 // Compute the minimum of the string lengths and save the difference. 4607 subsw(tmp1, cnt1, cnt2); 4608 cselw(cnt2, cnt1, cnt2, Assembler::LE); // min 4609 4610 // A very short string 4611 cmpw(cnt2, isLL ? 8:4); 4612 br(Assembler::LT, SHORT_STRING); 4613 4614 // Check if the strings start at the same location. 4615 cmp(str1, str2); 4616 br(Assembler::EQ, LENGTH_DIFF); 4617 4618 // Compare longwords 4619 { 4620 subw(cnt2, cnt2, isLL ? 8:4); // The last longword is a special case 4621 4622 // Move both string pointers to the last longword of their 4623 // strings, negate the remaining count, and convert it to bytes. 4624 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift))); 4625 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift))); 4626 if (isLU || isUL) { 4627 sub(cnt1, zr, cnt2, LSL, str1_chr_shift); 4628 eor(vtmpZ, T16B, vtmpZ, vtmpZ); 4629 } 4630 sub(cnt2, zr, cnt2, LSL, str2_chr_shift); 4631 4632 // Loop, loading longwords and comparing them into rscratch2. 4633 bind(NEXT_WORD); 4634 if (isLU) { 4635 ldrs(vtmp, Address(str1, cnt1)); 4636 zip1(vtmp, T8B, vtmp, vtmpZ); 4637 umov(result, vtmp, D, 0); 4638 } else { 4639 ldr(result, Address(str1, isUL ? cnt1:cnt2)); 4640 } 4641 if (isUL) { 4642 ldrs(vtmp, Address(str2, cnt2)); 4643 zip1(vtmp, T8B, vtmp, vtmpZ); 4644 umov(rscratch1, vtmp, D, 0); 4645 } else { 4646 ldr(rscratch1, Address(str2, cnt2)); 4647 } 4648 adds(cnt2, cnt2, isUL ? 4:8); 4649 if (isLU || isUL) add(cnt1, cnt1, isLU ? 4:8); 4650 eor(rscratch2, result, rscratch1); 4651 cbnz(rscratch2, DIFFERENCE); 4652 br(Assembler::LT, NEXT_WORD); 4653 4654 // Last longword. In the case where length == 4 we compare the 4655 // same longword twice, but that's still faster than another 4656 // conditional branch. 4657 4658 if (isLU) { 4659 ldrs(vtmp, Address(str1)); 4660 zip1(vtmp, T8B, vtmp, vtmpZ); 4661 umov(result, vtmp, D, 0); 4662 } else { 4663 ldr(result, Address(str1)); 4664 } 4665 if (isUL) { 4666 ldrs(vtmp, Address(str2)); 4667 zip1(vtmp, T8B, vtmp, vtmpZ); 4668 umov(rscratch1, vtmp, D, 0); 4669 } else { 4670 ldr(rscratch1, Address(str2)); 4671 } 4672 eor(rscratch2, result, rscratch1); 4673 cbz(rscratch2, LENGTH_DIFF); 4674 4675 // Find the first different characters in the longwords and 4676 // compute their difference. 4677 bind(DIFFERENCE); 4678 rev(rscratch2, rscratch2); 4679 clz(rscratch2, rscratch2); 4680 andr(rscratch2, rscratch2, isLL ? -8 : -16); 4681 lsrv(result, result, rscratch2); 4682 (this->*ext_chr)(result, result); 4683 lsrv(rscratch1, rscratch1, rscratch2); 4684 (this->*ext_chr)(rscratch1, rscratch1); 4685 subw(result, result, rscratch1); 4686 b(DONE); 4687 } 4688 4689 bind(SHORT_STRING); 4690 // Is the minimum length zero? 4691 cbz(cnt2, LENGTH_DIFF); 4692 4693 bind(SHORT_LOOP); 4694 (this->*str1_load_chr)(result, Address(post(str1, str1_chr_size))); 4695 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size))); 4696 subw(result, result, cnt1); 4697 cbnz(result, DONE); 4698 sub(cnt2, cnt2, 1); 4699 cbnz(cnt2, SHORT_LOOP); 4700 4701 // Strings are equal up to min length. Return the length difference. 4702 bind(LENGTH_DIFF); 4703 mov(result, tmp1); 4704 4705 // That's it 4706 bind(DONE); 4707 4708 BLOCK_COMMENT("} string_compare"); 4709 } 4710 4711 // Compare Strings or char/byte arrays. 4712 4713 // is_string is true iff this is a string comparison. 4714 4715 // For Strings we're passed the address of the first characters in a1 4716 // and a2 and the length in cnt1. 4717 4718 // For byte and char arrays we're passed the arrays themselves and we 4719 // have to extract length fields and do null checks here. 4720 4721 // elem_size is the element size in bytes: either 1 or 2. 4722 4723 // There are two implementations. For arrays >= 8 bytes, all 4724 // comparisons (including the final one, which may overlap) are 4725 // performed 8 bytes at a time. For arrays < 8 bytes, we compare a 4726 // halfword, then a short, and then a byte. 4727 4728 void MacroAssembler::arrays_equals(Register a1, Register a2, 4729 Register result, Register cnt1, 4730 int elem_size, bool is_string) 4731 { 4732 Label SAME, DONE, SHORT, NEXT_WORD, ONE; 4733 Register tmp1 = rscratch1; 4734 Register tmp2 = rscratch2; 4735 Register cnt2 = tmp2; // cnt2 only used in array length compare 4736 int elem_per_word = wordSize/elem_size; 4737 int log_elem_size = exact_log2(elem_size); 4738 int length_offset = arrayOopDesc::length_offset_in_bytes(); 4739 int base_offset 4740 = arrayOopDesc::base_offset_in_bytes(elem_size == 2 ? T_CHAR : T_BYTE); 4741 4742 assert(elem_size == 1 || elem_size == 2, "must be char or byte"); 4743 assert_different_registers(a1, a2, result, cnt1, rscratch1, rscratch2); 4744 4745 #ifndef PRODUCT 4746 { 4747 const char kind = (elem_size == 2) ? 'U' : 'L'; 4748 char comment[64]; 4749 snprintf(comment, sizeof comment, "%s%c%s {", 4750 is_string ? "string_equals" : "array_equals", 4751 kind, "{"); 4752 BLOCK_COMMENT(comment); 4753 } 4754 #endif 4755 4756 mov(result, false); 4757 4758 if (!is_string) { 4759 // if (a==a2) 4760 // return true; 4761 eor(rscratch1, a1, a2); 4762 cbz(rscratch1, SAME); 4763 // if (a==null || a2==null) 4764 // return false; 4765 cbz(a1, DONE); 4766 cbz(a2, DONE); 4767 // if (a1.length != a2.length) 4768 // return false; 4769 ldrw(cnt1, Address(a1, length_offset)); 4770 ldrw(cnt2, Address(a2, length_offset)); 4771 eorw(tmp1, cnt1, cnt2); 4772 cbnzw(tmp1, DONE); 4773 4774 lea(a1, Address(a1, base_offset)); 4775 lea(a2, Address(a2, base_offset)); 4776 } 4777 4778 // Check for short strings, i.e. smaller than wordSize. 4779 subs(cnt1, cnt1, elem_per_word); 4780 br(Assembler::LT, SHORT); 4781 // Main 8 byte comparison loop. 4782 bind(NEXT_WORD); { 4783 ldr(tmp1, Address(post(a1, wordSize))); 4784 ldr(tmp2, Address(post(a2, wordSize))); 4785 subs(cnt1, cnt1, elem_per_word); 4786 eor(tmp1, tmp1, tmp2); 4787 cbnz(tmp1, DONE); 4788 } br(GT, NEXT_WORD); 4789 // Last longword. In the case where length == 4 we compare the 4790 // same longword twice, but that's still faster than another 4791 // conditional branch. 4792 // cnt1 could be 0, -1, -2, -3, -4 for chars; -4 only happens when 4793 // length == 4. 4794 if (log_elem_size > 0) 4795 lsl(cnt1, cnt1, log_elem_size); 4796 ldr(tmp1, Address(a1, cnt1)); 4797 ldr(tmp2, Address(a2, cnt1)); 4798 eor(tmp1, tmp1, tmp2); 4799 cbnz(tmp1, DONE); 4800 b(SAME); 4801 4802 bind(SHORT); 4803 Label TAIL03, TAIL01; 4804 4805 tbz(cnt1, 2 - log_elem_size, TAIL03); // 0-7 bytes left. 4806 { 4807 ldrw(tmp1, Address(post(a1, 4))); 4808 ldrw(tmp2, Address(post(a2, 4))); 4809 eorw(tmp1, tmp1, tmp2); 4810 cbnzw(tmp1, DONE); 4811 } 4812 bind(TAIL03); 4813 tbz(cnt1, 1 - log_elem_size, TAIL01); // 0-3 bytes left. 4814 { 4815 ldrh(tmp1, Address(post(a1, 2))); 4816 ldrh(tmp2, Address(post(a2, 2))); 4817 eorw(tmp1, tmp1, tmp2); 4818 cbnzw(tmp1, DONE); 4819 } 4820 bind(TAIL01); 4821 if (elem_size == 1) { // Only needed when comparing byte arrays. 4822 tbz(cnt1, 0, SAME); // 0-1 bytes left. 4823 { 4824 ldrb(tmp1, a1); 4825 ldrb(tmp2, a2); 4826 eorw(tmp1, tmp1, tmp2); 4827 cbnzw(tmp1, DONE); 4828 } 4829 } 4830 // Arrays are equal. 4831 bind(SAME); 4832 mov(result, true); 4833 4834 // That's it. 4835 bind(DONE); 4836 BLOCK_COMMENT(is_string ? "} string_equals" : "} array_equals"); 4837 } 4838 4839 4840 // base: Address of a buffer to be zeroed, 8 bytes aligned. 4841 // cnt: Count in HeapWords. 4842 // is_large: True when 'cnt' is known to be >= BlockZeroingLowLimit. 4843 void MacroAssembler::zero_words(Register base, Register cnt) 4844 { 4845 if (UseBlockZeroing) { 4846 block_zero(base, cnt); 4847 } else { 4848 fill_words(base, cnt, zr); 4849 } 4850 } 4851 4852 // r10 = base: Address of a buffer to be zeroed, 8 bytes aligned. 4853 // cnt: Immediate count in HeapWords. 4854 // r11 = tmp: For use as cnt if we need to call out 4855 #define ShortArraySize (18 * BytesPerLong) 4856 void MacroAssembler::zero_words(Register base, u_int64_t cnt) 4857 { 4858 Register tmp = r11; 4859 int i = cnt & 1; // store any odd word to start 4860 if (i) str(zr, Address(base)); 4861 4862 if (cnt <= ShortArraySize / BytesPerLong) { 4863 for (; i < (int)cnt; i += 2) 4864 stp(zr, zr, Address(base, i * wordSize)); 4865 } else if (UseBlockZeroing && cnt >= (u_int64_t)(BlockZeroingLowLimit >> LogBytesPerWord)) { 4866 mov(tmp, cnt); 4867 block_zero(base, tmp, true); 4868 } else { 4869 const int unroll = 4; // Number of stp(zr, zr) instructions we'll unroll 4870 int remainder = cnt % (2 * unroll); 4871 for (; i < remainder; i += 2) 4872 stp(zr, zr, Address(base, i * wordSize)); 4873 4874 Label loop; 4875 Register cnt_reg = rscratch1; 4876 Register loop_base = rscratch2; 4877 cnt = cnt - remainder; 4878 mov(cnt_reg, cnt); 4879 // adjust base and prebias by -2 * wordSize so we can pre-increment 4880 add(loop_base, base, (remainder - 2) * wordSize); 4881 bind(loop); 4882 sub(cnt_reg, cnt_reg, 2 * unroll); 4883 for (i = 1; i < unroll; i++) 4884 stp(zr, zr, Address(loop_base, 2 * i * wordSize)); 4885 stp(zr, zr, Address(pre(loop_base, 2 * unroll * wordSize))); 4886 cbnz(cnt_reg, loop); 4887 } 4888 } 4889 4890 // base: Address of a buffer to be filled, 8 bytes aligned. 4891 // cnt: Count in 8-byte unit. 4892 // value: Value to be filled with. 4893 // base will point to the end of the buffer after filling. 4894 void MacroAssembler::fill_words(Register base, Register cnt, Register value) 4895 { 4896 // Algorithm: 4897 // 4898 // scratch1 = cnt & 7; 4899 // cnt -= scratch1; 4900 // p += scratch1; 4901 // switch (scratch1) { 4902 // do { 4903 // cnt -= 8; 4904 // p[-8] = v; 4905 // case 7: 4906 // p[-7] = v; 4907 // case 6: 4908 // p[-6] = v; 4909 // // ... 4910 // case 1: 4911 // p[-1] = v; 4912 // case 0: 4913 // p += 8; 4914 // } while (cnt); 4915 // } 4916 4917 assert_different_registers(base, cnt, value, rscratch1, rscratch2); 4918 4919 Label fini, skip, entry, loop; 4920 const int unroll = 8; // Number of stp instructions we'll unroll 4921 4922 cbz(cnt, fini); 4923 tbz(base, 3, skip); 4924 str(value, Address(post(base, 8))); 4925 sub(cnt, cnt, 1); 4926 bind(skip); 4927 4928 andr(rscratch1, cnt, (unroll-1) * 2); 4929 sub(cnt, cnt, rscratch1); 4930 add(base, base, rscratch1, Assembler::LSL, 3); 4931 adr(rscratch2, entry); 4932 sub(rscratch2, rscratch2, rscratch1, Assembler::LSL, 1); 4933 br(rscratch2); 4934 4935 bind(loop); 4936 add(base, base, unroll * 16); 4937 for (int i = -unroll; i < 0; i++) 4938 stp(value, value, Address(base, i * 16)); 4939 bind(entry); 4940 subs(cnt, cnt, unroll * 2); 4941 br(Assembler::GE, loop); 4942 4943 tbz(cnt, 0, fini); 4944 str(value, Address(post(base, 8))); 4945 bind(fini); 4946 } 4947 4948 // Use DC ZVA to do fast zeroing. 4949 // base: Address of a buffer to be zeroed, 8 bytes aligned. 4950 // cnt: Count in HeapWords. 4951 // is_large: True when 'cnt' is known to be >= BlockZeroingLowLimit. 4952 void MacroAssembler::block_zero(Register base, Register cnt, bool is_large) 4953 { 4954 Label small; 4955 Label store_pair, loop_store_pair, done; 4956 Label base_aligned; 4957 4958 assert_different_registers(base, cnt, rscratch1); 4959 guarantee(base == r10 && cnt == r11, "fix register usage"); 4960 4961 Register tmp = rscratch1; 4962 Register tmp2 = rscratch2; 4963 int zva_length = VM_Version::zva_length(); 4964 4965 // Ensure ZVA length can be divided by 16. This is required by 4966 // the subsequent operations. 4967 assert (zva_length % 16 == 0, "Unexpected ZVA Length"); 4968 4969 if (!is_large) cbz(cnt, done); 4970 tbz(base, 3, base_aligned); 4971 str(zr, Address(post(base, 8))); 4972 sub(cnt, cnt, 1); 4973 bind(base_aligned); 4974 4975 // Ensure count >= zva_length * 2 so that it still deserves a zva after 4976 // alignment. 4977 if (!is_large || !(BlockZeroingLowLimit >= zva_length * 2)) { 4978 int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit); 4979 subs(tmp, cnt, low_limit >> 3); 4980 br(Assembler::LT, small); 4981 } 4982 4983 far_call(StubRoutines::aarch64::get_zero_longs()); 4984 4985 bind(small); 4986 4987 const int unroll = 8; // Number of stp instructions we'll unroll 4988 Label small_loop, small_table_end; 4989 4990 andr(tmp, cnt, (unroll-1) * 2); 4991 sub(cnt, cnt, tmp); 4992 add(base, base, tmp, Assembler::LSL, 3); 4993 adr(tmp2, small_table_end); 4994 sub(tmp2, tmp2, tmp, Assembler::LSL, 1); 4995 br(tmp2); 4996 4997 bind(small_loop); 4998 add(base, base, unroll * 16); 4999 for (int i = -unroll; i < 0; i++) 5000 stp(zr, zr, Address(base, i * 16)); 5001 bind(small_table_end); 5002 subs(cnt, cnt, unroll * 2); 5003 br(Assembler::GE, small_loop); 5004 5005 tbz(cnt, 0, done); 5006 str(zr, Address(post(base, 8))); 5007 5008 bind(done); 5009 } 5010 5011 // Intrinsic for sun/nio/cs/ISO_8859_1$Encoder.implEncodeISOArray and 5012 // java/lang/StringUTF16.compress. 5013 void MacroAssembler::encode_iso_array(Register src, Register dst, 5014 Register len, Register result, 5015 FloatRegister Vtmp1, FloatRegister Vtmp2, 5016 FloatRegister Vtmp3, FloatRegister Vtmp4) 5017 { 5018 Label DONE, NEXT_32, LOOP_8, NEXT_8, LOOP_1, NEXT_1; 5019 Register tmp1 = rscratch1; 5020 5021 mov(result, len); // Save initial len 5022 5023 #ifndef BUILTIN_SIM 5024 subs(len, len, 32); 5025 br(LT, LOOP_8); 5026 5027 // The following code uses the SIMD 'uqxtn' and 'uqxtn2' instructions 5028 // to convert chars to bytes. These set the 'QC' bit in the FPSR if 5029 // any char could not fit in a byte, so clear the FPSR so we can test it. 5030 clear_fpsr(); 5031 5032 BIND(NEXT_32); 5033 ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src); 5034 uqxtn(Vtmp1, T8B, Vtmp1, T8H); // uqxtn - write bottom half 5035 uqxtn(Vtmp1, T16B, Vtmp2, T8H); // uqxtn2 - write top half 5036 uqxtn(Vtmp2, T8B, Vtmp3, T8H); 5037 uqxtn(Vtmp2, T16B, Vtmp4, T8H); // uqxtn2 5038 get_fpsr(tmp1); 5039 cbnzw(tmp1, LOOP_8); 5040 st1(Vtmp1, Vtmp2, T16B, post(dst, 32)); 5041 subs(len, len, 32); 5042 add(src, src, 64); 5043 br(GE, NEXT_32); 5044 5045 BIND(LOOP_8); 5046 adds(len, len, 32-8); 5047 br(LT, LOOP_1); 5048 clear_fpsr(); // QC may be set from loop above, clear again 5049 BIND(NEXT_8); 5050 ld1(Vtmp1, T8H, src); 5051 uqxtn(Vtmp1, T8B, Vtmp1, T8H); 5052 get_fpsr(tmp1); 5053 cbnzw(tmp1, LOOP_1); 5054 st1(Vtmp1, T8B, post(dst, 8)); 5055 subs(len, len, 8); 5056 add(src, src, 16); 5057 br(GE, NEXT_8); 5058 5059 BIND(LOOP_1); 5060 adds(len, len, 8); 5061 br(LE, DONE); 5062 #else 5063 cbz(len, DONE); 5064 #endif 5065 BIND(NEXT_1); 5066 ldrh(tmp1, Address(post(src, 2))); 5067 tst(tmp1, 0xff00); 5068 br(NE, DONE); 5069 strb(tmp1, Address(post(dst, 1))); 5070 subs(len, len, 1); 5071 br(GT, NEXT_1); 5072 5073 BIND(DONE); 5074 sub(result, result, len); // Return index where we stopped 5075 // Return len == 0 if we processed all 5076 // characters 5077 } 5078 5079 5080 // Inflate byte[] array to char[]. 5081 void MacroAssembler::byte_array_inflate(Register src, Register dst, Register len, 5082 FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3, 5083 Register tmp4) { 5084 Label big, done; 5085 5086 assert_different_registers(src, dst, len, tmp4, rscratch1); 5087 5088 fmovd(vtmp1 , zr); 5089 lsrw(rscratch1, len, 3); 5090 5091 cbnzw(rscratch1, big); 5092 5093 // Short string: less than 8 bytes. 5094 { 5095 Label loop, around, tiny; 5096 5097 subsw(len, len, 4); 5098 andw(len, len, 3); 5099 br(LO, tiny); 5100 5101 // Use SIMD to do 4 bytes. 5102 ldrs(vtmp2, post(src, 4)); 5103 zip1(vtmp3, T8B, vtmp2, vtmp1); 5104 strd(vtmp3, post(dst, 8)); 5105 5106 cbzw(len, done); 5107 5108 // Do the remaining bytes by steam. 5109 bind(loop); 5110 ldrb(tmp4, post(src, 1)); 5111 strh(tmp4, post(dst, 2)); 5112 subw(len, len, 1); 5113 5114 bind(tiny); 5115 cbnz(len, loop); 5116 5117 bind(around); 5118 b(done); 5119 } 5120 5121 // Unpack the bytes 8 at a time. 5122 bind(big); 5123 andw(len, len, 7); 5124 5125 { 5126 Label loop, around; 5127 5128 bind(loop); 5129 ldrd(vtmp2, post(src, 8)); 5130 sub(rscratch1, rscratch1, 1); 5131 zip1(vtmp3, T16B, vtmp2, vtmp1); 5132 st1(vtmp3, T8H, post(dst, 16)); 5133 cbnz(rscratch1, loop); 5134 5135 bind(around); 5136 } 5137 5138 // Do the tail of up to 8 bytes. 5139 sub(src, src, 8); 5140 add(src, src, len, ext::uxtw, 0); 5141 ldrd(vtmp2, Address(src)); 5142 sub(dst, dst, 16); 5143 add(dst, dst, len, ext::uxtw, 1); 5144 zip1(vtmp3, T16B, vtmp2, vtmp1); 5145 st1(vtmp3, T8H, Address(dst)); 5146 5147 bind(done); 5148 } 5149 5150 // Compress char[] array to byte[]. 5151 void MacroAssembler::char_array_compress(Register src, Register dst, Register len, 5152 FloatRegister tmp1Reg, FloatRegister tmp2Reg, 5153 FloatRegister tmp3Reg, FloatRegister tmp4Reg, 5154 Register result) { 5155 encode_iso_array(src, dst, len, result, 5156 tmp1Reg, tmp2Reg, tmp3Reg, tmp4Reg); 5157 cmp(len, zr); 5158 csel(result, result, zr, EQ); 5159 } 5160 5161 // get_thread() can be called anywhere inside generated code so we 5162 // need to save whatever non-callee save context might get clobbered 5163 // by the call to JavaThread::aarch64_get_thread_helper() or, indeed, 5164 // the call setup code. 5165 // 5166 // aarch64_get_thread_helper() clobbers only r0, r1, and flags. 5167 // 5168 void MacroAssembler::get_thread(Register dst) { 5169 RegSet saved_regs = RegSet::range(r0, r1) + lr - dst; 5170 push(saved_regs, sp); 5171 5172 mov(lr, CAST_FROM_FN_PTR(address, JavaThread::aarch64_get_thread_helper)); 5173 blrt(lr, 1, 0, 1); 5174 if (dst != c_rarg0) { 5175 mov(dst, c_rarg0); 5176 } 5177 5178 pop(saved_regs, sp); 5179 }