1 /* 2 * Copyright (c) 1997, 2018, Oracle and/or its affiliates. All rights reserved. 3 * Copyright (c) 2012, 2018, SAP SE. All rights reserved. 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. 9 * 10 * This code is distributed in the hope that it will be useful, but WITHOUT 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 * version 2 for more details (a copy is included in the LICENSE file that 14 * accompanied this code). 15 * 16 * You should have received a copy of the GNU General Public License version 17 * 2 along with this work; if not, write to the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 19 * 20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 21 * or visit www.oracle.com if you need additional information or have any 22 * questions. 23 * 24 */ 25 26 #include "precompiled.hpp" 27 #include "asm/macroAssembler.inline.hpp" 28 #include "compiler/disassembler.hpp" 29 #include "gc/shared/cardTable.hpp" 30 #include "gc/shared/cardTableBarrierSet.hpp" 31 #include "gc/shared/collectedHeap.inline.hpp" 32 #include "interpreter/interpreter.hpp" 33 #include "memory/resourceArea.hpp" 34 #include "nativeInst_ppc.hpp" 35 #include "prims/methodHandles.hpp" 36 #include "runtime/biasedLocking.hpp" 37 #include "runtime/icache.hpp" 38 #include "runtime/interfaceSupport.inline.hpp" 39 #include "runtime/objectMonitor.hpp" 40 #include "runtime/os.hpp" 41 #include "runtime/safepoint.hpp" 42 #include "runtime/safepointMechanism.hpp" 43 #include "runtime/sharedRuntime.hpp" 44 #include "runtime/stubRoutines.hpp" 45 #include "utilities/macros.hpp" 46 #if INCLUDE_ALL_GCS 47 #include "gc/g1/g1BarrierSet.hpp" 48 #include "gc/g1/g1CardTable.hpp" 49 #include "gc/g1/g1CollectedHeap.inline.hpp" 50 #include "gc/g1/heapRegion.hpp" 51 #endif // INCLUDE_ALL_GCS 52 #ifdef COMPILER2 53 #include "opto/intrinsicnode.hpp" 54 #endif 55 56 #ifdef PRODUCT 57 #define BLOCK_COMMENT(str) // nothing 58 #else 59 #define BLOCK_COMMENT(str) block_comment(str) 60 #endif 61 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 62 63 #ifdef ASSERT 64 // On RISC, there's no benefit to verifying instruction boundaries. 65 bool AbstractAssembler::pd_check_instruction_mark() { return false; } 66 #endif 67 68 void MacroAssembler::ld_largeoffset_unchecked(Register d, int si31, Register a, int emit_filler_nop) { 69 assert(Assembler::is_simm(si31, 31) && si31 >= 0, "si31 out of range"); 70 if (Assembler::is_simm(si31, 16)) { 71 ld(d, si31, a); 72 if (emit_filler_nop) nop(); 73 } else { 74 const int hi = MacroAssembler::largeoffset_si16_si16_hi(si31); 75 const int lo = MacroAssembler::largeoffset_si16_si16_lo(si31); 76 addis(d, a, hi); 77 ld(d, lo, d); 78 } 79 } 80 81 void MacroAssembler::ld_largeoffset(Register d, int si31, Register a, int emit_filler_nop) { 82 assert_different_registers(d, a); 83 ld_largeoffset_unchecked(d, si31, a, emit_filler_nop); 84 } 85 86 void MacroAssembler::load_sized_value(Register dst, RegisterOrConstant offs, Register base, 87 size_t size_in_bytes, bool is_signed) { 88 switch (size_in_bytes) { 89 case 8: ld(dst, offs, base); break; 90 case 4: is_signed ? lwa(dst, offs, base) : lwz(dst, offs, base); break; 91 case 2: is_signed ? lha(dst, offs, base) : lhz(dst, offs, base); break; 92 case 1: lbz(dst, offs, base); if (is_signed) extsb(dst, dst); break; // lba doesn't exist :( 93 default: ShouldNotReachHere(); 94 } 95 } 96 97 void MacroAssembler::store_sized_value(Register dst, RegisterOrConstant offs, Register base, 98 size_t size_in_bytes) { 99 switch (size_in_bytes) { 100 case 8: std(dst, offs, base); break; 101 case 4: stw(dst, offs, base); break; 102 case 2: sth(dst, offs, base); break; 103 case 1: stb(dst, offs, base); break; 104 default: ShouldNotReachHere(); 105 } 106 } 107 108 void MacroAssembler::align(int modulus, int max, int rem) { 109 int padding = (rem + modulus - (offset() % modulus)) % modulus; 110 if (padding > max) return; 111 for (int c = (padding >> 2); c > 0; --c) { nop(); } 112 } 113 114 // Issue instructions that calculate given TOC from global TOC. 115 void MacroAssembler::calculate_address_from_global_toc(Register dst, address addr, bool hi16, bool lo16, 116 bool add_relocation, bool emit_dummy_addr) { 117 int offset = -1; 118 if (emit_dummy_addr) { 119 offset = -128; // dummy address 120 } else if (addr != (address)(intptr_t)-1) { 121 offset = MacroAssembler::offset_to_global_toc(addr); 122 } 123 124 if (hi16) { 125 addis(dst, R29_TOC, MacroAssembler::largeoffset_si16_si16_hi(offset)); 126 } 127 if (lo16) { 128 if (add_relocation) { 129 // Relocate at the addi to avoid confusion with a load from the method's TOC. 130 relocate(internal_word_Relocation::spec(addr)); 131 } 132 addi(dst, dst, MacroAssembler::largeoffset_si16_si16_lo(offset)); 133 } 134 } 135 136 address MacroAssembler::patch_calculate_address_from_global_toc_at(address a, address bound, address addr) { 137 const int offset = MacroAssembler::offset_to_global_toc(addr); 138 139 const address inst2_addr = a; 140 const int inst2 = *(int *)inst2_addr; 141 142 // The relocation points to the second instruction, the addi, 143 // and the addi reads and writes the same register dst. 144 const int dst = inv_rt_field(inst2); 145 assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst"); 146 147 // Now, find the preceding addis which writes to dst. 148 int inst1 = 0; 149 address inst1_addr = inst2_addr - BytesPerInstWord; 150 while (inst1_addr >= bound) { 151 inst1 = *(int *) inst1_addr; 152 if (is_addis(inst1) && inv_rt_field(inst1) == dst) { 153 // Stop, found the addis which writes dst. 154 break; 155 } 156 inst1_addr -= BytesPerInstWord; 157 } 158 159 assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC"); 160 set_imm((int *)inst1_addr, MacroAssembler::largeoffset_si16_si16_hi(offset)); 161 set_imm((int *)inst2_addr, MacroAssembler::largeoffset_si16_si16_lo(offset)); 162 return inst1_addr; 163 } 164 165 address MacroAssembler::get_address_of_calculate_address_from_global_toc_at(address a, address bound) { 166 const address inst2_addr = a; 167 const int inst2 = *(int *)inst2_addr; 168 169 // The relocation points to the second instruction, the addi, 170 // and the addi reads and writes the same register dst. 171 const int dst = inv_rt_field(inst2); 172 assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst"); 173 174 // Now, find the preceding addis which writes to dst. 175 int inst1 = 0; 176 address inst1_addr = inst2_addr - BytesPerInstWord; 177 while (inst1_addr >= bound) { 178 inst1 = *(int *) inst1_addr; 179 if (is_addis(inst1) && inv_rt_field(inst1) == dst) { 180 // stop, found the addis which writes dst 181 break; 182 } 183 inst1_addr -= BytesPerInstWord; 184 } 185 186 assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC"); 187 188 int offset = (get_imm(inst1_addr, 0) << 16) + get_imm(inst2_addr, 0); 189 // -1 is a special case 190 if (offset == -1) { 191 return (address)(intptr_t)-1; 192 } else { 193 return global_toc() + offset; 194 } 195 } 196 197 #ifdef _LP64 198 // Patch compressed oops or klass constants. 199 // Assembler sequence is 200 // 1) compressed oops: 201 // lis rx = const.hi 202 // ori rx = rx | const.lo 203 // 2) compressed klass: 204 // lis rx = const.hi 205 // clrldi rx = rx & 0xFFFFffff // clearMS32b, optional 206 // ori rx = rx | const.lo 207 // Clrldi will be passed by. 208 address MacroAssembler::patch_set_narrow_oop(address a, address bound, narrowOop data) { 209 assert(UseCompressedOops, "Should only patch compressed oops"); 210 211 const address inst2_addr = a; 212 const int inst2 = *(int *)inst2_addr; 213 214 // The relocation points to the second instruction, the ori, 215 // and the ori reads and writes the same register dst. 216 const int dst = inv_rta_field(inst2); 217 assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst"); 218 // Now, find the preceding addis which writes to dst. 219 int inst1 = 0; 220 address inst1_addr = inst2_addr - BytesPerInstWord; 221 bool inst1_found = false; 222 while (inst1_addr >= bound) { 223 inst1 = *(int *)inst1_addr; 224 if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break; } 225 inst1_addr -= BytesPerInstWord; 226 } 227 assert(inst1_found, "inst is not lis"); 228 229 int xc = (data >> 16) & 0xffff; 230 int xd = (data >> 0) & 0xffff; 231 232 set_imm((int *)inst1_addr, (short)(xc)); // see enc_load_con_narrow_hi/_lo 233 set_imm((int *)inst2_addr, (xd)); // unsigned int 234 return inst1_addr; 235 } 236 237 // Get compressed oop or klass constant. 238 narrowOop MacroAssembler::get_narrow_oop(address a, address bound) { 239 assert(UseCompressedOops, "Should only patch compressed oops"); 240 241 const address inst2_addr = a; 242 const int inst2 = *(int *)inst2_addr; 243 244 // The relocation points to the second instruction, the ori, 245 // and the ori reads and writes the same register dst. 246 const int dst = inv_rta_field(inst2); 247 assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst"); 248 // Now, find the preceding lis which writes to dst. 249 int inst1 = 0; 250 address inst1_addr = inst2_addr - BytesPerInstWord; 251 bool inst1_found = false; 252 253 while (inst1_addr >= bound) { 254 inst1 = *(int *) inst1_addr; 255 if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break;} 256 inst1_addr -= BytesPerInstWord; 257 } 258 assert(inst1_found, "inst is not lis"); 259 260 uint xl = ((unsigned int) (get_imm(inst2_addr, 0) & 0xffff)); 261 uint xh = (((get_imm(inst1_addr, 0)) & 0xffff) << 16); 262 263 return (int) (xl | xh); 264 } 265 #endif // _LP64 266 267 // Returns true if successful. 268 bool MacroAssembler::load_const_from_method_toc(Register dst, AddressLiteral& a, 269 Register toc, bool fixed_size) { 270 int toc_offset = 0; 271 // Use RelocationHolder::none for the constant pool entry, otherwise 272 // we will end up with a failing NativeCall::verify(x) where x is 273 // the address of the constant pool entry. 274 // FIXME: We should insert relocation information for oops at the constant 275 // pool entries instead of inserting it at the loads; patching of a constant 276 // pool entry should be less expensive. 277 address const_address = address_constant((address)a.value(), RelocationHolder::none); 278 if (const_address == NULL) { return false; } // allocation failure 279 // Relocate at the pc of the load. 280 relocate(a.rspec()); 281 toc_offset = (int)(const_address - code()->consts()->start()); 282 ld_largeoffset_unchecked(dst, toc_offset, toc, fixed_size); 283 return true; 284 } 285 286 bool MacroAssembler::is_load_const_from_method_toc_at(address a) { 287 const address inst1_addr = a; 288 const int inst1 = *(int *)inst1_addr; 289 290 // The relocation points to the ld or the addis. 291 return (is_ld(inst1)) || 292 (is_addis(inst1) && inv_ra_field(inst1) != 0); 293 } 294 295 int MacroAssembler::get_offset_of_load_const_from_method_toc_at(address a) { 296 assert(is_load_const_from_method_toc_at(a), "must be load_const_from_method_toc"); 297 298 const address inst1_addr = a; 299 const int inst1 = *(int *)inst1_addr; 300 301 if (is_ld(inst1)) { 302 return inv_d1_field(inst1); 303 } else if (is_addis(inst1)) { 304 const int dst = inv_rt_field(inst1); 305 306 // Now, find the succeeding ld which reads and writes to dst. 307 address inst2_addr = inst1_addr + BytesPerInstWord; 308 int inst2 = 0; 309 while (true) { 310 inst2 = *(int *) inst2_addr; 311 if (is_ld(inst2) && inv_ra_field(inst2) == dst && inv_rt_field(inst2) == dst) { 312 // Stop, found the ld which reads and writes dst. 313 break; 314 } 315 inst2_addr += BytesPerInstWord; 316 } 317 return (inv_d1_field(inst1) << 16) + inv_d1_field(inst2); 318 } 319 ShouldNotReachHere(); 320 return 0; 321 } 322 323 // Get the constant from a `load_const' sequence. 324 long MacroAssembler::get_const(address a) { 325 assert(is_load_const_at(a), "not a load of a constant"); 326 const int *p = (const int*) a; 327 unsigned long x = (((unsigned long) (get_imm(a,0) & 0xffff)) << 48); 328 if (is_ori(*(p+1))) { 329 x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 32); 330 x |= (((unsigned long) (get_imm(a,3) & 0xffff)) << 16); 331 x |= (((unsigned long) (get_imm(a,4) & 0xffff))); 332 } else if (is_lis(*(p+1))) { 333 x |= (((unsigned long) (get_imm(a,2) & 0xffff)) << 32); 334 x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 16); 335 x |= (((unsigned long) (get_imm(a,3) & 0xffff))); 336 } else { 337 ShouldNotReachHere(); 338 return (long) 0; 339 } 340 return (long) x; 341 } 342 343 // Patch the 64 bit constant of a `load_const' sequence. This is a low 344 // level procedure. It neither flushes the instruction cache nor is it 345 // mt safe. 346 void MacroAssembler::patch_const(address a, long x) { 347 assert(is_load_const_at(a), "not a load of a constant"); 348 int *p = (int*) a; 349 if (is_ori(*(p+1))) { 350 set_imm(0 + p, (x >> 48) & 0xffff); 351 set_imm(1 + p, (x >> 32) & 0xffff); 352 set_imm(3 + p, (x >> 16) & 0xffff); 353 set_imm(4 + p, x & 0xffff); 354 } else if (is_lis(*(p+1))) { 355 set_imm(0 + p, (x >> 48) & 0xffff); 356 set_imm(2 + p, (x >> 32) & 0xffff); 357 set_imm(1 + p, (x >> 16) & 0xffff); 358 set_imm(3 + p, x & 0xffff); 359 } else { 360 ShouldNotReachHere(); 361 } 362 } 363 364 AddressLiteral MacroAssembler::allocate_metadata_address(Metadata* obj) { 365 assert(oop_recorder() != NULL, "this assembler needs a Recorder"); 366 int index = oop_recorder()->allocate_metadata_index(obj); 367 RelocationHolder rspec = metadata_Relocation::spec(index); 368 return AddressLiteral((address)obj, rspec); 369 } 370 371 AddressLiteral MacroAssembler::constant_metadata_address(Metadata* obj) { 372 assert(oop_recorder() != NULL, "this assembler needs a Recorder"); 373 int index = oop_recorder()->find_index(obj); 374 RelocationHolder rspec = metadata_Relocation::spec(index); 375 return AddressLiteral((address)obj, rspec); 376 } 377 378 AddressLiteral MacroAssembler::allocate_oop_address(jobject obj) { 379 assert(oop_recorder() != NULL, "this assembler needs an OopRecorder"); 380 int oop_index = oop_recorder()->allocate_oop_index(obj); 381 return AddressLiteral(address(obj), oop_Relocation::spec(oop_index)); 382 } 383 384 AddressLiteral MacroAssembler::constant_oop_address(jobject obj) { 385 assert(oop_recorder() != NULL, "this assembler needs an OopRecorder"); 386 int oop_index = oop_recorder()->find_index(obj); 387 return AddressLiteral(address(obj), oop_Relocation::spec(oop_index)); 388 } 389 390 RegisterOrConstant MacroAssembler::delayed_value_impl(intptr_t* delayed_value_addr, 391 Register tmp, int offset) { 392 intptr_t value = *delayed_value_addr; 393 if (value != 0) { 394 return RegisterOrConstant(value + offset); 395 } 396 397 // Load indirectly to solve generation ordering problem. 398 // static address, no relocation 399 int simm16_offset = load_const_optimized(tmp, delayed_value_addr, noreg, true); 400 ld(tmp, simm16_offset, tmp); // must be aligned ((xa & 3) == 0) 401 402 if (offset != 0) { 403 addi(tmp, tmp, offset); 404 } 405 406 return RegisterOrConstant(tmp); 407 } 408 409 #ifndef PRODUCT 410 void MacroAssembler::pd_print_patched_instruction(address branch) { 411 Unimplemented(); // TODO: PPC port 412 } 413 #endif // ndef PRODUCT 414 415 // Conditional far branch for destinations encodable in 24+2 bits. 416 void MacroAssembler::bc_far(int boint, int biint, Label& dest, int optimize) { 417 418 // If requested by flag optimize, relocate the bc_far as a 419 // runtime_call and prepare for optimizing it when the code gets 420 // relocated. 421 if (optimize == bc_far_optimize_on_relocate) { 422 relocate(relocInfo::runtime_call_type); 423 } 424 425 // variant 2: 426 // 427 // b!cxx SKIP 428 // bxx DEST 429 // SKIP: 430 // 431 432 const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)), 433 opposite_bcond(inv_boint_bcond(boint))); 434 435 // We emit two branches. 436 // First, a conditional branch which jumps around the far branch. 437 const address not_taken_pc = pc() + 2 * BytesPerInstWord; 438 const address bc_pc = pc(); 439 bc(opposite_boint, biint, not_taken_pc); 440 441 const int bc_instr = *(int*)bc_pc; 442 assert(not_taken_pc == (address)inv_bd_field(bc_instr, (intptr_t)bc_pc), "postcondition"); 443 assert(opposite_boint == inv_bo_field(bc_instr), "postcondition"); 444 assert(boint == add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(bc_instr))), 445 opposite_bcond(inv_boint_bcond(inv_bo_field(bc_instr)))), 446 "postcondition"); 447 assert(biint == inv_bi_field(bc_instr), "postcondition"); 448 449 // Second, an unconditional far branch which jumps to dest. 450 // Note: target(dest) remembers the current pc (see CodeSection::target) 451 // and returns the current pc if the label is not bound yet; when 452 // the label gets bound, the unconditional far branch will be patched. 453 const address target_pc = target(dest); 454 const address b_pc = pc(); 455 b(target_pc); 456 457 assert(not_taken_pc == pc(), "postcondition"); 458 assert(dest.is_bound() || target_pc == b_pc, "postcondition"); 459 } 460 461 // 1 or 2 instructions 462 void MacroAssembler::bc_far_optimized(int boint, int biint, Label& dest) { 463 if (dest.is_bound() && is_within_range_of_bcxx(target(dest), pc())) { 464 bc(boint, biint, dest); 465 } else { 466 bc_far(boint, biint, dest, MacroAssembler::bc_far_optimize_on_relocate); 467 } 468 } 469 470 bool MacroAssembler::is_bc_far_at(address instruction_addr) { 471 return is_bc_far_variant1_at(instruction_addr) || 472 is_bc_far_variant2_at(instruction_addr) || 473 is_bc_far_variant3_at(instruction_addr); 474 } 475 476 address MacroAssembler::get_dest_of_bc_far_at(address instruction_addr) { 477 if (is_bc_far_variant1_at(instruction_addr)) { 478 const address instruction_1_addr = instruction_addr; 479 const int instruction_1 = *(int*)instruction_1_addr; 480 return (address)inv_bd_field(instruction_1, (intptr_t)instruction_1_addr); 481 } else if (is_bc_far_variant2_at(instruction_addr)) { 482 const address instruction_2_addr = instruction_addr + 4; 483 return bxx_destination(instruction_2_addr); 484 } else if (is_bc_far_variant3_at(instruction_addr)) { 485 return instruction_addr + 8; 486 } 487 // variant 4 ??? 488 ShouldNotReachHere(); 489 return NULL; 490 } 491 void MacroAssembler::set_dest_of_bc_far_at(address instruction_addr, address dest) { 492 493 if (is_bc_far_variant3_at(instruction_addr)) { 494 // variant 3, far cond branch to the next instruction, already patched to nops: 495 // 496 // nop 497 // endgroup 498 // SKIP/DEST: 499 // 500 return; 501 } 502 503 // first, extract boint and biint from the current branch 504 int boint = 0; 505 int biint = 0; 506 507 ResourceMark rm; 508 const int code_size = 2 * BytesPerInstWord; 509 CodeBuffer buf(instruction_addr, code_size); 510 MacroAssembler masm(&buf); 511 if (is_bc_far_variant2_at(instruction_addr) && dest == instruction_addr + 8) { 512 // Far branch to next instruction: Optimize it by patching nops (produce variant 3). 513 masm.nop(); 514 masm.endgroup(); 515 } else { 516 if (is_bc_far_variant1_at(instruction_addr)) { 517 // variant 1, the 1st instruction contains the destination address: 518 // 519 // bcxx DEST 520 // nop 521 // 522 const int instruction_1 = *(int*)(instruction_addr); 523 boint = inv_bo_field(instruction_1); 524 biint = inv_bi_field(instruction_1); 525 } else if (is_bc_far_variant2_at(instruction_addr)) { 526 // variant 2, the 2nd instruction contains the destination address: 527 // 528 // b!cxx SKIP 529 // bxx DEST 530 // SKIP: 531 // 532 const int instruction_1 = *(int*)(instruction_addr); 533 boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(instruction_1))), 534 opposite_bcond(inv_boint_bcond(inv_bo_field(instruction_1)))); 535 biint = inv_bi_field(instruction_1); 536 } else { 537 // variant 4??? 538 ShouldNotReachHere(); 539 } 540 541 // second, set the new branch destination and optimize the code 542 if (dest != instruction_addr + 4 && // the bc_far is still unbound! 543 masm.is_within_range_of_bcxx(dest, instruction_addr)) { 544 // variant 1: 545 // 546 // bcxx DEST 547 // nop 548 // 549 masm.bc(boint, biint, dest); 550 masm.nop(); 551 } else { 552 // variant 2: 553 // 554 // b!cxx SKIP 555 // bxx DEST 556 // SKIP: 557 // 558 const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)), 559 opposite_bcond(inv_boint_bcond(boint))); 560 const address not_taken_pc = masm.pc() + 2 * BytesPerInstWord; 561 masm.bc(opposite_boint, biint, not_taken_pc); 562 masm.b(dest); 563 } 564 } 565 ICache::ppc64_flush_icache_bytes(instruction_addr, code_size); 566 } 567 568 // Emit a NOT mt-safe patchable 64 bit absolute call/jump. 569 void MacroAssembler::bxx64_patchable(address dest, relocInfo::relocType rt, bool link) { 570 // get current pc 571 uint64_t start_pc = (uint64_t) pc(); 572 573 const address pc_of_bl = (address) (start_pc + (6*BytesPerInstWord)); // bl is last 574 const address pc_of_b = (address) (start_pc + (0*BytesPerInstWord)); // b is first 575 576 // relocate here 577 if (rt != relocInfo::none) { 578 relocate(rt); 579 } 580 581 if ( ReoptimizeCallSequences && 582 (( link && is_within_range_of_b(dest, pc_of_bl)) || 583 (!link && is_within_range_of_b(dest, pc_of_b)))) { 584 // variant 2: 585 // Emit an optimized, pc-relative call/jump. 586 587 if (link) { 588 // some padding 589 nop(); 590 nop(); 591 nop(); 592 nop(); 593 nop(); 594 nop(); 595 596 // do the call 597 assert(pc() == pc_of_bl, "just checking"); 598 bl(dest, relocInfo::none); 599 } else { 600 // do the jump 601 assert(pc() == pc_of_b, "just checking"); 602 b(dest, relocInfo::none); 603 604 // some padding 605 nop(); 606 nop(); 607 nop(); 608 nop(); 609 nop(); 610 nop(); 611 } 612 613 // Assert that we can identify the emitted call/jump. 614 assert(is_bxx64_patchable_variant2_at((address)start_pc, link), 615 "can't identify emitted call"); 616 } else { 617 // variant 1: 618 mr(R0, R11); // spill R11 -> R0. 619 620 // Load the destination address into CTR, 621 // calculate destination relative to global toc. 622 calculate_address_from_global_toc(R11, dest, true, true, false); 623 624 mtctr(R11); 625 mr(R11, R0); // spill R11 <- R0. 626 nop(); 627 628 // do the call/jump 629 if (link) { 630 bctrl(); 631 } else{ 632 bctr(); 633 } 634 // Assert that we can identify the emitted call/jump. 635 assert(is_bxx64_patchable_variant1b_at((address)start_pc, link), 636 "can't identify emitted call"); 637 } 638 639 // Assert that we can identify the emitted call/jump. 640 assert(is_bxx64_patchable_at((address)start_pc, link), 641 "can't identify emitted call"); 642 assert(get_dest_of_bxx64_patchable_at((address)start_pc, link) == dest, 643 "wrong encoding of dest address"); 644 } 645 646 // Identify a bxx64_patchable instruction. 647 bool MacroAssembler::is_bxx64_patchable_at(address instruction_addr, bool link) { 648 return is_bxx64_patchable_variant1b_at(instruction_addr, link) 649 //|| is_bxx64_patchable_variant1_at(instruction_addr, link) 650 || is_bxx64_patchable_variant2_at(instruction_addr, link); 651 } 652 653 // Does the call64_patchable instruction use a pc-relative encoding of 654 // the call destination? 655 bool MacroAssembler::is_bxx64_patchable_pcrelative_at(address instruction_addr, bool link) { 656 // variant 2 is pc-relative 657 return is_bxx64_patchable_variant2_at(instruction_addr, link); 658 } 659 660 // Identify variant 1. 661 bool MacroAssembler::is_bxx64_patchable_variant1_at(address instruction_addr, bool link) { 662 unsigned int* instr = (unsigned int*) instruction_addr; 663 return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l] 664 && is_mtctr(instr[5]) // mtctr 665 && is_load_const_at(instruction_addr); 666 } 667 668 // Identify variant 1b: load destination relative to global toc. 669 bool MacroAssembler::is_bxx64_patchable_variant1b_at(address instruction_addr, bool link) { 670 unsigned int* instr = (unsigned int*) instruction_addr; 671 return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l] 672 && is_mtctr(instr[3]) // mtctr 673 && is_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord, instruction_addr); 674 } 675 676 // Identify variant 2. 677 bool MacroAssembler::is_bxx64_patchable_variant2_at(address instruction_addr, bool link) { 678 unsigned int* instr = (unsigned int*) instruction_addr; 679 if (link) { 680 return is_bl (instr[6]) // bl dest is last 681 && is_nop(instr[0]) // nop 682 && is_nop(instr[1]) // nop 683 && is_nop(instr[2]) // nop 684 && is_nop(instr[3]) // nop 685 && is_nop(instr[4]) // nop 686 && is_nop(instr[5]); // nop 687 } else { 688 return is_b (instr[0]) // b dest is first 689 && is_nop(instr[1]) // nop 690 && is_nop(instr[2]) // nop 691 && is_nop(instr[3]) // nop 692 && is_nop(instr[4]) // nop 693 && is_nop(instr[5]) // nop 694 && is_nop(instr[6]); // nop 695 } 696 } 697 698 // Set dest address of a bxx64_patchable instruction. 699 void MacroAssembler::set_dest_of_bxx64_patchable_at(address instruction_addr, address dest, bool link) { 700 ResourceMark rm; 701 int code_size = MacroAssembler::bxx64_patchable_size; 702 CodeBuffer buf(instruction_addr, code_size); 703 MacroAssembler masm(&buf); 704 masm.bxx64_patchable(dest, relocInfo::none, link); 705 ICache::ppc64_flush_icache_bytes(instruction_addr, code_size); 706 } 707 708 // Get dest address of a bxx64_patchable instruction. 709 address MacroAssembler::get_dest_of_bxx64_patchable_at(address instruction_addr, bool link) { 710 if (is_bxx64_patchable_variant1_at(instruction_addr, link)) { 711 return (address) (unsigned long) get_const(instruction_addr); 712 } else if (is_bxx64_patchable_variant2_at(instruction_addr, link)) { 713 unsigned int* instr = (unsigned int*) instruction_addr; 714 if (link) { 715 const int instr_idx = 6; // bl is last 716 int branchoffset = branch_destination(instr[instr_idx], 0); 717 return instruction_addr + branchoffset + instr_idx*BytesPerInstWord; 718 } else { 719 const int instr_idx = 0; // b is first 720 int branchoffset = branch_destination(instr[instr_idx], 0); 721 return instruction_addr + branchoffset + instr_idx*BytesPerInstWord; 722 } 723 // Load dest relative to global toc. 724 } else if (is_bxx64_patchable_variant1b_at(instruction_addr, link)) { 725 return get_address_of_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord, 726 instruction_addr); 727 } else { 728 ShouldNotReachHere(); 729 return NULL; 730 } 731 } 732 733 // Uses ordering which corresponds to ABI: 734 // _savegpr0_14: std r14,-144(r1) 735 // _savegpr0_15: std r15,-136(r1) 736 // _savegpr0_16: std r16,-128(r1) 737 void MacroAssembler::save_nonvolatile_gprs(Register dst, int offset) { 738 std(R14, offset, dst); offset += 8; 739 std(R15, offset, dst); offset += 8; 740 std(R16, offset, dst); offset += 8; 741 std(R17, offset, dst); offset += 8; 742 std(R18, offset, dst); offset += 8; 743 std(R19, offset, dst); offset += 8; 744 std(R20, offset, dst); offset += 8; 745 std(R21, offset, dst); offset += 8; 746 std(R22, offset, dst); offset += 8; 747 std(R23, offset, dst); offset += 8; 748 std(R24, offset, dst); offset += 8; 749 std(R25, offset, dst); offset += 8; 750 std(R26, offset, dst); offset += 8; 751 std(R27, offset, dst); offset += 8; 752 std(R28, offset, dst); offset += 8; 753 std(R29, offset, dst); offset += 8; 754 std(R30, offset, dst); offset += 8; 755 std(R31, offset, dst); offset += 8; 756 757 stfd(F14, offset, dst); offset += 8; 758 stfd(F15, offset, dst); offset += 8; 759 stfd(F16, offset, dst); offset += 8; 760 stfd(F17, offset, dst); offset += 8; 761 stfd(F18, offset, dst); offset += 8; 762 stfd(F19, offset, dst); offset += 8; 763 stfd(F20, offset, dst); offset += 8; 764 stfd(F21, offset, dst); offset += 8; 765 stfd(F22, offset, dst); offset += 8; 766 stfd(F23, offset, dst); offset += 8; 767 stfd(F24, offset, dst); offset += 8; 768 stfd(F25, offset, dst); offset += 8; 769 stfd(F26, offset, dst); offset += 8; 770 stfd(F27, offset, dst); offset += 8; 771 stfd(F28, offset, dst); offset += 8; 772 stfd(F29, offset, dst); offset += 8; 773 stfd(F30, offset, dst); offset += 8; 774 stfd(F31, offset, dst); 775 } 776 777 // Uses ordering which corresponds to ABI: 778 // _restgpr0_14: ld r14,-144(r1) 779 // _restgpr0_15: ld r15,-136(r1) 780 // _restgpr0_16: ld r16,-128(r1) 781 void MacroAssembler::restore_nonvolatile_gprs(Register src, int offset) { 782 ld(R14, offset, src); offset += 8; 783 ld(R15, offset, src); offset += 8; 784 ld(R16, offset, src); offset += 8; 785 ld(R17, offset, src); offset += 8; 786 ld(R18, offset, src); offset += 8; 787 ld(R19, offset, src); offset += 8; 788 ld(R20, offset, src); offset += 8; 789 ld(R21, offset, src); offset += 8; 790 ld(R22, offset, src); offset += 8; 791 ld(R23, offset, src); offset += 8; 792 ld(R24, offset, src); offset += 8; 793 ld(R25, offset, src); offset += 8; 794 ld(R26, offset, src); offset += 8; 795 ld(R27, offset, src); offset += 8; 796 ld(R28, offset, src); offset += 8; 797 ld(R29, offset, src); offset += 8; 798 ld(R30, offset, src); offset += 8; 799 ld(R31, offset, src); offset += 8; 800 801 // FP registers 802 lfd(F14, offset, src); offset += 8; 803 lfd(F15, offset, src); offset += 8; 804 lfd(F16, offset, src); offset += 8; 805 lfd(F17, offset, src); offset += 8; 806 lfd(F18, offset, src); offset += 8; 807 lfd(F19, offset, src); offset += 8; 808 lfd(F20, offset, src); offset += 8; 809 lfd(F21, offset, src); offset += 8; 810 lfd(F22, offset, src); offset += 8; 811 lfd(F23, offset, src); offset += 8; 812 lfd(F24, offset, src); offset += 8; 813 lfd(F25, offset, src); offset += 8; 814 lfd(F26, offset, src); offset += 8; 815 lfd(F27, offset, src); offset += 8; 816 lfd(F28, offset, src); offset += 8; 817 lfd(F29, offset, src); offset += 8; 818 lfd(F30, offset, src); offset += 8; 819 lfd(F31, offset, src); 820 } 821 822 // For verify_oops. 823 void MacroAssembler::save_volatile_gprs(Register dst, int offset) { 824 std(R2, offset, dst); offset += 8; 825 std(R3, offset, dst); offset += 8; 826 std(R4, offset, dst); offset += 8; 827 std(R5, offset, dst); offset += 8; 828 std(R6, offset, dst); offset += 8; 829 std(R7, offset, dst); offset += 8; 830 std(R8, offset, dst); offset += 8; 831 std(R9, offset, dst); offset += 8; 832 std(R10, offset, dst); offset += 8; 833 std(R11, offset, dst); offset += 8; 834 std(R12, offset, dst); offset += 8; 835 836 stfd(F0, offset, dst); offset += 8; 837 stfd(F1, offset, dst); offset += 8; 838 stfd(F2, offset, dst); offset += 8; 839 stfd(F3, offset, dst); offset += 8; 840 stfd(F4, offset, dst); offset += 8; 841 stfd(F5, offset, dst); offset += 8; 842 stfd(F6, offset, dst); offset += 8; 843 stfd(F7, offset, dst); offset += 8; 844 stfd(F8, offset, dst); offset += 8; 845 stfd(F9, offset, dst); offset += 8; 846 stfd(F10, offset, dst); offset += 8; 847 stfd(F11, offset, dst); offset += 8; 848 stfd(F12, offset, dst); offset += 8; 849 stfd(F13, offset, dst); 850 } 851 852 // For verify_oops. 853 void MacroAssembler::restore_volatile_gprs(Register src, int offset) { 854 ld(R2, offset, src); offset += 8; 855 ld(R3, offset, src); offset += 8; 856 ld(R4, offset, src); offset += 8; 857 ld(R5, offset, src); offset += 8; 858 ld(R6, offset, src); offset += 8; 859 ld(R7, offset, src); offset += 8; 860 ld(R8, offset, src); offset += 8; 861 ld(R9, offset, src); offset += 8; 862 ld(R10, offset, src); offset += 8; 863 ld(R11, offset, src); offset += 8; 864 ld(R12, offset, src); offset += 8; 865 866 lfd(F0, offset, src); offset += 8; 867 lfd(F1, offset, src); offset += 8; 868 lfd(F2, offset, src); offset += 8; 869 lfd(F3, offset, src); offset += 8; 870 lfd(F4, offset, src); offset += 8; 871 lfd(F5, offset, src); offset += 8; 872 lfd(F6, offset, src); offset += 8; 873 lfd(F7, offset, src); offset += 8; 874 lfd(F8, offset, src); offset += 8; 875 lfd(F9, offset, src); offset += 8; 876 lfd(F10, offset, src); offset += 8; 877 lfd(F11, offset, src); offset += 8; 878 lfd(F12, offset, src); offset += 8; 879 lfd(F13, offset, src); 880 } 881 882 void MacroAssembler::save_LR_CR(Register tmp) { 883 mfcr(tmp); 884 std(tmp, _abi(cr), R1_SP); 885 mflr(tmp); 886 std(tmp, _abi(lr), R1_SP); 887 // Tmp must contain lr on exit! (see return_addr and prolog in ppc64.ad) 888 } 889 890 void MacroAssembler::restore_LR_CR(Register tmp) { 891 assert(tmp != R1_SP, "must be distinct"); 892 ld(tmp, _abi(lr), R1_SP); 893 mtlr(tmp); 894 ld(tmp, _abi(cr), R1_SP); 895 mtcr(tmp); 896 } 897 898 address MacroAssembler::get_PC_trash_LR(Register result) { 899 Label L; 900 bl(L); 901 bind(L); 902 address lr_pc = pc(); 903 mflr(result); 904 return lr_pc; 905 } 906 907 void MacroAssembler::resize_frame(Register offset, Register tmp) { 908 #ifdef ASSERT 909 assert_different_registers(offset, tmp, R1_SP); 910 andi_(tmp, offset, frame::alignment_in_bytes-1); 911 asm_assert_eq("resize_frame: unaligned", 0x204); 912 #endif 913 914 // tmp <- *(SP) 915 ld(tmp, _abi(callers_sp), R1_SP); 916 // addr <- SP + offset; 917 // *(addr) <- tmp; 918 // SP <- addr 919 stdux(tmp, R1_SP, offset); 920 } 921 922 void MacroAssembler::resize_frame(int offset, Register tmp) { 923 assert(is_simm(offset, 16), "too big an offset"); 924 assert_different_registers(tmp, R1_SP); 925 assert((offset & (frame::alignment_in_bytes-1))==0, "resize_frame: unaligned"); 926 // tmp <- *(SP) 927 ld(tmp, _abi(callers_sp), R1_SP); 928 // addr <- SP + offset; 929 // *(addr) <- tmp; 930 // SP <- addr 931 stdu(tmp, offset, R1_SP); 932 } 933 934 void MacroAssembler::resize_frame_absolute(Register addr, Register tmp1, Register tmp2) { 935 // (addr == tmp1) || (addr == tmp2) is allowed here! 936 assert(tmp1 != tmp2, "must be distinct"); 937 938 // compute offset w.r.t. current stack pointer 939 // tmp_1 <- addr - SP (!) 940 subf(tmp1, R1_SP, addr); 941 942 // atomically update SP keeping back link. 943 resize_frame(tmp1/* offset */, tmp2/* tmp */); 944 } 945 946 void MacroAssembler::push_frame(Register bytes, Register tmp) { 947 #ifdef ASSERT 948 assert(bytes != R0, "r0 not allowed here"); 949 andi_(R0, bytes, frame::alignment_in_bytes-1); 950 asm_assert_eq("push_frame(Reg, Reg): unaligned", 0x203); 951 #endif 952 neg(tmp, bytes); 953 stdux(R1_SP, R1_SP, tmp); 954 } 955 956 // Push a frame of size `bytes'. 957 void MacroAssembler::push_frame(unsigned int bytes, Register tmp) { 958 long offset = align_addr(bytes, frame::alignment_in_bytes); 959 if (is_simm(-offset, 16)) { 960 stdu(R1_SP, -offset, R1_SP); 961 } else { 962 load_const_optimized(tmp, -offset); 963 stdux(R1_SP, R1_SP, tmp); 964 } 965 } 966 967 // Push a frame of size `bytes' plus abi_reg_args on top. 968 void MacroAssembler::push_frame_reg_args(unsigned int bytes, Register tmp) { 969 push_frame(bytes + frame::abi_reg_args_size, tmp); 970 } 971 972 // Setup up a new C frame with a spill area for non-volatile GPRs and 973 // additional space for local variables. 974 void MacroAssembler::push_frame_reg_args_nonvolatiles(unsigned int bytes, 975 Register tmp) { 976 push_frame(bytes + frame::abi_reg_args_size + frame::spill_nonvolatiles_size, tmp); 977 } 978 979 // Pop current C frame. 980 void MacroAssembler::pop_frame() { 981 ld(R1_SP, _abi(callers_sp), R1_SP); 982 } 983 984 #if defined(ABI_ELFv2) 985 address MacroAssembler::branch_to(Register r_function_entry, bool and_link) { 986 // TODO(asmundak): make sure the caller uses R12 as function descriptor 987 // most of the times. 988 if (R12 != r_function_entry) { 989 mr(R12, r_function_entry); 990 } 991 mtctr(R12); 992 // Do a call or a branch. 993 if (and_link) { 994 bctrl(); 995 } else { 996 bctr(); 997 } 998 _last_calls_return_pc = pc(); 999 1000 return _last_calls_return_pc; 1001 } 1002 1003 // Call a C function via a function descriptor and use full C 1004 // calling conventions. Updates and returns _last_calls_return_pc. 1005 address MacroAssembler::call_c(Register r_function_entry) { 1006 return branch_to(r_function_entry, /*and_link=*/true); 1007 } 1008 1009 // For tail calls: only branch, don't link, so callee returns to caller of this function. 1010 address MacroAssembler::call_c_and_return_to_caller(Register r_function_entry) { 1011 return branch_to(r_function_entry, /*and_link=*/false); 1012 } 1013 1014 address MacroAssembler::call_c(address function_entry, relocInfo::relocType rt) { 1015 load_const(R12, function_entry, R0); 1016 return branch_to(R12, /*and_link=*/true); 1017 } 1018 1019 #else 1020 // Generic version of a call to C function via a function descriptor 1021 // with variable support for C calling conventions (TOC, ENV, etc.). 1022 // Updates and returns _last_calls_return_pc. 1023 address MacroAssembler::branch_to(Register function_descriptor, bool and_link, bool save_toc_before_call, 1024 bool restore_toc_after_call, bool load_toc_of_callee, bool load_env_of_callee) { 1025 // we emit standard ptrgl glue code here 1026 assert((function_descriptor != R0), "function_descriptor cannot be R0"); 1027 1028 // retrieve necessary entries from the function descriptor 1029 ld(R0, in_bytes(FunctionDescriptor::entry_offset()), function_descriptor); 1030 mtctr(R0); 1031 1032 if (load_toc_of_callee) { 1033 ld(R2_TOC, in_bytes(FunctionDescriptor::toc_offset()), function_descriptor); 1034 } 1035 if (load_env_of_callee) { 1036 ld(R11, in_bytes(FunctionDescriptor::env_offset()), function_descriptor); 1037 } else if (load_toc_of_callee) { 1038 li(R11, 0); 1039 } 1040 1041 // do a call or a branch 1042 if (and_link) { 1043 bctrl(); 1044 } else { 1045 bctr(); 1046 } 1047 _last_calls_return_pc = pc(); 1048 1049 return _last_calls_return_pc; 1050 } 1051 1052 // Call a C function via a function descriptor and use full C calling 1053 // conventions. 1054 // We don't use the TOC in generated code, so there is no need to save 1055 // and restore its value. 1056 address MacroAssembler::call_c(Register fd) { 1057 return branch_to(fd, /*and_link=*/true, 1058 /*save toc=*/false, 1059 /*restore toc=*/false, 1060 /*load toc=*/true, 1061 /*load env=*/true); 1062 } 1063 1064 address MacroAssembler::call_c_and_return_to_caller(Register fd) { 1065 return branch_to(fd, /*and_link=*/false, 1066 /*save toc=*/false, 1067 /*restore toc=*/false, 1068 /*load toc=*/true, 1069 /*load env=*/true); 1070 } 1071 1072 address MacroAssembler::call_c(const FunctionDescriptor* fd, relocInfo::relocType rt) { 1073 if (rt != relocInfo::none) { 1074 // this call needs to be relocatable 1075 if (!ReoptimizeCallSequences 1076 || (rt != relocInfo::runtime_call_type && rt != relocInfo::none) 1077 || fd == NULL // support code-size estimation 1078 || !fd->is_friend_function() 1079 || fd->entry() == NULL) { 1080 // it's not a friend function as defined by class FunctionDescriptor, 1081 // so do a full call-c here. 1082 load_const(R11, (address)fd, R0); 1083 1084 bool has_env = (fd != NULL && fd->env() != NULL); 1085 return branch_to(R11, /*and_link=*/true, 1086 /*save toc=*/false, 1087 /*restore toc=*/false, 1088 /*load toc=*/true, 1089 /*load env=*/has_env); 1090 } else { 1091 // It's a friend function. Load the entry point and don't care about 1092 // toc and env. Use an optimizable call instruction, but ensure the 1093 // same code-size as in the case of a non-friend function. 1094 nop(); 1095 nop(); 1096 nop(); 1097 bl64_patchable(fd->entry(), rt); 1098 _last_calls_return_pc = pc(); 1099 return _last_calls_return_pc; 1100 } 1101 } else { 1102 // This call does not need to be relocatable, do more aggressive 1103 // optimizations. 1104 if (!ReoptimizeCallSequences 1105 || !fd->is_friend_function()) { 1106 // It's not a friend function as defined by class FunctionDescriptor, 1107 // so do a full call-c here. 1108 load_const(R11, (address)fd, R0); 1109 return branch_to(R11, /*and_link=*/true, 1110 /*save toc=*/false, 1111 /*restore toc=*/false, 1112 /*load toc=*/true, 1113 /*load env=*/true); 1114 } else { 1115 // it's a friend function, load the entry point and don't care about 1116 // toc and env. 1117 address dest = fd->entry(); 1118 if (is_within_range_of_b(dest, pc())) { 1119 bl(dest); 1120 } else { 1121 bl64_patchable(dest, rt); 1122 } 1123 _last_calls_return_pc = pc(); 1124 return _last_calls_return_pc; 1125 } 1126 } 1127 } 1128 1129 // Call a C function. All constants needed reside in TOC. 1130 // 1131 // Read the address to call from the TOC. 1132 // Read env from TOC, if fd specifies an env. 1133 // Read new TOC from TOC. 1134 address MacroAssembler::call_c_using_toc(const FunctionDescriptor* fd, 1135 relocInfo::relocType rt, Register toc) { 1136 if (!ReoptimizeCallSequences 1137 || (rt != relocInfo::runtime_call_type && rt != relocInfo::none) 1138 || !fd->is_friend_function()) { 1139 // It's not a friend function as defined by class FunctionDescriptor, 1140 // so do a full call-c here. 1141 assert(fd->entry() != NULL, "function must be linked"); 1142 1143 AddressLiteral fd_entry(fd->entry()); 1144 bool success = load_const_from_method_toc(R11, fd_entry, toc, /*fixed_size*/ true); 1145 mtctr(R11); 1146 if (fd->env() == NULL) { 1147 li(R11, 0); 1148 nop(); 1149 } else { 1150 AddressLiteral fd_env(fd->env()); 1151 success = success && load_const_from_method_toc(R11, fd_env, toc, /*fixed_size*/ true); 1152 } 1153 AddressLiteral fd_toc(fd->toc()); 1154 // Set R2_TOC (load from toc) 1155 success = success && load_const_from_method_toc(R2_TOC, fd_toc, toc, /*fixed_size*/ true); 1156 bctrl(); 1157 _last_calls_return_pc = pc(); 1158 if (!success) { return NULL; } 1159 } else { 1160 // It's a friend function, load the entry point and don't care about 1161 // toc and env. Use an optimizable call instruction, but ensure the 1162 // same code-size as in the case of a non-friend function. 1163 nop(); 1164 bl64_patchable(fd->entry(), rt); 1165 _last_calls_return_pc = pc(); 1166 } 1167 return _last_calls_return_pc; 1168 } 1169 #endif // ABI_ELFv2 1170 1171 void MacroAssembler::call_VM_base(Register oop_result, 1172 Register last_java_sp, 1173 address entry_point, 1174 bool check_exceptions) { 1175 BLOCK_COMMENT("call_VM {"); 1176 // Determine last_java_sp register. 1177 if (!last_java_sp->is_valid()) { 1178 last_java_sp = R1_SP; 1179 } 1180 set_top_ijava_frame_at_SP_as_last_Java_frame(last_java_sp, R11_scratch1); 1181 1182 // ARG1 must hold thread address. 1183 mr(R3_ARG1, R16_thread); 1184 #if defined(ABI_ELFv2) 1185 address return_pc = call_c(entry_point, relocInfo::none); 1186 #else 1187 address return_pc = call_c((FunctionDescriptor*)entry_point, relocInfo::none); 1188 #endif 1189 1190 reset_last_Java_frame(); 1191 1192 // Check for pending exceptions. 1193 if (check_exceptions) { 1194 // We don't check for exceptions here. 1195 ShouldNotReachHere(); 1196 } 1197 1198 // Get oop result if there is one and reset the value in the thread. 1199 if (oop_result->is_valid()) { 1200 get_vm_result(oop_result); 1201 } 1202 1203 _last_calls_return_pc = return_pc; 1204 BLOCK_COMMENT("} call_VM"); 1205 } 1206 1207 void MacroAssembler::call_VM_leaf_base(address entry_point) { 1208 BLOCK_COMMENT("call_VM_leaf {"); 1209 #if defined(ABI_ELFv2) 1210 call_c(entry_point, relocInfo::none); 1211 #else 1212 call_c(CAST_FROM_FN_PTR(FunctionDescriptor*, entry_point), relocInfo::none); 1213 #endif 1214 BLOCK_COMMENT("} call_VM_leaf"); 1215 } 1216 1217 void MacroAssembler::call_VM(Register oop_result, address entry_point, bool check_exceptions) { 1218 call_VM_base(oop_result, noreg, entry_point, check_exceptions); 1219 } 1220 1221 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, 1222 bool check_exceptions) { 1223 // R3_ARG1 is reserved for the thread. 1224 mr_if_needed(R4_ARG2, arg_1); 1225 call_VM(oop_result, entry_point, check_exceptions); 1226 } 1227 1228 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2, 1229 bool check_exceptions) { 1230 // R3_ARG1 is reserved for the thread 1231 mr_if_needed(R4_ARG2, arg_1); 1232 assert(arg_2 != R4_ARG2, "smashed argument"); 1233 mr_if_needed(R5_ARG3, arg_2); 1234 call_VM(oop_result, entry_point, check_exceptions); 1235 } 1236 1237 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2, Register arg_3, 1238 bool check_exceptions) { 1239 // R3_ARG1 is reserved for the thread 1240 mr_if_needed(R4_ARG2, arg_1); 1241 assert(arg_2 != R4_ARG2, "smashed argument"); 1242 mr_if_needed(R5_ARG3, arg_2); 1243 mr_if_needed(R6_ARG4, arg_3); 1244 call_VM(oop_result, entry_point, check_exceptions); 1245 } 1246 1247 void MacroAssembler::call_VM_leaf(address entry_point) { 1248 call_VM_leaf_base(entry_point); 1249 } 1250 1251 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1) { 1252 mr_if_needed(R3_ARG1, arg_1); 1253 call_VM_leaf(entry_point); 1254 } 1255 1256 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2) { 1257 mr_if_needed(R3_ARG1, arg_1); 1258 assert(arg_2 != R3_ARG1, "smashed argument"); 1259 mr_if_needed(R4_ARG2, arg_2); 1260 call_VM_leaf(entry_point); 1261 } 1262 1263 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2, Register arg_3) { 1264 mr_if_needed(R3_ARG1, arg_1); 1265 assert(arg_2 != R3_ARG1, "smashed argument"); 1266 mr_if_needed(R4_ARG2, arg_2); 1267 assert(arg_3 != R3_ARG1 && arg_3 != R4_ARG2, "smashed argument"); 1268 mr_if_needed(R5_ARG3, arg_3); 1269 call_VM_leaf(entry_point); 1270 } 1271 1272 // Check whether instruction is a read access to the polling page 1273 // which was emitted by load_from_polling_page(..). 1274 bool MacroAssembler::is_load_from_polling_page(int instruction, void* ucontext, 1275 address* polling_address_ptr) { 1276 if (!is_ld(instruction)) 1277 return false; // It's not a ld. Fail. 1278 1279 int rt = inv_rt_field(instruction); 1280 int ra = inv_ra_field(instruction); 1281 int ds = inv_ds_field(instruction); 1282 if (!(ds == 0 && ra != 0 && rt == 0)) { 1283 return false; // It's not a ld(r0, X, ra). Fail. 1284 } 1285 1286 if (!ucontext) { 1287 // Set polling address. 1288 if (polling_address_ptr != NULL) { 1289 *polling_address_ptr = NULL; 1290 } 1291 return true; // No ucontext given. Can't check value of ra. Assume true. 1292 } 1293 1294 #ifdef LINUX 1295 // Ucontext given. Check that register ra contains the address of 1296 // the safepoing polling page. 1297 ucontext_t* uc = (ucontext_t*) ucontext; 1298 // Set polling address. 1299 address addr = (address)uc->uc_mcontext.regs->gpr[ra] + (ssize_t)ds; 1300 if (polling_address_ptr != NULL) { 1301 *polling_address_ptr = addr; 1302 } 1303 return os::is_poll_address(addr); 1304 #else 1305 // Not on Linux, ucontext must be NULL. 1306 ShouldNotReachHere(); 1307 return false; 1308 #endif 1309 } 1310 1311 bool MacroAssembler::is_memory_serialization(int instruction, JavaThread* thread, void* ucontext) { 1312 #ifdef LINUX 1313 ucontext_t* uc = (ucontext_t*) ucontext; 1314 1315 if (is_stwx(instruction) || is_stwux(instruction)) { 1316 int ra = inv_ra_field(instruction); 1317 int rb = inv_rb_field(instruction); 1318 1319 // look up content of ra and rb in ucontext 1320 address ra_val=(address)uc->uc_mcontext.regs->gpr[ra]; 1321 long rb_val=(long)uc->uc_mcontext.regs->gpr[rb]; 1322 return os::is_memory_serialize_page(thread, ra_val+rb_val); 1323 } else if (is_stw(instruction) || is_stwu(instruction)) { 1324 int ra = inv_ra_field(instruction); 1325 int d1 = inv_d1_field(instruction); 1326 1327 // look up content of ra in ucontext 1328 address ra_val=(address)uc->uc_mcontext.regs->gpr[ra]; 1329 return os::is_memory_serialize_page(thread, ra_val+d1); 1330 } else { 1331 return false; 1332 } 1333 #else 1334 // workaround not needed on !LINUX :-) 1335 ShouldNotCallThis(); 1336 return false; 1337 #endif 1338 } 1339 1340 void MacroAssembler::bang_stack_with_offset(int offset) { 1341 // When increasing the stack, the old stack pointer will be written 1342 // to the new top of stack according to the PPC64 abi. 1343 // Therefore, stack banging is not necessary when increasing 1344 // the stack by <= os::vm_page_size() bytes. 1345 // When increasing the stack by a larger amount, this method is 1346 // called repeatedly to bang the intermediate pages. 1347 1348 // Stack grows down, caller passes positive offset. 1349 assert(offset > 0, "must bang with positive offset"); 1350 1351 long stdoffset = -offset; 1352 1353 if (is_simm(stdoffset, 16)) { 1354 // Signed 16 bit offset, a simple std is ok. 1355 if (UseLoadInstructionsForStackBangingPPC64) { 1356 ld(R0, (int)(signed short)stdoffset, R1_SP); 1357 } else { 1358 std(R0,(int)(signed short)stdoffset, R1_SP); 1359 } 1360 } else if (is_simm(stdoffset, 31)) { 1361 const int hi = MacroAssembler::largeoffset_si16_si16_hi(stdoffset); 1362 const int lo = MacroAssembler::largeoffset_si16_si16_lo(stdoffset); 1363 1364 Register tmp = R11; 1365 addis(tmp, R1_SP, hi); 1366 if (UseLoadInstructionsForStackBangingPPC64) { 1367 ld(R0, lo, tmp); 1368 } else { 1369 std(R0, lo, tmp); 1370 } 1371 } else { 1372 ShouldNotReachHere(); 1373 } 1374 } 1375 1376 // If instruction is a stack bang of the form 1377 // std R0, x(Ry), (see bang_stack_with_offset()) 1378 // stdu R1_SP, x(R1_SP), (see push_frame(), resize_frame()) 1379 // or stdux R1_SP, Rx, R1_SP (see push_frame(), resize_frame()) 1380 // return the banged address. Otherwise, return 0. 1381 address MacroAssembler::get_stack_bang_address(int instruction, void *ucontext) { 1382 #ifdef LINUX 1383 ucontext_t* uc = (ucontext_t*) ucontext; 1384 int rs = inv_rs_field(instruction); 1385 int ra = inv_ra_field(instruction); 1386 if ( (is_ld(instruction) && rs == 0 && UseLoadInstructionsForStackBangingPPC64) 1387 || (is_std(instruction) && rs == 0 && !UseLoadInstructionsForStackBangingPPC64) 1388 || (is_stdu(instruction) && rs == 1)) { 1389 int ds = inv_ds_field(instruction); 1390 // return banged address 1391 return ds+(address)uc->uc_mcontext.regs->gpr[ra]; 1392 } else if (is_stdux(instruction) && rs == 1) { 1393 int rb = inv_rb_field(instruction); 1394 address sp = (address)uc->uc_mcontext.regs->gpr[1]; 1395 long rb_val = (long)uc->uc_mcontext.regs->gpr[rb]; 1396 return ra != 1 || rb_val >= 0 ? NULL // not a stack bang 1397 : sp + rb_val; // banged address 1398 } 1399 return NULL; // not a stack bang 1400 #else 1401 // workaround not needed on !LINUX :-) 1402 ShouldNotCallThis(); 1403 return NULL; 1404 #endif 1405 } 1406 1407 void MacroAssembler::reserved_stack_check(Register return_pc) { 1408 // Test if reserved zone needs to be enabled. 1409 Label no_reserved_zone_enabling; 1410 1411 ld_ptr(R0, JavaThread::reserved_stack_activation_offset(), R16_thread); 1412 cmpld(CCR0, R1_SP, R0); 1413 blt_predict_taken(CCR0, no_reserved_zone_enabling); 1414 1415 // Enable reserved zone again, throw stack overflow exception. 1416 push_frame_reg_args(0, R0); 1417 call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone), R16_thread); 1418 pop_frame(); 1419 mtlr(return_pc); 1420 load_const_optimized(R0, StubRoutines::throw_delayed_StackOverflowError_entry()); 1421 mtctr(R0); 1422 bctr(); 1423 1424 should_not_reach_here(); 1425 1426 bind(no_reserved_zone_enabling); 1427 } 1428 1429 void MacroAssembler::getandsetd(Register dest_current_value, Register exchange_value, Register addr_base, 1430 bool cmpxchgx_hint) { 1431 Label retry; 1432 bind(retry); 1433 ldarx(dest_current_value, addr_base, cmpxchgx_hint); 1434 stdcx_(exchange_value, addr_base); 1435 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1436 bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0. 1437 } else { 1438 bne( CCR0, retry); // StXcx_ sets CCR0. 1439 } 1440 } 1441 1442 void MacroAssembler::getandaddd(Register dest_current_value, Register inc_value, Register addr_base, 1443 Register tmp, bool cmpxchgx_hint) { 1444 Label retry; 1445 bind(retry); 1446 ldarx(dest_current_value, addr_base, cmpxchgx_hint); 1447 add(tmp, dest_current_value, inc_value); 1448 stdcx_(tmp, addr_base); 1449 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1450 bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0. 1451 } else { 1452 bne( CCR0, retry); // StXcx_ sets CCR0. 1453 } 1454 } 1455 1456 // Word/sub-word atomic helper functions 1457 1458 // Temps and addr_base are killed if size < 4 and processor does not support respective instructions. 1459 // Only signed types are supported with size < 4. 1460 // Atomic add always kills tmp1. 1461 void MacroAssembler::atomic_get_and_modify_generic(Register dest_current_value, Register exchange_value, 1462 Register addr_base, Register tmp1, Register tmp2, Register tmp3, 1463 bool cmpxchgx_hint, bool is_add, int size) { 1464 // Sub-word instructions are available since Power 8. 1465 // For older processors, instruction_type != size holds, and we 1466 // emulate the sub-word instructions by constructing a 4-byte value 1467 // that leaves the other bytes unchanged. 1468 const int instruction_type = VM_Version::has_lqarx() ? size : 4; 1469 1470 Label retry; 1471 Register shift_amount = noreg, 1472 val32 = dest_current_value, 1473 modval = is_add ? tmp1 : exchange_value; 1474 1475 if (instruction_type != size) { 1476 assert_different_registers(tmp1, tmp2, tmp3, dest_current_value, exchange_value, addr_base); 1477 modval = tmp1; 1478 shift_amount = tmp2; 1479 val32 = tmp3; 1480 // Need some preperation: Compute shift amount, align address. Note: shorts must be 2 byte aligned. 1481 #ifdef VM_LITTLE_ENDIAN 1482 rldic(shift_amount, addr_base, 3, 64-5); // (dest & 3) * 8; 1483 clrrdi(addr_base, addr_base, 2); 1484 #else 1485 xori(shift_amount, addr_base, (size == 1) ? 3 : 2); 1486 clrrdi(addr_base, addr_base, 2); 1487 rldic(shift_amount, shift_amount, 3, 64-5); // byte: ((3-dest) & 3) * 8; short: ((1-dest/2) & 1) * 16; 1488 #endif 1489 } 1490 1491 // atomic emulation loop 1492 bind(retry); 1493 1494 switch (instruction_type) { 1495 case 4: lwarx(val32, addr_base, cmpxchgx_hint); break; 1496 case 2: lharx(val32, addr_base, cmpxchgx_hint); break; 1497 case 1: lbarx(val32, addr_base, cmpxchgx_hint); break; 1498 default: ShouldNotReachHere(); 1499 } 1500 1501 if (instruction_type != size) { 1502 srw(dest_current_value, val32, shift_amount); 1503 } 1504 1505 if (is_add) { add(modval, dest_current_value, exchange_value); } 1506 1507 if (instruction_type != size) { 1508 // Transform exchange value such that the replacement can be done by one xor instruction. 1509 xorr(modval, dest_current_value, is_add ? modval : exchange_value); 1510 clrldi(modval, modval, (size == 1) ? 56 : 48); 1511 slw(modval, modval, shift_amount); 1512 xorr(modval, val32, modval); 1513 } 1514 1515 switch (instruction_type) { 1516 case 4: stwcx_(modval, addr_base); break; 1517 case 2: sthcx_(modval, addr_base); break; 1518 case 1: stbcx_(modval, addr_base); break; 1519 default: ShouldNotReachHere(); 1520 } 1521 1522 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1523 bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0. 1524 } else { 1525 bne( CCR0, retry); // StXcx_ sets CCR0. 1526 } 1527 1528 // l?arx zero-extends, but Java wants byte/short values sign-extended. 1529 if (size == 1) { 1530 extsb(dest_current_value, dest_current_value); 1531 } else if (size == 2) { 1532 extsh(dest_current_value, dest_current_value); 1533 }; 1534 } 1535 1536 // Temps, addr_base and exchange_value are killed if size < 4 and processor does not support respective instructions. 1537 // Only signed types are supported with size < 4. 1538 void MacroAssembler::cmpxchg_loop_body(ConditionRegister flag, Register dest_current_value, 1539 Register compare_value, Register exchange_value, 1540 Register addr_base, Register tmp1, Register tmp2, 1541 Label &retry, Label &failed, bool cmpxchgx_hint, int size) { 1542 // Sub-word instructions are available since Power 8. 1543 // For older processors, instruction_type != size holds, and we 1544 // emulate the sub-word instructions by constructing a 4-byte value 1545 // that leaves the other bytes unchanged. 1546 const int instruction_type = VM_Version::has_lqarx() ? size : 4; 1547 1548 Register shift_amount = noreg, 1549 val32 = dest_current_value, 1550 modval = exchange_value; 1551 1552 if (instruction_type != size) { 1553 assert_different_registers(tmp1, tmp2, dest_current_value, compare_value, exchange_value, addr_base); 1554 shift_amount = tmp1; 1555 val32 = tmp2; 1556 modval = tmp2; 1557 // Need some preperation: Compute shift amount, align address. Note: shorts must be 2 byte aligned. 1558 #ifdef VM_LITTLE_ENDIAN 1559 rldic(shift_amount, addr_base, 3, 64-5); // (dest & 3) * 8; 1560 clrrdi(addr_base, addr_base, 2); 1561 #else 1562 xori(shift_amount, addr_base, (size == 1) ? 3 : 2); 1563 clrrdi(addr_base, addr_base, 2); 1564 rldic(shift_amount, shift_amount, 3, 64-5); // byte: ((3-dest) & 3) * 8; short: ((1-dest/2) & 1) * 16; 1565 #endif 1566 // Transform exchange value such that the replacement can be done by one xor instruction. 1567 xorr(exchange_value, compare_value, exchange_value); 1568 clrldi(exchange_value, exchange_value, (size == 1) ? 56 : 48); 1569 slw(exchange_value, exchange_value, shift_amount); 1570 } 1571 1572 // atomic emulation loop 1573 bind(retry); 1574 1575 switch (instruction_type) { 1576 case 4: lwarx(val32, addr_base, cmpxchgx_hint); break; 1577 case 2: lharx(val32, addr_base, cmpxchgx_hint); break; 1578 case 1: lbarx(val32, addr_base, cmpxchgx_hint); break; 1579 default: ShouldNotReachHere(); 1580 } 1581 1582 if (instruction_type != size) { 1583 srw(dest_current_value, val32, shift_amount); 1584 } 1585 if (size == 1) { 1586 extsb(dest_current_value, dest_current_value); 1587 } else if (size == 2) { 1588 extsh(dest_current_value, dest_current_value); 1589 }; 1590 1591 cmpw(flag, dest_current_value, compare_value); 1592 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1593 bne_predict_not_taken(flag, failed); 1594 } else { 1595 bne( flag, failed); 1596 } 1597 // branch to done => (flag == ne), (dest_current_value != compare_value) 1598 // fall through => (flag == eq), (dest_current_value == compare_value) 1599 1600 if (instruction_type != size) { 1601 xorr(modval, val32, exchange_value); 1602 } 1603 1604 switch (instruction_type) { 1605 case 4: stwcx_(modval, addr_base); break; 1606 case 2: sthcx_(modval, addr_base); break; 1607 case 1: stbcx_(modval, addr_base); break; 1608 default: ShouldNotReachHere(); 1609 } 1610 } 1611 1612 // CmpxchgX sets condition register to cmpX(current, compare). 1613 void MacroAssembler::cmpxchg_generic(ConditionRegister flag, Register dest_current_value, 1614 Register compare_value, Register exchange_value, 1615 Register addr_base, Register tmp1, Register tmp2, 1616 int semantics, bool cmpxchgx_hint, 1617 Register int_flag_success, bool contention_hint, bool weak, int size) { 1618 Label retry; 1619 Label failed; 1620 Label done; 1621 1622 // Save one branch if result is returned via register and 1623 // result register is different from the other ones. 1624 bool use_result_reg = (int_flag_success != noreg); 1625 bool preset_result_reg = (int_flag_success != dest_current_value && int_flag_success != compare_value && 1626 int_flag_success != exchange_value && int_flag_success != addr_base && 1627 int_flag_success != tmp1 && int_flag_success != tmp2); 1628 assert(!weak || flag == CCR0, "weak only supported with CCR0"); 1629 assert(size == 1 || size == 2 || size == 4, "unsupported"); 1630 1631 if (use_result_reg && preset_result_reg) { 1632 li(int_flag_success, 0); // preset (assume cas failed) 1633 } 1634 1635 // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM). 1636 if (contention_hint) { // Don't try to reserve if cmp fails. 1637 switch (size) { 1638 case 1: lbz(dest_current_value, 0, addr_base); extsb(dest_current_value, dest_current_value); break; 1639 case 2: lha(dest_current_value, 0, addr_base); break; 1640 case 4: lwz(dest_current_value, 0, addr_base); break; 1641 default: ShouldNotReachHere(); 1642 } 1643 cmpw(flag, dest_current_value, compare_value); 1644 bne(flag, failed); 1645 } 1646 1647 // release/fence semantics 1648 if (semantics & MemBarRel) { 1649 release(); 1650 } 1651 1652 cmpxchg_loop_body(flag, dest_current_value, compare_value, exchange_value, addr_base, tmp1, tmp2, 1653 retry, failed, cmpxchgx_hint, size); 1654 if (!weak || use_result_reg) { 1655 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1656 bne_predict_not_taken(CCR0, weak ? failed : retry); // StXcx_ sets CCR0. 1657 } else { 1658 bne( CCR0, weak ? failed : retry); // StXcx_ sets CCR0. 1659 } 1660 } 1661 // fall through => (flag == eq), (dest_current_value == compare_value), (swapped) 1662 1663 // Result in register (must do this at the end because int_flag_success can be the 1664 // same register as one above). 1665 if (use_result_reg) { 1666 li(int_flag_success, 1); 1667 } 1668 1669 if (semantics & MemBarFenceAfter) { 1670 fence(); 1671 } else if (semantics & MemBarAcq) { 1672 isync(); 1673 } 1674 1675 if (use_result_reg && !preset_result_reg) { 1676 b(done); 1677 } 1678 1679 bind(failed); 1680 if (use_result_reg && !preset_result_reg) { 1681 li(int_flag_success, 0); 1682 } 1683 1684 bind(done); 1685 // (flag == ne) => (dest_current_value != compare_value), (!swapped) 1686 // (flag == eq) => (dest_current_value == compare_value), ( swapped) 1687 } 1688 1689 // Preforms atomic compare exchange: 1690 // if (compare_value == *addr_base) 1691 // *addr_base = exchange_value 1692 // int_flag_success = 1; 1693 // else 1694 // int_flag_success = 0; 1695 // 1696 // ConditionRegister flag = cmp(compare_value, *addr_base) 1697 // Register dest_current_value = *addr_base 1698 // Register compare_value Used to compare with value in memory 1699 // Register exchange_value Written to memory if compare_value == *addr_base 1700 // Register addr_base The memory location to compareXChange 1701 // Register int_flag_success Set to 1 if exchange_value was written to *addr_base 1702 // 1703 // To avoid the costly compare exchange the value is tested beforehand. 1704 // Several special cases exist to avoid that unnecessary information is generated. 1705 // 1706 void MacroAssembler::cmpxchgd(ConditionRegister flag, 1707 Register dest_current_value, RegisterOrConstant compare_value, Register exchange_value, 1708 Register addr_base, int semantics, bool cmpxchgx_hint, 1709 Register int_flag_success, Label* failed_ext, bool contention_hint, bool weak) { 1710 Label retry; 1711 Label failed_int; 1712 Label& failed = (failed_ext != NULL) ? *failed_ext : failed_int; 1713 Label done; 1714 1715 // Save one branch if result is returned via register and result register is different from the other ones. 1716 bool use_result_reg = (int_flag_success!=noreg); 1717 bool preset_result_reg = (int_flag_success!=dest_current_value && int_flag_success!=compare_value.register_or_noreg() && 1718 int_flag_success!=exchange_value && int_flag_success!=addr_base); 1719 assert(!weak || flag == CCR0, "weak only supported with CCR0"); 1720 assert(int_flag_success == noreg || failed_ext == NULL, "cannot have both"); 1721 1722 if (use_result_reg && preset_result_reg) { 1723 li(int_flag_success, 0); // preset (assume cas failed) 1724 } 1725 1726 // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM). 1727 if (contention_hint) { // Don't try to reserve if cmp fails. 1728 ld(dest_current_value, 0, addr_base); 1729 cmpd(flag, compare_value, dest_current_value); 1730 bne(flag, failed); 1731 } 1732 1733 // release/fence semantics 1734 if (semantics & MemBarRel) { 1735 release(); 1736 } 1737 1738 // atomic emulation loop 1739 bind(retry); 1740 1741 ldarx(dest_current_value, addr_base, cmpxchgx_hint); 1742 cmpd(flag, compare_value, dest_current_value); 1743 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1744 bne_predict_not_taken(flag, failed); 1745 } else { 1746 bne( flag, failed); 1747 } 1748 1749 stdcx_(exchange_value, addr_base); 1750 if (!weak || use_result_reg || failed_ext) { 1751 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1752 bne_predict_not_taken(CCR0, weak ? failed : retry); // stXcx_ sets CCR0 1753 } else { 1754 bne( CCR0, weak ? failed : retry); // stXcx_ sets CCR0 1755 } 1756 } 1757 1758 // result in register (must do this at the end because int_flag_success can be the same register as one above) 1759 if (use_result_reg) { 1760 li(int_flag_success, 1); 1761 } 1762 1763 if (semantics & MemBarFenceAfter) { 1764 fence(); 1765 } else if (semantics & MemBarAcq) { 1766 isync(); 1767 } 1768 1769 if (use_result_reg && !preset_result_reg) { 1770 b(done); 1771 } 1772 1773 bind(failed_int); 1774 if (use_result_reg && !preset_result_reg) { 1775 li(int_flag_success, 0); 1776 } 1777 1778 bind(done); 1779 // (flag == ne) => (dest_current_value != compare_value), (!swapped) 1780 // (flag == eq) => (dest_current_value == compare_value), ( swapped) 1781 } 1782 1783 // Look up the method for a megamorphic invokeinterface call. 1784 // The target method is determined by <intf_klass, itable_index>. 1785 // The receiver klass is in recv_klass. 1786 // On success, the result will be in method_result, and execution falls through. 1787 // On failure, execution transfers to the given label. 1788 void MacroAssembler::lookup_interface_method(Register recv_klass, 1789 Register intf_klass, 1790 RegisterOrConstant itable_index, 1791 Register method_result, 1792 Register scan_temp, 1793 Register temp2, 1794 Label& L_no_such_interface, 1795 bool return_method) { 1796 assert_different_registers(recv_klass, intf_klass, method_result, scan_temp); 1797 1798 // Compute start of first itableOffsetEntry (which is at the end of the vtable). 1799 int vtable_base = in_bytes(Klass::vtable_start_offset()); 1800 int itentry_off = itableMethodEntry::method_offset_in_bytes(); 1801 int logMEsize = exact_log2(itableMethodEntry::size() * wordSize); 1802 int scan_step = itableOffsetEntry::size() * wordSize; 1803 int log_vte_size= exact_log2(vtableEntry::size_in_bytes()); 1804 1805 lwz(scan_temp, in_bytes(Klass::vtable_length_offset()), recv_klass); 1806 // %%% We should store the aligned, prescaled offset in the klassoop. 1807 // Then the next several instructions would fold away. 1808 1809 sldi(scan_temp, scan_temp, log_vte_size); 1810 addi(scan_temp, scan_temp, vtable_base); 1811 add(scan_temp, recv_klass, scan_temp); 1812 1813 // Adjust recv_klass by scaled itable_index, so we can free itable_index. 1814 if (return_method) { 1815 if (itable_index.is_register()) { 1816 Register itable_offset = itable_index.as_register(); 1817 sldi(method_result, itable_offset, logMEsize); 1818 if (itentry_off) { addi(method_result, method_result, itentry_off); } 1819 add(method_result, method_result, recv_klass); 1820 } else { 1821 long itable_offset = (long)itable_index.as_constant(); 1822 // static address, no relocation 1823 add_const_optimized(method_result, recv_klass, (itable_offset << logMEsize) + itentry_off, temp2); 1824 } 1825 } 1826 1827 // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) { 1828 // if (scan->interface() == intf) { 1829 // result = (klass + scan->offset() + itable_index); 1830 // } 1831 // } 1832 Label search, found_method; 1833 1834 for (int peel = 1; peel >= 0; peel--) { 1835 // %%%% Could load both offset and interface in one ldx, if they were 1836 // in the opposite order. This would save a load. 1837 ld(temp2, itableOffsetEntry::interface_offset_in_bytes(), scan_temp); 1838 1839 // Check that this entry is non-null. A null entry means that 1840 // the receiver class doesn't implement the interface, and wasn't the 1841 // same as when the caller was compiled. 1842 cmpd(CCR0, temp2, intf_klass); 1843 1844 if (peel) { 1845 beq(CCR0, found_method); 1846 } else { 1847 bne(CCR0, search); 1848 // (invert the test to fall through to found_method...) 1849 } 1850 1851 if (!peel) break; 1852 1853 bind(search); 1854 1855 cmpdi(CCR0, temp2, 0); 1856 beq(CCR0, L_no_such_interface); 1857 addi(scan_temp, scan_temp, scan_step); 1858 } 1859 1860 bind(found_method); 1861 1862 // Got a hit. 1863 if (return_method) { 1864 int ito_offset = itableOffsetEntry::offset_offset_in_bytes(); 1865 lwz(scan_temp, ito_offset, scan_temp); 1866 ldx(method_result, scan_temp, method_result); 1867 } 1868 } 1869 1870 // virtual method calling 1871 void MacroAssembler::lookup_virtual_method(Register recv_klass, 1872 RegisterOrConstant vtable_index, 1873 Register method_result) { 1874 1875 assert_different_registers(recv_klass, method_result, vtable_index.register_or_noreg()); 1876 1877 const int base = in_bytes(Klass::vtable_start_offset()); 1878 assert(vtableEntry::size() * wordSize == wordSize, "adjust the scaling in the code below"); 1879 1880 if (vtable_index.is_register()) { 1881 sldi(vtable_index.as_register(), vtable_index.as_register(), LogBytesPerWord); 1882 add(recv_klass, vtable_index.as_register(), recv_klass); 1883 } else { 1884 addi(recv_klass, recv_klass, vtable_index.as_constant() << LogBytesPerWord); 1885 } 1886 ld(R19_method, base + vtableEntry::method_offset_in_bytes(), recv_klass); 1887 } 1888 1889 /////////////////////////////////////////// subtype checking //////////////////////////////////////////// 1890 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass, 1891 Register super_klass, 1892 Register temp1_reg, 1893 Register temp2_reg, 1894 Label* L_success, 1895 Label* L_failure, 1896 Label* L_slow_path, 1897 RegisterOrConstant super_check_offset) { 1898 1899 const Register check_cache_offset = temp1_reg; 1900 const Register cached_super = temp2_reg; 1901 1902 assert_different_registers(sub_klass, super_klass, check_cache_offset, cached_super); 1903 1904 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 1905 int sc_offset = in_bytes(Klass::secondary_super_cache_offset()); 1906 1907 bool must_load_sco = (super_check_offset.constant_or_zero() == -1); 1908 bool need_slow_path = (must_load_sco || super_check_offset.constant_or_zero() == sco_offset); 1909 1910 Label L_fallthrough; 1911 int label_nulls = 0; 1912 if (L_success == NULL) { L_success = &L_fallthrough; label_nulls++; } 1913 if (L_failure == NULL) { L_failure = &L_fallthrough; label_nulls++; } 1914 if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; } 1915 assert(label_nulls <= 1 || 1916 (L_slow_path == &L_fallthrough && label_nulls <= 2 && !need_slow_path), 1917 "at most one NULL in the batch, usually"); 1918 1919 // If the pointers are equal, we are done (e.g., String[] elements). 1920 // This self-check enables sharing of secondary supertype arrays among 1921 // non-primary types such as array-of-interface. Otherwise, each such 1922 // type would need its own customized SSA. 1923 // We move this check to the front of the fast path because many 1924 // type checks are in fact trivially successful in this manner, 1925 // so we get a nicely predicted branch right at the start of the check. 1926 cmpd(CCR0, sub_klass, super_klass); 1927 beq(CCR0, *L_success); 1928 1929 // Check the supertype display: 1930 if (must_load_sco) { 1931 // The super check offset is always positive... 1932 lwz(check_cache_offset, sco_offset, super_klass); 1933 super_check_offset = RegisterOrConstant(check_cache_offset); 1934 // super_check_offset is register. 1935 assert_different_registers(sub_klass, super_klass, cached_super, super_check_offset.as_register()); 1936 } 1937 // The loaded value is the offset from KlassOopDesc. 1938 1939 ld(cached_super, super_check_offset, sub_klass); 1940 cmpd(CCR0, cached_super, super_klass); 1941 1942 // This check has worked decisively for primary supers. 1943 // Secondary supers are sought in the super_cache ('super_cache_addr'). 1944 // (Secondary supers are interfaces and very deeply nested subtypes.) 1945 // This works in the same check above because of a tricky aliasing 1946 // between the super_cache and the primary super display elements. 1947 // (The 'super_check_addr' can address either, as the case requires.) 1948 // Note that the cache is updated below if it does not help us find 1949 // what we need immediately. 1950 // So if it was a primary super, we can just fail immediately. 1951 // Otherwise, it's the slow path for us (no success at this point). 1952 1953 #define FINAL_JUMP(label) if (&(label) != &L_fallthrough) { b(label); } 1954 1955 if (super_check_offset.is_register()) { 1956 beq(CCR0, *L_success); 1957 cmpwi(CCR0, super_check_offset.as_register(), sc_offset); 1958 if (L_failure == &L_fallthrough) { 1959 beq(CCR0, *L_slow_path); 1960 } else { 1961 bne(CCR0, *L_failure); 1962 FINAL_JUMP(*L_slow_path); 1963 } 1964 } else { 1965 if (super_check_offset.as_constant() == sc_offset) { 1966 // Need a slow path; fast failure is impossible. 1967 if (L_slow_path == &L_fallthrough) { 1968 beq(CCR0, *L_success); 1969 } else { 1970 bne(CCR0, *L_slow_path); 1971 FINAL_JUMP(*L_success); 1972 } 1973 } else { 1974 // No slow path; it's a fast decision. 1975 if (L_failure == &L_fallthrough) { 1976 beq(CCR0, *L_success); 1977 } else { 1978 bne(CCR0, *L_failure); 1979 FINAL_JUMP(*L_success); 1980 } 1981 } 1982 } 1983 1984 bind(L_fallthrough); 1985 #undef FINAL_JUMP 1986 } 1987 1988 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass, 1989 Register super_klass, 1990 Register temp1_reg, 1991 Register temp2_reg, 1992 Label* L_success, 1993 Register result_reg) { 1994 const Register array_ptr = temp1_reg; // current value from cache array 1995 const Register temp = temp2_reg; 1996 1997 assert_different_registers(sub_klass, super_klass, array_ptr, temp); 1998 1999 int source_offset = in_bytes(Klass::secondary_supers_offset()); 2000 int target_offset = in_bytes(Klass::secondary_super_cache_offset()); 2001 2002 int length_offset = Array<Klass*>::length_offset_in_bytes(); 2003 int base_offset = Array<Klass*>::base_offset_in_bytes(); 2004 2005 Label hit, loop, failure, fallthru; 2006 2007 ld(array_ptr, source_offset, sub_klass); 2008 2009 // TODO: PPC port: assert(4 == arrayOopDesc::length_length_in_bytes(), "precondition violated."); 2010 lwz(temp, length_offset, array_ptr); 2011 cmpwi(CCR0, temp, 0); 2012 beq(CCR0, result_reg!=noreg ? failure : fallthru); // length 0 2013 2014 mtctr(temp); // load ctr 2015 2016 bind(loop); 2017 // Oops in table are NO MORE compressed. 2018 ld(temp, base_offset, array_ptr); 2019 cmpd(CCR0, temp, super_klass); 2020 beq(CCR0, hit); 2021 addi(array_ptr, array_ptr, BytesPerWord); 2022 bdnz(loop); 2023 2024 bind(failure); 2025 if (result_reg!=noreg) li(result_reg, 1); // load non-zero result (indicates a miss) 2026 b(fallthru); 2027 2028 bind(hit); 2029 std(super_klass, target_offset, sub_klass); // save result to cache 2030 if (result_reg != noreg) { li(result_reg, 0); } // load zero result (indicates a hit) 2031 if (L_success != NULL) { b(*L_success); } 2032 else if (result_reg == noreg) { blr(); } // return with CR0.eq if neither label nor result reg provided 2033 2034 bind(fallthru); 2035 } 2036 2037 // Try fast path, then go to slow one if not successful 2038 void MacroAssembler::check_klass_subtype(Register sub_klass, 2039 Register super_klass, 2040 Register temp1_reg, 2041 Register temp2_reg, 2042 Label& L_success) { 2043 Label L_failure; 2044 check_klass_subtype_fast_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success, &L_failure); 2045 check_klass_subtype_slow_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success); 2046 bind(L_failure); // Fallthru if not successful. 2047 } 2048 2049 void MacroAssembler::check_method_handle_type(Register mtype_reg, Register mh_reg, 2050 Register temp_reg, 2051 Label& wrong_method_type) { 2052 assert_different_registers(mtype_reg, mh_reg, temp_reg); 2053 // Compare method type against that of the receiver. 2054 load_heap_oop_not_null(temp_reg, delayed_value(java_lang_invoke_MethodHandle::type_offset_in_bytes, temp_reg), mh_reg); 2055 cmpd(CCR0, temp_reg, mtype_reg); 2056 bne(CCR0, wrong_method_type); 2057 } 2058 2059 RegisterOrConstant MacroAssembler::argument_offset(RegisterOrConstant arg_slot, 2060 Register temp_reg, 2061 int extra_slot_offset) { 2062 // cf. TemplateTable::prepare_invoke(), if (load_receiver). 2063 int stackElementSize = Interpreter::stackElementSize; 2064 int offset = extra_slot_offset * stackElementSize; 2065 if (arg_slot.is_constant()) { 2066 offset += arg_slot.as_constant() * stackElementSize; 2067 return offset; 2068 } else { 2069 assert(temp_reg != noreg, "must specify"); 2070 sldi(temp_reg, arg_slot.as_register(), exact_log2(stackElementSize)); 2071 if (offset != 0) 2072 addi(temp_reg, temp_reg, offset); 2073 return temp_reg; 2074 } 2075 } 2076 2077 // Supports temp2_reg = R0. 2078 void MacroAssembler::biased_locking_enter(ConditionRegister cr_reg, Register obj_reg, 2079 Register mark_reg, Register temp_reg, 2080 Register temp2_reg, Label& done, Label* slow_case) { 2081 assert(UseBiasedLocking, "why call this otherwise?"); 2082 2083 #ifdef ASSERT 2084 assert_different_registers(obj_reg, mark_reg, temp_reg, temp2_reg); 2085 #endif 2086 2087 Label cas_label; 2088 2089 // Branch to done if fast path fails and no slow_case provided. 2090 Label *slow_case_int = (slow_case != NULL) ? slow_case : &done; 2091 2092 // Biased locking 2093 // See whether the lock is currently biased toward our thread and 2094 // whether the epoch is still valid 2095 // Note that the runtime guarantees sufficient alignment of JavaThread 2096 // pointers to allow age to be placed into low bits 2097 assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits, 2098 "biased locking makes assumptions about bit layout"); 2099 2100 if (PrintBiasedLockingStatistics) { 2101 load_const(temp2_reg, (address) BiasedLocking::total_entry_count_addr(), temp_reg); 2102 lwzx(temp_reg, temp2_reg); 2103 addi(temp_reg, temp_reg, 1); 2104 stwx(temp_reg, temp2_reg); 2105 } 2106 2107 andi(temp_reg, mark_reg, markOopDesc::biased_lock_mask_in_place); 2108 cmpwi(cr_reg, temp_reg, markOopDesc::biased_lock_pattern); 2109 bne(cr_reg, cas_label); 2110 2111 load_klass(temp_reg, obj_reg); 2112 2113 load_const_optimized(temp2_reg, ~((int) markOopDesc::age_mask_in_place)); 2114 ld(temp_reg, in_bytes(Klass::prototype_header_offset()), temp_reg); 2115 orr(temp_reg, R16_thread, temp_reg); 2116 xorr(temp_reg, mark_reg, temp_reg); 2117 andr(temp_reg, temp_reg, temp2_reg); 2118 cmpdi(cr_reg, temp_reg, 0); 2119 if (PrintBiasedLockingStatistics) { 2120 Label l; 2121 bne(cr_reg, l); 2122 load_const(temp2_reg, (address) BiasedLocking::biased_lock_entry_count_addr()); 2123 lwzx(mark_reg, temp2_reg); 2124 addi(mark_reg, mark_reg, 1); 2125 stwx(mark_reg, temp2_reg); 2126 // restore mark_reg 2127 ld(mark_reg, oopDesc::mark_offset_in_bytes(), obj_reg); 2128 bind(l); 2129 } 2130 beq(cr_reg, done); 2131 2132 Label try_revoke_bias; 2133 Label try_rebias; 2134 2135 // At this point we know that the header has the bias pattern and 2136 // that we are not the bias owner in the current epoch. We need to 2137 // figure out more details about the state of the header in order to 2138 // know what operations can be legally performed on the object's 2139 // header. 2140 2141 // If the low three bits in the xor result aren't clear, that means 2142 // the prototype header is no longer biased and we have to revoke 2143 // the bias on this object. 2144 andi(temp2_reg, temp_reg, markOopDesc::biased_lock_mask_in_place); 2145 cmpwi(cr_reg, temp2_reg, 0); 2146 bne(cr_reg, try_revoke_bias); 2147 2148 // Biasing is still enabled for this data type. See whether the 2149 // epoch of the current bias is still valid, meaning that the epoch 2150 // bits of the mark word are equal to the epoch bits of the 2151 // prototype header. (Note that the prototype header's epoch bits 2152 // only change at a safepoint.) If not, attempt to rebias the object 2153 // toward the current thread. Note that we must be absolutely sure 2154 // that the current epoch is invalid in order to do this because 2155 // otherwise the manipulations it performs on the mark word are 2156 // illegal. 2157 2158 int shift_amount = 64 - markOopDesc::epoch_shift; 2159 // rotate epoch bits to right (little) end and set other bits to 0 2160 // [ big part | epoch | little part ] -> [ 0..0 | epoch ] 2161 rldicl_(temp2_reg, temp_reg, shift_amount, 64 - markOopDesc::epoch_bits); 2162 // branch if epoch bits are != 0, i.e. they differ, because the epoch has been incremented 2163 bne(CCR0, try_rebias); 2164 2165 // The epoch of the current bias is still valid but we know nothing 2166 // about the owner; it might be set or it might be clear. Try to 2167 // acquire the bias of the object using an atomic operation. If this 2168 // fails we will go in to the runtime to revoke the object's bias. 2169 // Note that we first construct the presumed unbiased header so we 2170 // don't accidentally blow away another thread's valid bias. 2171 andi(mark_reg, mark_reg, (markOopDesc::biased_lock_mask_in_place | 2172 markOopDesc::age_mask_in_place | 2173 markOopDesc::epoch_mask_in_place)); 2174 orr(temp_reg, R16_thread, mark_reg); 2175 2176 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 2177 2178 // CmpxchgX sets cr_reg to cmpX(temp2_reg, mark_reg). 2179 cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg, 2180 /*compare_value=*/mark_reg, /*exchange_value=*/temp_reg, 2181 /*where=*/obj_reg, 2182 MacroAssembler::MemBarAcq, 2183 MacroAssembler::cmpxchgx_hint_acquire_lock(), 2184 noreg, slow_case_int); // bail out if failed 2185 2186 // If the biasing toward our thread failed, this means that 2187 // another thread succeeded in biasing it toward itself and we 2188 // need to revoke that bias. The revocation will occur in the 2189 // interpreter runtime in the slow case. 2190 if (PrintBiasedLockingStatistics) { 2191 load_const(temp2_reg, (address) BiasedLocking::anonymously_biased_lock_entry_count_addr(), temp_reg); 2192 lwzx(temp_reg, temp2_reg); 2193 addi(temp_reg, temp_reg, 1); 2194 stwx(temp_reg, temp2_reg); 2195 } 2196 b(done); 2197 2198 bind(try_rebias); 2199 // At this point we know the epoch has expired, meaning that the 2200 // current "bias owner", if any, is actually invalid. Under these 2201 // circumstances _only_, we are allowed to use the current header's 2202 // value as the comparison value when doing the cas to acquire the 2203 // bias in the current epoch. In other words, we allow transfer of 2204 // the bias from one thread to another directly in this situation. 2205 load_klass(temp_reg, obj_reg); 2206 andi(temp2_reg, mark_reg, markOopDesc::age_mask_in_place); 2207 orr(temp2_reg, R16_thread, temp2_reg); 2208 ld(temp_reg, in_bytes(Klass::prototype_header_offset()), temp_reg); 2209 orr(temp_reg, temp2_reg, temp_reg); 2210 2211 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 2212 2213 cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg, 2214 /*compare_value=*/mark_reg, /*exchange_value=*/temp_reg, 2215 /*where=*/obj_reg, 2216 MacroAssembler::MemBarAcq, 2217 MacroAssembler::cmpxchgx_hint_acquire_lock(), 2218 noreg, slow_case_int); // bail out if failed 2219 2220 // If the biasing toward our thread failed, this means that 2221 // another thread succeeded in biasing it toward itself and we 2222 // need to revoke that bias. The revocation will occur in the 2223 // interpreter runtime in the slow case. 2224 if (PrintBiasedLockingStatistics) { 2225 load_const(temp2_reg, (address) BiasedLocking::rebiased_lock_entry_count_addr(), temp_reg); 2226 lwzx(temp_reg, temp2_reg); 2227 addi(temp_reg, temp_reg, 1); 2228 stwx(temp_reg, temp2_reg); 2229 } 2230 b(done); 2231 2232 bind(try_revoke_bias); 2233 // The prototype mark in the klass doesn't have the bias bit set any 2234 // more, indicating that objects of this data type are not supposed 2235 // to be biased any more. We are going to try to reset the mark of 2236 // this object to the prototype value and fall through to the 2237 // CAS-based locking scheme. Note that if our CAS fails, it means 2238 // that another thread raced us for the privilege of revoking the 2239 // bias of this particular object, so it's okay to continue in the 2240 // normal locking code. 2241 load_klass(temp_reg, obj_reg); 2242 ld(temp_reg, in_bytes(Klass::prototype_header_offset()), temp_reg); 2243 andi(temp2_reg, mark_reg, markOopDesc::age_mask_in_place); 2244 orr(temp_reg, temp_reg, temp2_reg); 2245 2246 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 2247 2248 // CmpxchgX sets cr_reg to cmpX(temp2_reg, mark_reg). 2249 cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg, 2250 /*compare_value=*/mark_reg, /*exchange_value=*/temp_reg, 2251 /*where=*/obj_reg, 2252 MacroAssembler::MemBarAcq, 2253 MacroAssembler::cmpxchgx_hint_acquire_lock()); 2254 2255 // reload markOop in mark_reg before continuing with lightweight locking 2256 ld(mark_reg, oopDesc::mark_offset_in_bytes(), obj_reg); 2257 2258 // Fall through to the normal CAS-based lock, because no matter what 2259 // the result of the above CAS, some thread must have succeeded in 2260 // removing the bias bit from the object's header. 2261 if (PrintBiasedLockingStatistics) { 2262 Label l; 2263 bne(cr_reg, l); 2264 load_const(temp2_reg, (address) BiasedLocking::revoked_lock_entry_count_addr(), temp_reg); 2265 lwzx(temp_reg, temp2_reg); 2266 addi(temp_reg, temp_reg, 1); 2267 stwx(temp_reg, temp2_reg); 2268 bind(l); 2269 } 2270 2271 bind(cas_label); 2272 } 2273 2274 void MacroAssembler::biased_locking_exit (ConditionRegister cr_reg, Register mark_addr, Register temp_reg, Label& done) { 2275 // Check for biased locking unlock case, which is a no-op 2276 // Note: we do not have to check the thread ID for two reasons. 2277 // First, the interpreter checks for IllegalMonitorStateException at 2278 // a higher level. Second, if the bias was revoked while we held the 2279 // lock, the object could not be rebiased toward another thread, so 2280 // the bias bit would be clear. 2281 2282 ld(temp_reg, 0, mark_addr); 2283 andi(temp_reg, temp_reg, markOopDesc::biased_lock_mask_in_place); 2284 2285 cmpwi(cr_reg, temp_reg, markOopDesc::biased_lock_pattern); 2286 beq(cr_reg, done); 2287 } 2288 2289 // allocation (for C1) 2290 void MacroAssembler::eden_allocate( 2291 Register obj, // result: pointer to object after successful allocation 2292 Register var_size_in_bytes, // object size in bytes if unknown at compile time; invalid otherwise 2293 int con_size_in_bytes, // object size in bytes if known at compile time 2294 Register t1, // temp register 2295 Register t2, // temp register 2296 Label& slow_case // continuation point if fast allocation fails 2297 ) { 2298 b(slow_case); 2299 } 2300 2301 void MacroAssembler::tlab_allocate( 2302 Register obj, // result: pointer to object after successful allocation 2303 Register var_size_in_bytes, // object size in bytes if unknown at compile time; invalid otherwise 2304 int con_size_in_bytes, // object size in bytes if known at compile time 2305 Register t1, // temp register 2306 Label& slow_case // continuation point if fast allocation fails 2307 ) { 2308 // make sure arguments make sense 2309 assert_different_registers(obj, var_size_in_bytes, t1); 2310 assert(0 <= con_size_in_bytes && is_simm13(con_size_in_bytes), "illegal object size"); 2311 assert((con_size_in_bytes & MinObjAlignmentInBytesMask) == 0, "object size is not multiple of alignment"); 2312 2313 const Register new_top = t1; 2314 //verify_tlab(); not implemented 2315 2316 ld(obj, in_bytes(JavaThread::tlab_top_offset()), R16_thread); 2317 ld(R0, in_bytes(JavaThread::tlab_current_end_offset()), R16_thread); 2318 if (var_size_in_bytes == noreg) { 2319 addi(new_top, obj, con_size_in_bytes); 2320 } else { 2321 add(new_top, obj, var_size_in_bytes); 2322 } 2323 cmpld(CCR0, new_top, R0); 2324 bc_far_optimized(Assembler::bcondCRbiIs1, bi0(CCR0, Assembler::greater), slow_case); 2325 2326 #ifdef ASSERT 2327 // make sure new free pointer is properly aligned 2328 { 2329 Label L; 2330 andi_(R0, new_top, MinObjAlignmentInBytesMask); 2331 beq(CCR0, L); 2332 stop("updated TLAB free is not properly aligned", 0x934); 2333 bind(L); 2334 } 2335 #endif // ASSERT 2336 2337 // update the tlab top pointer 2338 std(new_top, in_bytes(JavaThread::tlab_top_offset()), R16_thread); 2339 //verify_tlab(); not implemented 2340 } 2341 void MacroAssembler::incr_allocated_bytes(RegisterOrConstant size_in_bytes, Register t1, Register t2) { 2342 unimplemented("incr_allocated_bytes"); 2343 } 2344 2345 address MacroAssembler::emit_trampoline_stub(int destination_toc_offset, 2346 int insts_call_instruction_offset, Register Rtoc) { 2347 // Start the stub. 2348 address stub = start_a_stub(64); 2349 if (stub == NULL) { return NULL; } // CodeCache full: bail out 2350 2351 // Create a trampoline stub relocation which relates this trampoline stub 2352 // with the call instruction at insts_call_instruction_offset in the 2353 // instructions code-section. 2354 relocate(trampoline_stub_Relocation::spec(code()->insts()->start() + insts_call_instruction_offset)); 2355 const int stub_start_offset = offset(); 2356 2357 // For java_to_interp stubs we use R11_scratch1 as scratch register 2358 // and in call trampoline stubs we use R12_scratch2. This way we 2359 // can distinguish them (see is_NativeCallTrampolineStub_at()). 2360 Register reg_scratch = R12_scratch2; 2361 2362 // Now, create the trampoline stub's code: 2363 // - load the TOC 2364 // - load the call target from the constant pool 2365 // - call 2366 if (Rtoc == noreg) { 2367 calculate_address_from_global_toc(reg_scratch, method_toc()); 2368 Rtoc = reg_scratch; 2369 } 2370 2371 ld_largeoffset_unchecked(reg_scratch, destination_toc_offset, Rtoc, false); 2372 mtctr(reg_scratch); 2373 bctr(); 2374 2375 const address stub_start_addr = addr_at(stub_start_offset); 2376 2377 // Assert that the encoded destination_toc_offset can be identified and that it is correct. 2378 assert(destination_toc_offset == NativeCallTrampolineStub_at(stub_start_addr)->destination_toc_offset(), 2379 "encoded offset into the constant pool must match"); 2380 // Trampoline_stub_size should be good. 2381 assert((uint)(offset() - stub_start_offset) <= trampoline_stub_size, "should be good size"); 2382 assert(is_NativeCallTrampolineStub_at(stub_start_addr), "doesn't look like a trampoline"); 2383 2384 // End the stub. 2385 end_a_stub(); 2386 return stub; 2387 } 2388 2389 // TM on PPC64. 2390 void MacroAssembler::atomic_inc_ptr(Register addr, Register result, int simm16) { 2391 Label retry; 2392 bind(retry); 2393 ldarx(result, addr, /*hint*/ false); 2394 addi(result, result, simm16); 2395 stdcx_(result, addr); 2396 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 2397 bne_predict_not_taken(CCR0, retry); // stXcx_ sets CCR0 2398 } else { 2399 bne( CCR0, retry); // stXcx_ sets CCR0 2400 } 2401 } 2402 2403 void MacroAssembler::atomic_ori_int(Register addr, Register result, int uimm16) { 2404 Label retry; 2405 bind(retry); 2406 lwarx(result, addr, /*hint*/ false); 2407 ori(result, result, uimm16); 2408 stwcx_(result, addr); 2409 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 2410 bne_predict_not_taken(CCR0, retry); // stXcx_ sets CCR0 2411 } else { 2412 bne( CCR0, retry); // stXcx_ sets CCR0 2413 } 2414 } 2415 2416 #if INCLUDE_RTM_OPT 2417 2418 // Update rtm_counters based on abort status 2419 // input: abort_status 2420 // rtm_counters (RTMLockingCounters*) 2421 void MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters_Reg) { 2422 // Mapping to keep PreciseRTMLockingStatistics similar to x86. 2423 // x86 ppc (! means inverted, ? means not the same) 2424 // 0 31 Set if abort caused by XABORT instruction. 2425 // 1 ! 7 If set, the transaction may succeed on a retry. This bit is always clear if bit 0 is set. 2426 // 2 13 Set if another logical processor conflicted with a memory address that was part of the transaction that aborted. 2427 // 3 10 Set if an internal buffer overflowed. 2428 // 4 ?12 Set if a debug breakpoint was hit. 2429 // 5 ?32 Set if an abort occurred during execution of a nested transaction. 2430 const int tm_failure_bit[] = {Assembler::tm_tabort, // Note: Seems like signal handler sets this, too. 2431 Assembler::tm_failure_persistent, // inverted: transient 2432 Assembler::tm_trans_cf, 2433 Assembler::tm_footprint_of, 2434 Assembler::tm_non_trans_cf, 2435 Assembler::tm_suspended}; 2436 const bool tm_failure_inv[] = {false, true, false, false, false, false}; 2437 assert(sizeof(tm_failure_bit)/sizeof(int) == RTMLockingCounters::ABORT_STATUS_LIMIT, "adapt mapping!"); 2438 2439 const Register addr_Reg = R0; 2440 // Keep track of offset to where rtm_counters_Reg had pointed to. 2441 int counters_offs = RTMLockingCounters::abort_count_offset(); 2442 addi(addr_Reg, rtm_counters_Reg, counters_offs); 2443 const Register temp_Reg = rtm_counters_Reg; 2444 2445 //atomic_inc_ptr(addr_Reg, temp_Reg); We don't increment atomically 2446 ldx(temp_Reg, addr_Reg); 2447 addi(temp_Reg, temp_Reg, 1); 2448 stdx(temp_Reg, addr_Reg); 2449 2450 if (PrintPreciseRTMLockingStatistics) { 2451 int counters_offs_delta = RTMLockingCounters::abortX_count_offset() - counters_offs; 2452 2453 //mftexasr(abort_status); done by caller 2454 for (int i = 0; i < RTMLockingCounters::ABORT_STATUS_LIMIT; i++) { 2455 counters_offs += counters_offs_delta; 2456 li(temp_Reg, counters_offs_delta); // can't use addi with R0 2457 add(addr_Reg, addr_Reg, temp_Reg); // point to next counter 2458 counters_offs_delta = sizeof(uintx); 2459 2460 Label check_abort; 2461 rldicr_(temp_Reg, abort_status, tm_failure_bit[i], 0); 2462 if (tm_failure_inv[i]) { 2463 bne(CCR0, check_abort); 2464 } else { 2465 beq(CCR0, check_abort); 2466 } 2467 //atomic_inc_ptr(addr_Reg, temp_Reg); We don't increment atomically 2468 ldx(temp_Reg, addr_Reg); 2469 addi(temp_Reg, temp_Reg, 1); 2470 stdx(temp_Reg, addr_Reg); 2471 bind(check_abort); 2472 } 2473 } 2474 li(temp_Reg, -counters_offs); // can't use addi with R0 2475 add(rtm_counters_Reg, addr_Reg, temp_Reg); // restore 2476 } 2477 2478 // Branch if (random & (count-1) != 0), count is 2^n 2479 // tmp and CR0 are killed 2480 void MacroAssembler::branch_on_random_using_tb(Register tmp, int count, Label& brLabel) { 2481 mftb(tmp); 2482 andi_(tmp, tmp, count-1); 2483 bne(CCR0, brLabel); 2484 } 2485 2486 // Perform abort ratio calculation, set no_rtm bit if high ratio. 2487 // input: rtm_counters_Reg (RTMLockingCounters* address) - KILLED 2488 void MacroAssembler::rtm_abort_ratio_calculation(Register rtm_counters_Reg, 2489 RTMLockingCounters* rtm_counters, 2490 Metadata* method_data) { 2491 Label L_done, L_check_always_rtm1, L_check_always_rtm2; 2492 2493 if (RTMLockingCalculationDelay > 0) { 2494 // Delay calculation. 2495 ld(rtm_counters_Reg, (RegisterOrConstant)(intptr_t)RTMLockingCounters::rtm_calculation_flag_addr()); 2496 cmpdi(CCR0, rtm_counters_Reg, 0); 2497 beq(CCR0, L_done); 2498 load_const_optimized(rtm_counters_Reg, (address)rtm_counters, R0); // reload 2499 } 2500 // Abort ratio calculation only if abort_count > RTMAbortThreshold. 2501 // Aborted transactions = abort_count * 100 2502 // All transactions = total_count * RTMTotalCountIncrRate 2503 // Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio) 2504 ld(R0, RTMLockingCounters::abort_count_offset(), rtm_counters_Reg); 2505 if (is_simm(RTMAbortThreshold, 16)) { // cmpdi can handle 16bit immediate only. 2506 cmpdi(CCR0, R0, RTMAbortThreshold); 2507 blt(CCR0, L_check_always_rtm2); // reload of rtm_counters_Reg not necessary 2508 } else { 2509 load_const_optimized(rtm_counters_Reg, RTMAbortThreshold); 2510 cmpd(CCR0, R0, rtm_counters_Reg); 2511 blt(CCR0, L_check_always_rtm1); // reload of rtm_counters_Reg required 2512 } 2513 mulli(R0, R0, 100); 2514 2515 const Register tmpReg = rtm_counters_Reg; 2516 ld(tmpReg, RTMLockingCounters::total_count_offset(), rtm_counters_Reg); 2517 mulli(tmpReg, tmpReg, RTMTotalCountIncrRate); // allowable range: int16 2518 mulli(tmpReg, tmpReg, RTMAbortRatio); // allowable range: int16 2519 cmpd(CCR0, R0, tmpReg); 2520 blt(CCR0, L_check_always_rtm1); // jump to reload 2521 if (method_data != NULL) { 2522 // Set rtm_state to "no rtm" in MDO. 2523 // Not using a metadata relocation. Method and Class Loader are kept alive anyway. 2524 // (See nmethod::metadata_do and CodeBuffer::finalize_oop_references.) 2525 load_const(R0, (address)method_data + MethodData::rtm_state_offset_in_bytes(), tmpReg); 2526 atomic_ori_int(R0, tmpReg, NoRTM); 2527 } 2528 b(L_done); 2529 2530 bind(L_check_always_rtm1); 2531 load_const_optimized(rtm_counters_Reg, (address)rtm_counters, R0); // reload 2532 bind(L_check_always_rtm2); 2533 ld(tmpReg, RTMLockingCounters::total_count_offset(), rtm_counters_Reg); 2534 int64_t thresholdValue = RTMLockingThreshold / RTMTotalCountIncrRate; 2535 if (is_simm(thresholdValue, 16)) { // cmpdi can handle 16bit immediate only. 2536 cmpdi(CCR0, tmpReg, thresholdValue); 2537 } else { 2538 load_const_optimized(R0, thresholdValue); 2539 cmpd(CCR0, tmpReg, R0); 2540 } 2541 blt(CCR0, L_done); 2542 if (method_data != NULL) { 2543 // Set rtm_state to "always rtm" in MDO. 2544 // Not using a metadata relocation. See above. 2545 load_const(R0, (address)method_data + MethodData::rtm_state_offset_in_bytes(), tmpReg); 2546 atomic_ori_int(R0, tmpReg, UseRTM); 2547 } 2548 bind(L_done); 2549 } 2550 2551 // Update counters and perform abort ratio calculation. 2552 // input: abort_status_Reg 2553 void MacroAssembler::rtm_profiling(Register abort_status_Reg, Register temp_Reg, 2554 RTMLockingCounters* rtm_counters, 2555 Metadata* method_data, 2556 bool profile_rtm) { 2557 2558 assert(rtm_counters != NULL, "should not be NULL when profiling RTM"); 2559 // Update rtm counters based on state at abort. 2560 // Reads abort_status_Reg, updates flags. 2561 assert_different_registers(abort_status_Reg, temp_Reg); 2562 load_const_optimized(temp_Reg, (address)rtm_counters, R0); 2563 rtm_counters_update(abort_status_Reg, temp_Reg); 2564 if (profile_rtm) { 2565 assert(rtm_counters != NULL, "should not be NULL when profiling RTM"); 2566 rtm_abort_ratio_calculation(temp_Reg, rtm_counters, method_data); 2567 } 2568 } 2569 2570 // Retry on abort if abort's status indicates non-persistent failure. 2571 // inputs: retry_count_Reg 2572 // : abort_status_Reg 2573 // output: retry_count_Reg decremented by 1 2574 void MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg, 2575 Label& retryLabel, Label* checkRetry) { 2576 Label doneRetry; 2577 rldicr_(R0, abort_status_Reg, tm_failure_persistent, 0); 2578 bne(CCR0, doneRetry); 2579 if (checkRetry) { bind(*checkRetry); } 2580 addic_(retry_count_Reg, retry_count_Reg, -1); 2581 blt(CCR0, doneRetry); 2582 smt_yield(); // Can't use wait(). No permission (SIGILL). 2583 b(retryLabel); 2584 bind(doneRetry); 2585 } 2586 2587 // Spin and retry if lock is busy. 2588 // inputs: owner_addr_Reg (monitor address) 2589 // : retry_count_Reg 2590 // output: retry_count_Reg decremented by 1 2591 // CTR is killed 2592 void MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register owner_addr_Reg, Label& retryLabel) { 2593 Label SpinLoop, doneRetry; 2594 addic_(retry_count_Reg, retry_count_Reg, -1); 2595 blt(CCR0, doneRetry); 2596 2597 if (RTMSpinLoopCount > 1) { 2598 li(R0, RTMSpinLoopCount); 2599 mtctr(R0); 2600 } 2601 2602 bind(SpinLoop); 2603 smt_yield(); // Can't use waitrsv(). No permission (SIGILL). 2604 2605 if (RTMSpinLoopCount > 1) { 2606 bdz(retryLabel); 2607 ld(R0, 0, owner_addr_Reg); 2608 cmpdi(CCR0, R0, 0); 2609 bne(CCR0, SpinLoop); 2610 } 2611 2612 b(retryLabel); 2613 2614 bind(doneRetry); 2615 } 2616 2617 // Use RTM for normal stack locks. 2618 // Input: objReg (object to lock) 2619 void MacroAssembler::rtm_stack_locking(ConditionRegister flag, 2620 Register obj, Register mark_word, Register tmp, 2621 Register retry_on_abort_count_Reg, 2622 RTMLockingCounters* stack_rtm_counters, 2623 Metadata* method_data, bool profile_rtm, 2624 Label& DONE_LABEL, Label& IsInflated) { 2625 assert(UseRTMForStackLocks, "why call this otherwise?"); 2626 assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking"); 2627 Label L_rtm_retry, L_decrement_retry, L_on_abort; 2628 2629 if (RTMRetryCount > 0) { 2630 load_const_optimized(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort 2631 bind(L_rtm_retry); 2632 } 2633 andi_(R0, mark_word, markOopDesc::monitor_value); // inflated vs stack-locked|neutral|biased 2634 bne(CCR0, IsInflated); 2635 2636 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 2637 Label L_noincrement; 2638 if (RTMTotalCountIncrRate > 1) { 2639 branch_on_random_using_tb(tmp, RTMTotalCountIncrRate, L_noincrement); 2640 } 2641 assert(stack_rtm_counters != NULL, "should not be NULL when profiling RTM"); 2642 load_const_optimized(tmp, (address)stack_rtm_counters->total_count_addr(), R0); 2643 //atomic_inc_ptr(tmp, /*temp, will be reloaded*/mark_word); We don't increment atomically 2644 ldx(mark_word, tmp); 2645 addi(mark_word, mark_word, 1); 2646 stdx(mark_word, tmp); 2647 bind(L_noincrement); 2648 } 2649 tbegin_(); 2650 beq(CCR0, L_on_abort); 2651 ld(mark_word, oopDesc::mark_offset_in_bytes(), obj); // Reload in transaction, conflicts need to be tracked. 2652 andi(R0, mark_word, markOopDesc::biased_lock_mask_in_place); // look at 3 lock bits 2653 cmpwi(flag, R0, markOopDesc::unlocked_value); // bits = 001 unlocked 2654 beq(flag, DONE_LABEL); // all done if unlocked 2655 2656 if (UseRTMXendForLockBusy) { 2657 tend_(); 2658 b(L_decrement_retry); 2659 } else { 2660 tabort_(); 2661 } 2662 bind(L_on_abort); 2663 const Register abort_status_Reg = tmp; 2664 mftexasr(abort_status_Reg); 2665 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 2666 rtm_profiling(abort_status_Reg, /*temp*/mark_word, stack_rtm_counters, method_data, profile_rtm); 2667 } 2668 ld(mark_word, oopDesc::mark_offset_in_bytes(), obj); // reload 2669 if (RTMRetryCount > 0) { 2670 // Retry on lock abort if abort status is not permanent. 2671 rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry, &L_decrement_retry); 2672 } else { 2673 bind(L_decrement_retry); 2674 } 2675 } 2676 2677 // Use RTM for inflating locks 2678 // inputs: obj (object to lock) 2679 // mark_word (current header - KILLED) 2680 // boxReg (on-stack box address (displaced header location) - KILLED) 2681 void MacroAssembler::rtm_inflated_locking(ConditionRegister flag, 2682 Register obj, Register mark_word, Register boxReg, 2683 Register retry_on_busy_count_Reg, Register retry_on_abort_count_Reg, 2684 RTMLockingCounters* rtm_counters, 2685 Metadata* method_data, bool profile_rtm, 2686 Label& DONE_LABEL) { 2687 assert(UseRTMLocking, "why call this otherwise?"); 2688 Label L_rtm_retry, L_decrement_retry, L_on_abort; 2689 // Clean monitor_value bit to get valid pointer. 2690 int owner_offset = ObjectMonitor::owner_offset_in_bytes() - markOopDesc::monitor_value; 2691 2692 // Store non-null, using boxReg instead of (intptr_t)markOopDesc::unused_mark(). 2693 std(boxReg, BasicLock::displaced_header_offset_in_bytes(), boxReg); 2694 const Register tmpReg = boxReg; 2695 const Register owner_addr_Reg = mark_word; 2696 addi(owner_addr_Reg, mark_word, owner_offset); 2697 2698 if (RTMRetryCount > 0) { 2699 load_const_optimized(retry_on_busy_count_Reg, RTMRetryCount); // Retry on lock busy. 2700 load_const_optimized(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort. 2701 bind(L_rtm_retry); 2702 } 2703 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 2704 Label L_noincrement; 2705 if (RTMTotalCountIncrRate > 1) { 2706 branch_on_random_using_tb(R0, RTMTotalCountIncrRate, L_noincrement); 2707 } 2708 assert(rtm_counters != NULL, "should not be NULL when profiling RTM"); 2709 load_const(R0, (address)rtm_counters->total_count_addr(), tmpReg); 2710 //atomic_inc_ptr(R0, tmpReg); We don't increment atomically 2711 ldx(tmpReg, R0); 2712 addi(tmpReg, tmpReg, 1); 2713 stdx(tmpReg, R0); 2714 bind(L_noincrement); 2715 } 2716 tbegin_(); 2717 beq(CCR0, L_on_abort); 2718 // We don't reload mark word. Will only be reset at safepoint. 2719 ld(R0, 0, owner_addr_Reg); // Load in transaction, conflicts need to be tracked. 2720 cmpdi(flag, R0, 0); 2721 beq(flag, DONE_LABEL); 2722 2723 if (UseRTMXendForLockBusy) { 2724 tend_(); 2725 b(L_decrement_retry); 2726 } else { 2727 tabort_(); 2728 } 2729 bind(L_on_abort); 2730 const Register abort_status_Reg = tmpReg; 2731 mftexasr(abort_status_Reg); 2732 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 2733 rtm_profiling(abort_status_Reg, /*temp*/ owner_addr_Reg, rtm_counters, method_data, profile_rtm); 2734 // Restore owner_addr_Reg 2735 ld(mark_word, oopDesc::mark_offset_in_bytes(), obj); 2736 #ifdef ASSERT 2737 andi_(R0, mark_word, markOopDesc::monitor_value); 2738 asm_assert_ne("must be inflated", 0xa754); // Deflating only allowed at safepoint. 2739 #endif 2740 addi(owner_addr_Reg, mark_word, owner_offset); 2741 } 2742 if (RTMRetryCount > 0) { 2743 // Retry on lock abort if abort status is not permanent. 2744 rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry); 2745 } 2746 2747 // Appears unlocked - try to swing _owner from null to non-null. 2748 cmpxchgd(flag, /*current val*/ R0, (intptr_t)0, /*new val*/ R16_thread, owner_addr_Reg, 2749 MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq, 2750 MacroAssembler::cmpxchgx_hint_acquire_lock(), noreg, &L_decrement_retry, true); 2751 2752 if (RTMRetryCount > 0) { 2753 // success done else retry 2754 b(DONE_LABEL); 2755 bind(L_decrement_retry); 2756 // Spin and retry if lock is busy. 2757 rtm_retry_lock_on_busy(retry_on_busy_count_Reg, owner_addr_Reg, L_rtm_retry); 2758 } else { 2759 bind(L_decrement_retry); 2760 } 2761 } 2762 2763 #endif // INCLUDE_RTM_OPT 2764 2765 // "The box" is the space on the stack where we copy the object mark. 2766 void MacroAssembler::compiler_fast_lock_object(ConditionRegister flag, Register oop, Register box, 2767 Register temp, Register displaced_header, Register current_header, 2768 bool try_bias, 2769 RTMLockingCounters* rtm_counters, 2770 RTMLockingCounters* stack_rtm_counters, 2771 Metadata* method_data, 2772 bool use_rtm, bool profile_rtm) { 2773 assert_different_registers(oop, box, temp, displaced_header, current_header); 2774 assert(flag != CCR0, "bad condition register"); 2775 Label cont; 2776 Label object_has_monitor; 2777 Label cas_failed; 2778 2779 // Load markOop from object into displaced_header. 2780 ld(displaced_header, oopDesc::mark_offset_in_bytes(), oop); 2781 2782 2783 // Always do locking in runtime. 2784 if (EmitSync & 0x01) { 2785 cmpdi(flag, oop, 0); // Oop can't be 0 here => always false. 2786 return; 2787 } 2788 2789 if (try_bias) { 2790 biased_locking_enter(flag, oop, displaced_header, temp, current_header, cont); 2791 } 2792 2793 #if INCLUDE_RTM_OPT 2794 if (UseRTMForStackLocks && use_rtm) { 2795 rtm_stack_locking(flag, oop, displaced_header, temp, /*temp*/ current_header, 2796 stack_rtm_counters, method_data, profile_rtm, 2797 cont, object_has_monitor); 2798 } 2799 #endif // INCLUDE_RTM_OPT 2800 2801 // Handle existing monitor. 2802 if ((EmitSync & 0x02) == 0) { 2803 // The object has an existing monitor iff (mark & monitor_value) != 0. 2804 andi_(temp, displaced_header, markOopDesc::monitor_value); 2805 bne(CCR0, object_has_monitor); 2806 } 2807 2808 // Set displaced_header to be (markOop of object | UNLOCK_VALUE). 2809 ori(displaced_header, displaced_header, markOopDesc::unlocked_value); 2810 2811 // Load Compare Value application register. 2812 2813 // Initialize the box. (Must happen before we update the object mark!) 2814 std(displaced_header, BasicLock::displaced_header_offset_in_bytes(), box); 2815 2816 // Must fence, otherwise, preceding store(s) may float below cmpxchg. 2817 // Compare object markOop with mark and if equal exchange scratch1 with object markOop. 2818 cmpxchgd(/*flag=*/flag, 2819 /*current_value=*/current_header, 2820 /*compare_value=*/displaced_header, 2821 /*exchange_value=*/box, 2822 /*where=*/oop, 2823 MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq, 2824 MacroAssembler::cmpxchgx_hint_acquire_lock(), 2825 noreg, 2826 &cas_failed, 2827 /*check without membar and ldarx first*/true); 2828 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 2829 2830 // If the compare-and-exchange succeeded, then we found an unlocked 2831 // object and we have now locked it. 2832 b(cont); 2833 2834 bind(cas_failed); 2835 // We did not see an unlocked object so try the fast recursive case. 2836 2837 // Check if the owner is self by comparing the value in the markOop of object 2838 // (current_header) with the stack pointer. 2839 sub(current_header, current_header, R1_SP); 2840 load_const_optimized(temp, ~(os::vm_page_size()-1) | markOopDesc::lock_mask_in_place); 2841 2842 and_(R0/*==0?*/, current_header, temp); 2843 // If condition is true we are cont and hence we can store 0 as the 2844 // displaced header in the box, which indicates that it is a recursive lock. 2845 mcrf(flag,CCR0); 2846 std(R0/*==0, perhaps*/, BasicLock::displaced_header_offset_in_bytes(), box); 2847 2848 // Handle existing monitor. 2849 if ((EmitSync & 0x02) == 0) { 2850 b(cont); 2851 2852 bind(object_has_monitor); 2853 // The object's monitor m is unlocked iff m->owner == NULL, 2854 // otherwise m->owner may contain a thread or a stack address. 2855 2856 #if INCLUDE_RTM_OPT 2857 // Use the same RTM locking code in 32- and 64-bit VM. 2858 if (use_rtm) { 2859 rtm_inflated_locking(flag, oop, displaced_header, box, temp, /*temp*/ current_header, 2860 rtm_counters, method_data, profile_rtm, cont); 2861 } else { 2862 #endif // INCLUDE_RTM_OPT 2863 2864 // Try to CAS m->owner from NULL to current thread. 2865 addi(temp, displaced_header, ObjectMonitor::owner_offset_in_bytes()-markOopDesc::monitor_value); 2866 cmpxchgd(/*flag=*/flag, 2867 /*current_value=*/current_header, 2868 /*compare_value=*/(intptr_t)0, 2869 /*exchange_value=*/R16_thread, 2870 /*where=*/temp, 2871 MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq, 2872 MacroAssembler::cmpxchgx_hint_acquire_lock()); 2873 2874 // Store a non-null value into the box. 2875 std(box, BasicLock::displaced_header_offset_in_bytes(), box); 2876 2877 # ifdef ASSERT 2878 bne(flag, cont); 2879 // We have acquired the monitor, check some invariants. 2880 addi(/*monitor=*/temp, temp, -ObjectMonitor::owner_offset_in_bytes()); 2881 // Invariant 1: _recursions should be 0. 2882 //assert(ObjectMonitor::recursions_size_in_bytes() == 8, "unexpected size"); 2883 asm_assert_mem8_is_zero(ObjectMonitor::recursions_offset_in_bytes(), temp, 2884 "monitor->_recursions should be 0", -1); 2885 # endif 2886 2887 #if INCLUDE_RTM_OPT 2888 } // use_rtm() 2889 #endif 2890 } 2891 2892 bind(cont); 2893 // flag == EQ indicates success 2894 // flag == NE indicates failure 2895 } 2896 2897 void MacroAssembler::compiler_fast_unlock_object(ConditionRegister flag, Register oop, Register box, 2898 Register temp, Register displaced_header, Register current_header, 2899 bool try_bias, bool use_rtm) { 2900 assert_different_registers(oop, box, temp, displaced_header, current_header); 2901 assert(flag != CCR0, "bad condition register"); 2902 Label cont; 2903 Label object_has_monitor; 2904 2905 // Always do locking in runtime. 2906 if (EmitSync & 0x01) { 2907 cmpdi(flag, oop, 0); // Oop can't be 0 here => always false. 2908 return; 2909 } 2910 2911 if (try_bias) { 2912 biased_locking_exit(flag, oop, current_header, cont); 2913 } 2914 2915 #if INCLUDE_RTM_OPT 2916 if (UseRTMForStackLocks && use_rtm) { 2917 assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking"); 2918 Label L_regular_unlock; 2919 ld(current_header, oopDesc::mark_offset_in_bytes(), oop); // fetch markword 2920 andi(R0, current_header, markOopDesc::biased_lock_mask_in_place); // look at 3 lock bits 2921 cmpwi(flag, R0, markOopDesc::unlocked_value); // bits = 001 unlocked 2922 bne(flag, L_regular_unlock); // else RegularLock 2923 tend_(); // otherwise end... 2924 b(cont); // ... and we're done 2925 bind(L_regular_unlock); 2926 } 2927 #endif 2928 2929 // Find the lock address and load the displaced header from the stack. 2930 ld(displaced_header, BasicLock::displaced_header_offset_in_bytes(), box); 2931 2932 // If the displaced header is 0, we have a recursive unlock. 2933 cmpdi(flag, displaced_header, 0); 2934 beq(flag, cont); 2935 2936 // Handle existing monitor. 2937 if ((EmitSync & 0x02) == 0) { 2938 // The object has an existing monitor iff (mark & monitor_value) != 0. 2939 RTM_OPT_ONLY( if (!(UseRTMForStackLocks && use_rtm)) ) // skip load if already done 2940 ld(current_header, oopDesc::mark_offset_in_bytes(), oop); 2941 andi_(R0, current_header, markOopDesc::monitor_value); 2942 bne(CCR0, object_has_monitor); 2943 } 2944 2945 // Check if it is still a light weight lock, this is is true if we see 2946 // the stack address of the basicLock in the markOop of the object. 2947 // Cmpxchg sets flag to cmpd(current_header, box). 2948 cmpxchgd(/*flag=*/flag, 2949 /*current_value=*/current_header, 2950 /*compare_value=*/box, 2951 /*exchange_value=*/displaced_header, 2952 /*where=*/oop, 2953 MacroAssembler::MemBarRel, 2954 MacroAssembler::cmpxchgx_hint_release_lock(), 2955 noreg, 2956 &cont); 2957 2958 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 2959 2960 // Handle existing monitor. 2961 if ((EmitSync & 0x02) == 0) { 2962 b(cont); 2963 2964 bind(object_has_monitor); 2965 addi(current_header, current_header, -markOopDesc::monitor_value); // monitor 2966 ld(temp, ObjectMonitor::owner_offset_in_bytes(), current_header); 2967 2968 // It's inflated. 2969 #if INCLUDE_RTM_OPT 2970 if (use_rtm) { 2971 Label L_regular_inflated_unlock; 2972 // Clean monitor_value bit to get valid pointer 2973 cmpdi(flag, temp, 0); 2974 bne(flag, L_regular_inflated_unlock); 2975 tend_(); 2976 b(cont); 2977 bind(L_regular_inflated_unlock); 2978 } 2979 #endif 2980 2981 ld(displaced_header, ObjectMonitor::recursions_offset_in_bytes(), current_header); 2982 xorr(temp, R16_thread, temp); // Will be 0 if we are the owner. 2983 orr(temp, temp, displaced_header); // Will be 0 if there are 0 recursions. 2984 cmpdi(flag, temp, 0); 2985 bne(flag, cont); 2986 2987 ld(temp, ObjectMonitor::EntryList_offset_in_bytes(), current_header); 2988 ld(displaced_header, ObjectMonitor::cxq_offset_in_bytes(), current_header); 2989 orr(temp, temp, displaced_header); // Will be 0 if both are 0. 2990 cmpdi(flag, temp, 0); 2991 bne(flag, cont); 2992 release(); 2993 std(temp, ObjectMonitor::owner_offset_in_bytes(), current_header); 2994 } 2995 2996 bind(cont); 2997 // flag == EQ indicates success 2998 // flag == NE indicates failure 2999 } 3000 3001 // Write serialization page so VM thread can do a pseudo remote membar. 3002 // We use the current thread pointer to calculate a thread specific 3003 // offset to write to within the page. This minimizes bus traffic 3004 // due to cache line collision. 3005 void MacroAssembler::serialize_memory(Register thread, Register tmp1, Register tmp2) { 3006 srdi(tmp2, thread, os::get_serialize_page_shift_count()); 3007 3008 int mask = os::vm_page_size() - sizeof(int); 3009 if (Assembler::is_simm(mask, 16)) { 3010 andi(tmp2, tmp2, mask); 3011 } else { 3012 lis(tmp1, (int)((signed short) (mask >> 16))); 3013 ori(tmp1, tmp1, mask & 0x0000ffff); 3014 andr(tmp2, tmp2, tmp1); 3015 } 3016 3017 load_const(tmp1, (long) os::get_memory_serialize_page()); 3018 release(); 3019 stwx(R0, tmp1, tmp2); 3020 } 3021 3022 void MacroAssembler::safepoint_poll(Label& slow_path, Register temp_reg) { 3023 if (SafepointMechanism::uses_thread_local_poll()) { 3024 ld(temp_reg, in_bytes(Thread::polling_page_offset()), R16_thread); 3025 // Armed page has poll_bit set. 3026 andi_(temp_reg, temp_reg, SafepointMechanism::poll_bit()); 3027 } else { 3028 lwz(temp_reg, (RegisterOrConstant)(intptr_t)SafepointSynchronize::address_of_state()); 3029 cmpwi(CCR0, temp_reg, SafepointSynchronize::_not_synchronized); 3030 } 3031 bne(CCR0, slow_path); 3032 } 3033 3034 3035 // GC barrier helper macros 3036 3037 // Write the card table byte if needed. 3038 void MacroAssembler::card_write_barrier_post(Register Rstore_addr, Register Rnew_val, Register Rtmp) { 3039 CardTableBarrierSet* bs = 3040 barrier_set_cast<CardTableBarrierSet>(Universe::heap()->barrier_set()); 3041 assert(bs->kind() == BarrierSet::CardTableBarrierSet, "wrong barrier"); 3042 CardTable* ct = bs->card_table(); 3043 #ifdef ASSERT 3044 cmpdi(CCR0, Rnew_val, 0); 3045 asm_assert_ne("null oop not allowed", 0x321); 3046 #endif 3047 card_table_write(ct->byte_map_base(), Rtmp, Rstore_addr); 3048 } 3049 3050 // Write the card table byte. 3051 void MacroAssembler::card_table_write(jbyte* byte_map_base, Register Rtmp, Register Robj) { 3052 assert_different_registers(Robj, Rtmp, R0); 3053 load_const_optimized(Rtmp, (address)byte_map_base, R0); 3054 srdi(Robj, Robj, CardTable::card_shift); 3055 li(R0, 0); // dirty 3056 if (UseConcMarkSweepGC) membar(Assembler::StoreStore); 3057 stbx(R0, Rtmp, Robj); 3058 } 3059 3060 // Kills R31 if value is a volatile register. 3061 void MacroAssembler::resolve_jobject(Register value, Register tmp1, Register tmp2, bool needs_frame) { 3062 Label done; 3063 cmpdi(CCR0, value, 0); 3064 beq(CCR0, done); // Use NULL as-is. 3065 3066 clrrdi(tmp1, value, JNIHandles::weak_tag_size); 3067 #if INCLUDE_ALL_GCS 3068 if (UseG1GC) { andi_(tmp2, value, JNIHandles::weak_tag_mask); } 3069 #endif 3070 ld(value, 0, tmp1); // Resolve (untagged) jobject. 3071 3072 #if INCLUDE_ALL_GCS 3073 if (UseG1GC) { 3074 Label not_weak; 3075 beq(CCR0, not_weak); // Test for jweak tag. 3076 verify_oop(value); 3077 g1_write_barrier_pre(noreg, // obj 3078 noreg, // offset 3079 value, // pre_val 3080 tmp1, tmp2, needs_frame); 3081 bind(not_weak); 3082 } 3083 #endif // INCLUDE_ALL_GCS 3084 verify_oop(value); 3085 bind(done); 3086 } 3087 3088 #if INCLUDE_ALL_GCS 3089 // General G1 pre-barrier generator. 3090 // Goal: record the previous value if it is not null. 3091 void MacroAssembler::g1_write_barrier_pre(Register Robj, RegisterOrConstant offset, Register Rpre_val, 3092 Register Rtmp1, Register Rtmp2, bool needs_frame) { 3093 Label runtime, filtered; 3094 3095 // Is marking active? 3096 if (in_bytes(SATBMarkQueue::byte_width_of_active()) == 4) { 3097 lwz(Rtmp1, in_bytes(JavaThread::satb_mark_queue_offset() + SATBMarkQueue::byte_offset_of_active()), R16_thread); 3098 } else { 3099 guarantee(in_bytes(SATBMarkQueue::byte_width_of_active()) == 1, "Assumption"); 3100 lbz(Rtmp1, in_bytes(JavaThread::satb_mark_queue_offset() + SATBMarkQueue::byte_offset_of_active()), R16_thread); 3101 } 3102 cmpdi(CCR0, Rtmp1, 0); 3103 beq(CCR0, filtered); 3104 3105 // Do we need to load the previous value? 3106 if (Robj != noreg) { 3107 // Load the previous value... 3108 if (UseCompressedOops) { 3109 lwz(Rpre_val, offset, Robj); 3110 } else { 3111 ld(Rpre_val, offset, Robj); 3112 } 3113 // Previous value has been loaded into Rpre_val. 3114 } 3115 assert(Rpre_val != noreg, "must have a real register"); 3116 3117 // Is the previous value null? 3118 cmpdi(CCR0, Rpre_val, 0); 3119 beq(CCR0, filtered); 3120 3121 if (Robj != noreg && UseCompressedOops) { 3122 decode_heap_oop_not_null(Rpre_val); 3123 } 3124 3125 // OK, it's not filtered, so we'll need to call enqueue. In the normal 3126 // case, pre_val will be a scratch G-reg, but there are some cases in 3127 // which it's an O-reg. In the first case, do a normal call. In the 3128 // latter, do a save here and call the frameless version. 3129 3130 // Can we store original value in the thread's buffer? 3131 // Is index == 0? 3132 // (The index field is typed as size_t.) 3133 const Register Rbuffer = Rtmp1, Rindex = Rtmp2; 3134 3135 ld(Rindex, in_bytes(JavaThread::satb_mark_queue_offset() + SATBMarkQueue::byte_offset_of_index()), R16_thread); 3136 cmpdi(CCR0, Rindex, 0); 3137 beq(CCR0, runtime); // If index == 0, goto runtime. 3138 ld(Rbuffer, in_bytes(JavaThread::satb_mark_queue_offset() + SATBMarkQueue::byte_offset_of_buf()), R16_thread); 3139 3140 addi(Rindex, Rindex, -wordSize); // Decrement index. 3141 std(Rindex, in_bytes(JavaThread::satb_mark_queue_offset() + SATBMarkQueue::byte_offset_of_index()), R16_thread); 3142 3143 // Record the previous value. 3144 stdx(Rpre_val, Rbuffer, Rindex); 3145 b(filtered); 3146 3147 bind(runtime); 3148 3149 // May need to preserve LR. Also needed if current frame is not compatible with C calling convention. 3150 if (needs_frame) { 3151 save_LR_CR(Rtmp1); 3152 push_frame_reg_args(0, Rtmp2); 3153 } 3154 3155 if (Rpre_val->is_volatile() && Robj == noreg) mr(R31, Rpre_val); // Save pre_val across C call if it was preloaded. 3156 call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_pre), Rpre_val, R16_thread); 3157 if (Rpre_val->is_volatile() && Robj == noreg) mr(Rpre_val, R31); // restore 3158 3159 if (needs_frame) { 3160 pop_frame(); 3161 restore_LR_CR(Rtmp1); 3162 } 3163 3164 bind(filtered); 3165 } 3166 3167 // General G1 post-barrier generator 3168 // Store cross-region card. 3169 void MacroAssembler::g1_write_barrier_post(Register Rstore_addr, Register Rnew_val, Register Rtmp1, Register Rtmp2, Register Rtmp3, Label *filtered_ext) { 3170 Label runtime, filtered_int; 3171 Label& filtered = (filtered_ext != NULL) ? *filtered_ext : filtered_int; 3172 assert_different_registers(Rstore_addr, Rnew_val, Rtmp1, Rtmp2); 3173 3174 G1BarrierSet* bs = 3175 barrier_set_cast<G1BarrierSet>(Universe::heap()->barrier_set()); 3176 CardTable* ct = bs->card_table(); 3177 3178 // Does store cross heap regions? 3179 if (G1RSBarrierRegionFilter) { 3180 xorr(Rtmp1, Rstore_addr, Rnew_val); 3181 srdi_(Rtmp1, Rtmp1, HeapRegion::LogOfHRGrainBytes); 3182 beq(CCR0, filtered); 3183 } 3184 3185 // Crosses regions, storing NULL? 3186 #ifdef ASSERT 3187 cmpdi(CCR0, Rnew_val, 0); 3188 asm_assert_ne("null oop not allowed (G1)", 0x322); // Checked by caller on PPC64, so following branch is obsolete: 3189 //beq(CCR0, filtered); 3190 #endif 3191 3192 // Storing region crossing non-NULL, is card already dirty? 3193 assert(sizeof(*ct->byte_map_base()) == sizeof(jbyte), "adjust this code"); 3194 const Register Rcard_addr = Rtmp1; 3195 Register Rbase = Rtmp2; 3196 load_const_optimized(Rbase, (address)ct->byte_map_base(), /*temp*/ Rtmp3); 3197 3198 srdi(Rcard_addr, Rstore_addr, CardTable::card_shift); 3199 3200 // Get the address of the card. 3201 lbzx(/*card value*/ Rtmp3, Rbase, Rcard_addr); 3202 cmpwi(CCR0, Rtmp3, (int)G1CardTable::g1_young_card_val()); 3203 beq(CCR0, filtered); 3204 3205 membar(Assembler::StoreLoad); 3206 lbzx(/*card value*/ Rtmp3, Rbase, Rcard_addr); // Reload after membar. 3207 cmpwi(CCR0, Rtmp3 /* card value */, CardTable::dirty_card_val()); 3208 beq(CCR0, filtered); 3209 3210 // Storing a region crossing, non-NULL oop, card is clean. 3211 // Dirty card and log. 3212 li(Rtmp3, CardTable::dirty_card_val()); 3213 //release(); // G1: oops are allowed to get visible after dirty marking. 3214 stbx(Rtmp3, Rbase, Rcard_addr); 3215 3216 add(Rcard_addr, Rbase, Rcard_addr); // This is the address which needs to get enqueued. 3217 Rbase = noreg; // end of lifetime 3218 3219 const Register Rqueue_index = Rtmp2, 3220 Rqueue_buf = Rtmp3; 3221 ld(Rqueue_index, in_bytes(JavaThread::dirty_card_queue_offset() + DirtyCardQueue::byte_offset_of_index()), R16_thread); 3222 cmpdi(CCR0, Rqueue_index, 0); 3223 beq(CCR0, runtime); // index == 0 then jump to runtime 3224 ld(Rqueue_buf, in_bytes(JavaThread::dirty_card_queue_offset() + DirtyCardQueue::byte_offset_of_buf()), R16_thread); 3225 3226 addi(Rqueue_index, Rqueue_index, -wordSize); // decrement index 3227 std(Rqueue_index, in_bytes(JavaThread::dirty_card_queue_offset() + DirtyCardQueue::byte_offset_of_index()), R16_thread); 3228 3229 stdx(Rcard_addr, Rqueue_buf, Rqueue_index); // store card 3230 b(filtered); 3231 3232 bind(runtime); 3233 3234 // Save the live input values. 3235 call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_post), Rcard_addr, R16_thread); 3236 3237 bind(filtered_int); 3238 } 3239 #endif // INCLUDE_ALL_GCS 3240 3241 // Values for last_Java_pc, and last_Java_sp must comply to the rules 3242 // in frame_ppc.hpp. 3243 void MacroAssembler::set_last_Java_frame(Register last_Java_sp, Register last_Java_pc) { 3244 // Always set last_Java_pc and flags first because once last_Java_sp 3245 // is visible has_last_Java_frame is true and users will look at the 3246 // rest of the fields. (Note: flags should always be zero before we 3247 // get here so doesn't need to be set.) 3248 3249 // Verify that last_Java_pc was zeroed on return to Java 3250 asm_assert_mem8_is_zero(in_bytes(JavaThread::last_Java_pc_offset()), R16_thread, 3251 "last_Java_pc not zeroed before leaving Java", 0x200); 3252 3253 // When returning from calling out from Java mode the frame anchor's 3254 // last_Java_pc will always be set to NULL. It is set here so that 3255 // if we are doing a call to native (not VM) that we capture the 3256 // known pc and don't have to rely on the native call having a 3257 // standard frame linkage where we can find the pc. 3258 if (last_Java_pc != noreg) 3259 std(last_Java_pc, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread); 3260 3261 // Set last_Java_sp last. 3262 std(last_Java_sp, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread); 3263 } 3264 3265 void MacroAssembler::reset_last_Java_frame(void) { 3266 asm_assert_mem8_isnot_zero(in_bytes(JavaThread::last_Java_sp_offset()), 3267 R16_thread, "SP was not set, still zero", 0x202); 3268 3269 BLOCK_COMMENT("reset_last_Java_frame {"); 3270 li(R0, 0); 3271 3272 // _last_Java_sp = 0 3273 std(R0, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread); 3274 3275 // _last_Java_pc = 0 3276 std(R0, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread); 3277 BLOCK_COMMENT("} reset_last_Java_frame"); 3278 } 3279 3280 void MacroAssembler::set_top_ijava_frame_at_SP_as_last_Java_frame(Register sp, Register tmp1) { 3281 assert_different_registers(sp, tmp1); 3282 3283 // sp points to a TOP_IJAVA_FRAME, retrieve frame's PC via 3284 // TOP_IJAVA_FRAME_ABI. 3285 // FIXME: assert that we really have a TOP_IJAVA_FRAME here! 3286 address entry = pc(); 3287 load_const_optimized(tmp1, entry); 3288 3289 set_last_Java_frame(/*sp=*/sp, /*pc=*/tmp1); 3290 } 3291 3292 void MacroAssembler::get_vm_result(Register oop_result) { 3293 // Read: 3294 // R16_thread 3295 // R16_thread->in_bytes(JavaThread::vm_result_offset()) 3296 // 3297 // Updated: 3298 // oop_result 3299 // R16_thread->in_bytes(JavaThread::vm_result_offset()) 3300 3301 verify_thread(); 3302 3303 ld(oop_result, in_bytes(JavaThread::vm_result_offset()), R16_thread); 3304 li(R0, 0); 3305 std(R0, in_bytes(JavaThread::vm_result_offset()), R16_thread); 3306 3307 verify_oop(oop_result); 3308 } 3309 3310 void MacroAssembler::get_vm_result_2(Register metadata_result) { 3311 // Read: 3312 // R16_thread 3313 // R16_thread->in_bytes(JavaThread::vm_result_2_offset()) 3314 // 3315 // Updated: 3316 // metadata_result 3317 // R16_thread->in_bytes(JavaThread::vm_result_2_offset()) 3318 3319 ld(metadata_result, in_bytes(JavaThread::vm_result_2_offset()), R16_thread); 3320 li(R0, 0); 3321 std(R0, in_bytes(JavaThread::vm_result_2_offset()), R16_thread); 3322 } 3323 3324 Register MacroAssembler::encode_klass_not_null(Register dst, Register src) { 3325 Register current = (src != noreg) ? src : dst; // Klass is in dst if no src provided. 3326 if (Universe::narrow_klass_base() != 0) { 3327 // Use dst as temp if it is free. 3328 sub_const_optimized(dst, current, Universe::narrow_klass_base(), R0); 3329 current = dst; 3330 } 3331 if (Universe::narrow_klass_shift() != 0) { 3332 srdi(dst, current, Universe::narrow_klass_shift()); 3333 current = dst; 3334 } 3335 return current; 3336 } 3337 3338 void MacroAssembler::store_klass(Register dst_oop, Register klass, Register ck) { 3339 if (UseCompressedClassPointers) { 3340 Register compressedKlass = encode_klass_not_null(ck, klass); 3341 stw(compressedKlass, oopDesc::klass_offset_in_bytes(), dst_oop); 3342 } else { 3343 std(klass, oopDesc::klass_offset_in_bytes(), dst_oop); 3344 } 3345 } 3346 3347 void MacroAssembler::store_klass_gap(Register dst_oop, Register val) { 3348 if (UseCompressedClassPointers) { 3349 if (val == noreg) { 3350 val = R0; 3351 li(val, 0); 3352 } 3353 stw(val, oopDesc::klass_gap_offset_in_bytes(), dst_oop); // klass gap if compressed 3354 } 3355 } 3356 3357 int MacroAssembler::instr_size_for_decode_klass_not_null() { 3358 if (!UseCompressedClassPointers) return 0; 3359 int num_instrs = 1; // shift or move 3360 if (Universe::narrow_klass_base() != 0) num_instrs = 7; // shift + load const + add 3361 return num_instrs * BytesPerInstWord; 3362 } 3363 3364 void MacroAssembler::decode_klass_not_null(Register dst, Register src) { 3365 assert(dst != R0, "Dst reg may not be R0, as R0 is used here."); 3366 if (src == noreg) src = dst; 3367 Register shifted_src = src; 3368 if (Universe::narrow_klass_shift() != 0 || 3369 Universe::narrow_klass_base() == 0 && src != dst) { // Move required. 3370 shifted_src = dst; 3371 sldi(shifted_src, src, Universe::narrow_klass_shift()); 3372 } 3373 if (Universe::narrow_klass_base() != 0) { 3374 add_const_optimized(dst, shifted_src, Universe::narrow_klass_base(), R0); 3375 } 3376 } 3377 3378 void MacroAssembler::load_klass(Register dst, Register src) { 3379 if (UseCompressedClassPointers) { 3380 lwz(dst, oopDesc::klass_offset_in_bytes(), src); 3381 // Attention: no null check here! 3382 decode_klass_not_null(dst, dst); 3383 } else { 3384 ld(dst, oopDesc::klass_offset_in_bytes(), src); 3385 } 3386 } 3387 3388 // ((OopHandle)result).resolve(); 3389 void MacroAssembler::resolve_oop_handle(Register result) { 3390 // OopHandle::resolve is an indirection. 3391 ld(result, 0, result); 3392 } 3393 3394 void MacroAssembler::load_mirror_from_const_method(Register mirror, Register const_method) { 3395 ld(mirror, in_bytes(ConstMethod::constants_offset()), const_method); 3396 ld(mirror, ConstantPool::pool_holder_offset_in_bytes(), mirror); 3397 ld(mirror, in_bytes(Klass::java_mirror_offset()), mirror); 3398 resolve_oop_handle(mirror); 3399 } 3400 3401 // Clear Array 3402 // For very short arrays. tmp == R0 is allowed. 3403 void MacroAssembler::clear_memory_unrolled(Register base_ptr, int cnt_dwords, Register tmp, int offset) { 3404 if (cnt_dwords > 0) { li(tmp, 0); } 3405 for (int i = 0; i < cnt_dwords; ++i) { std(tmp, offset + i * 8, base_ptr); } 3406 } 3407 3408 // Version for constant short array length. Kills base_ptr. tmp == R0 is allowed. 3409 void MacroAssembler::clear_memory_constlen(Register base_ptr, int cnt_dwords, Register tmp) { 3410 if (cnt_dwords < 8) { 3411 clear_memory_unrolled(base_ptr, cnt_dwords, tmp); 3412 return; 3413 } 3414 3415 Label loop; 3416 const long loopcnt = cnt_dwords >> 1, 3417 remainder = cnt_dwords & 1; 3418 3419 li(tmp, loopcnt); 3420 mtctr(tmp); 3421 li(tmp, 0); 3422 bind(loop); 3423 std(tmp, 0, base_ptr); 3424 std(tmp, 8, base_ptr); 3425 addi(base_ptr, base_ptr, 16); 3426 bdnz(loop); 3427 if (remainder) { std(tmp, 0, base_ptr); } 3428 } 3429 3430 // Kills both input registers. tmp == R0 is allowed. 3431 void MacroAssembler::clear_memory_doubleword(Register base_ptr, Register cnt_dwords, Register tmp, long const_cnt) { 3432 // Procedure for large arrays (uses data cache block zero instruction). 3433 Label startloop, fast, fastloop, small_rest, restloop, done; 3434 const int cl_size = VM_Version::L1_data_cache_line_size(), 3435 cl_dwords = cl_size >> 3, 3436 cl_dw_addr_bits = exact_log2(cl_dwords), 3437 dcbz_min = 1, // Min count of dcbz executions, needs to be >0. 3438 min_cnt = ((dcbz_min + 1) << cl_dw_addr_bits) - 1; 3439 3440 if (const_cnt >= 0) { 3441 // Constant case. 3442 if (const_cnt < min_cnt) { 3443 clear_memory_constlen(base_ptr, const_cnt, tmp); 3444 return; 3445 } 3446 load_const_optimized(cnt_dwords, const_cnt, tmp); 3447 } else { 3448 // cnt_dwords already loaded in register. Need to check size. 3449 cmpdi(CCR1, cnt_dwords, min_cnt); // Big enough? (ensure >= dcbz_min lines included). 3450 blt(CCR1, small_rest); 3451 } 3452 rldicl_(tmp, base_ptr, 64-3, 64-cl_dw_addr_bits); // Extract dword offset within first cache line. 3453 beq(CCR0, fast); // Already 128byte aligned. 3454 3455 subfic(tmp, tmp, cl_dwords); 3456 mtctr(tmp); // Set ctr to hit 128byte boundary (0<ctr<cl_dwords). 3457 subf(cnt_dwords, tmp, cnt_dwords); // rest. 3458 li(tmp, 0); 3459 3460 bind(startloop); // Clear at the beginning to reach 128byte boundary. 3461 std(tmp, 0, base_ptr); // Clear 8byte aligned block. 3462 addi(base_ptr, base_ptr, 8); 3463 bdnz(startloop); 3464 3465 bind(fast); // Clear 128byte blocks. 3466 srdi(tmp, cnt_dwords, cl_dw_addr_bits); // Loop count for 128byte loop (>0). 3467 andi(cnt_dwords, cnt_dwords, cl_dwords-1); // Rest in dwords. 3468 mtctr(tmp); // Load counter. 3469 3470 bind(fastloop); 3471 dcbz(base_ptr); // Clear 128byte aligned block. 3472 addi(base_ptr, base_ptr, cl_size); 3473 bdnz(fastloop); 3474 3475 bind(small_rest); 3476 cmpdi(CCR0, cnt_dwords, 0); // size 0? 3477 beq(CCR0, done); // rest == 0 3478 li(tmp, 0); 3479 mtctr(cnt_dwords); // Load counter. 3480 3481 bind(restloop); // Clear rest. 3482 std(tmp, 0, base_ptr); // Clear 8byte aligned block. 3483 addi(base_ptr, base_ptr, 8); 3484 bdnz(restloop); 3485 3486 bind(done); 3487 } 3488 3489 /////////////////////////////////////////// String intrinsics //////////////////////////////////////////// 3490 3491 #ifdef COMPILER2 3492 // Intrinsics for CompactStrings 3493 3494 // Compress char[] to byte[] by compressing 16 bytes at once. 3495 void MacroAssembler::string_compress_16(Register src, Register dst, Register cnt, 3496 Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5, 3497 Label& Lfailure) { 3498 3499 const Register tmp0 = R0; 3500 assert_different_registers(src, dst, cnt, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5); 3501 Label Lloop, Lslow; 3502 3503 // Check if cnt >= 8 (= 16 bytes) 3504 lis(tmp1, 0xFF); // tmp1 = 0x00FF00FF00FF00FF 3505 srwi_(tmp2, cnt, 3); 3506 beq(CCR0, Lslow); 3507 ori(tmp1, tmp1, 0xFF); 3508 rldimi(tmp1, tmp1, 32, 0); 3509 mtctr(tmp2); 3510 3511 // 2x unrolled loop 3512 bind(Lloop); 3513 ld(tmp2, 0, src); // _0_1_2_3 (Big Endian) 3514 ld(tmp4, 8, src); // _4_5_6_7 3515 3516 orr(tmp0, tmp2, tmp4); 3517 rldicl(tmp3, tmp2, 6*8, 64-24); // _____1_2 3518 rldimi(tmp2, tmp2, 2*8, 2*8); // _0_2_3_3 3519 rldicl(tmp5, tmp4, 6*8, 64-24); // _____5_6 3520 rldimi(tmp4, tmp4, 2*8, 2*8); // _4_6_7_7 3521 3522 andc_(tmp0, tmp0, tmp1); 3523 bne(CCR0, Lfailure); // Not latin1. 3524 addi(src, src, 16); 3525 3526 rlwimi(tmp3, tmp2, 0*8, 24, 31);// _____1_3 3527 srdi(tmp2, tmp2, 3*8); // ____0_2_ 3528 rlwimi(tmp5, tmp4, 0*8, 24, 31);// _____5_7 3529 srdi(tmp4, tmp4, 3*8); // ____4_6_ 3530 3531 orr(tmp2, tmp2, tmp3); // ____0123 3532 orr(tmp4, tmp4, tmp5); // ____4567 3533 3534 stw(tmp2, 0, dst); 3535 stw(tmp4, 4, dst); 3536 addi(dst, dst, 8); 3537 bdnz(Lloop); 3538 3539 bind(Lslow); // Fallback to slow version 3540 } 3541 3542 // Compress char[] to byte[]. cnt must be positive int. 3543 void MacroAssembler::string_compress(Register src, Register dst, Register cnt, Register tmp, Label& Lfailure) { 3544 Label Lloop; 3545 mtctr(cnt); 3546 3547 bind(Lloop); 3548 lhz(tmp, 0, src); 3549 cmplwi(CCR0, tmp, 0xff); 3550 bgt(CCR0, Lfailure); // Not latin1. 3551 addi(src, src, 2); 3552 stb(tmp, 0, dst); 3553 addi(dst, dst, 1); 3554 bdnz(Lloop); 3555 } 3556 3557 // Inflate byte[] to char[] by inflating 16 bytes at once. 3558 void MacroAssembler::string_inflate_16(Register src, Register dst, Register cnt, 3559 Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5) { 3560 const Register tmp0 = R0; 3561 assert_different_registers(src, dst, cnt, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5); 3562 Label Lloop, Lslow; 3563 3564 // Check if cnt >= 8 3565 srwi_(tmp2, cnt, 3); 3566 beq(CCR0, Lslow); 3567 lis(tmp1, 0xFF); // tmp1 = 0x00FF00FF 3568 ori(tmp1, tmp1, 0xFF); 3569 mtctr(tmp2); 3570 3571 // 2x unrolled loop 3572 bind(Lloop); 3573 lwz(tmp2, 0, src); // ____0123 (Big Endian) 3574 lwz(tmp4, 4, src); // ____4567 3575 addi(src, src, 8); 3576 3577 rldicl(tmp3, tmp2, 7*8, 64-8); // _______2 3578 rlwimi(tmp2, tmp2, 3*8, 16, 23);// ____0113 3579 rldicl(tmp5, tmp4, 7*8, 64-8); // _______6 3580 rlwimi(tmp4, tmp4, 3*8, 16, 23);// ____4557 3581 3582 andc(tmp0, tmp2, tmp1); // ____0_1_ 3583 rlwimi(tmp2, tmp3, 2*8, 0, 23); // _____2_3 3584 andc(tmp3, tmp4, tmp1); // ____4_5_ 3585 rlwimi(tmp4, tmp5, 2*8, 0, 23); // _____6_7 3586 3587 rldimi(tmp2, tmp0, 3*8, 0*8); // _0_1_2_3 3588 rldimi(tmp4, tmp3, 3*8, 0*8); // _4_5_6_7 3589 3590 std(tmp2, 0, dst); 3591 std(tmp4, 8, dst); 3592 addi(dst, dst, 16); 3593 bdnz(Lloop); 3594 3595 bind(Lslow); // Fallback to slow version 3596 } 3597 3598 // Inflate byte[] to char[]. cnt must be positive int. 3599 void MacroAssembler::string_inflate(Register src, Register dst, Register cnt, Register tmp) { 3600 Label Lloop; 3601 mtctr(cnt); 3602 3603 bind(Lloop); 3604 lbz(tmp, 0, src); 3605 addi(src, src, 1); 3606 sth(tmp, 0, dst); 3607 addi(dst, dst, 2); 3608 bdnz(Lloop); 3609 } 3610 3611 void MacroAssembler::string_compare(Register str1, Register str2, 3612 Register cnt1, Register cnt2, 3613 Register tmp1, Register result, int ae) { 3614 const Register tmp0 = R0, 3615 diff = tmp1; 3616 3617 assert_different_registers(str1, str2, cnt1, cnt2, tmp0, tmp1, result); 3618 Label Ldone, Lslow, Lloop, Lreturn_diff; 3619 3620 // Note: Making use of the fact that compareTo(a, b) == -compareTo(b, a) 3621 // we interchange str1 and str2 in the UL case and negate the result. 3622 // Like this, str1 is always latin1 encoded, except for the UU case. 3623 // In addition, we need 0 (or sign which is 0) extend. 3624 3625 if (ae == StrIntrinsicNode::UU) { 3626 srwi(cnt1, cnt1, 1); 3627 } else { 3628 clrldi(cnt1, cnt1, 32); 3629 } 3630 3631 if (ae != StrIntrinsicNode::LL) { 3632 srwi(cnt2, cnt2, 1); 3633 } else { 3634 clrldi(cnt2, cnt2, 32); 3635 } 3636 3637 // See if the lengths are different, and calculate min in cnt1. 3638 // Save diff in case we need it for a tie-breaker. 3639 subf_(diff, cnt2, cnt1); // diff = cnt1 - cnt2 3640 // if (diff > 0) { cnt1 = cnt2; } 3641 if (VM_Version::has_isel()) { 3642 isel(cnt1, CCR0, Assembler::greater, /*invert*/ false, cnt2); 3643 } else { 3644 Label Lskip; 3645 blt(CCR0, Lskip); 3646 mr(cnt1, cnt2); 3647 bind(Lskip); 3648 } 3649 3650 // Rename registers 3651 Register chr1 = result; 3652 Register chr2 = tmp0; 3653 3654 // Compare multiple characters in fast loop (only implemented for same encoding). 3655 int stride1 = 8, stride2 = 8; 3656 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3657 int log2_chars_per_iter = (ae == StrIntrinsicNode::LL) ? 3 : 2; 3658 Label Lfastloop, Lskipfast; 3659 3660 srwi_(tmp0, cnt1, log2_chars_per_iter); 3661 beq(CCR0, Lskipfast); 3662 rldicl(cnt2, cnt1, 0, 64 - log2_chars_per_iter); // Remaining characters. 3663 li(cnt1, 1 << log2_chars_per_iter); // Initialize for failure case: Rescan characters from current iteration. 3664 mtctr(tmp0); 3665 3666 bind(Lfastloop); 3667 ld(chr1, 0, str1); 3668 ld(chr2, 0, str2); 3669 cmpd(CCR0, chr1, chr2); 3670 bne(CCR0, Lslow); 3671 addi(str1, str1, stride1); 3672 addi(str2, str2, stride2); 3673 bdnz(Lfastloop); 3674 mr(cnt1, cnt2); // Remaining characters. 3675 bind(Lskipfast); 3676 } 3677 3678 // Loop which searches the first difference character by character. 3679 cmpwi(CCR0, cnt1, 0); 3680 beq(CCR0, Lreturn_diff); 3681 bind(Lslow); 3682 mtctr(cnt1); 3683 3684 switch (ae) { 3685 case StrIntrinsicNode::LL: stride1 = 1; stride2 = 1; break; 3686 case StrIntrinsicNode::UL: // fallthru (see comment above) 3687 case StrIntrinsicNode::LU: stride1 = 1; stride2 = 2; break; 3688 case StrIntrinsicNode::UU: stride1 = 2; stride2 = 2; break; 3689 default: ShouldNotReachHere(); break; 3690 } 3691 3692 bind(Lloop); 3693 if (stride1 == 1) { lbz(chr1, 0, str1); } else { lhz(chr1, 0, str1); } 3694 if (stride2 == 1) { lbz(chr2, 0, str2); } else { lhz(chr2, 0, str2); } 3695 subf_(result, chr2, chr1); // result = chr1 - chr2 3696 bne(CCR0, Ldone); 3697 addi(str1, str1, stride1); 3698 addi(str2, str2, stride2); 3699 bdnz(Lloop); 3700 3701 // If strings are equal up to min length, return the length difference. 3702 bind(Lreturn_diff); 3703 mr(result, diff); 3704 3705 // Otherwise, return the difference between the first mismatched chars. 3706 bind(Ldone); 3707 if (ae == StrIntrinsicNode::UL) { 3708 neg(result, result); // Negate result (see note above). 3709 } 3710 } 3711 3712 void MacroAssembler::array_equals(bool is_array_equ, Register ary1, Register ary2, 3713 Register limit, Register tmp1, Register result, bool is_byte) { 3714 const Register tmp0 = R0; 3715 assert_different_registers(ary1, ary2, limit, tmp0, tmp1, result); 3716 Label Ldone, Lskiploop, Lloop, Lfastloop, Lskipfast; 3717 bool limit_needs_shift = false; 3718 3719 if (is_array_equ) { 3720 const int length_offset = arrayOopDesc::length_offset_in_bytes(); 3721 const int base_offset = arrayOopDesc::base_offset_in_bytes(is_byte ? T_BYTE : T_CHAR); 3722 3723 // Return true if the same array. 3724 cmpd(CCR0, ary1, ary2); 3725 beq(CCR0, Lskiploop); 3726 3727 // Return false if one of them is NULL. 3728 cmpdi(CCR0, ary1, 0); 3729 cmpdi(CCR1, ary2, 0); 3730 li(result, 0); 3731 cror(CCR0, Assembler::equal, CCR1, Assembler::equal); 3732 beq(CCR0, Ldone); 3733 3734 // Load the lengths of arrays. 3735 lwz(limit, length_offset, ary1); 3736 lwz(tmp0, length_offset, ary2); 3737 3738 // Return false if the two arrays are not equal length. 3739 cmpw(CCR0, limit, tmp0); 3740 bne(CCR0, Ldone); 3741 3742 // Load array addresses. 3743 addi(ary1, ary1, base_offset); 3744 addi(ary2, ary2, base_offset); 3745 } else { 3746 limit_needs_shift = !is_byte; 3747 li(result, 0); // Assume not equal. 3748 } 3749 3750 // Rename registers 3751 Register chr1 = tmp0; 3752 Register chr2 = tmp1; 3753 3754 // Compare 8 bytes per iteration in fast loop. 3755 const int log2_chars_per_iter = is_byte ? 3 : 2; 3756 3757 srwi_(tmp0, limit, log2_chars_per_iter + (limit_needs_shift ? 1 : 0)); 3758 beq(CCR0, Lskipfast); 3759 mtctr(tmp0); 3760 3761 bind(Lfastloop); 3762 ld(chr1, 0, ary1); 3763 ld(chr2, 0, ary2); 3764 addi(ary1, ary1, 8); 3765 addi(ary2, ary2, 8); 3766 cmpd(CCR0, chr1, chr2); 3767 bne(CCR0, Ldone); 3768 bdnz(Lfastloop); 3769 3770 bind(Lskipfast); 3771 rldicl_(limit, limit, limit_needs_shift ? 64 - 1 : 0, 64 - log2_chars_per_iter); // Remaining characters. 3772 beq(CCR0, Lskiploop); 3773 mtctr(limit); 3774 3775 // Character by character. 3776 bind(Lloop); 3777 if (is_byte) { 3778 lbz(chr1, 0, ary1); 3779 lbz(chr2, 0, ary2); 3780 addi(ary1, ary1, 1); 3781 addi(ary2, ary2, 1); 3782 } else { 3783 lhz(chr1, 0, ary1); 3784 lhz(chr2, 0, ary2); 3785 addi(ary1, ary1, 2); 3786 addi(ary2, ary2, 2); 3787 } 3788 cmpw(CCR0, chr1, chr2); 3789 bne(CCR0, Ldone); 3790 bdnz(Lloop); 3791 3792 bind(Lskiploop); 3793 li(result, 1); // All characters are equal. 3794 bind(Ldone); 3795 } 3796 3797 void MacroAssembler::string_indexof(Register result, Register haystack, Register haycnt, 3798 Register needle, ciTypeArray* needle_values, Register needlecnt, int needlecntval, 3799 Register tmp1, Register tmp2, Register tmp3, Register tmp4, int ae) { 3800 3801 // Ensure 0<needlecnt<=haycnt in ideal graph as prerequisite! 3802 Label L_TooShort, L_Found, L_NotFound, L_End; 3803 Register last_addr = haycnt, // Kill haycnt at the beginning. 3804 addr = tmp1, 3805 n_start = tmp2, 3806 ch1 = tmp3, 3807 ch2 = R0; 3808 3809 assert(ae != StrIntrinsicNode::LU, "Invalid encoding"); 3810 const int h_csize = (ae == StrIntrinsicNode::LL) ? 1 : 2; 3811 const int n_csize = (ae == StrIntrinsicNode::UU) ? 2 : 1; 3812 3813 // ************************************************************************************************** 3814 // Prepare for main loop: optimized for needle count >=2, bail out otherwise. 3815 // ************************************************************************************************** 3816 3817 // Compute last haystack addr to use if no match gets found. 3818 clrldi(haycnt, haycnt, 32); // Ensure positive int is valid as 64 bit value. 3819 addi(addr, haystack, -h_csize); // Accesses use pre-increment. 3820 if (needlecntval == 0) { // variable needlecnt 3821 cmpwi(CCR6, needlecnt, 2); 3822 clrldi(needlecnt, needlecnt, 32); // Ensure positive int is valid as 64 bit value. 3823 blt(CCR6, L_TooShort); // Variable needlecnt: handle short needle separately. 3824 } 3825 3826 if (n_csize == 2) { lwz(n_start, 0, needle); } else { lhz(n_start, 0, needle); } // Load first 2 characters of needle. 3827 3828 if (needlecntval == 0) { // variable needlecnt 3829 subf(ch1, needlecnt, haycnt); // Last character index to compare is haycnt-needlecnt. 3830 addi(needlecnt, needlecnt, -2); // Rest of needle. 3831 } else { // constant needlecnt 3832 guarantee(needlecntval != 1, "IndexOf with single-character needle must be handled separately"); 3833 assert((needlecntval & 0x7fff) == needlecntval, "wrong immediate"); 3834 addi(ch1, haycnt, -needlecntval); // Last character index to compare is haycnt-needlecnt. 3835 if (needlecntval > 3) { li(needlecnt, needlecntval - 2); } // Rest of needle. 3836 } 3837 3838 if (h_csize == 2) { slwi(ch1, ch1, 1); } // Scale to number of bytes. 3839 3840 if (ae ==StrIntrinsicNode::UL) { 3841 srwi(tmp4, n_start, 1*8); // ___0 3842 rlwimi(n_start, tmp4, 2*8, 0, 23); // _0_1 3843 } 3844 3845 add(last_addr, haystack, ch1); // Point to last address to compare (haystack+2*(haycnt-needlecnt)). 3846 3847 // Main Loop (now we have at least 2 characters). 3848 Label L_OuterLoop, L_InnerLoop, L_FinalCheck, L_Comp1, L_Comp2; 3849 bind(L_OuterLoop); // Search for 1st 2 characters. 3850 Register addr_diff = tmp4; 3851 subf(addr_diff, addr, last_addr); // Difference between already checked address and last address to check. 3852 addi(addr, addr, h_csize); // This is the new address we want to use for comparing. 3853 srdi_(ch2, addr_diff, h_csize); 3854 beq(CCR0, L_FinalCheck); // 2 characters left? 3855 mtctr(ch2); // num of characters / 2 3856 bind(L_InnerLoop); // Main work horse (2x unrolled search loop) 3857 if (h_csize == 2) { // Load 2 characters of haystack (ignore alignment). 3858 lwz(ch1, 0, addr); 3859 lwz(ch2, 2, addr); 3860 } else { 3861 lhz(ch1, 0, addr); 3862 lhz(ch2, 1, addr); 3863 } 3864 cmpw(CCR0, ch1, n_start); // Compare 2 characters (1 would be sufficient but try to reduce branches to CompLoop). 3865 cmpw(CCR1, ch2, n_start); 3866 beq(CCR0, L_Comp1); // Did we find the needle start? 3867 beq(CCR1, L_Comp2); 3868 addi(addr, addr, 2 * h_csize); 3869 bdnz(L_InnerLoop); 3870 bind(L_FinalCheck); 3871 andi_(addr_diff, addr_diff, h_csize); // Remaining characters not covered by InnerLoop: (num of characters) & 1. 3872 beq(CCR0, L_NotFound); 3873 if (h_csize == 2) { lwz(ch1, 0, addr); } else { lhz(ch1, 0, addr); } // One position left at which we have to compare. 3874 cmpw(CCR1, ch1, n_start); 3875 beq(CCR1, L_Comp1); 3876 bind(L_NotFound); 3877 li(result, -1); // not found 3878 b(L_End); 3879 3880 // ************************************************************************************************** 3881 // Special Case: unfortunately, the variable needle case can be called with needlecnt<2 3882 // ************************************************************************************************** 3883 if (needlecntval == 0) { // We have to handle these cases separately. 3884 Label L_OneCharLoop; 3885 bind(L_TooShort); 3886 mtctr(haycnt); 3887 if (n_csize == 2) { lhz(n_start, 0, needle); } else { lbz(n_start, 0, needle); } // First character of needle 3888 bind(L_OneCharLoop); 3889 if (h_csize == 2) { lhzu(ch1, 2, addr); } else { lbzu(ch1, 1, addr); } 3890 cmpw(CCR1, ch1, n_start); 3891 beq(CCR1, L_Found); // Did we find the one character needle? 3892 bdnz(L_OneCharLoop); 3893 li(result, -1); // Not found. 3894 b(L_End); 3895 } 3896 3897 // ************************************************************************************************** 3898 // Regular Case Part II: compare rest of needle (first 2 characters have been compared already) 3899 // ************************************************************************************************** 3900 3901 // Compare the rest 3902 bind(L_Comp2); 3903 addi(addr, addr, h_csize); // First comparison has failed, 2nd one hit. 3904 bind(L_Comp1); // Addr points to possible needle start. 3905 if (needlecntval != 2) { // Const needlecnt==2? 3906 if (needlecntval != 3) { 3907 if (needlecntval == 0) { beq(CCR6, L_Found); } // Variable needlecnt==2? 3908 Register n_ind = tmp4, 3909 h_ind = n_ind; 3910 li(n_ind, 2 * n_csize); // First 2 characters are already compared, use index 2. 3911 mtctr(needlecnt); // Decremented by 2, still > 0. 3912 Label L_CompLoop; 3913 bind(L_CompLoop); 3914 if (ae ==StrIntrinsicNode::UL) { 3915 h_ind = ch1; 3916 sldi(h_ind, n_ind, 1); 3917 } 3918 if (n_csize == 2) { lhzx(ch2, needle, n_ind); } else { lbzx(ch2, needle, n_ind); } 3919 if (h_csize == 2) { lhzx(ch1, addr, h_ind); } else { lbzx(ch1, addr, h_ind); } 3920 cmpw(CCR1, ch1, ch2); 3921 bne(CCR1, L_OuterLoop); 3922 addi(n_ind, n_ind, n_csize); 3923 bdnz(L_CompLoop); 3924 } else { // No loop required if there's only one needle character left. 3925 if (n_csize == 2) { lhz(ch2, 2 * 2, needle); } else { lbz(ch2, 2 * 1, needle); } 3926 if (h_csize == 2) { lhz(ch1, 2 * 2, addr); } else { lbz(ch1, 2 * 1, addr); } 3927 cmpw(CCR1, ch1, ch2); 3928 bne(CCR1, L_OuterLoop); 3929 } 3930 } 3931 // Return index ... 3932 bind(L_Found); 3933 subf(result, haystack, addr); // relative to haystack, ... 3934 if (h_csize == 2) { srdi(result, result, 1); } // in characters. 3935 bind(L_End); 3936 } // string_indexof 3937 3938 void MacroAssembler::string_indexof_char(Register result, Register haystack, Register haycnt, 3939 Register needle, jchar needleChar, Register tmp1, Register tmp2, bool is_byte) { 3940 assert_different_registers(haystack, haycnt, needle, tmp1, tmp2); 3941 3942 Label L_InnerLoop, L_FinalCheck, L_Found1, L_Found2, L_NotFound, L_End; 3943 Register addr = tmp1, 3944 ch1 = tmp2, 3945 ch2 = R0; 3946 3947 const int h_csize = is_byte ? 1 : 2; 3948 3949 //4: 3950 srwi_(tmp2, haycnt, 1); // Shift right by exact_log2(UNROLL_FACTOR). 3951 mr(addr, haystack); 3952 beq(CCR0, L_FinalCheck); 3953 mtctr(tmp2); // Move to count register. 3954 //8: 3955 bind(L_InnerLoop); // Main work horse (2x unrolled search loop). 3956 if (!is_byte) { 3957 lhz(ch1, 0, addr); 3958 lhz(ch2, 2, addr); 3959 } else { 3960 lbz(ch1, 0, addr); 3961 lbz(ch2, 1, addr); 3962 } 3963 (needle != R0) ? cmpw(CCR0, ch1, needle) : cmplwi(CCR0, ch1, (unsigned int)needleChar); 3964 (needle != R0) ? cmpw(CCR1, ch2, needle) : cmplwi(CCR1, ch2, (unsigned int)needleChar); 3965 beq(CCR0, L_Found1); // Did we find the needle? 3966 beq(CCR1, L_Found2); 3967 addi(addr, addr, 2 * h_csize); 3968 bdnz(L_InnerLoop); 3969 //16: 3970 bind(L_FinalCheck); 3971 andi_(R0, haycnt, 1); 3972 beq(CCR0, L_NotFound); 3973 if (!is_byte) { lhz(ch1, 0, addr); } else { lbz(ch1, 0, addr); } // One position left at which we have to compare. 3974 (needle != R0) ? cmpw(CCR1, ch1, needle) : cmplwi(CCR1, ch1, (unsigned int)needleChar); 3975 beq(CCR1, L_Found1); 3976 //21: 3977 bind(L_NotFound); 3978 li(result, -1); // Not found. 3979 b(L_End); 3980 3981 bind(L_Found2); 3982 addi(addr, addr, h_csize); 3983 //24: 3984 bind(L_Found1); // Return index ... 3985 subf(result, haystack, addr); // relative to haystack, ... 3986 if (!is_byte) { srdi(result, result, 1); } // in characters. 3987 bind(L_End); 3988 } // string_indexof_char 3989 3990 3991 void MacroAssembler::has_negatives(Register src, Register cnt, Register result, 3992 Register tmp1, Register tmp2) { 3993 const Register tmp0 = R0; 3994 assert_different_registers(src, result, cnt, tmp0, tmp1, tmp2); 3995 Label Lfastloop, Lslow, Lloop, Lnoneg, Ldone; 3996 3997 // Check if cnt >= 8 (= 16 bytes) 3998 lis(tmp1, (int)(short)0x8080); // tmp1 = 0x8080808080808080 3999 srwi_(tmp2, cnt, 4); 4000 li(result, 1); // Assume there's a negative byte. 4001 beq(CCR0, Lslow); 4002 ori(tmp1, tmp1, 0x8080); 4003 rldimi(tmp1, tmp1, 32, 0); 4004 mtctr(tmp2); 4005 4006 // 2x unrolled loop 4007 bind(Lfastloop); 4008 ld(tmp2, 0, src); 4009 ld(tmp0, 8, src); 4010 4011 orr(tmp0, tmp2, tmp0); 4012 4013 and_(tmp0, tmp0, tmp1); 4014 bne(CCR0, Ldone); // Found negative byte. 4015 addi(src, src, 16); 4016 4017 bdnz(Lfastloop); 4018 4019 bind(Lslow); // Fallback to slow version 4020 rldicl_(tmp0, cnt, 0, 64-4); 4021 beq(CCR0, Lnoneg); 4022 mtctr(tmp0); 4023 bind(Lloop); 4024 lbz(tmp0, 0, src); 4025 addi(src, src, 1); 4026 andi_(tmp0, tmp0, 0x80); 4027 bne(CCR0, Ldone); // Found negative byte. 4028 bdnz(Lloop); 4029 bind(Lnoneg); 4030 li(result, 0); 4031 4032 bind(Ldone); 4033 } 4034 4035 #endif // Compiler2 4036 4037 // Helpers for Intrinsic Emitters 4038 // 4039 // Revert the byte order of a 32bit value in a register 4040 // src: 0x44556677 4041 // dst: 0x77665544 4042 // Three steps to obtain the result: 4043 // 1) Rotate src (as doubleword) left 5 bytes. That puts the leftmost byte of the src word 4044 // into the rightmost byte position. Afterwards, everything left of the rightmost byte is cleared. 4045 // This value initializes dst. 4046 // 2) Rotate src (as word) left 3 bytes. That puts the rightmost byte of the src word into the leftmost 4047 // byte position. Furthermore, byte 5 is rotated into byte 6 position where it is supposed to go. 4048 // This value is mask inserted into dst with a [0..23] mask of 1s. 4049 // 3) Rotate src (as word) left 1 byte. That puts byte 6 into byte 5 position. 4050 // This value is mask inserted into dst with a [8..15] mask of 1s. 4051 void MacroAssembler::load_reverse_32(Register dst, Register src) { 4052 assert_different_registers(dst, src); 4053 4054 rldicl(dst, src, (4+1)*8, 56); // Rotate byte 4 into position 7 (rightmost), clear all to the left. 4055 rlwimi(dst, src, 3*8, 0, 23); // Insert byte 5 into position 6, 7 into 4, leave pos 7 alone. 4056 rlwimi(dst, src, 1*8, 8, 15); // Insert byte 6 into position 5, leave the rest alone. 4057 } 4058 4059 // Calculate the column addresses of the crc32 lookup table into distinct registers. 4060 // This loop-invariant calculation is moved out of the loop body, reducing the loop 4061 // body size from 20 to 16 instructions. 4062 // Returns the offset that was used to calculate the address of column tc3. 4063 // Due to register shortage, setting tc3 may overwrite table. With the return offset 4064 // at hand, the original table address can be easily reconstructed. 4065 int MacroAssembler::crc32_table_columns(Register table, Register tc0, Register tc1, Register tc2, Register tc3) { 4066 4067 #ifdef VM_LITTLE_ENDIAN 4068 // This is what we implement (the DOLIT4 part): 4069 // ========================================================================= */ 4070 // #define DOLIT4 c ^= *buf4++; \ 4071 // c = crc_table[3][c & 0xff] ^ crc_table[2][(c >> 8) & 0xff] ^ \ 4072 // crc_table[1][(c >> 16) & 0xff] ^ crc_table[0][c >> 24] 4073 // #define DOLIT32 DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4 4074 // ========================================================================= */ 4075 const int ix0 = 3*(4*CRC32_COLUMN_SIZE); 4076 const int ix1 = 2*(4*CRC32_COLUMN_SIZE); 4077 const int ix2 = 1*(4*CRC32_COLUMN_SIZE); 4078 const int ix3 = 0*(4*CRC32_COLUMN_SIZE); 4079 #else 4080 // This is what we implement (the DOBIG4 part): 4081 // ========================================================================= 4082 // #define DOBIG4 c ^= *++buf4; \ 4083 // c = crc_table[4][c & 0xff] ^ crc_table[5][(c >> 8) & 0xff] ^ \ 4084 // crc_table[6][(c >> 16) & 0xff] ^ crc_table[7][c >> 24] 4085 // #define DOBIG32 DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4 4086 // ========================================================================= 4087 const int ix0 = 4*(4*CRC32_COLUMN_SIZE); 4088 const int ix1 = 5*(4*CRC32_COLUMN_SIZE); 4089 const int ix2 = 6*(4*CRC32_COLUMN_SIZE); 4090 const int ix3 = 7*(4*CRC32_COLUMN_SIZE); 4091 #endif 4092 assert_different_registers(table, tc0, tc1, tc2); 4093 assert(table == tc3, "must be!"); 4094 4095 addi(tc0, table, ix0); 4096 addi(tc1, table, ix1); 4097 addi(tc2, table, ix2); 4098 if (ix3 != 0) addi(tc3, table, ix3); 4099 4100 return ix3; 4101 } 4102 4103 /** 4104 * uint32_t crc; 4105 * timesXtoThe32[crc & 0xFF] ^ (crc >> 8); 4106 */ 4107 void MacroAssembler::fold_byte_crc32(Register crc, Register val, Register table, Register tmp) { 4108 assert_different_registers(crc, table, tmp); 4109 assert_different_registers(val, table); 4110 4111 if (crc == val) { // Must rotate first to use the unmodified value. 4112 rlwinm(tmp, val, 2, 24-2, 31-2); // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest. 4113 // As we use a word (4-byte) instruction, we have to adapt the mask bit positions. 4114 srwi(crc, crc, 8); // Unsigned shift, clear leftmost 8 bits. 4115 } else { 4116 srwi(crc, crc, 8); // Unsigned shift, clear leftmost 8 bits. 4117 rlwinm(tmp, val, 2, 24-2, 31-2); // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest. 4118 } 4119 lwzx(tmp, table, tmp); 4120 xorr(crc, crc, tmp); 4121 } 4122 4123 /** 4124 * uint32_t crc; 4125 * timesXtoThe32[crc & 0xFF] ^ (crc >> 8); 4126 */ 4127 void MacroAssembler::fold_8bit_crc32(Register crc, Register table, Register tmp) { 4128 fold_byte_crc32(crc, crc, table, tmp); 4129 } 4130 4131 /** 4132 * Emits code to update CRC-32 with a byte value according to constants in table. 4133 * 4134 * @param [in,out]crc Register containing the crc. 4135 * @param [in]val Register containing the byte to fold into the CRC. 4136 * @param [in]table Register containing the table of crc constants. 4137 * 4138 * uint32_t crc; 4139 * val = crc_table[(val ^ crc) & 0xFF]; 4140 * crc = val ^ (crc >> 8); 4141 */ 4142 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) { 4143 BLOCK_COMMENT("update_byte_crc32:"); 4144 xorr(val, val, crc); 4145 fold_byte_crc32(crc, val, table, val); 4146 } 4147 4148 /** 4149 * @param crc register containing existing CRC (32-bit) 4150 * @param buf register pointing to input byte buffer (byte*) 4151 * @param len register containing number of bytes 4152 * @param table register pointing to CRC table 4153 */ 4154 void MacroAssembler::update_byteLoop_crc32(Register crc, Register buf, Register len, Register table, 4155 Register data, bool loopAlignment) { 4156 assert_different_registers(crc, buf, len, table, data); 4157 4158 Label L_mainLoop, L_done; 4159 const int mainLoop_stepping = 1; 4160 const int mainLoop_alignment = loopAlignment ? 32 : 4; // (InputForNewCode > 4 ? InputForNewCode : 32) : 4; 4161 4162 // Process all bytes in a single-byte loop. 4163 clrldi_(len, len, 32); // Enforce 32 bit. Anything to do? 4164 beq(CCR0, L_done); 4165 4166 mtctr(len); 4167 align(mainLoop_alignment); 4168 BIND(L_mainLoop); 4169 lbz(data, 0, buf); // Byte from buffer, zero-extended. 4170 addi(buf, buf, mainLoop_stepping); // Advance buffer position. 4171 update_byte_crc32(crc, data, table); 4172 bdnz(L_mainLoop); // Iterate. 4173 4174 bind(L_done); 4175 } 4176 4177 /** 4178 * Emits code to update CRC-32 with a 4-byte value according to constants in table 4179 * Implementation according to jdk/src/share/native/java/util/zip/zlib-1.2.8/crc32.c 4180 */ 4181 // A not on the lookup table address(es): 4182 // The lookup table consists of two sets of four columns each. 4183 // The columns {0..3} are used for little-endian machines. 4184 // The columns {4..7} are used for big-endian machines. 4185 // To save the effort of adding the column offset to the table address each time 4186 // a table element is looked up, it is possible to pass the pre-calculated 4187 // column addresses. 4188 // Uses R9..R12 as work register. Must be saved/restored by caller, if necessary. 4189 void MacroAssembler::update_1word_crc32(Register crc, Register buf, Register table, int bufDisp, int bufInc, 4190 Register t0, Register t1, Register t2, Register t3, 4191 Register tc0, Register tc1, Register tc2, Register tc3) { 4192 assert_different_registers(crc, t3); 4193 4194 // XOR crc with next four bytes of buffer. 4195 lwz(t3, bufDisp, buf); 4196 if (bufInc != 0) { 4197 addi(buf, buf, bufInc); 4198 } 4199 xorr(t3, t3, crc); 4200 4201 // Chop crc into 4 single-byte pieces, shifted left 2 bits, to form the table indices. 4202 rlwinm(t0, t3, 2, 24-2, 31-2); // ((t1 >> 0) & 0xff) << 2 4203 rlwinm(t1, t3, 32+(2- 8), 24-2, 31-2); // ((t1 >> 8) & 0xff) << 2 4204 rlwinm(t2, t3, 32+(2-16), 24-2, 31-2); // ((t1 >> 16) & 0xff) << 2 4205 rlwinm(t3, t3, 32+(2-24), 24-2, 31-2); // ((t1 >> 24) & 0xff) << 2 4206 4207 // Use the pre-calculated column addresses. 4208 // Load pre-calculated table values. 4209 lwzx(t0, tc0, t0); 4210 lwzx(t1, tc1, t1); 4211 lwzx(t2, tc2, t2); 4212 lwzx(t3, tc3, t3); 4213 4214 // Calculate new crc from table values. 4215 xorr(t0, t0, t1); 4216 xorr(t2, t2, t3); 4217 xorr(crc, t0, t2); // Now crc contains the final checksum value. 4218 } 4219 4220 /** 4221 * @param crc register containing existing CRC (32-bit) 4222 * @param buf register pointing to input byte buffer (byte*) 4223 * @param len register containing number of bytes 4224 * @param table register pointing to CRC table 4225 * 4226 * Uses R9..R12 as work register. Must be saved/restored by caller! 4227 */ 4228 void MacroAssembler::kernel_crc32_2word(Register crc, Register buf, Register len, Register table, 4229 Register t0, Register t1, Register t2, Register t3, 4230 Register tc0, Register tc1, Register tc2, Register tc3, 4231 bool invertCRC) { 4232 assert_different_registers(crc, buf, len, table); 4233 4234 Label L_mainLoop, L_tail; 4235 Register tmp = t0; 4236 Register data = t0; 4237 Register tmp2 = t1; 4238 const int mainLoop_stepping = 8; 4239 const int tailLoop_stepping = 1; 4240 const int log_stepping = exact_log2(mainLoop_stepping); 4241 const int mainLoop_alignment = 32; // InputForNewCode > 4 ? InputForNewCode : 32; 4242 const int complexThreshold = 2*mainLoop_stepping; 4243 4244 // Don't test for len <= 0 here. This pathological case should not occur anyway. 4245 // Optimizing for it by adding a test and a branch seems to be a waste of CPU cycles 4246 // for all well-behaved cases. The situation itself is detected and handled correctly 4247 // within update_byteLoop_crc32. 4248 assert(tailLoop_stepping == 1, "check tailLoop_stepping!"); 4249 4250 BLOCK_COMMENT("kernel_crc32_2word {"); 4251 4252 if (invertCRC) { 4253 nand(crc, crc, crc); // 1s complement of crc 4254 } 4255 4256 // Check for short (<mainLoop_stepping) buffer. 4257 cmpdi(CCR0, len, complexThreshold); 4258 blt(CCR0, L_tail); 4259 4260 // Pre-mainLoop alignment did show a slight (1%) positive effect on performance. 4261 // We leave the code in for reference. Maybe we need alignment when we exploit vector instructions. 4262 { 4263 // Align buf addr to mainLoop_stepping boundary. 4264 neg(tmp2, buf); // Calculate # preLoop iterations for alignment. 4265 rldicl(tmp2, tmp2, 0, 64-log_stepping); // Rotate tmp2 0 bits, insert into tmp2, anding with mask with 1s from 62..63. 4266 4267 if (complexThreshold > mainLoop_stepping) { 4268 sub(len, len, tmp2); // Remaining bytes for main loop (>=mainLoop_stepping is guaranteed). 4269 } else { 4270 sub(tmp, len, tmp2); // Remaining bytes for main loop. 4271 cmpdi(CCR0, tmp, mainLoop_stepping); 4272 blt(CCR0, L_tail); // For less than one mainloop_stepping left, do only tail processing 4273 mr(len, tmp); // remaining bytes for main loop (>=mainLoop_stepping is guaranteed). 4274 } 4275 update_byteLoop_crc32(crc, buf, tmp2, table, data, false); 4276 } 4277 4278 srdi(tmp2, len, log_stepping); // #iterations for mainLoop 4279 andi(len, len, mainLoop_stepping-1); // remaining bytes for tailLoop 4280 mtctr(tmp2); 4281 4282 #ifdef VM_LITTLE_ENDIAN 4283 Register crc_rv = crc; 4284 #else 4285 Register crc_rv = tmp; // Load_reverse needs separate registers to work on. 4286 // Occupies tmp, but frees up crc. 4287 load_reverse_32(crc_rv, crc); // Revert byte order because we are dealing with big-endian data. 4288 tmp = crc; 4289 #endif 4290 4291 int reconstructTableOffset = crc32_table_columns(table, tc0, tc1, tc2, tc3); 4292 4293 align(mainLoop_alignment); // Octoword-aligned loop address. Shows 2% improvement. 4294 BIND(L_mainLoop); 4295 update_1word_crc32(crc_rv, buf, table, 0, 0, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3); 4296 update_1word_crc32(crc_rv, buf, table, 4, mainLoop_stepping, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3); 4297 bdnz(L_mainLoop); 4298 4299 #ifndef VM_LITTLE_ENDIAN 4300 load_reverse_32(crc, crc_rv); // Revert byte order because we are dealing with big-endian data. 4301 tmp = crc_rv; // Tmp uses it's original register again. 4302 #endif 4303 4304 // Restore original table address for tailLoop. 4305 if (reconstructTableOffset != 0) { 4306 addi(table, table, -reconstructTableOffset); 4307 } 4308 4309 // Process last few (<complexThreshold) bytes of buffer. 4310 BIND(L_tail); 4311 update_byteLoop_crc32(crc, buf, len, table, data, false); 4312 4313 if (invertCRC) { 4314 nand(crc, crc, crc); // 1s complement of crc 4315 } 4316 BLOCK_COMMENT("} kernel_crc32_2word"); 4317 } 4318 4319 /** 4320 * @param crc register containing existing CRC (32-bit) 4321 * @param buf register pointing to input byte buffer (byte*) 4322 * @param len register containing number of bytes 4323 * @param table register pointing to CRC table 4324 * 4325 * uses R9..R12 as work register. Must be saved/restored by caller! 4326 */ 4327 void MacroAssembler::kernel_crc32_1word(Register crc, Register buf, Register len, Register table, 4328 Register t0, Register t1, Register t2, Register t3, 4329 Register tc0, Register tc1, Register tc2, Register tc3, 4330 bool invertCRC) { 4331 assert_different_registers(crc, buf, len, table); 4332 4333 Label L_mainLoop, L_tail; 4334 Register tmp = t0; 4335 Register data = t0; 4336 Register tmp2 = t1; 4337 const int mainLoop_stepping = 4; 4338 const int tailLoop_stepping = 1; 4339 const int log_stepping = exact_log2(mainLoop_stepping); 4340 const int mainLoop_alignment = 32; // InputForNewCode > 4 ? InputForNewCode : 32; 4341 const int complexThreshold = 2*mainLoop_stepping; 4342 4343 // Don't test for len <= 0 here. This pathological case should not occur anyway. 4344 // Optimizing for it by adding a test and a branch seems to be a waste of CPU cycles 4345 // for all well-behaved cases. The situation itself is detected and handled correctly 4346 // within update_byteLoop_crc32. 4347 assert(tailLoop_stepping == 1, "check tailLoop_stepping!"); 4348 4349 BLOCK_COMMENT("kernel_crc32_1word {"); 4350 4351 if (invertCRC) { 4352 nand(crc, crc, crc); // 1s complement of crc 4353 } 4354 4355 // Check for short (<mainLoop_stepping) buffer. 4356 cmpdi(CCR0, len, complexThreshold); 4357 blt(CCR0, L_tail); 4358 4359 // Pre-mainLoop alignment did show a slight (1%) positive effect on performance. 4360 // We leave the code in for reference. Maybe we need alignment when we exploit vector instructions. 4361 { 4362 // Align buf addr to mainLoop_stepping boundary. 4363 neg(tmp2, buf); // Calculate # preLoop iterations for alignment. 4364 rldicl(tmp2, tmp2, 0, 64-log_stepping); // Rotate tmp2 0 bits, insert into tmp2, anding with mask with 1s from 62..63. 4365 4366 if (complexThreshold > mainLoop_stepping) { 4367 sub(len, len, tmp2); // Remaining bytes for main loop (>=mainLoop_stepping is guaranteed). 4368 } else { 4369 sub(tmp, len, tmp2); // Remaining bytes for main loop. 4370 cmpdi(CCR0, tmp, mainLoop_stepping); 4371 blt(CCR0, L_tail); // For less than one mainloop_stepping left, do only tail processing 4372 mr(len, tmp); // remaining bytes for main loop (>=mainLoop_stepping is guaranteed). 4373 } 4374 update_byteLoop_crc32(crc, buf, tmp2, table, data, false); 4375 } 4376 4377 srdi(tmp2, len, log_stepping); // #iterations for mainLoop 4378 andi(len, len, mainLoop_stepping-1); // remaining bytes for tailLoop 4379 mtctr(tmp2); 4380 4381 #ifdef VM_LITTLE_ENDIAN 4382 Register crc_rv = crc; 4383 #else 4384 Register crc_rv = tmp; // Load_reverse needs separate registers to work on. 4385 // Occupies tmp, but frees up crc. 4386 load_reverse_32(crc_rv, crc); // Revert byte order because we are dealing with big-endian data. 4387 tmp = crc; 4388 #endif 4389 4390 int reconstructTableOffset = crc32_table_columns(table, tc0, tc1, tc2, tc3); 4391 4392 align(mainLoop_alignment); // Octoword-aligned loop address. Shows 2% improvement. 4393 BIND(L_mainLoop); 4394 update_1word_crc32(crc_rv, buf, table, 0, mainLoop_stepping, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3); 4395 bdnz(L_mainLoop); 4396 4397 #ifndef VM_LITTLE_ENDIAN 4398 load_reverse_32(crc, crc_rv); // Revert byte order because we are dealing with big-endian data. 4399 tmp = crc_rv; // Tmp uses it's original register again. 4400 #endif 4401 4402 // Restore original table address for tailLoop. 4403 if (reconstructTableOffset != 0) { 4404 addi(table, table, -reconstructTableOffset); 4405 } 4406 4407 // Process last few (<complexThreshold) bytes of buffer. 4408 BIND(L_tail); 4409 update_byteLoop_crc32(crc, buf, len, table, data, false); 4410 4411 if (invertCRC) { 4412 nand(crc, crc, crc); // 1s complement of crc 4413 } 4414 BLOCK_COMMENT("} kernel_crc32_1word"); 4415 } 4416 4417 /** 4418 * @param crc register containing existing CRC (32-bit) 4419 * @param buf register pointing to input byte buffer (byte*) 4420 * @param len register containing number of bytes 4421 * @param table register pointing to CRC table 4422 * 4423 * Uses R7_ARG5, R8_ARG6 as work registers. 4424 */ 4425 void MacroAssembler::kernel_crc32_1byte(Register crc, Register buf, Register len, Register table, 4426 Register t0, Register t1, Register t2, Register t3, 4427 bool invertCRC) { 4428 assert_different_registers(crc, buf, len, table); 4429 4430 Register data = t0; // Holds the current byte to be folded into crc. 4431 4432 BLOCK_COMMENT("kernel_crc32_1byte {"); 4433 4434 if (invertCRC) { 4435 nand(crc, crc, crc); // 1s complement of crc 4436 } 4437 4438 // Process all bytes in a single-byte loop. 4439 update_byteLoop_crc32(crc, buf, len, table, data, true); 4440 4441 if (invertCRC) { 4442 nand(crc, crc, crc); // 1s complement of crc 4443 } 4444 BLOCK_COMMENT("} kernel_crc32_1byte"); 4445 } 4446 4447 /** 4448 * @param crc register containing existing CRC (32-bit) 4449 * @param buf register pointing to input byte buffer (byte*) 4450 * @param len register containing number of bytes 4451 * @param table register pointing to CRC table 4452 * @param constants register pointing to CRC table for 128-bit aligned memory 4453 * @param barretConstants register pointing to table for barrett reduction 4454 * @param t0-t4 temp registers 4455 */ 4456 void MacroAssembler::kernel_crc32_1word_vpmsum(Register crc, Register buf, Register len, Register table, 4457 Register constants, Register barretConstants, 4458 Register t0, Register t1, Register t2, Register t3, Register t4, 4459 bool invertCRC) { 4460 assert_different_registers(crc, buf, len, table); 4461 4462 Label L_alignedHead, L_tail; 4463 4464 BLOCK_COMMENT("kernel_crc32_1word_vpmsum {"); 4465 4466 // 1. ~c 4467 if (invertCRC) { 4468 nand(crc, crc, crc); // 1s complement of crc 4469 } 4470 4471 // 2. use kernel_crc32_1word for short len 4472 clrldi(len, len, 32); 4473 cmpdi(CCR0, len, 512); 4474 blt(CCR0, L_tail); 4475 4476 // 3. calculate from 0 to first aligned address 4477 const int alignment = 16; 4478 Register prealign = t0; 4479 4480 andi_(prealign, buf, alignment - 1); 4481 beq(CCR0, L_alignedHead); 4482 subfic(prealign, prealign, alignment); 4483 4484 subf(len, prealign, len); 4485 update_byteLoop_crc32(crc, buf, prealign, table, t2, false); 4486 4487 // 4. calculate from first aligned address as far as possible 4488 BIND(L_alignedHead); 4489 kernel_crc32_1word_aligned(crc, buf, len, constants, barretConstants, t0, t1, t2, t3, t4); 4490 4491 // 5. remaining bytes 4492 BIND(L_tail); 4493 Register tc0 = t4; 4494 Register tc1 = constants; 4495 Register tc2 = barretConstants; 4496 kernel_crc32_1word(crc, buf, len, table, t0, t1, t2, t3, tc0, tc1, tc2, table, false); 4497 4498 // 6. ~c 4499 if (invertCRC) { 4500 nand(crc, crc, crc); // 1s complement of crc 4501 } 4502 4503 BLOCK_COMMENT("} kernel_crc32_1word_vpmsum"); 4504 } 4505 4506 /** 4507 * @param crc register containing existing CRC (32-bit) 4508 * @param buf register pointing to input byte buffer (byte*) 4509 * @param len register containing number of bytes (will get updated to remaining bytes) 4510 * @param constants register pointing to CRC table for 128-bit aligned memory 4511 * @param barretConstants register pointing to table for barrett reduction 4512 * @param t0-t4 temp registers 4513 * Precondition: len should be >= 512. Otherwise, nothing will be done. 4514 */ 4515 void MacroAssembler::kernel_crc32_1word_aligned(Register crc, Register buf, Register len, 4516 Register constants, Register barretConstants, 4517 Register t0, Register t1, Register t2, Register t3, Register t4) { 4518 4519 // Save non-volatile vector registers (frameless). 4520 Register offset = t1; 4521 int offsetInt = 0; 4522 offsetInt -= 16; li(offset, offsetInt); stvx(VR20, offset, R1_SP); 4523 offsetInt -= 16; li(offset, offsetInt); stvx(VR21, offset, R1_SP); 4524 offsetInt -= 16; li(offset, offsetInt); stvx(VR22, offset, R1_SP); 4525 offsetInt -= 16; li(offset, offsetInt); stvx(VR23, offset, R1_SP); 4526 offsetInt -= 16; li(offset, offsetInt); stvx(VR24, offset, R1_SP); 4527 offsetInt -= 16; li(offset, offsetInt); stvx(VR25, offset, R1_SP); 4528 #ifndef VM_LITTLE_ENDIAN 4529 offsetInt -= 16; li(offset, offsetInt); stvx(VR26, offset, R1_SP); 4530 #endif 4531 offsetInt -= 8; std(R14, offsetInt, R1_SP); 4532 offsetInt -= 8; std(R15, offsetInt, R1_SP); 4533 offsetInt -= 8; std(R16, offsetInt, R1_SP); 4534 offsetInt -= 8; std(R17, offsetInt, R1_SP); 4535 4536 // Implementation uses an inner loop which uses between 256 and 16 * unroll_factor 4537 // bytes per iteration. The basic scheme is: 4538 // lvx: load vector (Big Endian needs reversal) 4539 // vpmsumw: carry-less 32 bit multiplications with constant representing a large CRC shift 4540 // vxor: xor partial results together to get unroll_factor2 vectors 4541 4542 // Outer loop performs the CRC shifts needed to combine the unroll_factor2 vectors. 4543 4544 // Using 16 * unroll_factor / unroll_factor_2 bytes for constants. 4545 const int unroll_factor = 2048; 4546 const int unroll_factor2 = 8; 4547 4548 // Support registers. 4549 Register offs[] = { noreg, t0, t1, t2, t3, t4, crc /* will live in VCRC */, R14 }; 4550 Register num_bytes = R15, 4551 loop_count = R16, 4552 cur_const = R17; 4553 // Constant array for outer loop: unroll_factor2 - 1 registers, 4554 // Constant array for inner loop: unroll_factor / unroll_factor2 registers. 4555 VectorRegister consts0[] = { VR16, VR17, VR18, VR19, VR20, VR21, VR22 }, 4556 consts1[] = { VR23, VR24 }; 4557 // Data register arrays: 2 arrays with unroll_factor2 registers. 4558 VectorRegister data0[] = { VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7 }, 4559 data1[] = { VR8, VR9, VR10, VR11, VR12, VR13, VR14, VR15 }; 4560 4561 VectorRegister VCRC = data0[0]; 4562 VectorRegister Vc = VR25; 4563 VectorRegister swap_bytes = VR26; // Only for Big Endian. 4564 4565 // We have at least 1 iteration (ensured by caller). 4566 Label L_outer_loop, L_inner_loop, L_last; 4567 4568 // If supported set DSCR pre-fetch to deepest. 4569 if (VM_Version::has_mfdscr()) { 4570 load_const_optimized(t0, VM_Version::_dscr_val | 7); 4571 mtdscr(t0); 4572 } 4573 4574 mtvrwz(VCRC, crc); // crc lives lives in VCRC, now 4575 4576 for (int i = 1; i < unroll_factor2; ++i) { 4577 li(offs[i], 16 * i); 4578 } 4579 4580 // Load consts for outer loop 4581 lvx(consts0[0], constants); 4582 for (int i = 1; i < unroll_factor2 - 1; ++i) { 4583 lvx(consts0[i], offs[i], constants); 4584 } 4585 addi(constants, constants, (unroll_factor2 - 1) * 16); 4586 4587 load_const_optimized(num_bytes, 16 * unroll_factor); 4588 load_const_optimized(loop_count, unroll_factor / (2 * unroll_factor2) - 1); // One double-iteration peeled off. 4589 4590 // Reuse data registers outside of the loop. 4591 VectorRegister Vtmp = data1[0]; 4592 VectorRegister Vtmp2 = data1[1]; 4593 VectorRegister zeroes = data1[2]; 4594 4595 vspltisb(Vtmp, 0); 4596 vsldoi(VCRC, Vtmp, VCRC, 8); // 96 bit zeroes, 32 bit CRC. 4597 4598 // Load vector for vpermxor (to xor both 64 bit parts together) 4599 lvsl(Vtmp, buf); // 000102030405060708090a0b0c0d0e0f 4600 vspltisb(Vc, 4); 4601 vsl(Vc, Vtmp, Vc); // 00102030405060708090a0b0c0d0e0f0 4602 xxspltd(Vc->to_vsr(), Vc->to_vsr(), 0); 4603 vor(Vc, Vtmp, Vc); // 001122334455667708192a3b4c5d6e7f 4604 4605 #ifdef VM_LITTLE_ENDIAN 4606 #define BE_swap_bytes(x) 4607 #else 4608 vspltisb(Vtmp2, 0xf); 4609 vxor(swap_bytes, Vtmp, Vtmp2); 4610 #define BE_swap_bytes(x) vperm(x, x, x, swap_bytes) 4611 #endif 4612 4613 cmpd(CCR0, len, num_bytes); 4614 blt(CCR0, L_last); 4615 4616 // ********** Main loop start ********** 4617 align(32); 4618 bind(L_outer_loop); 4619 4620 // Begin of unrolled first iteration (no xor). 4621 lvx(data1[0], buf); 4622 mr(cur_const, constants); 4623 for (int i = 1; i < unroll_factor2 / 2; ++i) { 4624 lvx(data1[i], offs[i], buf); 4625 } 4626 vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result. 4627 lvx(consts1[0], cur_const); 4628 mtctr(loop_count); 4629 for (int i = 0; i < unroll_factor2 / 2; ++i) { 4630 BE_swap_bytes(data1[i]); 4631 if (i == 0) { vxor(data1[0], data1[0], VCRC); } // xor in previous CRC. 4632 lvx(data1[i + unroll_factor2 / 2], offs[i + unroll_factor2 / 2], buf); 4633 vpmsumw(data0[i], data1[i], consts1[0]); 4634 } 4635 addi(buf, buf, 16 * unroll_factor2); 4636 subf(len, num_bytes, len); 4637 lvx(consts1[1], offs[1], cur_const); 4638 addi(cur_const, cur_const, 32); 4639 // Begin of unrolled second iteration (head). 4640 for (int i = 0; i < unroll_factor2 / 2; ++i) { 4641 BE_swap_bytes(data1[i + unroll_factor2 / 2]); 4642 if (i == 0) { lvx(data1[0], buf); } else { lvx(data1[i], offs[i], buf); } 4643 vpmsumw(data0[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2], consts1[0]); 4644 } 4645 for (int i = 0; i < unroll_factor2 / 2; ++i) { 4646 BE_swap_bytes(data1[i]); 4647 lvx(data1[i + unroll_factor2 / 2], offs[i + unroll_factor2 / 2], buf); 4648 vpmsumw(data1[i], data1[i], consts1[1]); 4649 } 4650 addi(buf, buf, 16 * unroll_factor2); 4651 4652 // Generate most performance relevant code. Loads + half of the vpmsumw have been generated. 4653 // Double-iteration allows using the 2 constant registers alternatingly. 4654 align(32); 4655 bind(L_inner_loop); 4656 for (int j = 1; j < 3; ++j) { // j < unroll_factor / unroll_factor2 - 1 for complete unrolling. 4657 if (j & 1) { 4658 lvx(consts1[0], cur_const); 4659 } else { 4660 lvx(consts1[1], offs[1], cur_const); 4661 addi(cur_const, cur_const, 32); 4662 } 4663 for (int i = 0; i < unroll_factor2; ++i) { 4664 int idx = i + unroll_factor2 / 2, inc = 0; // For modulo-scheduled input. 4665 if (idx >= unroll_factor2) { idx -= unroll_factor2; inc = 1; } 4666 BE_swap_bytes(data1[idx]); 4667 vxor(data0[i], data0[i], data1[i]); 4668 if (i == 0) lvx(data1[0], buf); else lvx(data1[i], offs[i], buf); 4669 vpmsumw(data1[idx], data1[idx], consts1[(j + inc) & 1]); 4670 } 4671 addi(buf, buf, 16 * unroll_factor2); 4672 } 4673 bdnz(L_inner_loop); 4674 4675 // Tail of last iteration (no loads). 4676 for (int i = 0; i < unroll_factor2 / 2; ++i) { 4677 BE_swap_bytes(data1[i + unroll_factor2 / 2]); 4678 vxor(data0[i], data0[i], data1[i]); 4679 vpmsumw(data1[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2], consts1[1]); 4680 } 4681 for (int i = 0; i < unroll_factor2 / 2; ++i) { 4682 vpmsumw(data0[i], data0[i], consts0[unroll_factor2 - 2 - i]); // First half of fixup shifts. 4683 vxor(data0[i + unroll_factor2 / 2], data0[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2]); 4684 } 4685 4686 // Last data register is ok, other ones need fixup shift. 4687 for (int i = unroll_factor2 / 2; i < unroll_factor2 - 1; ++i) { 4688 vpmsumw(data0[i], data0[i], consts0[unroll_factor2 - 2 - i]); 4689 } 4690 4691 // Combine to 128 bit result vector VCRC = data0[0]. 4692 for (int i = 1; i < unroll_factor2; i<<=1) { 4693 for (int j = 0; j <= unroll_factor2 - 2*i; j+=2*i) { 4694 vxor(data0[j], data0[j], data0[j+i]); 4695 } 4696 } 4697 cmpd(CCR0, len, num_bytes); 4698 bge(CCR0, L_outer_loop); 4699 4700 // Last chance with lower num_bytes. 4701 bind(L_last); 4702 srdi(loop_count, len, exact_log2(16 * 2 * unroll_factor2)); // Use double-iterations. 4703 add_const_optimized(constants, constants, 16 * (unroll_factor / unroll_factor2)); // Point behind last one. 4704 sldi(R0, loop_count, exact_log2(16 * 2)); // Bytes of constants to be used. 4705 clrrdi(num_bytes, len, exact_log2(16 * 2 * unroll_factor2)); 4706 subf(constants, R0, constants); // Point to constant to be used first. 4707 4708 addic_(loop_count, loop_count, -1); // One double-iteration peeled off. 4709 bgt(CCR0, L_outer_loop); 4710 // ********** Main loop end ********** 4711 #undef BE_swap_bytes 4712 4713 // Restore DSCR pre-fetch value. 4714 if (VM_Version::has_mfdscr()) { 4715 load_const_optimized(t0, VM_Version::_dscr_val); 4716 mtdscr(t0); 4717 } 4718 4719 vspltisb(zeroes, 0); 4720 4721 // Combine to 64 bit result. 4722 vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result. 4723 4724 // Reduce to 32 bit CRC: Remainder by multiply-high. 4725 lvx(Vtmp, barretConstants); 4726 vsldoi(Vtmp2, zeroes, VCRC, 12); // Extract high 32 bit. 4727 vpmsumd(Vtmp2, Vtmp2, Vtmp); // Multiply by inverse long poly. 4728 vsldoi(Vtmp2, zeroes, Vtmp2, 12); // Extract high 32 bit. 4729 vsldoi(Vtmp, zeroes, Vtmp, 8); 4730 vpmsumd(Vtmp2, Vtmp2, Vtmp); // Multiply quotient by long poly. 4731 vxor(VCRC, VCRC, Vtmp2); // Remainder fits into 32 bit. 4732 4733 // Move result. len is already updated. 4734 vsldoi(VCRC, VCRC, zeroes, 8); 4735 mfvrd(crc, VCRC); 4736 4737 // Restore non-volatile Vector registers (frameless). 4738 offsetInt = 0; 4739 offsetInt -= 16; li(offset, offsetInt); lvx(VR20, offset, R1_SP); 4740 offsetInt -= 16; li(offset, offsetInt); lvx(VR21, offset, R1_SP); 4741 offsetInt -= 16; li(offset, offsetInt); lvx(VR22, offset, R1_SP); 4742 offsetInt -= 16; li(offset, offsetInt); lvx(VR23, offset, R1_SP); 4743 offsetInt -= 16; li(offset, offsetInt); lvx(VR24, offset, R1_SP); 4744 offsetInt -= 16; li(offset, offsetInt); lvx(VR25, offset, R1_SP); 4745 #ifndef VM_LITTLE_ENDIAN 4746 offsetInt -= 16; li(offset, offsetInt); lvx(VR26, offset, R1_SP); 4747 #endif 4748 offsetInt -= 8; ld(R14, offsetInt, R1_SP); 4749 offsetInt -= 8; ld(R15, offsetInt, R1_SP); 4750 offsetInt -= 8; ld(R16, offsetInt, R1_SP); 4751 offsetInt -= 8; ld(R17, offsetInt, R1_SP); 4752 } 4753 4754 void MacroAssembler::kernel_crc32_singleByte(Register crc, Register buf, Register len, Register table, Register tmp, bool invertCRC) { 4755 assert_different_registers(crc, buf, /* len, not used!! */ table, tmp); 4756 4757 BLOCK_COMMENT("kernel_crc32_singleByte:"); 4758 if (invertCRC) { 4759 nand(crc, crc, crc); // 1s complement of crc 4760 } 4761 4762 lbz(tmp, 0, buf); // Byte from buffer, zero-extended. 4763 update_byte_crc32(crc, tmp, table); 4764 4765 if (invertCRC) { 4766 nand(crc, crc, crc); // 1s complement of crc 4767 } 4768 } 4769 4770 void MacroAssembler::kernel_crc32_singleByteReg(Register crc, Register val, Register table, bool invertCRC) { 4771 assert_different_registers(crc, val, table); 4772 4773 BLOCK_COMMENT("kernel_crc32_singleByteReg:"); 4774 if (invertCRC) { 4775 nand(crc, crc, crc); // 1s complement of crc 4776 } 4777 4778 update_byte_crc32(crc, val, table); 4779 4780 if (invertCRC) { 4781 nand(crc, crc, crc); // 1s complement of crc 4782 } 4783 } 4784 4785 // dest_lo += src1 + src2 4786 // dest_hi += carry1 + carry2 4787 void MacroAssembler::add2_with_carry(Register dest_hi, 4788 Register dest_lo, 4789 Register src1, Register src2) { 4790 li(R0, 0); 4791 addc(dest_lo, dest_lo, src1); 4792 adde(dest_hi, dest_hi, R0); 4793 addc(dest_lo, dest_lo, src2); 4794 adde(dest_hi, dest_hi, R0); 4795 } 4796 4797 // Multiply 64 bit by 64 bit first loop. 4798 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, 4799 Register x_xstart, 4800 Register y, Register y_idx, 4801 Register z, 4802 Register carry, 4803 Register product_high, Register product, 4804 Register idx, Register kdx, 4805 Register tmp) { 4806 // jlong carry, x[], y[], z[]; 4807 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx--, kdx--) { 4808 // huge_128 product = y[idx] * x[xstart] + carry; 4809 // z[kdx] = (jlong)product; 4810 // carry = (jlong)(product >>> 64); 4811 // } 4812 // z[xstart] = carry; 4813 4814 Label L_first_loop, L_first_loop_exit; 4815 Label L_one_x, L_one_y, L_multiply; 4816 4817 addic_(xstart, xstart, -1); 4818 blt(CCR0, L_one_x); // Special case: length of x is 1. 4819 4820 // Load next two integers of x. 4821 sldi(tmp, xstart, LogBytesPerInt); 4822 ldx(x_xstart, x, tmp); 4823 #ifdef VM_LITTLE_ENDIAN 4824 rldicl(x_xstart, x_xstart, 32, 0); 4825 #endif 4826 4827 align(32, 16); 4828 bind(L_first_loop); 4829 4830 cmpdi(CCR0, idx, 1); 4831 blt(CCR0, L_first_loop_exit); 4832 addi(idx, idx, -2); 4833 beq(CCR0, L_one_y); 4834 4835 // Load next two integers of y. 4836 sldi(tmp, idx, LogBytesPerInt); 4837 ldx(y_idx, y, tmp); 4838 #ifdef VM_LITTLE_ENDIAN 4839 rldicl(y_idx, y_idx, 32, 0); 4840 #endif 4841 4842 4843 bind(L_multiply); 4844 multiply64(product_high, product, x_xstart, y_idx); 4845 4846 li(tmp, 0); 4847 addc(product, product, carry); // Add carry to result. 4848 adde(product_high, product_high, tmp); // Add carry of the last addition. 4849 addi(kdx, kdx, -2); 4850 4851 // Store result. 4852 #ifdef VM_LITTLE_ENDIAN 4853 rldicl(product, product, 32, 0); 4854 #endif 4855 sldi(tmp, kdx, LogBytesPerInt); 4856 stdx(product, z, tmp); 4857 mr_if_needed(carry, product_high); 4858 b(L_first_loop); 4859 4860 4861 bind(L_one_y); // Load one 32 bit portion of y as (0,value). 4862 4863 lwz(y_idx, 0, y); 4864 b(L_multiply); 4865 4866 4867 bind(L_one_x); // Load one 32 bit portion of x as (0,value). 4868 4869 lwz(x_xstart, 0, x); 4870 b(L_first_loop); 4871 4872 bind(L_first_loop_exit); 4873 } 4874 4875 // Multiply 64 bit by 64 bit and add 128 bit. 4876 void MacroAssembler::multiply_add_128_x_128(Register x_xstart, Register y, 4877 Register z, Register yz_idx, 4878 Register idx, Register carry, 4879 Register product_high, Register product, 4880 Register tmp, int offset) { 4881 4882 // huge_128 product = (y[idx] * x_xstart) + z[kdx] + carry; 4883 // z[kdx] = (jlong)product; 4884 4885 sldi(tmp, idx, LogBytesPerInt); 4886 if (offset) { 4887 addi(tmp, tmp, offset); 4888 } 4889 ldx(yz_idx, y, tmp); 4890 #ifdef VM_LITTLE_ENDIAN 4891 rldicl(yz_idx, yz_idx, 32, 0); 4892 #endif 4893 4894 multiply64(product_high, product, x_xstart, yz_idx); 4895 ldx(yz_idx, z, tmp); 4896 #ifdef VM_LITTLE_ENDIAN 4897 rldicl(yz_idx, yz_idx, 32, 0); 4898 #endif 4899 4900 add2_with_carry(product_high, product, carry, yz_idx); 4901 4902 sldi(tmp, idx, LogBytesPerInt); 4903 if (offset) { 4904 addi(tmp, tmp, offset); 4905 } 4906 #ifdef VM_LITTLE_ENDIAN 4907 rldicl(product, product, 32, 0); 4908 #endif 4909 stdx(product, z, tmp); 4910 } 4911 4912 // Multiply 128 bit by 128 bit. Unrolled inner loop. 4913 void MacroAssembler::multiply_128_x_128_loop(Register x_xstart, 4914 Register y, Register z, 4915 Register yz_idx, Register idx, Register carry, 4916 Register product_high, Register product, 4917 Register carry2, Register tmp) { 4918 4919 // jlong carry, x[], y[], z[]; 4920 // int kdx = ystart+1; 4921 // for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop 4922 // huge_128 product = (y[idx+1] * x_xstart) + z[kdx+idx+1] + carry; 4923 // z[kdx+idx+1] = (jlong)product; 4924 // jlong carry2 = (jlong)(product >>> 64); 4925 // product = (y[idx] * x_xstart) + z[kdx+idx] + carry2; 4926 // z[kdx+idx] = (jlong)product; 4927 // carry = (jlong)(product >>> 64); 4928 // } 4929 // idx += 2; 4930 // if (idx > 0) { 4931 // product = (y[idx] * x_xstart) + z[kdx+idx] + carry; 4932 // z[kdx+idx] = (jlong)product; 4933 // carry = (jlong)(product >>> 64); 4934 // } 4935 4936 Label L_third_loop, L_third_loop_exit, L_post_third_loop_done; 4937 const Register jdx = R0; 4938 4939 // Scale the index. 4940 srdi_(jdx, idx, 2); 4941 beq(CCR0, L_third_loop_exit); 4942 mtctr(jdx); 4943 4944 align(32, 16); 4945 bind(L_third_loop); 4946 4947 addi(idx, idx, -4); 4948 4949 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 8); 4950 mr_if_needed(carry2, product_high); 4951 4952 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry2, product_high, product, tmp, 0); 4953 mr_if_needed(carry, product_high); 4954 bdnz(L_third_loop); 4955 4956 bind(L_third_loop_exit); // Handle any left-over operand parts. 4957 4958 andi_(idx, idx, 0x3); 4959 beq(CCR0, L_post_third_loop_done); 4960 4961 Label L_check_1; 4962 4963 addic_(idx, idx, -2); 4964 blt(CCR0, L_check_1); 4965 4966 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 0); 4967 mr_if_needed(carry, product_high); 4968 4969 bind(L_check_1); 4970 4971 addi(idx, idx, 0x2); 4972 andi_(idx, idx, 0x1); 4973 addic_(idx, idx, -1); 4974 blt(CCR0, L_post_third_loop_done); 4975 4976 sldi(tmp, idx, LogBytesPerInt); 4977 lwzx(yz_idx, y, tmp); 4978 multiply64(product_high, product, x_xstart, yz_idx); 4979 lwzx(yz_idx, z, tmp); 4980 4981 add2_with_carry(product_high, product, yz_idx, carry); 4982 4983 sldi(tmp, idx, LogBytesPerInt); 4984 stwx(product, z, tmp); 4985 srdi(product, product, 32); 4986 4987 sldi(product_high, product_high, 32); 4988 orr(product, product, product_high); 4989 mr_if_needed(carry, product); 4990 4991 bind(L_post_third_loop_done); 4992 } // multiply_128_x_128_loop 4993 4994 void MacroAssembler::muladd(Register out, Register in, 4995 Register offset, Register len, Register k, 4996 Register tmp1, Register tmp2, Register carry) { 4997 4998 // Labels 4999 Label LOOP, SKIP; 5000 5001 // Make sure length is positive. 5002 cmpdi (CCR0, len, 0); 5003 5004 // Prepare variables 5005 subi (offset, offset, 4); 5006 li (carry, 0); 5007 ble (CCR0, SKIP); 5008 5009 mtctr (len); 5010 subi (len, len, 1 ); 5011 sldi (len, len, 2 ); 5012 5013 // Main loop 5014 bind(LOOP); 5015 lwzx (tmp1, len, in ); 5016 lwzx (tmp2, offset, out ); 5017 mulld (tmp1, tmp1, k ); 5018 add (tmp2, carry, tmp2 ); 5019 add (tmp2, tmp1, tmp2 ); 5020 stwx (tmp2, offset, out ); 5021 srdi (carry, tmp2, 32 ); 5022 subi (offset, offset, 4 ); 5023 subi (len, len, 4 ); 5024 bdnz (LOOP); 5025 bind(SKIP); 5026 } 5027 5028 void MacroAssembler::multiply_to_len(Register x, Register xlen, 5029 Register y, Register ylen, 5030 Register z, Register zlen, 5031 Register tmp1, Register tmp2, 5032 Register tmp3, Register tmp4, 5033 Register tmp5, Register tmp6, 5034 Register tmp7, Register tmp8, 5035 Register tmp9, Register tmp10, 5036 Register tmp11, Register tmp12, 5037 Register tmp13) { 5038 5039 ShortBranchVerifier sbv(this); 5040 5041 assert_different_registers(x, xlen, y, ylen, z, zlen, 5042 tmp1, tmp2, tmp3, tmp4, tmp5, tmp6); 5043 assert_different_registers(x, xlen, y, ylen, z, zlen, 5044 tmp1, tmp2, tmp3, tmp4, tmp5, tmp7); 5045 assert_different_registers(x, xlen, y, ylen, z, zlen, 5046 tmp1, tmp2, tmp3, tmp4, tmp5, tmp8); 5047 5048 const Register idx = tmp1; 5049 const Register kdx = tmp2; 5050 const Register xstart = tmp3; 5051 5052 const Register y_idx = tmp4; 5053 const Register carry = tmp5; 5054 const Register product = tmp6; 5055 const Register product_high = tmp7; 5056 const Register x_xstart = tmp8; 5057 const Register tmp = tmp9; 5058 5059 // First Loop. 5060 // 5061 // final static long LONG_MASK = 0xffffffffL; 5062 // int xstart = xlen - 1; 5063 // int ystart = ylen - 1; 5064 // long carry = 0; 5065 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) { 5066 // long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry; 5067 // z[kdx] = (int)product; 5068 // carry = product >>> 32; 5069 // } 5070 // z[xstart] = (int)carry; 5071 5072 mr_if_needed(idx, ylen); // idx = ylen 5073 mr_if_needed(kdx, zlen); // kdx = xlen + ylen 5074 li(carry, 0); // carry = 0 5075 5076 Label L_done; 5077 5078 addic_(xstart, xlen, -1); 5079 blt(CCR0, L_done); 5080 5081 multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, 5082 carry, product_high, product, idx, kdx, tmp); 5083 5084 Label L_second_loop; 5085 5086 cmpdi(CCR0, kdx, 0); 5087 beq(CCR0, L_second_loop); 5088 5089 Label L_carry; 5090 5091 addic_(kdx, kdx, -1); 5092 beq(CCR0, L_carry); 5093 5094 // Store lower 32 bits of carry. 5095 sldi(tmp, kdx, LogBytesPerInt); 5096 stwx(carry, z, tmp); 5097 srdi(carry, carry, 32); 5098 addi(kdx, kdx, -1); 5099 5100 5101 bind(L_carry); 5102 5103 // Store upper 32 bits of carry. 5104 sldi(tmp, kdx, LogBytesPerInt); 5105 stwx(carry, z, tmp); 5106 5107 // Second and third (nested) loops. 5108 // 5109 // for (int i = xstart-1; i >= 0; i--) { // Second loop 5110 // carry = 0; 5111 // for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop 5112 // long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) + 5113 // (z[k] & LONG_MASK) + carry; 5114 // z[k] = (int)product; 5115 // carry = product >>> 32; 5116 // } 5117 // z[i] = (int)carry; 5118 // } 5119 // 5120 // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = rdx 5121 5122 bind(L_second_loop); 5123 5124 li(carry, 0); // carry = 0; 5125 5126 addic_(xstart, xstart, -1); // i = xstart-1; 5127 blt(CCR0, L_done); 5128 5129 Register zsave = tmp10; 5130 5131 mr(zsave, z); 5132 5133 5134 Label L_last_x; 5135 5136 sldi(tmp, xstart, LogBytesPerInt); 5137 add(z, z, tmp); // z = z + k - j 5138 addi(z, z, 4); 5139 addic_(xstart, xstart, -1); // i = xstart-1; 5140 blt(CCR0, L_last_x); 5141 5142 sldi(tmp, xstart, LogBytesPerInt); 5143 ldx(x_xstart, x, tmp); 5144 #ifdef VM_LITTLE_ENDIAN 5145 rldicl(x_xstart, x_xstart, 32, 0); 5146 #endif 5147 5148 5149 Label L_third_loop_prologue; 5150 5151 bind(L_third_loop_prologue); 5152 5153 Register xsave = tmp11; 5154 Register xlensave = tmp12; 5155 Register ylensave = tmp13; 5156 5157 mr(xsave, x); 5158 mr(xlensave, xstart); 5159 mr(ylensave, ylen); 5160 5161 5162 multiply_128_x_128_loop(x_xstart, y, z, y_idx, ylen, 5163 carry, product_high, product, x, tmp); 5164 5165 mr(z, zsave); 5166 mr(x, xsave); 5167 mr(xlen, xlensave); // This is the decrement of the loop counter! 5168 mr(ylen, ylensave); 5169 5170 addi(tmp3, xlen, 1); 5171 sldi(tmp, tmp3, LogBytesPerInt); 5172 stwx(carry, z, tmp); 5173 addic_(tmp3, tmp3, -1); 5174 blt(CCR0, L_done); 5175 5176 srdi(carry, carry, 32); 5177 sldi(tmp, tmp3, LogBytesPerInt); 5178 stwx(carry, z, tmp); 5179 b(L_second_loop); 5180 5181 // Next infrequent code is moved outside loops. 5182 bind(L_last_x); 5183 5184 lwz(x_xstart, 0, x); 5185 b(L_third_loop_prologue); 5186 5187 bind(L_done); 5188 } // multiply_to_len 5189 5190 void MacroAssembler::asm_assert(bool check_equal, const char *msg, int id) { 5191 #ifdef ASSERT 5192 Label ok; 5193 if (check_equal) { 5194 beq(CCR0, ok); 5195 } else { 5196 bne(CCR0, ok); 5197 } 5198 stop(msg, id); 5199 bind(ok); 5200 #endif 5201 } 5202 5203 void MacroAssembler::asm_assert_mems_zero(bool check_equal, int size, int mem_offset, 5204 Register mem_base, const char* msg, int id) { 5205 #ifdef ASSERT 5206 switch (size) { 5207 case 4: 5208 lwz(R0, mem_offset, mem_base); 5209 cmpwi(CCR0, R0, 0); 5210 break; 5211 case 8: 5212 ld(R0, mem_offset, mem_base); 5213 cmpdi(CCR0, R0, 0); 5214 break; 5215 default: 5216 ShouldNotReachHere(); 5217 } 5218 asm_assert(check_equal, msg, id); 5219 #endif // ASSERT 5220 } 5221 5222 void MacroAssembler::verify_thread() { 5223 if (VerifyThread) { 5224 unimplemented("'VerifyThread' currently not implemented on PPC"); 5225 } 5226 } 5227 5228 // READ: oop. KILL: R0. Volatile floats perhaps. 5229 void MacroAssembler::verify_oop(Register oop, const char* msg) { 5230 if (!VerifyOops) { 5231 return; 5232 } 5233 5234 address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address(); 5235 const Register tmp = R11; // Will be preserved. 5236 const int nbytes_save = MacroAssembler::num_volatile_regs * 8; 5237 save_volatile_gprs(R1_SP, -nbytes_save); // except R0 5238 5239 mr_if_needed(R4_ARG2, oop); 5240 save_LR_CR(tmp); // save in old frame 5241 push_frame_reg_args(nbytes_save, tmp); 5242 // load FunctionDescriptor** / entry_address * 5243 load_const_optimized(tmp, fd, R0); 5244 // load FunctionDescriptor* / entry_address 5245 ld(tmp, 0, tmp); 5246 load_const_optimized(R3_ARG1, (address)msg, R0); 5247 // Call destination for its side effect. 5248 call_c(tmp); 5249 5250 pop_frame(); 5251 restore_LR_CR(tmp); 5252 restore_volatile_gprs(R1_SP, -nbytes_save); // except R0 5253 } 5254 5255 void MacroAssembler::verify_oop_addr(RegisterOrConstant offs, Register base, const char* msg) { 5256 if (!VerifyOops) { 5257 return; 5258 } 5259 5260 address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address(); 5261 const Register tmp = R11; // Will be preserved. 5262 const int nbytes_save = MacroAssembler::num_volatile_regs * 8; 5263 save_volatile_gprs(R1_SP, -nbytes_save); // except R0 5264 5265 ld(R4_ARG2, offs, base); 5266 save_LR_CR(tmp); // save in old frame 5267 push_frame_reg_args(nbytes_save, tmp); 5268 // load FunctionDescriptor** / entry_address * 5269 load_const_optimized(tmp, fd, R0); 5270 // load FunctionDescriptor* / entry_address 5271 ld(tmp, 0, tmp); 5272 load_const_optimized(R3_ARG1, (address)msg, R0); 5273 // Call destination for its side effect. 5274 call_c(tmp); 5275 5276 pop_frame(); 5277 restore_LR_CR(tmp); 5278 restore_volatile_gprs(R1_SP, -nbytes_save); // except R0 5279 } 5280 5281 const char* stop_types[] = { 5282 "stop", 5283 "untested", 5284 "unimplemented", 5285 "shouldnotreachhere" 5286 }; 5287 5288 static void stop_on_request(int tp, const char* msg) { 5289 tty->print("PPC assembly code requires stop: (%s) %s\n", stop_types[tp%/*stop_end*/4], msg); 5290 guarantee(false, "PPC assembly code requires stop: %s", msg); 5291 } 5292 5293 // Call a C-function that prints output. 5294 void MacroAssembler::stop(int type, const char* msg, int id) { 5295 #ifndef PRODUCT 5296 block_comment(err_msg("stop: %s %s {", stop_types[type%stop_end], msg)); 5297 #else 5298 block_comment("stop {"); 5299 #endif 5300 5301 // setup arguments 5302 load_const_optimized(R3_ARG1, type); 5303 load_const_optimized(R4_ARG2, (void *)msg, /*tmp=*/R0); 5304 call_VM_leaf(CAST_FROM_FN_PTR(address, stop_on_request), R3_ARG1, R4_ARG2); 5305 illtrap(); 5306 emit_int32(id); 5307 block_comment("} stop;"); 5308 } 5309 5310 #ifndef PRODUCT 5311 // Write pattern 0x0101010101010101 in memory region [low-before, high+after]. 5312 // Val, addr are temp registers. 5313 // If low == addr, addr is killed. 5314 // High is preserved. 5315 void MacroAssembler::zap_from_to(Register low, int before, Register high, int after, Register val, Register addr) { 5316 if (!ZapMemory) return; 5317 5318 assert_different_registers(low, val); 5319 5320 BLOCK_COMMENT("zap memory region {"); 5321 load_const_optimized(val, 0x0101010101010101); 5322 int size = before + after; 5323 if (low == high && size < 5 && size > 0) { 5324 int offset = -before*BytesPerWord; 5325 for (int i = 0; i < size; ++i) { 5326 std(val, offset, low); 5327 offset += (1*BytesPerWord); 5328 } 5329 } else { 5330 addi(addr, low, -before*BytesPerWord); 5331 assert_different_registers(high, val); 5332 if (after) addi(high, high, after * BytesPerWord); 5333 Label loop; 5334 bind(loop); 5335 std(val, 0, addr); 5336 addi(addr, addr, 8); 5337 cmpd(CCR6, addr, high); 5338 ble(CCR6, loop); 5339 if (after) addi(high, high, -after * BytesPerWord); // Correct back to old value. 5340 } 5341 BLOCK_COMMENT("} zap memory region"); 5342 } 5343 5344 #endif // !PRODUCT 5345 5346 void SkipIfEqualZero::skip_to_label_if_equal_zero(MacroAssembler* masm, Register temp, 5347 const bool* flag_addr, Label& label) { 5348 int simm16_offset = masm->load_const_optimized(temp, (address)flag_addr, R0, true); 5349 assert(sizeof(bool) == 1, "PowerPC ABI"); 5350 masm->lbz(temp, simm16_offset, temp); 5351 masm->cmpwi(CCR0, temp, 0); 5352 masm->beq(CCR0, label); 5353 } 5354 5355 SkipIfEqualZero::SkipIfEqualZero(MacroAssembler* masm, Register temp, const bool* flag_addr) : _masm(masm), _label() { 5356 skip_to_label_if_equal_zero(masm, temp, flag_addr, _label); 5357 } 5358 5359 SkipIfEqualZero::~SkipIfEqualZero() { 5360 _masm->bind(_label); 5361 }