1 /* 2 * Copyright (c) 1997, 2019, Oracle and/or its affiliates. All rights reserved. 3 * Copyright (c) 2012, 2019, SAP SE. All rights reserved. 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. 9 * 10 * This code is distributed in the hope that it will be useful, but WITHOUT 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 * version 2 for more details (a copy is included in the LICENSE file that 14 * accompanied this code). 15 * 16 * You should have received a copy of the GNU General Public License version 17 * 2 along with this work; if not, write to the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 19 * 20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 21 * or visit www.oracle.com if you need additional information or have any 22 * questions. 23 * 24 */ 25 26 #include "precompiled.hpp" 27 #include "asm/macroAssembler.inline.hpp" 28 #include "compiler/disassembler.hpp" 29 #include "gc/shared/collectedHeap.inline.hpp" 30 #include "gc/shared/barrierSet.hpp" 31 #include "gc/shared/barrierSetAssembler.hpp" 32 #include "interpreter/interpreter.hpp" 33 #include "memory/resourceArea.hpp" 34 #include "nativeInst_ppc.hpp" 35 #include "oops/klass.inline.hpp" 36 #include "prims/methodHandles.hpp" 37 #include "runtime/biasedLocking.hpp" 38 #include "runtime/icache.hpp" 39 #include "runtime/interfaceSupport.inline.hpp" 40 #include "runtime/objectMonitor.hpp" 41 #include "runtime/os.hpp" 42 #include "runtime/safepoint.hpp" 43 #include "runtime/safepointMechanism.hpp" 44 #include "runtime/sharedRuntime.hpp" 45 #include "runtime/stubRoutines.hpp" 46 #include "utilities/macros.hpp" 47 #ifdef COMPILER2 48 #include "opto/intrinsicnode.hpp" 49 #endif 50 51 #ifdef PRODUCT 52 #define BLOCK_COMMENT(str) // nothing 53 #else 54 #define BLOCK_COMMENT(str) block_comment(str) 55 #endif 56 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 57 58 #ifdef ASSERT 59 // On RISC, there's no benefit to verifying instruction boundaries. 60 bool AbstractAssembler::pd_check_instruction_mark() { return false; } 61 #endif 62 63 void MacroAssembler::ld_largeoffset_unchecked(Register d, int si31, Register a, int emit_filler_nop) { 64 assert(Assembler::is_simm(si31, 31) && si31 >= 0, "si31 out of range"); 65 if (Assembler::is_simm(si31, 16)) { 66 ld(d, si31, a); 67 if (emit_filler_nop) nop(); 68 } else { 69 const int hi = MacroAssembler::largeoffset_si16_si16_hi(si31); 70 const int lo = MacroAssembler::largeoffset_si16_si16_lo(si31); 71 addis(d, a, hi); 72 ld(d, lo, d); 73 } 74 } 75 76 void MacroAssembler::ld_largeoffset(Register d, int si31, Register a, int emit_filler_nop) { 77 assert_different_registers(d, a); 78 ld_largeoffset_unchecked(d, si31, a, emit_filler_nop); 79 } 80 81 void MacroAssembler::load_sized_value(Register dst, RegisterOrConstant offs, Register base, 82 size_t size_in_bytes, bool is_signed) { 83 switch (size_in_bytes) { 84 case 8: ld(dst, offs, base); break; 85 case 4: is_signed ? lwa(dst, offs, base) : lwz(dst, offs, base); break; 86 case 2: is_signed ? lha(dst, offs, base) : lhz(dst, offs, base); break; 87 case 1: lbz(dst, offs, base); if (is_signed) extsb(dst, dst); break; // lba doesn't exist :( 88 default: ShouldNotReachHere(); 89 } 90 } 91 92 void MacroAssembler::store_sized_value(Register dst, RegisterOrConstant offs, Register base, 93 size_t size_in_bytes) { 94 switch (size_in_bytes) { 95 case 8: std(dst, offs, base); break; 96 case 4: stw(dst, offs, base); break; 97 case 2: sth(dst, offs, base); break; 98 case 1: stb(dst, offs, base); break; 99 default: ShouldNotReachHere(); 100 } 101 } 102 103 void MacroAssembler::align(int modulus, int max, int rem) { 104 int padding = (rem + modulus - (offset() % modulus)) % modulus; 105 if (padding > max) return; 106 for (int c = (padding >> 2); c > 0; --c) { nop(); } 107 } 108 109 // Issue instructions that calculate given TOC from global TOC. 110 void MacroAssembler::calculate_address_from_global_toc(Register dst, address addr, bool hi16, bool lo16, 111 bool add_relocation, bool emit_dummy_addr) { 112 int offset = -1; 113 if (emit_dummy_addr) { 114 offset = -128; // dummy address 115 } else if (addr != (address)(intptr_t)-1) { 116 offset = MacroAssembler::offset_to_global_toc(addr); 117 } 118 119 if (hi16) { 120 addis(dst, R29_TOC, MacroAssembler::largeoffset_si16_si16_hi(offset)); 121 } 122 if (lo16) { 123 if (add_relocation) { 124 // Relocate at the addi to avoid confusion with a load from the method's TOC. 125 relocate(internal_word_Relocation::spec(addr)); 126 } 127 addi(dst, dst, MacroAssembler::largeoffset_si16_si16_lo(offset)); 128 } 129 } 130 131 address MacroAssembler::patch_calculate_address_from_global_toc_at(address a, address bound, address addr) { 132 const int offset = MacroAssembler::offset_to_global_toc(addr); 133 134 const address inst2_addr = a; 135 const int inst2 = *(int *)inst2_addr; 136 137 // The relocation points to the second instruction, the addi, 138 // and the addi reads and writes the same register dst. 139 const int dst = inv_rt_field(inst2); 140 assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst"); 141 142 // Now, find the preceding addis which writes to dst. 143 int inst1 = 0; 144 address inst1_addr = inst2_addr - BytesPerInstWord; 145 while (inst1_addr >= bound) { 146 inst1 = *(int *) inst1_addr; 147 if (is_addis(inst1) && inv_rt_field(inst1) == dst) { 148 // Stop, found the addis which writes dst. 149 break; 150 } 151 inst1_addr -= BytesPerInstWord; 152 } 153 154 assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC"); 155 set_imm((int *)inst1_addr, MacroAssembler::largeoffset_si16_si16_hi(offset)); 156 set_imm((int *)inst2_addr, MacroAssembler::largeoffset_si16_si16_lo(offset)); 157 return inst1_addr; 158 } 159 160 address MacroAssembler::get_address_of_calculate_address_from_global_toc_at(address a, address bound) { 161 const address inst2_addr = a; 162 const int inst2 = *(int *)inst2_addr; 163 164 // The relocation points to the second instruction, the addi, 165 // and the addi reads and writes the same register dst. 166 const int dst = inv_rt_field(inst2); 167 assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst"); 168 169 // Now, find the preceding addis which writes to dst. 170 int inst1 = 0; 171 address inst1_addr = inst2_addr - BytesPerInstWord; 172 while (inst1_addr >= bound) { 173 inst1 = *(int *) inst1_addr; 174 if (is_addis(inst1) && inv_rt_field(inst1) == dst) { 175 // stop, found the addis which writes dst 176 break; 177 } 178 inst1_addr -= BytesPerInstWord; 179 } 180 181 assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC"); 182 183 int offset = (get_imm(inst1_addr, 0) << 16) + get_imm(inst2_addr, 0); 184 // -1 is a special case 185 if (offset == -1) { 186 return (address)(intptr_t)-1; 187 } else { 188 return global_toc() + offset; 189 } 190 } 191 192 #ifdef _LP64 193 // Patch compressed oops or klass constants. 194 // Assembler sequence is 195 // 1) compressed oops: 196 // lis rx = const.hi 197 // ori rx = rx | const.lo 198 // 2) compressed klass: 199 // lis rx = const.hi 200 // clrldi rx = rx & 0xFFFFffff // clearMS32b, optional 201 // ori rx = rx | const.lo 202 // Clrldi will be passed by. 203 address MacroAssembler::patch_set_narrow_oop(address a, address bound, narrowOop data) { 204 assert(UseCompressedOops, "Should only patch compressed oops"); 205 206 const address inst2_addr = a; 207 const int inst2 = *(int *)inst2_addr; 208 209 // The relocation points to the second instruction, the ori, 210 // and the ori reads and writes the same register dst. 211 const int dst = inv_rta_field(inst2); 212 assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst"); 213 // Now, find the preceding addis which writes to dst. 214 int inst1 = 0; 215 address inst1_addr = inst2_addr - BytesPerInstWord; 216 bool inst1_found = false; 217 while (inst1_addr >= bound) { 218 inst1 = *(int *)inst1_addr; 219 if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break; } 220 inst1_addr -= BytesPerInstWord; 221 } 222 assert(inst1_found, "inst is not lis"); 223 224 int xc = (data >> 16) & 0xffff; 225 int xd = (data >> 0) & 0xffff; 226 227 set_imm((int *)inst1_addr, (short)(xc)); // see enc_load_con_narrow_hi/_lo 228 set_imm((int *)inst2_addr, (xd)); // unsigned int 229 return inst1_addr; 230 } 231 232 // Get compressed oop or klass constant. 233 narrowOop MacroAssembler::get_narrow_oop(address a, address bound) { 234 assert(UseCompressedOops, "Should only patch compressed oops"); 235 236 const address inst2_addr = a; 237 const int inst2 = *(int *)inst2_addr; 238 239 // The relocation points to the second instruction, the ori, 240 // and the ori reads and writes the same register dst. 241 const int dst = inv_rta_field(inst2); 242 assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst"); 243 // Now, find the preceding lis which writes to dst. 244 int inst1 = 0; 245 address inst1_addr = inst2_addr - BytesPerInstWord; 246 bool inst1_found = false; 247 248 while (inst1_addr >= bound) { 249 inst1 = *(int *) inst1_addr; 250 if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break;} 251 inst1_addr -= BytesPerInstWord; 252 } 253 assert(inst1_found, "inst is not lis"); 254 255 uint xl = ((unsigned int) (get_imm(inst2_addr, 0) & 0xffff)); 256 uint xh = (((get_imm(inst1_addr, 0)) & 0xffff) << 16); 257 258 return (int) (xl | xh); 259 } 260 #endif // _LP64 261 262 // Returns true if successful. 263 bool MacroAssembler::load_const_from_method_toc(Register dst, AddressLiteral& a, 264 Register toc, bool fixed_size) { 265 int toc_offset = 0; 266 // Use RelocationHolder::none for the constant pool entry, otherwise 267 // we will end up with a failing NativeCall::verify(x) where x is 268 // the address of the constant pool entry. 269 // FIXME: We should insert relocation information for oops at the constant 270 // pool entries instead of inserting it at the loads; patching of a constant 271 // pool entry should be less expensive. 272 address const_address = address_constant((address)a.value(), RelocationHolder::none); 273 if (const_address == NULL) { return false; } // allocation failure 274 // Relocate at the pc of the load. 275 relocate(a.rspec()); 276 toc_offset = (int)(const_address - code()->consts()->start()); 277 ld_largeoffset_unchecked(dst, toc_offset, toc, fixed_size); 278 return true; 279 } 280 281 bool MacroAssembler::is_load_const_from_method_toc_at(address a) { 282 const address inst1_addr = a; 283 const int inst1 = *(int *)inst1_addr; 284 285 // The relocation points to the ld or the addis. 286 return (is_ld(inst1)) || 287 (is_addis(inst1) && inv_ra_field(inst1) != 0); 288 } 289 290 int MacroAssembler::get_offset_of_load_const_from_method_toc_at(address a) { 291 assert(is_load_const_from_method_toc_at(a), "must be load_const_from_method_toc"); 292 293 const address inst1_addr = a; 294 const int inst1 = *(int *)inst1_addr; 295 296 if (is_ld(inst1)) { 297 return inv_d1_field(inst1); 298 } else if (is_addis(inst1)) { 299 const int dst = inv_rt_field(inst1); 300 301 // Now, find the succeeding ld which reads and writes to dst. 302 address inst2_addr = inst1_addr + BytesPerInstWord; 303 int inst2 = 0; 304 while (true) { 305 inst2 = *(int *) inst2_addr; 306 if (is_ld(inst2) && inv_ra_field(inst2) == dst && inv_rt_field(inst2) == dst) { 307 // Stop, found the ld which reads and writes dst. 308 break; 309 } 310 inst2_addr += BytesPerInstWord; 311 } 312 return (inv_d1_field(inst1) << 16) + inv_d1_field(inst2); 313 } 314 ShouldNotReachHere(); 315 return 0; 316 } 317 318 // Get the constant from a `load_const' sequence. 319 long MacroAssembler::get_const(address a) { 320 assert(is_load_const_at(a), "not a load of a constant"); 321 const int *p = (const int*) a; 322 unsigned long x = (((unsigned long) (get_imm(a,0) & 0xffff)) << 48); 323 if (is_ori(*(p+1))) { 324 x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 32); 325 x |= (((unsigned long) (get_imm(a,3) & 0xffff)) << 16); 326 x |= (((unsigned long) (get_imm(a,4) & 0xffff))); 327 } else if (is_lis(*(p+1))) { 328 x |= (((unsigned long) (get_imm(a,2) & 0xffff)) << 32); 329 x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 16); 330 x |= (((unsigned long) (get_imm(a,3) & 0xffff))); 331 } else { 332 ShouldNotReachHere(); 333 return (long) 0; 334 } 335 return (long) x; 336 } 337 338 // Patch the 64 bit constant of a `load_const' sequence. This is a low 339 // level procedure. It neither flushes the instruction cache nor is it 340 // mt safe. 341 void MacroAssembler::patch_const(address a, long x) { 342 assert(is_load_const_at(a), "not a load of a constant"); 343 int *p = (int*) a; 344 if (is_ori(*(p+1))) { 345 set_imm(0 + p, (x >> 48) & 0xffff); 346 set_imm(1 + p, (x >> 32) & 0xffff); 347 set_imm(3 + p, (x >> 16) & 0xffff); 348 set_imm(4 + p, x & 0xffff); 349 } else if (is_lis(*(p+1))) { 350 set_imm(0 + p, (x >> 48) & 0xffff); 351 set_imm(2 + p, (x >> 32) & 0xffff); 352 set_imm(1 + p, (x >> 16) & 0xffff); 353 set_imm(3 + p, x & 0xffff); 354 } else { 355 ShouldNotReachHere(); 356 } 357 } 358 359 AddressLiteral MacroAssembler::allocate_metadata_address(Metadata* obj) { 360 assert(oop_recorder() != NULL, "this assembler needs a Recorder"); 361 int index = oop_recorder()->allocate_metadata_index(obj); 362 RelocationHolder rspec = metadata_Relocation::spec(index); 363 return AddressLiteral((address)obj, rspec); 364 } 365 366 AddressLiteral MacroAssembler::constant_metadata_address(Metadata* obj) { 367 assert(oop_recorder() != NULL, "this assembler needs a Recorder"); 368 int index = oop_recorder()->find_index(obj); 369 RelocationHolder rspec = metadata_Relocation::spec(index); 370 return AddressLiteral((address)obj, rspec); 371 } 372 373 AddressLiteral MacroAssembler::allocate_oop_address(jobject obj) { 374 assert(oop_recorder() != NULL, "this assembler needs an OopRecorder"); 375 int oop_index = oop_recorder()->allocate_oop_index(obj); 376 return AddressLiteral(address(obj), oop_Relocation::spec(oop_index)); 377 } 378 379 AddressLiteral MacroAssembler::constant_oop_address(jobject obj) { 380 assert(oop_recorder() != NULL, "this assembler needs an OopRecorder"); 381 int oop_index = oop_recorder()->find_index(obj); 382 return AddressLiteral(address(obj), oop_Relocation::spec(oop_index)); 383 } 384 385 RegisterOrConstant MacroAssembler::delayed_value_impl(intptr_t* delayed_value_addr, 386 Register tmp, int offset) { 387 intptr_t value = *delayed_value_addr; 388 if (value != 0) { 389 return RegisterOrConstant(value + offset); 390 } 391 392 // Load indirectly to solve generation ordering problem. 393 // static address, no relocation 394 int simm16_offset = load_const_optimized(tmp, delayed_value_addr, noreg, true); 395 ld(tmp, simm16_offset, tmp); // must be aligned ((xa & 3) == 0) 396 397 if (offset != 0) { 398 addi(tmp, tmp, offset); 399 } 400 401 return RegisterOrConstant(tmp); 402 } 403 404 #ifndef PRODUCT 405 void MacroAssembler::pd_print_patched_instruction(address branch) { 406 Unimplemented(); // TODO: PPC port 407 } 408 #endif // ndef PRODUCT 409 410 // Conditional far branch for destinations encodable in 24+2 bits. 411 void MacroAssembler::bc_far(int boint, int biint, Label& dest, int optimize) { 412 413 // If requested by flag optimize, relocate the bc_far as a 414 // runtime_call and prepare for optimizing it when the code gets 415 // relocated. 416 if (optimize == bc_far_optimize_on_relocate) { 417 relocate(relocInfo::runtime_call_type); 418 } 419 420 // variant 2: 421 // 422 // b!cxx SKIP 423 // bxx DEST 424 // SKIP: 425 // 426 427 const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)), 428 opposite_bcond(inv_boint_bcond(boint))); 429 430 // We emit two branches. 431 // First, a conditional branch which jumps around the far branch. 432 const address not_taken_pc = pc() + 2 * BytesPerInstWord; 433 const address bc_pc = pc(); 434 bc(opposite_boint, biint, not_taken_pc); 435 436 const int bc_instr = *(int*)bc_pc; 437 assert(not_taken_pc == (address)inv_bd_field(bc_instr, (intptr_t)bc_pc), "postcondition"); 438 assert(opposite_boint == inv_bo_field(bc_instr), "postcondition"); 439 assert(boint == add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(bc_instr))), 440 opposite_bcond(inv_boint_bcond(inv_bo_field(bc_instr)))), 441 "postcondition"); 442 assert(biint == inv_bi_field(bc_instr), "postcondition"); 443 444 // Second, an unconditional far branch which jumps to dest. 445 // Note: target(dest) remembers the current pc (see CodeSection::target) 446 // and returns the current pc if the label is not bound yet; when 447 // the label gets bound, the unconditional far branch will be patched. 448 const address target_pc = target(dest); 449 const address b_pc = pc(); 450 b(target_pc); 451 452 assert(not_taken_pc == pc(), "postcondition"); 453 assert(dest.is_bound() || target_pc == b_pc, "postcondition"); 454 } 455 456 // 1 or 2 instructions 457 void MacroAssembler::bc_far_optimized(int boint, int biint, Label& dest) { 458 if (dest.is_bound() && is_within_range_of_bcxx(target(dest), pc())) { 459 bc(boint, biint, dest); 460 } else { 461 bc_far(boint, biint, dest, MacroAssembler::bc_far_optimize_on_relocate); 462 } 463 } 464 465 bool MacroAssembler::is_bc_far_at(address instruction_addr) { 466 return is_bc_far_variant1_at(instruction_addr) || 467 is_bc_far_variant2_at(instruction_addr) || 468 is_bc_far_variant3_at(instruction_addr); 469 } 470 471 address MacroAssembler::get_dest_of_bc_far_at(address instruction_addr) { 472 if (is_bc_far_variant1_at(instruction_addr)) { 473 const address instruction_1_addr = instruction_addr; 474 const int instruction_1 = *(int*)instruction_1_addr; 475 return (address)inv_bd_field(instruction_1, (intptr_t)instruction_1_addr); 476 } else if (is_bc_far_variant2_at(instruction_addr)) { 477 const address instruction_2_addr = instruction_addr + 4; 478 return bxx_destination(instruction_2_addr); 479 } else if (is_bc_far_variant3_at(instruction_addr)) { 480 return instruction_addr + 8; 481 } 482 // variant 4 ??? 483 ShouldNotReachHere(); 484 return NULL; 485 } 486 void MacroAssembler::set_dest_of_bc_far_at(address instruction_addr, address dest) { 487 488 if (is_bc_far_variant3_at(instruction_addr)) { 489 // variant 3, far cond branch to the next instruction, already patched to nops: 490 // 491 // nop 492 // endgroup 493 // SKIP/DEST: 494 // 495 return; 496 } 497 498 // first, extract boint and biint from the current branch 499 int boint = 0; 500 int biint = 0; 501 502 ResourceMark rm; 503 const int code_size = 2 * BytesPerInstWord; 504 CodeBuffer buf(instruction_addr, code_size); 505 MacroAssembler masm(&buf); 506 if (is_bc_far_variant2_at(instruction_addr) && dest == instruction_addr + 8) { 507 // Far branch to next instruction: Optimize it by patching nops (produce variant 3). 508 masm.nop(); 509 masm.endgroup(); 510 } else { 511 if (is_bc_far_variant1_at(instruction_addr)) { 512 // variant 1, the 1st instruction contains the destination address: 513 // 514 // bcxx DEST 515 // nop 516 // 517 const int instruction_1 = *(int*)(instruction_addr); 518 boint = inv_bo_field(instruction_1); 519 biint = inv_bi_field(instruction_1); 520 } else if (is_bc_far_variant2_at(instruction_addr)) { 521 // variant 2, the 2nd instruction contains the destination address: 522 // 523 // b!cxx SKIP 524 // bxx DEST 525 // SKIP: 526 // 527 const int instruction_1 = *(int*)(instruction_addr); 528 boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(instruction_1))), 529 opposite_bcond(inv_boint_bcond(inv_bo_field(instruction_1)))); 530 biint = inv_bi_field(instruction_1); 531 } else { 532 // variant 4??? 533 ShouldNotReachHere(); 534 } 535 536 // second, set the new branch destination and optimize the code 537 if (dest != instruction_addr + 4 && // the bc_far is still unbound! 538 masm.is_within_range_of_bcxx(dest, instruction_addr)) { 539 // variant 1: 540 // 541 // bcxx DEST 542 // nop 543 // 544 masm.bc(boint, biint, dest); 545 masm.nop(); 546 } else { 547 // variant 2: 548 // 549 // b!cxx SKIP 550 // bxx DEST 551 // SKIP: 552 // 553 const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)), 554 opposite_bcond(inv_boint_bcond(boint))); 555 const address not_taken_pc = masm.pc() + 2 * BytesPerInstWord; 556 masm.bc(opposite_boint, biint, not_taken_pc); 557 masm.b(dest); 558 } 559 } 560 ICache::ppc64_flush_icache_bytes(instruction_addr, code_size); 561 } 562 563 // Emit a NOT mt-safe patchable 64 bit absolute call/jump. 564 void MacroAssembler::bxx64_patchable(address dest, relocInfo::relocType rt, bool link) { 565 // get current pc 566 uint64_t start_pc = (uint64_t) pc(); 567 568 const address pc_of_bl = (address) (start_pc + (6*BytesPerInstWord)); // bl is last 569 const address pc_of_b = (address) (start_pc + (0*BytesPerInstWord)); // b is first 570 571 // relocate here 572 if (rt != relocInfo::none) { 573 relocate(rt); 574 } 575 576 if ( ReoptimizeCallSequences && 577 (( link && is_within_range_of_b(dest, pc_of_bl)) || 578 (!link && is_within_range_of_b(dest, pc_of_b)))) { 579 // variant 2: 580 // Emit an optimized, pc-relative call/jump. 581 582 if (link) { 583 // some padding 584 nop(); 585 nop(); 586 nop(); 587 nop(); 588 nop(); 589 nop(); 590 591 // do the call 592 assert(pc() == pc_of_bl, "just checking"); 593 bl(dest, relocInfo::none); 594 } else { 595 // do the jump 596 assert(pc() == pc_of_b, "just checking"); 597 b(dest, relocInfo::none); 598 599 // some padding 600 nop(); 601 nop(); 602 nop(); 603 nop(); 604 nop(); 605 nop(); 606 } 607 608 // Assert that we can identify the emitted call/jump. 609 assert(is_bxx64_patchable_variant2_at((address)start_pc, link), 610 "can't identify emitted call"); 611 } else { 612 // variant 1: 613 mr(R0, R11); // spill R11 -> R0. 614 615 // Load the destination address into CTR, 616 // calculate destination relative to global toc. 617 calculate_address_from_global_toc(R11, dest, true, true, false); 618 619 mtctr(R11); 620 mr(R11, R0); // spill R11 <- R0. 621 nop(); 622 623 // do the call/jump 624 if (link) { 625 bctrl(); 626 } else{ 627 bctr(); 628 } 629 // Assert that we can identify the emitted call/jump. 630 assert(is_bxx64_patchable_variant1b_at((address)start_pc, link), 631 "can't identify emitted call"); 632 } 633 634 // Assert that we can identify the emitted call/jump. 635 assert(is_bxx64_patchable_at((address)start_pc, link), 636 "can't identify emitted call"); 637 assert(get_dest_of_bxx64_patchable_at((address)start_pc, link) == dest, 638 "wrong encoding of dest address"); 639 } 640 641 // Identify a bxx64_patchable instruction. 642 bool MacroAssembler::is_bxx64_patchable_at(address instruction_addr, bool link) { 643 return is_bxx64_patchable_variant1b_at(instruction_addr, link) 644 //|| is_bxx64_patchable_variant1_at(instruction_addr, link) 645 || is_bxx64_patchable_variant2_at(instruction_addr, link); 646 } 647 648 // Does the call64_patchable instruction use a pc-relative encoding of 649 // the call destination? 650 bool MacroAssembler::is_bxx64_patchable_pcrelative_at(address instruction_addr, bool link) { 651 // variant 2 is pc-relative 652 return is_bxx64_patchable_variant2_at(instruction_addr, link); 653 } 654 655 // Identify variant 1. 656 bool MacroAssembler::is_bxx64_patchable_variant1_at(address instruction_addr, bool link) { 657 unsigned int* instr = (unsigned int*) instruction_addr; 658 return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l] 659 && is_mtctr(instr[5]) // mtctr 660 && is_load_const_at(instruction_addr); 661 } 662 663 // Identify variant 1b: load destination relative to global toc. 664 bool MacroAssembler::is_bxx64_patchable_variant1b_at(address instruction_addr, bool link) { 665 unsigned int* instr = (unsigned int*) instruction_addr; 666 return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l] 667 && is_mtctr(instr[3]) // mtctr 668 && is_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord, instruction_addr); 669 } 670 671 // Identify variant 2. 672 bool MacroAssembler::is_bxx64_patchable_variant2_at(address instruction_addr, bool link) { 673 unsigned int* instr = (unsigned int*) instruction_addr; 674 if (link) { 675 return is_bl (instr[6]) // bl dest is last 676 && is_nop(instr[0]) // nop 677 && is_nop(instr[1]) // nop 678 && is_nop(instr[2]) // nop 679 && is_nop(instr[3]) // nop 680 && is_nop(instr[4]) // nop 681 && is_nop(instr[5]); // nop 682 } else { 683 return is_b (instr[0]) // b dest is first 684 && is_nop(instr[1]) // nop 685 && is_nop(instr[2]) // nop 686 && is_nop(instr[3]) // nop 687 && is_nop(instr[4]) // nop 688 && is_nop(instr[5]) // nop 689 && is_nop(instr[6]); // nop 690 } 691 } 692 693 // Set dest address of a bxx64_patchable instruction. 694 void MacroAssembler::set_dest_of_bxx64_patchable_at(address instruction_addr, address dest, bool link) { 695 ResourceMark rm; 696 int code_size = MacroAssembler::bxx64_patchable_size; 697 CodeBuffer buf(instruction_addr, code_size); 698 MacroAssembler masm(&buf); 699 masm.bxx64_patchable(dest, relocInfo::none, link); 700 ICache::ppc64_flush_icache_bytes(instruction_addr, code_size); 701 } 702 703 // Get dest address of a bxx64_patchable instruction. 704 address MacroAssembler::get_dest_of_bxx64_patchable_at(address instruction_addr, bool link) { 705 if (is_bxx64_patchable_variant1_at(instruction_addr, link)) { 706 return (address) (unsigned long) get_const(instruction_addr); 707 } else if (is_bxx64_patchable_variant2_at(instruction_addr, link)) { 708 unsigned int* instr = (unsigned int*) instruction_addr; 709 if (link) { 710 const int instr_idx = 6; // bl is last 711 int branchoffset = branch_destination(instr[instr_idx], 0); 712 return instruction_addr + branchoffset + instr_idx*BytesPerInstWord; 713 } else { 714 const int instr_idx = 0; // b is first 715 int branchoffset = branch_destination(instr[instr_idx], 0); 716 return instruction_addr + branchoffset + instr_idx*BytesPerInstWord; 717 } 718 // Load dest relative to global toc. 719 } else if (is_bxx64_patchable_variant1b_at(instruction_addr, link)) { 720 return get_address_of_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord, 721 instruction_addr); 722 } else { 723 ShouldNotReachHere(); 724 return NULL; 725 } 726 } 727 728 // Uses ordering which corresponds to ABI: 729 // _savegpr0_14: std r14,-144(r1) 730 // _savegpr0_15: std r15,-136(r1) 731 // _savegpr0_16: std r16,-128(r1) 732 void MacroAssembler::save_nonvolatile_gprs(Register dst, int offset) { 733 std(R14, offset, dst); offset += 8; 734 std(R15, offset, dst); offset += 8; 735 std(R16, offset, dst); offset += 8; 736 std(R17, offset, dst); offset += 8; 737 std(R18, offset, dst); offset += 8; 738 std(R19, offset, dst); offset += 8; 739 std(R20, offset, dst); offset += 8; 740 std(R21, offset, dst); offset += 8; 741 std(R22, offset, dst); offset += 8; 742 std(R23, offset, dst); offset += 8; 743 std(R24, offset, dst); offset += 8; 744 std(R25, offset, dst); offset += 8; 745 std(R26, offset, dst); offset += 8; 746 std(R27, offset, dst); offset += 8; 747 std(R28, offset, dst); offset += 8; 748 std(R29, offset, dst); offset += 8; 749 std(R30, offset, dst); offset += 8; 750 std(R31, offset, dst); offset += 8; 751 752 stfd(F14, offset, dst); offset += 8; 753 stfd(F15, offset, dst); offset += 8; 754 stfd(F16, offset, dst); offset += 8; 755 stfd(F17, offset, dst); offset += 8; 756 stfd(F18, offset, dst); offset += 8; 757 stfd(F19, offset, dst); offset += 8; 758 stfd(F20, offset, dst); offset += 8; 759 stfd(F21, offset, dst); offset += 8; 760 stfd(F22, offset, dst); offset += 8; 761 stfd(F23, offset, dst); offset += 8; 762 stfd(F24, offset, dst); offset += 8; 763 stfd(F25, offset, dst); offset += 8; 764 stfd(F26, offset, dst); offset += 8; 765 stfd(F27, offset, dst); offset += 8; 766 stfd(F28, offset, dst); offset += 8; 767 stfd(F29, offset, dst); offset += 8; 768 stfd(F30, offset, dst); offset += 8; 769 stfd(F31, offset, dst); 770 } 771 772 // Uses ordering which corresponds to ABI: 773 // _restgpr0_14: ld r14,-144(r1) 774 // _restgpr0_15: ld r15,-136(r1) 775 // _restgpr0_16: ld r16,-128(r1) 776 void MacroAssembler::restore_nonvolatile_gprs(Register src, int offset) { 777 ld(R14, offset, src); offset += 8; 778 ld(R15, offset, src); offset += 8; 779 ld(R16, offset, src); offset += 8; 780 ld(R17, offset, src); offset += 8; 781 ld(R18, offset, src); offset += 8; 782 ld(R19, offset, src); offset += 8; 783 ld(R20, offset, src); offset += 8; 784 ld(R21, offset, src); offset += 8; 785 ld(R22, offset, src); offset += 8; 786 ld(R23, offset, src); offset += 8; 787 ld(R24, offset, src); offset += 8; 788 ld(R25, offset, src); offset += 8; 789 ld(R26, offset, src); offset += 8; 790 ld(R27, offset, src); offset += 8; 791 ld(R28, offset, src); offset += 8; 792 ld(R29, offset, src); offset += 8; 793 ld(R30, offset, src); offset += 8; 794 ld(R31, offset, src); offset += 8; 795 796 // FP registers 797 lfd(F14, offset, src); offset += 8; 798 lfd(F15, offset, src); offset += 8; 799 lfd(F16, offset, src); offset += 8; 800 lfd(F17, offset, src); offset += 8; 801 lfd(F18, offset, src); offset += 8; 802 lfd(F19, offset, src); offset += 8; 803 lfd(F20, offset, src); offset += 8; 804 lfd(F21, offset, src); offset += 8; 805 lfd(F22, offset, src); offset += 8; 806 lfd(F23, offset, src); offset += 8; 807 lfd(F24, offset, src); offset += 8; 808 lfd(F25, offset, src); offset += 8; 809 lfd(F26, offset, src); offset += 8; 810 lfd(F27, offset, src); offset += 8; 811 lfd(F28, offset, src); offset += 8; 812 lfd(F29, offset, src); offset += 8; 813 lfd(F30, offset, src); offset += 8; 814 lfd(F31, offset, src); 815 } 816 817 // For verify_oops. 818 void MacroAssembler::save_volatile_gprs(Register dst, int offset) { 819 std(R2, offset, dst); offset += 8; 820 std(R3, offset, dst); offset += 8; 821 std(R4, offset, dst); offset += 8; 822 std(R5, offset, dst); offset += 8; 823 std(R6, offset, dst); offset += 8; 824 std(R7, offset, dst); offset += 8; 825 std(R8, offset, dst); offset += 8; 826 std(R9, offset, dst); offset += 8; 827 std(R10, offset, dst); offset += 8; 828 std(R11, offset, dst); offset += 8; 829 std(R12, offset, dst); offset += 8; 830 831 stfd(F0, offset, dst); offset += 8; 832 stfd(F1, offset, dst); offset += 8; 833 stfd(F2, offset, dst); offset += 8; 834 stfd(F3, offset, dst); offset += 8; 835 stfd(F4, offset, dst); offset += 8; 836 stfd(F5, offset, dst); offset += 8; 837 stfd(F6, offset, dst); offset += 8; 838 stfd(F7, offset, dst); offset += 8; 839 stfd(F8, offset, dst); offset += 8; 840 stfd(F9, offset, dst); offset += 8; 841 stfd(F10, offset, dst); offset += 8; 842 stfd(F11, offset, dst); offset += 8; 843 stfd(F12, offset, dst); offset += 8; 844 stfd(F13, offset, dst); 845 } 846 847 // For verify_oops. 848 void MacroAssembler::restore_volatile_gprs(Register src, int offset) { 849 ld(R2, offset, src); offset += 8; 850 ld(R3, offset, src); offset += 8; 851 ld(R4, offset, src); offset += 8; 852 ld(R5, offset, src); offset += 8; 853 ld(R6, offset, src); offset += 8; 854 ld(R7, offset, src); offset += 8; 855 ld(R8, offset, src); offset += 8; 856 ld(R9, offset, src); offset += 8; 857 ld(R10, offset, src); offset += 8; 858 ld(R11, offset, src); offset += 8; 859 ld(R12, offset, src); offset += 8; 860 861 lfd(F0, offset, src); offset += 8; 862 lfd(F1, offset, src); offset += 8; 863 lfd(F2, offset, src); offset += 8; 864 lfd(F3, offset, src); offset += 8; 865 lfd(F4, offset, src); offset += 8; 866 lfd(F5, offset, src); offset += 8; 867 lfd(F6, offset, src); offset += 8; 868 lfd(F7, offset, src); offset += 8; 869 lfd(F8, offset, src); offset += 8; 870 lfd(F9, offset, src); offset += 8; 871 lfd(F10, offset, src); offset += 8; 872 lfd(F11, offset, src); offset += 8; 873 lfd(F12, offset, src); offset += 8; 874 lfd(F13, offset, src); 875 } 876 877 void MacroAssembler::save_LR_CR(Register tmp) { 878 mfcr(tmp); 879 std(tmp, _abi(cr), R1_SP); 880 mflr(tmp); 881 std(tmp, _abi(lr), R1_SP); 882 // Tmp must contain lr on exit! (see return_addr and prolog in ppc64.ad) 883 } 884 885 void MacroAssembler::restore_LR_CR(Register tmp) { 886 assert(tmp != R1_SP, "must be distinct"); 887 ld(tmp, _abi(lr), R1_SP); 888 mtlr(tmp); 889 ld(tmp, _abi(cr), R1_SP); 890 mtcr(tmp); 891 } 892 893 address MacroAssembler::get_PC_trash_LR(Register result) { 894 Label L; 895 bl(L); 896 bind(L); 897 address lr_pc = pc(); 898 mflr(result); 899 return lr_pc; 900 } 901 902 void MacroAssembler::resize_frame(Register offset, Register tmp) { 903 #ifdef ASSERT 904 assert_different_registers(offset, tmp, R1_SP); 905 andi_(tmp, offset, frame::alignment_in_bytes-1); 906 asm_assert_eq("resize_frame: unaligned", 0x204); 907 #endif 908 909 // tmp <- *(SP) 910 ld(tmp, _abi(callers_sp), R1_SP); 911 // addr <- SP + offset; 912 // *(addr) <- tmp; 913 // SP <- addr 914 stdux(tmp, R1_SP, offset); 915 } 916 917 void MacroAssembler::resize_frame(int offset, Register tmp) { 918 assert(is_simm(offset, 16), "too big an offset"); 919 assert_different_registers(tmp, R1_SP); 920 assert((offset & (frame::alignment_in_bytes-1))==0, "resize_frame: unaligned"); 921 // tmp <- *(SP) 922 ld(tmp, _abi(callers_sp), R1_SP); 923 // addr <- SP + offset; 924 // *(addr) <- tmp; 925 // SP <- addr 926 stdu(tmp, offset, R1_SP); 927 } 928 929 void MacroAssembler::resize_frame_absolute(Register addr, Register tmp1, Register tmp2) { 930 // (addr == tmp1) || (addr == tmp2) is allowed here! 931 assert(tmp1 != tmp2, "must be distinct"); 932 933 // compute offset w.r.t. current stack pointer 934 // tmp_1 <- addr - SP (!) 935 subf(tmp1, R1_SP, addr); 936 937 // atomically update SP keeping back link. 938 resize_frame(tmp1/* offset */, tmp2/* tmp */); 939 } 940 941 void MacroAssembler::push_frame(Register bytes, Register tmp) { 942 #ifdef ASSERT 943 assert(bytes != R0, "r0 not allowed here"); 944 andi_(R0, bytes, frame::alignment_in_bytes-1); 945 asm_assert_eq("push_frame(Reg, Reg): unaligned", 0x203); 946 #endif 947 neg(tmp, bytes); 948 stdux(R1_SP, R1_SP, tmp); 949 } 950 951 // Push a frame of size `bytes'. 952 void MacroAssembler::push_frame(unsigned int bytes, Register tmp) { 953 long offset = align_addr(bytes, frame::alignment_in_bytes); 954 if (is_simm(-offset, 16)) { 955 stdu(R1_SP, -offset, R1_SP); 956 } else { 957 load_const_optimized(tmp, -offset); 958 stdux(R1_SP, R1_SP, tmp); 959 } 960 } 961 962 // Push a frame of size `bytes' plus abi_reg_args on top. 963 void MacroAssembler::push_frame_reg_args(unsigned int bytes, Register tmp) { 964 push_frame(bytes + frame::abi_reg_args_size, tmp); 965 } 966 967 // Setup up a new C frame with a spill area for non-volatile GPRs and 968 // additional space for local variables. 969 void MacroAssembler::push_frame_reg_args_nonvolatiles(unsigned int bytes, 970 Register tmp) { 971 push_frame(bytes + frame::abi_reg_args_size + frame::spill_nonvolatiles_size, tmp); 972 } 973 974 // Pop current C frame. 975 void MacroAssembler::pop_frame() { 976 ld(R1_SP, _abi(callers_sp), R1_SP); 977 } 978 979 #if defined(ABI_ELFv2) 980 address MacroAssembler::branch_to(Register r_function_entry, bool and_link) { 981 // TODO(asmundak): make sure the caller uses R12 as function descriptor 982 // most of the times. 983 if (R12 != r_function_entry) { 984 mr(R12, r_function_entry); 985 } 986 mtctr(R12); 987 // Do a call or a branch. 988 if (and_link) { 989 bctrl(); 990 } else { 991 bctr(); 992 } 993 _last_calls_return_pc = pc(); 994 995 return _last_calls_return_pc; 996 } 997 998 // Call a C function via a function descriptor and use full C 999 // calling conventions. Updates and returns _last_calls_return_pc. 1000 address MacroAssembler::call_c(Register r_function_entry) { 1001 return branch_to(r_function_entry, /*and_link=*/true); 1002 } 1003 1004 // For tail calls: only branch, don't link, so callee returns to caller of this function. 1005 address MacroAssembler::call_c_and_return_to_caller(Register r_function_entry) { 1006 return branch_to(r_function_entry, /*and_link=*/false); 1007 } 1008 1009 address MacroAssembler::call_c(address function_entry, relocInfo::relocType rt) { 1010 load_const(R12, function_entry, R0); 1011 return branch_to(R12, /*and_link=*/true); 1012 } 1013 1014 #else 1015 // Generic version of a call to C function via a function descriptor 1016 // with variable support for C calling conventions (TOC, ENV, etc.). 1017 // Updates and returns _last_calls_return_pc. 1018 address MacroAssembler::branch_to(Register function_descriptor, bool and_link, bool save_toc_before_call, 1019 bool restore_toc_after_call, bool load_toc_of_callee, bool load_env_of_callee) { 1020 // we emit standard ptrgl glue code here 1021 assert((function_descriptor != R0), "function_descriptor cannot be R0"); 1022 1023 // retrieve necessary entries from the function descriptor 1024 ld(R0, in_bytes(FunctionDescriptor::entry_offset()), function_descriptor); 1025 mtctr(R0); 1026 1027 if (load_toc_of_callee) { 1028 ld(R2_TOC, in_bytes(FunctionDescriptor::toc_offset()), function_descriptor); 1029 } 1030 if (load_env_of_callee) { 1031 ld(R11, in_bytes(FunctionDescriptor::env_offset()), function_descriptor); 1032 } else if (load_toc_of_callee) { 1033 li(R11, 0); 1034 } 1035 1036 // do a call or a branch 1037 if (and_link) { 1038 bctrl(); 1039 } else { 1040 bctr(); 1041 } 1042 _last_calls_return_pc = pc(); 1043 1044 return _last_calls_return_pc; 1045 } 1046 1047 // Call a C function via a function descriptor and use full C calling 1048 // conventions. 1049 // We don't use the TOC in generated code, so there is no need to save 1050 // and restore its value. 1051 address MacroAssembler::call_c(Register fd) { 1052 return branch_to(fd, /*and_link=*/true, 1053 /*save toc=*/false, 1054 /*restore toc=*/false, 1055 /*load toc=*/true, 1056 /*load env=*/true); 1057 } 1058 1059 address MacroAssembler::call_c_and_return_to_caller(Register fd) { 1060 return branch_to(fd, /*and_link=*/false, 1061 /*save toc=*/false, 1062 /*restore toc=*/false, 1063 /*load toc=*/true, 1064 /*load env=*/true); 1065 } 1066 1067 address MacroAssembler::call_c(const FunctionDescriptor* fd, relocInfo::relocType rt) { 1068 if (rt != relocInfo::none) { 1069 // this call needs to be relocatable 1070 if (!ReoptimizeCallSequences 1071 || (rt != relocInfo::runtime_call_type && rt != relocInfo::none) 1072 || fd == NULL // support code-size estimation 1073 || !fd->is_friend_function() 1074 || fd->entry() == NULL) { 1075 // it's not a friend function as defined by class FunctionDescriptor, 1076 // so do a full call-c here. 1077 load_const(R11, (address)fd, R0); 1078 1079 bool has_env = (fd != NULL && fd->env() != NULL); 1080 return branch_to(R11, /*and_link=*/true, 1081 /*save toc=*/false, 1082 /*restore toc=*/false, 1083 /*load toc=*/true, 1084 /*load env=*/has_env); 1085 } else { 1086 // It's a friend function. Load the entry point and don't care about 1087 // toc and env. Use an optimizable call instruction, but ensure the 1088 // same code-size as in the case of a non-friend function. 1089 nop(); 1090 nop(); 1091 nop(); 1092 bl64_patchable(fd->entry(), rt); 1093 _last_calls_return_pc = pc(); 1094 return _last_calls_return_pc; 1095 } 1096 } else { 1097 // This call does not need to be relocatable, do more aggressive 1098 // optimizations. 1099 if (!ReoptimizeCallSequences 1100 || !fd->is_friend_function()) { 1101 // It's not a friend function as defined by class FunctionDescriptor, 1102 // so do a full call-c here. 1103 load_const(R11, (address)fd, R0); 1104 return branch_to(R11, /*and_link=*/true, 1105 /*save toc=*/false, 1106 /*restore toc=*/false, 1107 /*load toc=*/true, 1108 /*load env=*/true); 1109 } else { 1110 // it's a friend function, load the entry point and don't care about 1111 // toc and env. 1112 address dest = fd->entry(); 1113 if (is_within_range_of_b(dest, pc())) { 1114 bl(dest); 1115 } else { 1116 bl64_patchable(dest, rt); 1117 } 1118 _last_calls_return_pc = pc(); 1119 return _last_calls_return_pc; 1120 } 1121 } 1122 } 1123 1124 // Call a C function. All constants needed reside in TOC. 1125 // 1126 // Read the address to call from the TOC. 1127 // Read env from TOC, if fd specifies an env. 1128 // Read new TOC from TOC. 1129 address MacroAssembler::call_c_using_toc(const FunctionDescriptor* fd, 1130 relocInfo::relocType rt, Register toc) { 1131 if (!ReoptimizeCallSequences 1132 || (rt != relocInfo::runtime_call_type && rt != relocInfo::none) 1133 || !fd->is_friend_function()) { 1134 // It's not a friend function as defined by class FunctionDescriptor, 1135 // so do a full call-c here. 1136 assert(fd->entry() != NULL, "function must be linked"); 1137 1138 AddressLiteral fd_entry(fd->entry()); 1139 bool success = load_const_from_method_toc(R11, fd_entry, toc, /*fixed_size*/ true); 1140 mtctr(R11); 1141 if (fd->env() == NULL) { 1142 li(R11, 0); 1143 nop(); 1144 } else { 1145 AddressLiteral fd_env(fd->env()); 1146 success = success && load_const_from_method_toc(R11, fd_env, toc, /*fixed_size*/ true); 1147 } 1148 AddressLiteral fd_toc(fd->toc()); 1149 // Set R2_TOC (load from toc) 1150 success = success && load_const_from_method_toc(R2_TOC, fd_toc, toc, /*fixed_size*/ true); 1151 bctrl(); 1152 _last_calls_return_pc = pc(); 1153 if (!success) { return NULL; } 1154 } else { 1155 // It's a friend function, load the entry point and don't care about 1156 // toc and env. Use an optimizable call instruction, but ensure the 1157 // same code-size as in the case of a non-friend function. 1158 nop(); 1159 bl64_patchable(fd->entry(), rt); 1160 _last_calls_return_pc = pc(); 1161 } 1162 return _last_calls_return_pc; 1163 } 1164 #endif // ABI_ELFv2 1165 1166 void MacroAssembler::call_VM_base(Register oop_result, 1167 Register last_java_sp, 1168 address entry_point, 1169 bool check_exceptions) { 1170 BLOCK_COMMENT("call_VM {"); 1171 // Determine last_java_sp register. 1172 if (!last_java_sp->is_valid()) { 1173 last_java_sp = R1_SP; 1174 } 1175 set_top_ijava_frame_at_SP_as_last_Java_frame(last_java_sp, R11_scratch1); 1176 1177 // ARG1 must hold thread address. 1178 mr(R3_ARG1, R16_thread); 1179 #if defined(ABI_ELFv2) 1180 address return_pc = call_c(entry_point, relocInfo::none); 1181 #else 1182 address return_pc = call_c((FunctionDescriptor*)entry_point, relocInfo::none); 1183 #endif 1184 1185 reset_last_Java_frame(); 1186 1187 // Check for pending exceptions. 1188 if (check_exceptions) { 1189 // We don't check for exceptions here. 1190 ShouldNotReachHere(); 1191 } 1192 1193 // Get oop result if there is one and reset the value in the thread. 1194 if (oop_result->is_valid()) { 1195 get_vm_result(oop_result); 1196 } 1197 1198 _last_calls_return_pc = return_pc; 1199 BLOCK_COMMENT("} call_VM"); 1200 } 1201 1202 void MacroAssembler::call_VM_leaf_base(address entry_point) { 1203 BLOCK_COMMENT("call_VM_leaf {"); 1204 #if defined(ABI_ELFv2) 1205 call_c(entry_point, relocInfo::none); 1206 #else 1207 call_c(CAST_FROM_FN_PTR(FunctionDescriptor*, entry_point), relocInfo::none); 1208 #endif 1209 BLOCK_COMMENT("} call_VM_leaf"); 1210 } 1211 1212 void MacroAssembler::call_VM(Register oop_result, address entry_point, bool check_exceptions) { 1213 call_VM_base(oop_result, noreg, entry_point, check_exceptions); 1214 } 1215 1216 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, 1217 bool check_exceptions) { 1218 // R3_ARG1 is reserved for the thread. 1219 mr_if_needed(R4_ARG2, arg_1); 1220 call_VM(oop_result, entry_point, check_exceptions); 1221 } 1222 1223 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2, 1224 bool check_exceptions) { 1225 // R3_ARG1 is reserved for the thread 1226 mr_if_needed(R4_ARG2, arg_1); 1227 assert(arg_2 != R4_ARG2, "smashed argument"); 1228 mr_if_needed(R5_ARG3, arg_2); 1229 call_VM(oop_result, entry_point, check_exceptions); 1230 } 1231 1232 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2, Register arg_3, 1233 bool check_exceptions) { 1234 // R3_ARG1 is reserved for the thread 1235 mr_if_needed(R4_ARG2, arg_1); 1236 assert(arg_2 != R4_ARG2, "smashed argument"); 1237 mr_if_needed(R5_ARG3, arg_2); 1238 mr_if_needed(R6_ARG4, arg_3); 1239 call_VM(oop_result, entry_point, check_exceptions); 1240 } 1241 1242 void MacroAssembler::call_VM_leaf(address entry_point) { 1243 call_VM_leaf_base(entry_point); 1244 } 1245 1246 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1) { 1247 mr_if_needed(R3_ARG1, arg_1); 1248 call_VM_leaf(entry_point); 1249 } 1250 1251 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2) { 1252 mr_if_needed(R3_ARG1, arg_1); 1253 assert(arg_2 != R3_ARG1, "smashed argument"); 1254 mr_if_needed(R4_ARG2, arg_2); 1255 call_VM_leaf(entry_point); 1256 } 1257 1258 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2, Register arg_3) { 1259 mr_if_needed(R3_ARG1, arg_1); 1260 assert(arg_2 != R3_ARG1, "smashed argument"); 1261 mr_if_needed(R4_ARG2, arg_2); 1262 assert(arg_3 != R3_ARG1 && arg_3 != R4_ARG2, "smashed argument"); 1263 mr_if_needed(R5_ARG3, arg_3); 1264 call_VM_leaf(entry_point); 1265 } 1266 1267 // Check whether instruction is a read access to the polling page 1268 // which was emitted by load_from_polling_page(..). 1269 bool MacroAssembler::is_load_from_polling_page(int instruction, void* ucontext, 1270 address* polling_address_ptr) { 1271 if (!is_ld(instruction)) 1272 return false; // It's not a ld. Fail. 1273 1274 int rt = inv_rt_field(instruction); 1275 int ra = inv_ra_field(instruction); 1276 int ds = inv_ds_field(instruction); 1277 if (!(ds == 0 && ra != 0 && rt == 0)) { 1278 return false; // It's not a ld(r0, X, ra). Fail. 1279 } 1280 1281 if (!ucontext) { 1282 // Set polling address. 1283 if (polling_address_ptr != NULL) { 1284 *polling_address_ptr = NULL; 1285 } 1286 return true; // No ucontext given. Can't check value of ra. Assume true. 1287 } 1288 1289 #ifdef LINUX 1290 // Ucontext given. Check that register ra contains the address of 1291 // the safepoing polling page. 1292 ucontext_t* uc = (ucontext_t*) ucontext; 1293 // Set polling address. 1294 address addr = (address)uc->uc_mcontext.regs->gpr[ra] + (ssize_t)ds; 1295 if (polling_address_ptr != NULL) { 1296 *polling_address_ptr = addr; 1297 } 1298 return os::is_poll_address(addr); 1299 #else 1300 // Not on Linux, ucontext must be NULL. 1301 ShouldNotReachHere(); 1302 return false; 1303 #endif 1304 } 1305 1306 void MacroAssembler::bang_stack_with_offset(int offset) { 1307 // When increasing the stack, the old stack pointer will be written 1308 // to the new top of stack according to the PPC64 abi. 1309 // Therefore, stack banging is not necessary when increasing 1310 // the stack by <= os::vm_page_size() bytes. 1311 // When increasing the stack by a larger amount, this method is 1312 // called repeatedly to bang the intermediate pages. 1313 1314 // Stack grows down, caller passes positive offset. 1315 assert(offset > 0, "must bang with positive offset"); 1316 1317 long stdoffset = -offset; 1318 1319 if (is_simm(stdoffset, 16)) { 1320 // Signed 16 bit offset, a simple std is ok. 1321 if (UseLoadInstructionsForStackBangingPPC64) { 1322 ld(R0, (int)(signed short)stdoffset, R1_SP); 1323 } else { 1324 std(R0,(int)(signed short)stdoffset, R1_SP); 1325 } 1326 } else if (is_simm(stdoffset, 31)) { 1327 const int hi = MacroAssembler::largeoffset_si16_si16_hi(stdoffset); 1328 const int lo = MacroAssembler::largeoffset_si16_si16_lo(stdoffset); 1329 1330 Register tmp = R11; 1331 addis(tmp, R1_SP, hi); 1332 if (UseLoadInstructionsForStackBangingPPC64) { 1333 ld(R0, lo, tmp); 1334 } else { 1335 std(R0, lo, tmp); 1336 } 1337 } else { 1338 ShouldNotReachHere(); 1339 } 1340 } 1341 1342 // If instruction is a stack bang of the form 1343 // std R0, x(Ry), (see bang_stack_with_offset()) 1344 // stdu R1_SP, x(R1_SP), (see push_frame(), resize_frame()) 1345 // or stdux R1_SP, Rx, R1_SP (see push_frame(), resize_frame()) 1346 // return the banged address. Otherwise, return 0. 1347 address MacroAssembler::get_stack_bang_address(int instruction, void *ucontext) { 1348 #ifdef LINUX 1349 ucontext_t* uc = (ucontext_t*) ucontext; 1350 int rs = inv_rs_field(instruction); 1351 int ra = inv_ra_field(instruction); 1352 if ( (is_ld(instruction) && rs == 0 && UseLoadInstructionsForStackBangingPPC64) 1353 || (is_std(instruction) && rs == 0 && !UseLoadInstructionsForStackBangingPPC64) 1354 || (is_stdu(instruction) && rs == 1)) { 1355 int ds = inv_ds_field(instruction); 1356 // return banged address 1357 return ds+(address)uc->uc_mcontext.regs->gpr[ra]; 1358 } else if (is_stdux(instruction) && rs == 1) { 1359 int rb = inv_rb_field(instruction); 1360 address sp = (address)uc->uc_mcontext.regs->gpr[1]; 1361 long rb_val = (long)uc->uc_mcontext.regs->gpr[rb]; 1362 return ra != 1 || rb_val >= 0 ? NULL // not a stack bang 1363 : sp + rb_val; // banged address 1364 } 1365 return NULL; // not a stack bang 1366 #else 1367 // workaround not needed on !LINUX :-) 1368 ShouldNotCallThis(); 1369 return NULL; 1370 #endif 1371 } 1372 1373 void MacroAssembler::reserved_stack_check(Register return_pc) { 1374 // Test if reserved zone needs to be enabled. 1375 Label no_reserved_zone_enabling; 1376 1377 ld_ptr(R0, JavaThread::reserved_stack_activation_offset(), R16_thread); 1378 cmpld(CCR0, R1_SP, R0); 1379 blt_predict_taken(CCR0, no_reserved_zone_enabling); 1380 1381 // Enable reserved zone again, throw stack overflow exception. 1382 push_frame_reg_args(0, R0); 1383 call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone), R16_thread); 1384 pop_frame(); 1385 mtlr(return_pc); 1386 load_const_optimized(R0, StubRoutines::throw_delayed_StackOverflowError_entry()); 1387 mtctr(R0); 1388 bctr(); 1389 1390 should_not_reach_here(); 1391 1392 bind(no_reserved_zone_enabling); 1393 } 1394 1395 void MacroAssembler::getandsetd(Register dest_current_value, Register exchange_value, Register addr_base, 1396 bool cmpxchgx_hint) { 1397 Label retry; 1398 bind(retry); 1399 ldarx(dest_current_value, addr_base, cmpxchgx_hint); 1400 stdcx_(exchange_value, addr_base); 1401 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1402 bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0. 1403 } else { 1404 bne( CCR0, retry); // StXcx_ sets CCR0. 1405 } 1406 } 1407 1408 void MacroAssembler::getandaddd(Register dest_current_value, Register inc_value, Register addr_base, 1409 Register tmp, bool cmpxchgx_hint) { 1410 Label retry; 1411 bind(retry); 1412 ldarx(dest_current_value, addr_base, cmpxchgx_hint); 1413 add(tmp, dest_current_value, inc_value); 1414 stdcx_(tmp, addr_base); 1415 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1416 bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0. 1417 } else { 1418 bne( CCR0, retry); // StXcx_ sets CCR0. 1419 } 1420 } 1421 1422 // Word/sub-word atomic helper functions 1423 1424 // Temps and addr_base are killed if size < 4 and processor does not support respective instructions. 1425 // Only signed types are supported with size < 4. 1426 // Atomic add always kills tmp1. 1427 void MacroAssembler::atomic_get_and_modify_generic(Register dest_current_value, Register exchange_value, 1428 Register addr_base, Register tmp1, Register tmp2, Register tmp3, 1429 bool cmpxchgx_hint, bool is_add, int size) { 1430 // Sub-word instructions are available since Power 8. 1431 // For older processors, instruction_type != size holds, and we 1432 // emulate the sub-word instructions by constructing a 4-byte value 1433 // that leaves the other bytes unchanged. 1434 const int instruction_type = VM_Version::has_lqarx() ? size : 4; 1435 1436 Label retry; 1437 Register shift_amount = noreg, 1438 val32 = dest_current_value, 1439 modval = is_add ? tmp1 : exchange_value; 1440 1441 if (instruction_type != size) { 1442 assert_different_registers(tmp1, tmp2, tmp3, dest_current_value, exchange_value, addr_base); 1443 modval = tmp1; 1444 shift_amount = tmp2; 1445 val32 = tmp3; 1446 // Need some preperation: Compute shift amount, align address. Note: shorts must be 2 byte aligned. 1447 #ifdef VM_LITTLE_ENDIAN 1448 rldic(shift_amount, addr_base, 3, 64-5); // (dest & 3) * 8; 1449 clrrdi(addr_base, addr_base, 2); 1450 #else 1451 xori(shift_amount, addr_base, (size == 1) ? 3 : 2); 1452 clrrdi(addr_base, addr_base, 2); 1453 rldic(shift_amount, shift_amount, 3, 64-5); // byte: ((3-dest) & 3) * 8; short: ((1-dest/2) & 1) * 16; 1454 #endif 1455 } 1456 1457 // atomic emulation loop 1458 bind(retry); 1459 1460 switch (instruction_type) { 1461 case 4: lwarx(val32, addr_base, cmpxchgx_hint); break; 1462 case 2: lharx(val32, addr_base, cmpxchgx_hint); break; 1463 case 1: lbarx(val32, addr_base, cmpxchgx_hint); break; 1464 default: ShouldNotReachHere(); 1465 } 1466 1467 if (instruction_type != size) { 1468 srw(dest_current_value, val32, shift_amount); 1469 } 1470 1471 if (is_add) { add(modval, dest_current_value, exchange_value); } 1472 1473 if (instruction_type != size) { 1474 // Transform exchange value such that the replacement can be done by one xor instruction. 1475 xorr(modval, dest_current_value, is_add ? modval : exchange_value); 1476 clrldi(modval, modval, (size == 1) ? 56 : 48); 1477 slw(modval, modval, shift_amount); 1478 xorr(modval, val32, modval); 1479 } 1480 1481 switch (instruction_type) { 1482 case 4: stwcx_(modval, addr_base); break; 1483 case 2: sthcx_(modval, addr_base); break; 1484 case 1: stbcx_(modval, addr_base); break; 1485 default: ShouldNotReachHere(); 1486 } 1487 1488 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1489 bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0. 1490 } else { 1491 bne( CCR0, retry); // StXcx_ sets CCR0. 1492 } 1493 1494 // l?arx zero-extends, but Java wants byte/short values sign-extended. 1495 if (size == 1) { 1496 extsb(dest_current_value, dest_current_value); 1497 } else if (size == 2) { 1498 extsh(dest_current_value, dest_current_value); 1499 }; 1500 } 1501 1502 // Temps, addr_base and exchange_value are killed if size < 4 and processor does not support respective instructions. 1503 // Only signed types are supported with size < 4. 1504 void MacroAssembler::cmpxchg_loop_body(ConditionRegister flag, Register dest_current_value, 1505 Register compare_value, Register exchange_value, 1506 Register addr_base, Register tmp1, Register tmp2, 1507 Label &retry, Label &failed, bool cmpxchgx_hint, int size) { 1508 // Sub-word instructions are available since Power 8. 1509 // For older processors, instruction_type != size holds, and we 1510 // emulate the sub-word instructions by constructing a 4-byte value 1511 // that leaves the other bytes unchanged. 1512 const int instruction_type = VM_Version::has_lqarx() ? size : 4; 1513 1514 Register shift_amount = noreg, 1515 val32 = dest_current_value, 1516 modval = exchange_value; 1517 1518 if (instruction_type != size) { 1519 assert_different_registers(tmp1, tmp2, dest_current_value, compare_value, exchange_value, addr_base); 1520 shift_amount = tmp1; 1521 val32 = tmp2; 1522 modval = tmp2; 1523 // Need some preperation: Compute shift amount, align address. Note: shorts must be 2 byte aligned. 1524 #ifdef VM_LITTLE_ENDIAN 1525 rldic(shift_amount, addr_base, 3, 64-5); // (dest & 3) * 8; 1526 clrrdi(addr_base, addr_base, 2); 1527 #else 1528 xori(shift_amount, addr_base, (size == 1) ? 3 : 2); 1529 clrrdi(addr_base, addr_base, 2); 1530 rldic(shift_amount, shift_amount, 3, 64-5); // byte: ((3-dest) & 3) * 8; short: ((1-dest/2) & 1) * 16; 1531 #endif 1532 // Transform exchange value such that the replacement can be done by one xor instruction. 1533 xorr(exchange_value, compare_value, exchange_value); 1534 clrldi(exchange_value, exchange_value, (size == 1) ? 56 : 48); 1535 slw(exchange_value, exchange_value, shift_amount); 1536 } 1537 1538 // atomic emulation loop 1539 bind(retry); 1540 1541 switch (instruction_type) { 1542 case 4: lwarx(val32, addr_base, cmpxchgx_hint); break; 1543 case 2: lharx(val32, addr_base, cmpxchgx_hint); break; 1544 case 1: lbarx(val32, addr_base, cmpxchgx_hint); break; 1545 default: ShouldNotReachHere(); 1546 } 1547 1548 if (instruction_type != size) { 1549 srw(dest_current_value, val32, shift_amount); 1550 } 1551 if (size == 1) { 1552 extsb(dest_current_value, dest_current_value); 1553 } else if (size == 2) { 1554 extsh(dest_current_value, dest_current_value); 1555 }; 1556 1557 cmpw(flag, dest_current_value, compare_value); 1558 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1559 bne_predict_not_taken(flag, failed); 1560 } else { 1561 bne( flag, failed); 1562 } 1563 // branch to done => (flag == ne), (dest_current_value != compare_value) 1564 // fall through => (flag == eq), (dest_current_value == compare_value) 1565 1566 if (instruction_type != size) { 1567 xorr(modval, val32, exchange_value); 1568 } 1569 1570 switch (instruction_type) { 1571 case 4: stwcx_(modval, addr_base); break; 1572 case 2: sthcx_(modval, addr_base); break; 1573 case 1: stbcx_(modval, addr_base); break; 1574 default: ShouldNotReachHere(); 1575 } 1576 } 1577 1578 // CmpxchgX sets condition register to cmpX(current, compare). 1579 void MacroAssembler::cmpxchg_generic(ConditionRegister flag, Register dest_current_value, 1580 Register compare_value, Register exchange_value, 1581 Register addr_base, Register tmp1, Register tmp2, 1582 int semantics, bool cmpxchgx_hint, 1583 Register int_flag_success, bool contention_hint, bool weak, int size) { 1584 Label retry; 1585 Label failed; 1586 Label done; 1587 1588 // Save one branch if result is returned via register and 1589 // result register is different from the other ones. 1590 bool use_result_reg = (int_flag_success != noreg); 1591 bool preset_result_reg = (int_flag_success != dest_current_value && int_flag_success != compare_value && 1592 int_flag_success != exchange_value && int_flag_success != addr_base && 1593 int_flag_success != tmp1 && int_flag_success != tmp2); 1594 assert(!weak || flag == CCR0, "weak only supported with CCR0"); 1595 assert(size == 1 || size == 2 || size == 4, "unsupported"); 1596 1597 if (use_result_reg && preset_result_reg) { 1598 li(int_flag_success, 0); // preset (assume cas failed) 1599 } 1600 1601 // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM). 1602 if (contention_hint) { // Don't try to reserve if cmp fails. 1603 switch (size) { 1604 case 1: lbz(dest_current_value, 0, addr_base); extsb(dest_current_value, dest_current_value); break; 1605 case 2: lha(dest_current_value, 0, addr_base); break; 1606 case 4: lwz(dest_current_value, 0, addr_base); break; 1607 default: ShouldNotReachHere(); 1608 } 1609 cmpw(flag, dest_current_value, compare_value); 1610 bne(flag, failed); 1611 } 1612 1613 // release/fence semantics 1614 if (semantics & MemBarRel) { 1615 release(); 1616 } 1617 1618 cmpxchg_loop_body(flag, dest_current_value, compare_value, exchange_value, addr_base, tmp1, tmp2, 1619 retry, failed, cmpxchgx_hint, size); 1620 if (!weak || use_result_reg) { 1621 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1622 bne_predict_not_taken(CCR0, weak ? failed : retry); // StXcx_ sets CCR0. 1623 } else { 1624 bne( CCR0, weak ? failed : retry); // StXcx_ sets CCR0. 1625 } 1626 } 1627 // fall through => (flag == eq), (dest_current_value == compare_value), (swapped) 1628 1629 // Result in register (must do this at the end because int_flag_success can be the 1630 // same register as one above). 1631 if (use_result_reg) { 1632 li(int_flag_success, 1); 1633 } 1634 1635 if (semantics & MemBarFenceAfter) { 1636 fence(); 1637 } else if (semantics & MemBarAcq) { 1638 isync(); 1639 } 1640 1641 if (use_result_reg && !preset_result_reg) { 1642 b(done); 1643 } 1644 1645 bind(failed); 1646 if (use_result_reg && !preset_result_reg) { 1647 li(int_flag_success, 0); 1648 } 1649 1650 bind(done); 1651 // (flag == ne) => (dest_current_value != compare_value), (!swapped) 1652 // (flag == eq) => (dest_current_value == compare_value), ( swapped) 1653 } 1654 1655 // Preforms atomic compare exchange: 1656 // if (compare_value == *addr_base) 1657 // *addr_base = exchange_value 1658 // int_flag_success = 1; 1659 // else 1660 // int_flag_success = 0; 1661 // 1662 // ConditionRegister flag = cmp(compare_value, *addr_base) 1663 // Register dest_current_value = *addr_base 1664 // Register compare_value Used to compare with value in memory 1665 // Register exchange_value Written to memory if compare_value == *addr_base 1666 // Register addr_base The memory location to compareXChange 1667 // Register int_flag_success Set to 1 if exchange_value was written to *addr_base 1668 // 1669 // To avoid the costly compare exchange the value is tested beforehand. 1670 // Several special cases exist to avoid that unnecessary information is generated. 1671 // 1672 void MacroAssembler::cmpxchgd(ConditionRegister flag, 1673 Register dest_current_value, RegisterOrConstant compare_value, Register exchange_value, 1674 Register addr_base, int semantics, bool cmpxchgx_hint, 1675 Register int_flag_success, Label* failed_ext, bool contention_hint, bool weak) { 1676 Label retry; 1677 Label failed_int; 1678 Label& failed = (failed_ext != NULL) ? *failed_ext : failed_int; 1679 Label done; 1680 1681 // Save one branch if result is returned via register and result register is different from the other ones. 1682 bool use_result_reg = (int_flag_success!=noreg); 1683 bool preset_result_reg = (int_flag_success!=dest_current_value && int_flag_success!=compare_value.register_or_noreg() && 1684 int_flag_success!=exchange_value && int_flag_success!=addr_base); 1685 assert(!weak || flag == CCR0, "weak only supported with CCR0"); 1686 assert(int_flag_success == noreg || failed_ext == NULL, "cannot have both"); 1687 1688 if (use_result_reg && preset_result_reg) { 1689 li(int_flag_success, 0); // preset (assume cas failed) 1690 } 1691 1692 // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM). 1693 if (contention_hint) { // Don't try to reserve if cmp fails. 1694 ld(dest_current_value, 0, addr_base); 1695 cmpd(flag, compare_value, dest_current_value); 1696 bne(flag, failed); 1697 } 1698 1699 // release/fence semantics 1700 if (semantics & MemBarRel) { 1701 release(); 1702 } 1703 1704 // atomic emulation loop 1705 bind(retry); 1706 1707 ldarx(dest_current_value, addr_base, cmpxchgx_hint); 1708 cmpd(flag, compare_value, dest_current_value); 1709 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1710 bne_predict_not_taken(flag, failed); 1711 } else { 1712 bne( flag, failed); 1713 } 1714 1715 stdcx_(exchange_value, addr_base); 1716 if (!weak || use_result_reg || failed_ext) { 1717 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1718 bne_predict_not_taken(CCR0, weak ? failed : retry); // stXcx_ sets CCR0 1719 } else { 1720 bne( CCR0, weak ? failed : retry); // stXcx_ sets CCR0 1721 } 1722 } 1723 1724 // result in register (must do this at the end because int_flag_success can be the same register as one above) 1725 if (use_result_reg) { 1726 li(int_flag_success, 1); 1727 } 1728 1729 if (semantics & MemBarFenceAfter) { 1730 fence(); 1731 } else if (semantics & MemBarAcq) { 1732 isync(); 1733 } 1734 1735 if (use_result_reg && !preset_result_reg) { 1736 b(done); 1737 } 1738 1739 bind(failed_int); 1740 if (use_result_reg && !preset_result_reg) { 1741 li(int_flag_success, 0); 1742 } 1743 1744 bind(done); 1745 // (flag == ne) => (dest_current_value != compare_value), (!swapped) 1746 // (flag == eq) => (dest_current_value == compare_value), ( swapped) 1747 } 1748 1749 // Look up the method for a megamorphic invokeinterface call. 1750 // The target method is determined by <intf_klass, itable_index>. 1751 // The receiver klass is in recv_klass. 1752 // On success, the result will be in method_result, and execution falls through. 1753 // On failure, execution transfers to the given label. 1754 void MacroAssembler::lookup_interface_method(Register recv_klass, 1755 Register intf_klass, 1756 RegisterOrConstant itable_index, 1757 Register method_result, 1758 Register scan_temp, 1759 Register temp2, 1760 Label& L_no_such_interface, 1761 bool return_method) { 1762 assert_different_registers(recv_klass, intf_klass, method_result, scan_temp); 1763 1764 // Compute start of first itableOffsetEntry (which is at the end of the vtable). 1765 int vtable_base = in_bytes(Klass::vtable_start_offset()); 1766 int itentry_off = itableMethodEntry::method_offset_in_bytes(); 1767 int logMEsize = exact_log2(itableMethodEntry::size() * wordSize); 1768 int scan_step = itableOffsetEntry::size() * wordSize; 1769 int log_vte_size= exact_log2(vtableEntry::size_in_bytes()); 1770 1771 lwz(scan_temp, in_bytes(Klass::vtable_length_offset()), recv_klass); 1772 // %%% We should store the aligned, prescaled offset in the klassoop. 1773 // Then the next several instructions would fold away. 1774 1775 sldi(scan_temp, scan_temp, log_vte_size); 1776 addi(scan_temp, scan_temp, vtable_base); 1777 add(scan_temp, recv_klass, scan_temp); 1778 1779 // Adjust recv_klass by scaled itable_index, so we can free itable_index. 1780 if (return_method) { 1781 if (itable_index.is_register()) { 1782 Register itable_offset = itable_index.as_register(); 1783 sldi(method_result, itable_offset, logMEsize); 1784 if (itentry_off) { addi(method_result, method_result, itentry_off); } 1785 add(method_result, method_result, recv_klass); 1786 } else { 1787 long itable_offset = (long)itable_index.as_constant(); 1788 // static address, no relocation 1789 add_const_optimized(method_result, recv_klass, (itable_offset << logMEsize) + itentry_off, temp2); 1790 } 1791 } 1792 1793 // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) { 1794 // if (scan->interface() == intf) { 1795 // result = (klass + scan->offset() + itable_index); 1796 // } 1797 // } 1798 Label search, found_method; 1799 1800 for (int peel = 1; peel >= 0; peel--) { 1801 // %%%% Could load both offset and interface in one ldx, if they were 1802 // in the opposite order. This would save a load. 1803 ld(temp2, itableOffsetEntry::interface_offset_in_bytes(), scan_temp); 1804 1805 // Check that this entry is non-null. A null entry means that 1806 // the receiver class doesn't implement the interface, and wasn't the 1807 // same as when the caller was compiled. 1808 cmpd(CCR0, temp2, intf_klass); 1809 1810 if (peel) { 1811 beq(CCR0, found_method); 1812 } else { 1813 bne(CCR0, search); 1814 // (invert the test to fall through to found_method...) 1815 } 1816 1817 if (!peel) break; 1818 1819 bind(search); 1820 1821 cmpdi(CCR0, temp2, 0); 1822 beq(CCR0, L_no_such_interface); 1823 addi(scan_temp, scan_temp, scan_step); 1824 } 1825 1826 bind(found_method); 1827 1828 // Got a hit. 1829 if (return_method) { 1830 int ito_offset = itableOffsetEntry::offset_offset_in_bytes(); 1831 lwz(scan_temp, ito_offset, scan_temp); 1832 ldx(method_result, scan_temp, method_result); 1833 } 1834 } 1835 1836 // virtual method calling 1837 void MacroAssembler::lookup_virtual_method(Register recv_klass, 1838 RegisterOrConstant vtable_index, 1839 Register method_result) { 1840 1841 assert_different_registers(recv_klass, method_result, vtable_index.register_or_noreg()); 1842 1843 const int base = in_bytes(Klass::vtable_start_offset()); 1844 assert(vtableEntry::size() * wordSize == wordSize, "adjust the scaling in the code below"); 1845 1846 if (vtable_index.is_register()) { 1847 sldi(vtable_index.as_register(), vtable_index.as_register(), LogBytesPerWord); 1848 add(recv_klass, vtable_index.as_register(), recv_klass); 1849 } else { 1850 addi(recv_klass, recv_klass, vtable_index.as_constant() << LogBytesPerWord); 1851 } 1852 ld(R19_method, base + vtableEntry::method_offset_in_bytes(), recv_klass); 1853 } 1854 1855 /////////////////////////////////////////// subtype checking //////////////////////////////////////////// 1856 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass, 1857 Register super_klass, 1858 Register temp1_reg, 1859 Register temp2_reg, 1860 Label* L_success, 1861 Label* L_failure, 1862 Label* L_slow_path, 1863 RegisterOrConstant super_check_offset) { 1864 1865 const Register check_cache_offset = temp1_reg; 1866 const Register cached_super = temp2_reg; 1867 1868 assert_different_registers(sub_klass, super_klass, check_cache_offset, cached_super); 1869 1870 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 1871 int sc_offset = in_bytes(Klass::secondary_super_cache_offset()); 1872 1873 bool must_load_sco = (super_check_offset.constant_or_zero() == -1); 1874 bool need_slow_path = (must_load_sco || super_check_offset.constant_or_zero() == sco_offset); 1875 1876 Label L_fallthrough; 1877 int label_nulls = 0; 1878 if (L_success == NULL) { L_success = &L_fallthrough; label_nulls++; } 1879 if (L_failure == NULL) { L_failure = &L_fallthrough; label_nulls++; } 1880 if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; } 1881 assert(label_nulls <= 1 || 1882 (L_slow_path == &L_fallthrough && label_nulls <= 2 && !need_slow_path), 1883 "at most one NULL in the batch, usually"); 1884 1885 // If the pointers are equal, we are done (e.g., String[] elements). 1886 // This self-check enables sharing of secondary supertype arrays among 1887 // non-primary types such as array-of-interface. Otherwise, each such 1888 // type would need its own customized SSA. 1889 // We move this check to the front of the fast path because many 1890 // type checks are in fact trivially successful in this manner, 1891 // so we get a nicely predicted branch right at the start of the check. 1892 cmpd(CCR0, sub_klass, super_klass); 1893 beq(CCR0, *L_success); 1894 1895 // Check the supertype display: 1896 if (must_load_sco) { 1897 // The super check offset is always positive... 1898 lwz(check_cache_offset, sco_offset, super_klass); 1899 super_check_offset = RegisterOrConstant(check_cache_offset); 1900 // super_check_offset is register. 1901 assert_different_registers(sub_klass, super_klass, cached_super, super_check_offset.as_register()); 1902 } 1903 // The loaded value is the offset from KlassOopDesc. 1904 1905 ld(cached_super, super_check_offset, sub_klass); 1906 cmpd(CCR0, cached_super, super_klass); 1907 1908 // This check has worked decisively for primary supers. 1909 // Secondary supers are sought in the super_cache ('super_cache_addr'). 1910 // (Secondary supers are interfaces and very deeply nested subtypes.) 1911 // This works in the same check above because of a tricky aliasing 1912 // between the super_cache and the primary super display elements. 1913 // (The 'super_check_addr' can address either, as the case requires.) 1914 // Note that the cache is updated below if it does not help us find 1915 // what we need immediately. 1916 // So if it was a primary super, we can just fail immediately. 1917 // Otherwise, it's the slow path for us (no success at this point). 1918 1919 #define FINAL_JUMP(label) if (&(label) != &L_fallthrough) { b(label); } 1920 1921 if (super_check_offset.is_register()) { 1922 beq(CCR0, *L_success); 1923 cmpwi(CCR0, super_check_offset.as_register(), sc_offset); 1924 if (L_failure == &L_fallthrough) { 1925 beq(CCR0, *L_slow_path); 1926 } else { 1927 bne(CCR0, *L_failure); 1928 FINAL_JUMP(*L_slow_path); 1929 } 1930 } else { 1931 if (super_check_offset.as_constant() == sc_offset) { 1932 // Need a slow path; fast failure is impossible. 1933 if (L_slow_path == &L_fallthrough) { 1934 beq(CCR0, *L_success); 1935 } else { 1936 bne(CCR0, *L_slow_path); 1937 FINAL_JUMP(*L_success); 1938 } 1939 } else { 1940 // No slow path; it's a fast decision. 1941 if (L_failure == &L_fallthrough) { 1942 beq(CCR0, *L_success); 1943 } else { 1944 bne(CCR0, *L_failure); 1945 FINAL_JUMP(*L_success); 1946 } 1947 } 1948 } 1949 1950 bind(L_fallthrough); 1951 #undef FINAL_JUMP 1952 } 1953 1954 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass, 1955 Register super_klass, 1956 Register temp1_reg, 1957 Register temp2_reg, 1958 Label* L_success, 1959 Register result_reg) { 1960 const Register array_ptr = temp1_reg; // current value from cache array 1961 const Register temp = temp2_reg; 1962 1963 assert_different_registers(sub_klass, super_klass, array_ptr, temp); 1964 1965 int source_offset = in_bytes(Klass::secondary_supers_offset()); 1966 int target_offset = in_bytes(Klass::secondary_super_cache_offset()); 1967 1968 int length_offset = Array<Klass*>::length_offset_in_bytes(); 1969 int base_offset = Array<Klass*>::base_offset_in_bytes(); 1970 1971 Label hit, loop, failure, fallthru; 1972 1973 ld(array_ptr, source_offset, sub_klass); 1974 1975 // TODO: PPC port: assert(4 == arrayOopDesc::length_length_in_bytes(), "precondition violated."); 1976 lwz(temp, length_offset, array_ptr); 1977 cmpwi(CCR0, temp, 0); 1978 beq(CCR0, result_reg!=noreg ? failure : fallthru); // length 0 1979 1980 mtctr(temp); // load ctr 1981 1982 bind(loop); 1983 // Oops in table are NO MORE compressed. 1984 ld(temp, base_offset, array_ptr); 1985 cmpd(CCR0, temp, super_klass); 1986 beq(CCR0, hit); 1987 addi(array_ptr, array_ptr, BytesPerWord); 1988 bdnz(loop); 1989 1990 bind(failure); 1991 if (result_reg!=noreg) li(result_reg, 1); // load non-zero result (indicates a miss) 1992 b(fallthru); 1993 1994 bind(hit); 1995 std(super_klass, target_offset, sub_klass); // save result to cache 1996 if (result_reg != noreg) { li(result_reg, 0); } // load zero result (indicates a hit) 1997 if (L_success != NULL) { b(*L_success); } 1998 else if (result_reg == noreg) { blr(); } // return with CR0.eq if neither label nor result reg provided 1999 2000 bind(fallthru); 2001 } 2002 2003 // Try fast path, then go to slow one if not successful 2004 void MacroAssembler::check_klass_subtype(Register sub_klass, 2005 Register super_klass, 2006 Register temp1_reg, 2007 Register temp2_reg, 2008 Label& L_success) { 2009 Label L_failure; 2010 check_klass_subtype_fast_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success, &L_failure); 2011 check_klass_subtype_slow_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success); 2012 bind(L_failure); // Fallthru if not successful. 2013 } 2014 2015 void MacroAssembler::clinit_barrier(Register klass, Register thread, Label* L_fast_path, Label* L_slow_path) { 2016 assert(L_fast_path != NULL || L_slow_path != NULL, "at least one is required"); 2017 2018 Label L_fallthrough; 2019 if (L_fast_path == NULL) { 2020 L_fast_path = &L_fallthrough; 2021 } else if (L_slow_path == NULL) { 2022 L_slow_path = &L_fallthrough; 2023 } 2024 2025 // Fast path check: class is fully initialized 2026 lbz(R0, in_bytes(InstanceKlass::init_state_offset()), klass); 2027 cmpwi(CCR0, R0, InstanceKlass::fully_initialized); 2028 beq(CCR0, *L_fast_path); 2029 2030 // Fast path check: current thread is initializer thread 2031 ld(R0, in_bytes(InstanceKlass::init_thread_offset()), klass); 2032 cmpd(CCR0, thread, R0); 2033 if (L_slow_path == &L_fallthrough) { 2034 beq(CCR0, *L_fast_path); 2035 } else if (L_fast_path == &L_fallthrough) { 2036 bne(CCR0, *L_slow_path); 2037 } else { 2038 Unimplemented(); 2039 } 2040 2041 bind(L_fallthrough); 2042 } 2043 2044 RegisterOrConstant MacroAssembler::argument_offset(RegisterOrConstant arg_slot, 2045 Register temp_reg, 2046 int extra_slot_offset) { 2047 // cf. TemplateTable::prepare_invoke(), if (load_receiver). 2048 int stackElementSize = Interpreter::stackElementSize; 2049 int offset = extra_slot_offset * stackElementSize; 2050 if (arg_slot.is_constant()) { 2051 offset += arg_slot.as_constant() * stackElementSize; 2052 return offset; 2053 } else { 2054 assert(temp_reg != noreg, "must specify"); 2055 sldi(temp_reg, arg_slot.as_register(), exact_log2(stackElementSize)); 2056 if (offset != 0) 2057 addi(temp_reg, temp_reg, offset); 2058 return temp_reg; 2059 } 2060 } 2061 2062 // Supports temp2_reg = R0. 2063 void MacroAssembler::biased_locking_enter(ConditionRegister cr_reg, Register obj_reg, 2064 Register mark_reg, Register temp_reg, 2065 Register temp2_reg, Label& done, Label* slow_case) { 2066 assert(UseBiasedLocking, "why call this otherwise?"); 2067 2068 #ifdef ASSERT 2069 assert_different_registers(obj_reg, mark_reg, temp_reg, temp2_reg); 2070 #endif 2071 2072 Label cas_label; 2073 2074 // Branch to done if fast path fails and no slow_case provided. 2075 Label *slow_case_int = (slow_case != NULL) ? slow_case : &done; 2076 2077 // Biased locking 2078 // See whether the lock is currently biased toward our thread and 2079 // whether the epoch is still valid 2080 // Note that the runtime guarantees sufficient alignment of JavaThread 2081 // pointers to allow age to be placed into low bits 2082 assert(markWord::age_shift == markWord::lock_bits + markWord::biased_lock_bits, 2083 "biased locking makes assumptions about bit layout"); 2084 2085 if (PrintBiasedLockingStatistics) { 2086 load_const(temp2_reg, (address) BiasedLocking::total_entry_count_addr(), temp_reg); 2087 lwzx(temp_reg, temp2_reg); 2088 addi(temp_reg, temp_reg, 1); 2089 stwx(temp_reg, temp2_reg); 2090 } 2091 2092 andi(temp_reg, mark_reg, markWord::biased_lock_mask_in_place); 2093 cmpwi(cr_reg, temp_reg, markWord::biased_lock_pattern); 2094 bne(cr_reg, cas_label); 2095 2096 load_klass(temp_reg, obj_reg); 2097 2098 load_const_optimized(temp2_reg, ~((int) markWord::age_mask_in_place)); 2099 ld(temp_reg, in_bytes(Klass::prototype_header_offset()), temp_reg); 2100 orr(temp_reg, R16_thread, temp_reg); 2101 xorr(temp_reg, mark_reg, temp_reg); 2102 andr(temp_reg, temp_reg, temp2_reg); 2103 cmpdi(cr_reg, temp_reg, 0); 2104 if (PrintBiasedLockingStatistics) { 2105 Label l; 2106 bne(cr_reg, l); 2107 load_const(temp2_reg, (address) BiasedLocking::biased_lock_entry_count_addr()); 2108 lwzx(mark_reg, temp2_reg); 2109 addi(mark_reg, mark_reg, 1); 2110 stwx(mark_reg, temp2_reg); 2111 // restore mark_reg 2112 ld(mark_reg, oopDesc::mark_offset_in_bytes(), obj_reg); 2113 bind(l); 2114 } 2115 beq(cr_reg, done); 2116 2117 Label try_revoke_bias; 2118 Label try_rebias; 2119 2120 // At this point we know that the header has the bias pattern and 2121 // that we are not the bias owner in the current epoch. We need to 2122 // figure out more details about the state of the header in order to 2123 // know what operations can be legally performed on the object's 2124 // header. 2125 2126 // If the low three bits in the xor result aren't clear, that means 2127 // the prototype header is no longer biased and we have to revoke 2128 // the bias on this object. 2129 andi(temp2_reg, temp_reg, markWord::biased_lock_mask_in_place); 2130 cmpwi(cr_reg, temp2_reg, 0); 2131 bne(cr_reg, try_revoke_bias); 2132 2133 // Biasing is still enabled for this data type. See whether the 2134 // epoch of the current bias is still valid, meaning that the epoch 2135 // bits of the mark word are equal to the epoch bits of the 2136 // prototype header. (Note that the prototype header's epoch bits 2137 // only change at a safepoint.) If not, attempt to rebias the object 2138 // toward the current thread. Note that we must be absolutely sure 2139 // that the current epoch is invalid in order to do this because 2140 // otherwise the manipulations it performs on the mark word are 2141 // illegal. 2142 2143 int shift_amount = 64 - markWord::epoch_shift; 2144 // rotate epoch bits to right (little) end and set other bits to 0 2145 // [ big part | epoch | little part ] -> [ 0..0 | epoch ] 2146 rldicl_(temp2_reg, temp_reg, shift_amount, 64 - markWord::epoch_bits); 2147 // branch if epoch bits are != 0, i.e. they differ, because the epoch has been incremented 2148 bne(CCR0, try_rebias); 2149 2150 // The epoch of the current bias is still valid but we know nothing 2151 // about the owner; it might be set or it might be clear. Try to 2152 // acquire the bias of the object using an atomic operation. If this 2153 // fails we will go in to the runtime to revoke the object's bias. 2154 // Note that we first construct the presumed unbiased header so we 2155 // don't accidentally blow away another thread's valid bias. 2156 andi(mark_reg, mark_reg, (markWord::biased_lock_mask_in_place | 2157 markWord::age_mask_in_place | 2158 markWord::epoch_mask_in_place)); 2159 orr(temp_reg, R16_thread, mark_reg); 2160 2161 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 2162 2163 // CmpxchgX sets cr_reg to cmpX(temp2_reg, mark_reg). 2164 cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg, 2165 /*compare_value=*/mark_reg, /*exchange_value=*/temp_reg, 2166 /*where=*/obj_reg, 2167 MacroAssembler::MemBarAcq, 2168 MacroAssembler::cmpxchgx_hint_acquire_lock(), 2169 noreg, slow_case_int); // bail out if failed 2170 2171 // If the biasing toward our thread failed, this means that 2172 // another thread succeeded in biasing it toward itself and we 2173 // need to revoke that bias. The revocation will occur in the 2174 // interpreter runtime in the slow case. 2175 if (PrintBiasedLockingStatistics) { 2176 load_const(temp2_reg, (address) BiasedLocking::anonymously_biased_lock_entry_count_addr(), temp_reg); 2177 lwzx(temp_reg, temp2_reg); 2178 addi(temp_reg, temp_reg, 1); 2179 stwx(temp_reg, temp2_reg); 2180 } 2181 b(done); 2182 2183 bind(try_rebias); 2184 // At this point we know the epoch has expired, meaning that the 2185 // current "bias owner", if any, is actually invalid. Under these 2186 // circumstances _only_, we are allowed to use the current header's 2187 // value as the comparison value when doing the cas to acquire the 2188 // bias in the current epoch. In other words, we allow transfer of 2189 // the bias from one thread to another directly in this situation. 2190 load_klass(temp_reg, obj_reg); 2191 andi(temp2_reg, mark_reg, markWord::age_mask_in_place); 2192 orr(temp2_reg, R16_thread, temp2_reg); 2193 ld(temp_reg, in_bytes(Klass::prototype_header_offset()), temp_reg); 2194 orr(temp_reg, temp2_reg, temp_reg); 2195 2196 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 2197 2198 cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg, 2199 /*compare_value=*/mark_reg, /*exchange_value=*/temp_reg, 2200 /*where=*/obj_reg, 2201 MacroAssembler::MemBarAcq, 2202 MacroAssembler::cmpxchgx_hint_acquire_lock(), 2203 noreg, slow_case_int); // bail out if failed 2204 2205 // If the biasing toward our thread failed, this means that 2206 // another thread succeeded in biasing it toward itself and we 2207 // need to revoke that bias. The revocation will occur in the 2208 // interpreter runtime in the slow case. 2209 if (PrintBiasedLockingStatistics) { 2210 load_const(temp2_reg, (address) BiasedLocking::rebiased_lock_entry_count_addr(), temp_reg); 2211 lwzx(temp_reg, temp2_reg); 2212 addi(temp_reg, temp_reg, 1); 2213 stwx(temp_reg, temp2_reg); 2214 } 2215 b(done); 2216 2217 bind(try_revoke_bias); 2218 // The prototype mark in the klass doesn't have the bias bit set any 2219 // more, indicating that objects of this data type are not supposed 2220 // to be biased any more. We are going to try to reset the mark of 2221 // this object to the prototype value and fall through to the 2222 // CAS-based locking scheme. Note that if our CAS fails, it means 2223 // that another thread raced us for the privilege of revoking the 2224 // bias of this particular object, so it's okay to continue in the 2225 // normal locking code. 2226 load_klass(temp_reg, obj_reg); 2227 ld(temp_reg, in_bytes(Klass::prototype_header_offset()), temp_reg); 2228 andi(temp2_reg, mark_reg, markWord::age_mask_in_place); 2229 orr(temp_reg, temp_reg, temp2_reg); 2230 2231 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 2232 2233 // CmpxchgX sets cr_reg to cmpX(temp2_reg, mark_reg). 2234 cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg, 2235 /*compare_value=*/mark_reg, /*exchange_value=*/temp_reg, 2236 /*where=*/obj_reg, 2237 MacroAssembler::MemBarAcq, 2238 MacroAssembler::cmpxchgx_hint_acquire_lock()); 2239 2240 // reload markWord in mark_reg before continuing with lightweight locking 2241 ld(mark_reg, oopDesc::mark_offset_in_bytes(), obj_reg); 2242 2243 // Fall through to the normal CAS-based lock, because no matter what 2244 // the result of the above CAS, some thread must have succeeded in 2245 // removing the bias bit from the object's header. 2246 if (PrintBiasedLockingStatistics) { 2247 Label l; 2248 bne(cr_reg, l); 2249 load_const(temp2_reg, (address) BiasedLocking::revoked_lock_entry_count_addr(), temp_reg); 2250 lwzx(temp_reg, temp2_reg); 2251 addi(temp_reg, temp_reg, 1); 2252 stwx(temp_reg, temp2_reg); 2253 bind(l); 2254 } 2255 2256 bind(cas_label); 2257 } 2258 2259 void MacroAssembler::biased_locking_exit (ConditionRegister cr_reg, Register mark_addr, Register temp_reg, Label& done) { 2260 // Check for biased locking unlock case, which is a no-op 2261 // Note: we do not have to check the thread ID for two reasons. 2262 // First, the interpreter checks for IllegalMonitorStateException at 2263 // a higher level. Second, if the bias was revoked while we held the 2264 // lock, the object could not be rebiased toward another thread, so 2265 // the bias bit would be clear. 2266 2267 ld(temp_reg, 0, mark_addr); 2268 andi(temp_reg, temp_reg, markWord::biased_lock_mask_in_place); 2269 2270 cmpwi(cr_reg, temp_reg, markWord::biased_lock_pattern); 2271 beq(cr_reg, done); 2272 } 2273 2274 // allocation (for C1) 2275 void MacroAssembler::eden_allocate( 2276 Register obj, // result: pointer to object after successful allocation 2277 Register var_size_in_bytes, // object size in bytes if unknown at compile time; invalid otherwise 2278 int con_size_in_bytes, // object size in bytes if known at compile time 2279 Register t1, // temp register 2280 Register t2, // temp register 2281 Label& slow_case // continuation point if fast allocation fails 2282 ) { 2283 b(slow_case); 2284 } 2285 2286 void MacroAssembler::tlab_allocate( 2287 Register obj, // result: pointer to object after successful allocation 2288 Register var_size_in_bytes, // object size in bytes if unknown at compile time; invalid otherwise 2289 int con_size_in_bytes, // object size in bytes if known at compile time 2290 Register t1, // temp register 2291 Label& slow_case // continuation point if fast allocation fails 2292 ) { 2293 // make sure arguments make sense 2294 assert_different_registers(obj, var_size_in_bytes, t1); 2295 assert(0 <= con_size_in_bytes && is_simm16(con_size_in_bytes), "illegal object size"); 2296 assert((con_size_in_bytes & MinObjAlignmentInBytesMask) == 0, "object size is not multiple of alignment"); 2297 2298 const Register new_top = t1; 2299 //verify_tlab(); not implemented 2300 2301 ld(obj, in_bytes(JavaThread::tlab_top_offset()), R16_thread); 2302 ld(R0, in_bytes(JavaThread::tlab_end_offset()), R16_thread); 2303 if (var_size_in_bytes == noreg) { 2304 addi(new_top, obj, con_size_in_bytes); 2305 } else { 2306 add(new_top, obj, var_size_in_bytes); 2307 } 2308 cmpld(CCR0, new_top, R0); 2309 bc_far_optimized(Assembler::bcondCRbiIs1, bi0(CCR0, Assembler::greater), slow_case); 2310 2311 #ifdef ASSERT 2312 // make sure new free pointer is properly aligned 2313 { 2314 Label L; 2315 andi_(R0, new_top, MinObjAlignmentInBytesMask); 2316 beq(CCR0, L); 2317 stop("updated TLAB free is not properly aligned", 0x934); 2318 bind(L); 2319 } 2320 #endif // ASSERT 2321 2322 // update the tlab top pointer 2323 std(new_top, in_bytes(JavaThread::tlab_top_offset()), R16_thread); 2324 //verify_tlab(); not implemented 2325 } 2326 void MacroAssembler::incr_allocated_bytes(RegisterOrConstant size_in_bytes, Register t1, Register t2) { 2327 unimplemented("incr_allocated_bytes"); 2328 } 2329 2330 address MacroAssembler::emit_trampoline_stub(int destination_toc_offset, 2331 int insts_call_instruction_offset, Register Rtoc) { 2332 // Start the stub. 2333 address stub = start_a_stub(64); 2334 if (stub == NULL) { return NULL; } // CodeCache full: bail out 2335 2336 // Create a trampoline stub relocation which relates this trampoline stub 2337 // with the call instruction at insts_call_instruction_offset in the 2338 // instructions code-section. 2339 relocate(trampoline_stub_Relocation::spec(code()->insts()->start() + insts_call_instruction_offset)); 2340 const int stub_start_offset = offset(); 2341 2342 // For java_to_interp stubs we use R11_scratch1 as scratch register 2343 // and in call trampoline stubs we use R12_scratch2. This way we 2344 // can distinguish them (see is_NativeCallTrampolineStub_at()). 2345 Register reg_scratch = R12_scratch2; 2346 2347 // Now, create the trampoline stub's code: 2348 // - load the TOC 2349 // - load the call target from the constant pool 2350 // - call 2351 if (Rtoc == noreg) { 2352 calculate_address_from_global_toc(reg_scratch, method_toc()); 2353 Rtoc = reg_scratch; 2354 } 2355 2356 ld_largeoffset_unchecked(reg_scratch, destination_toc_offset, Rtoc, false); 2357 mtctr(reg_scratch); 2358 bctr(); 2359 2360 const address stub_start_addr = addr_at(stub_start_offset); 2361 2362 // Assert that the encoded destination_toc_offset can be identified and that it is correct. 2363 assert(destination_toc_offset == NativeCallTrampolineStub_at(stub_start_addr)->destination_toc_offset(), 2364 "encoded offset into the constant pool must match"); 2365 // Trampoline_stub_size should be good. 2366 assert((uint)(offset() - stub_start_offset) <= trampoline_stub_size, "should be good size"); 2367 assert(is_NativeCallTrampolineStub_at(stub_start_addr), "doesn't look like a trampoline"); 2368 2369 // End the stub. 2370 end_a_stub(); 2371 return stub; 2372 } 2373 2374 // TM on PPC64. 2375 void MacroAssembler::atomic_inc_ptr(Register addr, Register result, int simm16) { 2376 Label retry; 2377 bind(retry); 2378 ldarx(result, addr, /*hint*/ false); 2379 addi(result, result, simm16); 2380 stdcx_(result, addr); 2381 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 2382 bne_predict_not_taken(CCR0, retry); // stXcx_ sets CCR0 2383 } else { 2384 bne( CCR0, retry); // stXcx_ sets CCR0 2385 } 2386 } 2387 2388 void MacroAssembler::atomic_ori_int(Register addr, Register result, int uimm16) { 2389 Label retry; 2390 bind(retry); 2391 lwarx(result, addr, /*hint*/ false); 2392 ori(result, result, uimm16); 2393 stwcx_(result, addr); 2394 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 2395 bne_predict_not_taken(CCR0, retry); // stXcx_ sets CCR0 2396 } else { 2397 bne( CCR0, retry); // stXcx_ sets CCR0 2398 } 2399 } 2400 2401 #if INCLUDE_RTM_OPT 2402 2403 // Update rtm_counters based on abort status 2404 // input: abort_status 2405 // rtm_counters_Reg (RTMLockingCounters*) 2406 void MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters_Reg) { 2407 // Mapping to keep PreciseRTMLockingStatistics similar to x86. 2408 // x86 ppc (! means inverted, ? means not the same) 2409 // 0 31 Set if abort caused by XABORT instruction. 2410 // 1 ! 7 If set, the transaction may succeed on a retry. This bit is always clear if bit 0 is set. 2411 // 2 13 Set if another logical processor conflicted with a memory address that was part of the transaction that aborted. 2412 // 3 10 Set if an internal buffer overflowed. 2413 // 4 ?12 Set if a debug breakpoint was hit. 2414 // 5 ?32 Set if an abort occurred during execution of a nested transaction. 2415 const int failure_bit[] = {tm_tabort, // Signal handler will set this too. 2416 tm_failure_persistent, 2417 tm_non_trans_cf, 2418 tm_trans_cf, 2419 tm_footprint_of, 2420 tm_failure_code, 2421 tm_transaction_level}; 2422 2423 const int num_failure_bits = sizeof(failure_bit) / sizeof(int); 2424 const int num_counters = RTMLockingCounters::ABORT_STATUS_LIMIT; 2425 2426 const int bit2counter_map[][num_counters] = 2427 // 0 = no map; 1 = mapped, no inverted logic; -1 = mapped, inverted logic 2428 // Inverted logic means that if a bit is set don't count it, or vice-versa. 2429 // Care must be taken when mapping bits to counters as bits for a given 2430 // counter must be mutually exclusive. Otherwise, the counter will be 2431 // incremented more than once. 2432 // counters: 2433 // 0 1 2 3 4 5 2434 // abort , persist, conflict, overflow, debug , nested bits: 2435 {{ 1 , 0 , 0 , 0 , 0 , 0 }, // abort 2436 { 0 , -1 , 0 , 0 , 0 , 0 }, // failure_persistent 2437 { 0 , 0 , 1 , 0 , 0 , 0 }, // non_trans_cf 2438 { 0 , 0 , 1 , 0 , 0 , 0 }, // trans_cf 2439 { 0 , 0 , 0 , 1 , 0 , 0 }, // footprint_of 2440 { 0 , 0 , 0 , 0 , -1 , 0 }, // failure_code = 0xD4 2441 { 0 , 0 , 0 , 0 , 0 , 1 }}; // transaction_level > 1 2442 // ... 2443 2444 // Move abort_status value to R0 and use abort_status register as a 2445 // temporary register because R0 as third operand in ld/std is treated 2446 // as base address zero (value). Likewise, R0 as second operand in addi 2447 // is problematic because it amounts to li. 2448 const Register temp_Reg = abort_status; 2449 const Register abort_status_R0 = R0; 2450 mr(abort_status_R0, abort_status); 2451 2452 // Increment total abort counter. 2453 int counters_offs = RTMLockingCounters::abort_count_offset(); 2454 ld(temp_Reg, counters_offs, rtm_counters_Reg); 2455 addi(temp_Reg, temp_Reg, 1); 2456 std(temp_Reg, counters_offs, rtm_counters_Reg); 2457 2458 // Increment specific abort counters. 2459 if (PrintPreciseRTMLockingStatistics) { 2460 2461 // #0 counter offset. 2462 int abortX_offs = RTMLockingCounters::abortX_count_offset(); 2463 2464 for (int nbit = 0; nbit < num_failure_bits; nbit++) { 2465 for (int ncounter = 0; ncounter < num_counters; ncounter++) { 2466 if (bit2counter_map[nbit][ncounter] != 0) { 2467 Label check_abort; 2468 int abort_counter_offs = abortX_offs + (ncounter << 3); 2469 2470 if (failure_bit[nbit] == tm_transaction_level) { 2471 // Don't check outer transaction, TL = 1 (bit 63). Hence only 2472 // 11 bits in the TL field are checked to find out if failure 2473 // occured in a nested transaction. This check also matches 2474 // the case when nesting_of = 1 (nesting overflow). 2475 rldicr_(temp_Reg, abort_status_R0, failure_bit[nbit], 10); 2476 } else if (failure_bit[nbit] == tm_failure_code) { 2477 // Check failure code for trap or illegal caught in TM. 2478 // Bits 0:7 are tested as bit 7 (persistent) is copied from 2479 // tabort or treclaim source operand. 2480 // On Linux: trap or illegal is TM_CAUSE_SIGNAL (0xD4). 2481 rldicl(temp_Reg, abort_status_R0, 8, 56); 2482 cmpdi(CCR0, temp_Reg, 0xD4); 2483 } else { 2484 rldicr_(temp_Reg, abort_status_R0, failure_bit[nbit], 0); 2485 } 2486 2487 if (bit2counter_map[nbit][ncounter] == 1) { 2488 beq(CCR0, check_abort); 2489 } else { 2490 bne(CCR0, check_abort); 2491 } 2492 2493 // We don't increment atomically. 2494 ld(temp_Reg, abort_counter_offs, rtm_counters_Reg); 2495 addi(temp_Reg, temp_Reg, 1); 2496 std(temp_Reg, abort_counter_offs, rtm_counters_Reg); 2497 2498 bind(check_abort); 2499 } 2500 } 2501 } 2502 } 2503 // Restore abort_status. 2504 mr(abort_status, abort_status_R0); 2505 } 2506 2507 // Branch if (random & (count-1) != 0), count is 2^n 2508 // tmp and CR0 are killed 2509 void MacroAssembler::branch_on_random_using_tb(Register tmp, int count, Label& brLabel) { 2510 mftb(tmp); 2511 andi_(tmp, tmp, count-1); 2512 bne(CCR0, brLabel); 2513 } 2514 2515 // Perform abort ratio calculation, set no_rtm bit if high ratio. 2516 // input: rtm_counters_Reg (RTMLockingCounters* address) - KILLED 2517 void MacroAssembler::rtm_abort_ratio_calculation(Register rtm_counters_Reg, 2518 RTMLockingCounters* rtm_counters, 2519 Metadata* method_data) { 2520 Label L_done, L_check_always_rtm1, L_check_always_rtm2; 2521 2522 if (RTMLockingCalculationDelay > 0) { 2523 // Delay calculation. 2524 ld(rtm_counters_Reg, (RegisterOrConstant)(intptr_t)RTMLockingCounters::rtm_calculation_flag_addr()); 2525 cmpdi(CCR0, rtm_counters_Reg, 0); 2526 beq(CCR0, L_done); 2527 load_const_optimized(rtm_counters_Reg, (address)rtm_counters, R0); // reload 2528 } 2529 // Abort ratio calculation only if abort_count > RTMAbortThreshold. 2530 // Aborted transactions = abort_count * 100 2531 // All transactions = total_count * RTMTotalCountIncrRate 2532 // Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio) 2533 ld(R0, RTMLockingCounters::abort_count_offset(), rtm_counters_Reg); 2534 if (is_simm(RTMAbortThreshold, 16)) { // cmpdi can handle 16bit immediate only. 2535 cmpdi(CCR0, R0, RTMAbortThreshold); 2536 blt(CCR0, L_check_always_rtm2); // reload of rtm_counters_Reg not necessary 2537 } else { 2538 load_const_optimized(rtm_counters_Reg, RTMAbortThreshold); 2539 cmpd(CCR0, R0, rtm_counters_Reg); 2540 blt(CCR0, L_check_always_rtm1); // reload of rtm_counters_Reg required 2541 } 2542 mulli(R0, R0, 100); 2543 2544 const Register tmpReg = rtm_counters_Reg; 2545 ld(tmpReg, RTMLockingCounters::total_count_offset(), rtm_counters_Reg); 2546 mulli(tmpReg, tmpReg, RTMTotalCountIncrRate); // allowable range: int16 2547 mulli(tmpReg, tmpReg, RTMAbortRatio); // allowable range: int16 2548 cmpd(CCR0, R0, tmpReg); 2549 blt(CCR0, L_check_always_rtm1); // jump to reload 2550 if (method_data != NULL) { 2551 // Set rtm_state to "no rtm" in MDO. 2552 // Not using a metadata relocation. Method and Class Loader are kept alive anyway. 2553 // (See nmethod::metadata_do and CodeBuffer::finalize_oop_references.) 2554 load_const(R0, (address)method_data + MethodData::rtm_state_offset_in_bytes(), tmpReg); 2555 atomic_ori_int(R0, tmpReg, NoRTM); 2556 } 2557 b(L_done); 2558 2559 bind(L_check_always_rtm1); 2560 load_const_optimized(rtm_counters_Reg, (address)rtm_counters, R0); // reload 2561 bind(L_check_always_rtm2); 2562 ld(tmpReg, RTMLockingCounters::total_count_offset(), rtm_counters_Reg); 2563 int64_t thresholdValue = RTMLockingThreshold / RTMTotalCountIncrRate; 2564 if (is_simm(thresholdValue, 16)) { // cmpdi can handle 16bit immediate only. 2565 cmpdi(CCR0, tmpReg, thresholdValue); 2566 } else { 2567 load_const_optimized(R0, thresholdValue); 2568 cmpd(CCR0, tmpReg, R0); 2569 } 2570 blt(CCR0, L_done); 2571 if (method_data != NULL) { 2572 // Set rtm_state to "always rtm" in MDO. 2573 // Not using a metadata relocation. See above. 2574 load_const(R0, (address)method_data + MethodData::rtm_state_offset_in_bytes(), tmpReg); 2575 atomic_ori_int(R0, tmpReg, UseRTM); 2576 } 2577 bind(L_done); 2578 } 2579 2580 // Update counters and perform abort ratio calculation. 2581 // input: abort_status_Reg 2582 void MacroAssembler::rtm_profiling(Register abort_status_Reg, Register temp_Reg, 2583 RTMLockingCounters* rtm_counters, 2584 Metadata* method_data, 2585 bool profile_rtm) { 2586 2587 assert(rtm_counters != NULL, "should not be NULL when profiling RTM"); 2588 // Update rtm counters based on state at abort. 2589 // Reads abort_status_Reg, updates flags. 2590 assert_different_registers(abort_status_Reg, temp_Reg); 2591 load_const_optimized(temp_Reg, (address)rtm_counters, R0); 2592 rtm_counters_update(abort_status_Reg, temp_Reg); 2593 if (profile_rtm) { 2594 assert(rtm_counters != NULL, "should not be NULL when profiling RTM"); 2595 rtm_abort_ratio_calculation(temp_Reg, rtm_counters, method_data); 2596 } 2597 } 2598 2599 // Retry on abort if abort's status indicates non-persistent failure. 2600 // inputs: retry_count_Reg 2601 // : abort_status_Reg 2602 // output: retry_count_Reg decremented by 1 2603 void MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg, 2604 Label& retryLabel, Label* checkRetry) { 2605 Label doneRetry; 2606 2607 // Don't retry if failure is persistent. 2608 // The persistent bit is set when a (A) Disallowed operation is performed in 2609 // transactional state, like for instance trying to write the TFHAR after a 2610 // transaction is started; or when there is (B) a Nesting Overflow (too many 2611 // nested transactions); or when (C) the Footprint overflows (too many 2612 // addressess touched in TM state so there is no more space in the footprint 2613 // area to track them); or in case of (D) a Self-Induced Conflict, i.e. a 2614 // store is performed to a given address in TM state, then once in suspended 2615 // state the same address is accessed. Failure (A) is very unlikely to occur 2616 // in the JVM. Failure (D) will never occur because Suspended state is never 2617 // used in the JVM. Thus mostly (B) a Nesting Overflow or (C) a Footprint 2618 // Overflow will set the persistent bit. 2619 rldicr_(R0, abort_status_Reg, tm_failure_persistent, 0); 2620 bne(CCR0, doneRetry); 2621 2622 // Don't retry if transaction was deliberately aborted, i.e. caused by a 2623 // tabort instruction. 2624 rldicr_(R0, abort_status_Reg, tm_tabort, 0); 2625 bne(CCR0, doneRetry); 2626 2627 // Retry if transaction aborted due to a conflict with another thread. 2628 if (checkRetry) { bind(*checkRetry); } 2629 addic_(retry_count_Reg, retry_count_Reg, -1); 2630 blt(CCR0, doneRetry); 2631 b(retryLabel); 2632 bind(doneRetry); 2633 } 2634 2635 // Spin and retry if lock is busy. 2636 // inputs: owner_addr_Reg (monitor address) 2637 // : retry_count_Reg 2638 // output: retry_count_Reg decremented by 1 2639 // CTR is killed 2640 void MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register owner_addr_Reg, Label& retryLabel) { 2641 Label SpinLoop, doneRetry, doRetry; 2642 addic_(retry_count_Reg, retry_count_Reg, -1); 2643 blt(CCR0, doneRetry); 2644 2645 if (RTMSpinLoopCount > 1) { 2646 li(R0, RTMSpinLoopCount); 2647 mtctr(R0); 2648 } 2649 2650 // low thread priority 2651 smt_prio_low(); 2652 bind(SpinLoop); 2653 2654 if (RTMSpinLoopCount > 1) { 2655 bdz(doRetry); 2656 ld(R0, 0, owner_addr_Reg); 2657 cmpdi(CCR0, R0, 0); 2658 bne(CCR0, SpinLoop); 2659 } 2660 2661 bind(doRetry); 2662 2663 // restore thread priority to default in userspace 2664 #ifdef LINUX 2665 smt_prio_medium_low(); 2666 #else 2667 smt_prio_medium(); 2668 #endif 2669 2670 b(retryLabel); 2671 2672 bind(doneRetry); 2673 } 2674 2675 // Use RTM for normal stack locks. 2676 // Input: objReg (object to lock) 2677 void MacroAssembler::rtm_stack_locking(ConditionRegister flag, 2678 Register obj, Register mark_word, Register tmp, 2679 Register retry_on_abort_count_Reg, 2680 RTMLockingCounters* stack_rtm_counters, 2681 Metadata* method_data, bool profile_rtm, 2682 Label& DONE_LABEL, Label& IsInflated) { 2683 assert(UseRTMForStackLocks, "why call this otherwise?"); 2684 assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking"); 2685 Label L_rtm_retry, L_decrement_retry, L_on_abort; 2686 2687 if (RTMRetryCount > 0) { 2688 load_const_optimized(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort 2689 bind(L_rtm_retry); 2690 } 2691 andi_(R0, mark_word, markWord::monitor_value); // inflated vs stack-locked|neutral|biased 2692 bne(CCR0, IsInflated); 2693 2694 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 2695 Label L_noincrement; 2696 if (RTMTotalCountIncrRate > 1) { 2697 branch_on_random_using_tb(tmp, RTMTotalCountIncrRate, L_noincrement); 2698 } 2699 assert(stack_rtm_counters != NULL, "should not be NULL when profiling RTM"); 2700 load_const_optimized(tmp, (address)stack_rtm_counters->total_count_addr(), R0); 2701 //atomic_inc_ptr(tmp, /*temp, will be reloaded*/mark_word); We don't increment atomically 2702 ldx(mark_word, tmp); 2703 addi(mark_word, mark_word, 1); 2704 stdx(mark_word, tmp); 2705 bind(L_noincrement); 2706 } 2707 tbegin_(); 2708 beq(CCR0, L_on_abort); 2709 ld(mark_word, oopDesc::mark_offset_in_bytes(), obj); // Reload in transaction, conflicts need to be tracked. 2710 andi(R0, mark_word, markWord::biased_lock_mask_in_place); // look at 3 lock bits 2711 cmpwi(flag, R0, markWord::unlocked_value); // bits = 001 unlocked 2712 beq(flag, DONE_LABEL); // all done if unlocked 2713 2714 if (UseRTMXendForLockBusy) { 2715 tend_(); 2716 b(L_decrement_retry); 2717 } else { 2718 tabort_(); 2719 } 2720 bind(L_on_abort); 2721 const Register abort_status_Reg = tmp; 2722 mftexasr(abort_status_Reg); 2723 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 2724 rtm_profiling(abort_status_Reg, /*temp*/mark_word, stack_rtm_counters, method_data, profile_rtm); 2725 } 2726 ld(mark_word, oopDesc::mark_offset_in_bytes(), obj); // reload 2727 if (RTMRetryCount > 0) { 2728 // Retry on lock abort if abort status is not permanent. 2729 rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry, &L_decrement_retry); 2730 } else { 2731 bind(L_decrement_retry); 2732 } 2733 } 2734 2735 // Use RTM for inflating locks 2736 // inputs: obj (object to lock) 2737 // mark_word (current header - KILLED) 2738 // boxReg (on-stack box address (displaced header location) - KILLED) 2739 void MacroAssembler::rtm_inflated_locking(ConditionRegister flag, 2740 Register obj, Register mark_word, Register boxReg, 2741 Register retry_on_busy_count_Reg, Register retry_on_abort_count_Reg, 2742 RTMLockingCounters* rtm_counters, 2743 Metadata* method_data, bool profile_rtm, 2744 Label& DONE_LABEL) { 2745 assert(UseRTMLocking, "why call this otherwise?"); 2746 Label L_rtm_retry, L_decrement_retry, L_on_abort; 2747 // Clean monitor_value bit to get valid pointer. 2748 int owner_offset = ObjectMonitor::owner_offset_in_bytes() - markWord::monitor_value; 2749 2750 // Store non-null, using boxReg instead of (intptr_t)markWord::unused_mark(). 2751 std(boxReg, BasicLock::displaced_header_offset_in_bytes(), boxReg); 2752 const Register tmpReg = boxReg; 2753 const Register owner_addr_Reg = mark_word; 2754 addi(owner_addr_Reg, mark_word, owner_offset); 2755 2756 if (RTMRetryCount > 0) { 2757 load_const_optimized(retry_on_busy_count_Reg, RTMRetryCount); // Retry on lock busy. 2758 load_const_optimized(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort. 2759 bind(L_rtm_retry); 2760 } 2761 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 2762 Label L_noincrement; 2763 if (RTMTotalCountIncrRate > 1) { 2764 branch_on_random_using_tb(R0, RTMTotalCountIncrRate, L_noincrement); 2765 } 2766 assert(rtm_counters != NULL, "should not be NULL when profiling RTM"); 2767 load_const(R0, (address)rtm_counters->total_count_addr(), tmpReg); 2768 //atomic_inc_ptr(R0, tmpReg); We don't increment atomically 2769 ldx(tmpReg, R0); 2770 addi(tmpReg, tmpReg, 1); 2771 stdx(tmpReg, R0); 2772 bind(L_noincrement); 2773 } 2774 tbegin_(); 2775 beq(CCR0, L_on_abort); 2776 // We don't reload mark word. Will only be reset at safepoint. 2777 ld(R0, 0, owner_addr_Reg); // Load in transaction, conflicts need to be tracked. 2778 cmpdi(flag, R0, 0); 2779 beq(flag, DONE_LABEL); 2780 2781 if (UseRTMXendForLockBusy) { 2782 tend_(); 2783 b(L_decrement_retry); 2784 } else { 2785 tabort_(); 2786 } 2787 bind(L_on_abort); 2788 const Register abort_status_Reg = tmpReg; 2789 mftexasr(abort_status_Reg); 2790 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 2791 rtm_profiling(abort_status_Reg, /*temp*/ owner_addr_Reg, rtm_counters, method_data, profile_rtm); 2792 // Restore owner_addr_Reg 2793 ld(mark_word, oopDesc::mark_offset_in_bytes(), obj); 2794 #ifdef ASSERT 2795 andi_(R0, mark_word, markWord::monitor_value); 2796 asm_assert_ne("must be inflated", 0xa754); // Deflating only allowed at safepoint. 2797 #endif 2798 addi(owner_addr_Reg, mark_word, owner_offset); 2799 } 2800 if (RTMRetryCount > 0) { 2801 // Retry on lock abort if abort status is not permanent. 2802 rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry); 2803 } 2804 2805 // Appears unlocked - try to swing _owner from null to non-null. 2806 cmpxchgd(flag, /*current val*/ R0, (intptr_t)0, /*new val*/ R16_thread, owner_addr_Reg, 2807 MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq, 2808 MacroAssembler::cmpxchgx_hint_acquire_lock(), noreg, &L_decrement_retry, true); 2809 2810 if (RTMRetryCount > 0) { 2811 // success done else retry 2812 b(DONE_LABEL); 2813 bind(L_decrement_retry); 2814 // Spin and retry if lock is busy. 2815 rtm_retry_lock_on_busy(retry_on_busy_count_Reg, owner_addr_Reg, L_rtm_retry); 2816 } else { 2817 bind(L_decrement_retry); 2818 } 2819 } 2820 2821 #endif // INCLUDE_RTM_OPT 2822 2823 // "The box" is the space on the stack where we copy the object mark. 2824 void MacroAssembler::compiler_fast_lock_object(ConditionRegister flag, Register oop, Register box, 2825 Register temp, Register displaced_header, Register current_header, 2826 bool try_bias, 2827 RTMLockingCounters* rtm_counters, 2828 RTMLockingCounters* stack_rtm_counters, 2829 Metadata* method_data, 2830 bool use_rtm, bool profile_rtm) { 2831 assert_different_registers(oop, box, temp, displaced_header, current_header); 2832 assert(flag != CCR0, "bad condition register"); 2833 Label cont; 2834 Label object_has_monitor; 2835 Label cas_failed; 2836 2837 // Load markWord from object into displaced_header. 2838 ld(displaced_header, oopDesc::mark_offset_in_bytes(), oop); 2839 2840 2841 if (try_bias) { 2842 biased_locking_enter(flag, oop, displaced_header, temp, current_header, cont); 2843 } 2844 2845 #if INCLUDE_RTM_OPT 2846 if (UseRTMForStackLocks && use_rtm) { 2847 rtm_stack_locking(flag, oop, displaced_header, temp, /*temp*/ current_header, 2848 stack_rtm_counters, method_data, profile_rtm, 2849 cont, object_has_monitor); 2850 } 2851 #endif // INCLUDE_RTM_OPT 2852 2853 // Handle existing monitor. 2854 // The object has an existing monitor iff (mark & monitor_value) != 0. 2855 andi_(temp, displaced_header, markWord::monitor_value); 2856 bne(CCR0, object_has_monitor); 2857 2858 // Set displaced_header to be (markWord of object | UNLOCK_VALUE). 2859 ori(displaced_header, displaced_header, markWord::unlocked_value); 2860 2861 // Load Compare Value application register. 2862 2863 // Initialize the box. (Must happen before we update the object mark!) 2864 std(displaced_header, BasicLock::displaced_header_offset_in_bytes(), box); 2865 2866 // Must fence, otherwise, preceding store(s) may float below cmpxchg. 2867 // Compare object markWord with mark and if equal exchange scratch1 with object markWord. 2868 cmpxchgd(/*flag=*/flag, 2869 /*current_value=*/current_header, 2870 /*compare_value=*/displaced_header, 2871 /*exchange_value=*/box, 2872 /*where=*/oop, 2873 MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq, 2874 MacroAssembler::cmpxchgx_hint_acquire_lock(), 2875 noreg, 2876 &cas_failed, 2877 /*check without membar and ldarx first*/true); 2878 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 2879 2880 // If the compare-and-exchange succeeded, then we found an unlocked 2881 // object and we have now locked it. 2882 b(cont); 2883 2884 bind(cas_failed); 2885 // We did not see an unlocked object so try the fast recursive case. 2886 2887 // Check if the owner is self by comparing the value in the markWord of object 2888 // (current_header) with the stack pointer. 2889 sub(current_header, current_header, R1_SP); 2890 load_const_optimized(temp, ~(os::vm_page_size()-1) | markWord::lock_mask_in_place); 2891 2892 and_(R0/*==0?*/, current_header, temp); 2893 // If condition is true we are cont and hence we can store 0 as the 2894 // displaced header in the box, which indicates that it is a recursive lock. 2895 mcrf(flag,CCR0); 2896 std(R0/*==0, perhaps*/, BasicLock::displaced_header_offset_in_bytes(), box); 2897 2898 // Handle existing monitor. 2899 b(cont); 2900 2901 bind(object_has_monitor); 2902 // The object's monitor m is unlocked iff m->owner == NULL, 2903 // otherwise m->owner may contain a thread or a stack address. 2904 2905 #if INCLUDE_RTM_OPT 2906 // Use the same RTM locking code in 32- and 64-bit VM. 2907 if (use_rtm) { 2908 rtm_inflated_locking(flag, oop, displaced_header, box, temp, /*temp*/ current_header, 2909 rtm_counters, method_data, profile_rtm, cont); 2910 } else { 2911 #endif // INCLUDE_RTM_OPT 2912 2913 // Try to CAS m->owner from NULL to current thread. 2914 addi(temp, displaced_header, ObjectMonitor::owner_offset_in_bytes()-markWord::monitor_value); 2915 cmpxchgd(/*flag=*/flag, 2916 /*current_value=*/current_header, 2917 /*compare_value=*/(intptr_t)0, 2918 /*exchange_value=*/R16_thread, 2919 /*where=*/temp, 2920 MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq, 2921 MacroAssembler::cmpxchgx_hint_acquire_lock()); 2922 2923 // Store a non-null value into the box. 2924 std(box, BasicLock::displaced_header_offset_in_bytes(), box); 2925 2926 # ifdef ASSERT 2927 bne(flag, cont); 2928 // We have acquired the monitor, check some invariants. 2929 addi(/*monitor=*/temp, temp, -ObjectMonitor::owner_offset_in_bytes()); 2930 // Invariant 1: _recursions should be 0. 2931 //assert(ObjectMonitor::recursions_size_in_bytes() == 8, "unexpected size"); 2932 asm_assert_mem8_is_zero(ObjectMonitor::recursions_offset_in_bytes(), temp, 2933 "monitor->_recursions should be 0", -1); 2934 # endif 2935 2936 #if INCLUDE_RTM_OPT 2937 } // use_rtm() 2938 #endif 2939 2940 bind(cont); 2941 // flag == EQ indicates success 2942 // flag == NE indicates failure 2943 } 2944 2945 void MacroAssembler::compiler_fast_unlock_object(ConditionRegister flag, Register oop, Register box, 2946 Register temp, Register displaced_header, Register current_header, 2947 bool try_bias, bool use_rtm) { 2948 assert_different_registers(oop, box, temp, displaced_header, current_header); 2949 assert(flag != CCR0, "bad condition register"); 2950 Label cont; 2951 Label object_has_monitor; 2952 2953 if (try_bias) { 2954 biased_locking_exit(flag, oop, current_header, cont); 2955 } 2956 2957 #if INCLUDE_RTM_OPT 2958 if (UseRTMForStackLocks && use_rtm) { 2959 assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking"); 2960 Label L_regular_unlock; 2961 ld(current_header, oopDesc::mark_offset_in_bytes(), oop); // fetch markword 2962 andi(R0, current_header, markWord::biased_lock_mask_in_place); // look at 3 lock bits 2963 cmpwi(flag, R0, markWord::unlocked_value); // bits = 001 unlocked 2964 bne(flag, L_regular_unlock); // else RegularLock 2965 tend_(); // otherwise end... 2966 b(cont); // ... and we're done 2967 bind(L_regular_unlock); 2968 } 2969 #endif 2970 2971 // Find the lock address and load the displaced header from the stack. 2972 ld(displaced_header, BasicLock::displaced_header_offset_in_bytes(), box); 2973 2974 // If the displaced header is 0, we have a recursive unlock. 2975 cmpdi(flag, displaced_header, 0); 2976 beq(flag, cont); 2977 2978 // Handle existing monitor. 2979 // The object has an existing monitor iff (mark & monitor_value) != 0. 2980 RTM_OPT_ONLY( if (!(UseRTMForStackLocks && use_rtm)) ) // skip load if already done 2981 ld(current_header, oopDesc::mark_offset_in_bytes(), oop); 2982 andi_(R0, current_header, markWord::monitor_value); 2983 bne(CCR0, object_has_monitor); 2984 2985 // Check if it is still a light weight lock, this is is true if we see 2986 // the stack address of the basicLock in the markWord of the object. 2987 // Cmpxchg sets flag to cmpd(current_header, box). 2988 cmpxchgd(/*flag=*/flag, 2989 /*current_value=*/current_header, 2990 /*compare_value=*/box, 2991 /*exchange_value=*/displaced_header, 2992 /*where=*/oop, 2993 MacroAssembler::MemBarRel, 2994 MacroAssembler::cmpxchgx_hint_release_lock(), 2995 noreg, 2996 &cont); 2997 2998 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 2999 3000 // Handle existing monitor. 3001 b(cont); 3002 3003 bind(object_has_monitor); 3004 STATIC_ASSERT(markWord::monitor_value <= INT_MAX); 3005 addi(current_header, current_header, -(int)markWord::monitor_value); // monitor 3006 ld(temp, ObjectMonitor::owner_offset_in_bytes(), current_header); 3007 3008 // It's inflated. 3009 #if INCLUDE_RTM_OPT 3010 if (use_rtm) { 3011 Label L_regular_inflated_unlock; 3012 // Clean monitor_value bit to get valid pointer 3013 cmpdi(flag, temp, 0); 3014 bne(flag, L_regular_inflated_unlock); 3015 tend_(); 3016 b(cont); 3017 bind(L_regular_inflated_unlock); 3018 } 3019 #endif 3020 3021 ld(displaced_header, ObjectMonitor::recursions_offset_in_bytes(), current_header); 3022 xorr(temp, R16_thread, temp); // Will be 0 if we are the owner. 3023 orr(temp, temp, displaced_header); // Will be 0 if there are 0 recursions. 3024 cmpdi(flag, temp, 0); 3025 bne(flag, cont); 3026 3027 ld(temp, ObjectMonitor::EntryList_offset_in_bytes(), current_header); 3028 ld(displaced_header, ObjectMonitor::cxq_offset_in_bytes(), current_header); 3029 orr(temp, temp, displaced_header); // Will be 0 if both are 0. 3030 cmpdi(flag, temp, 0); 3031 bne(flag, cont); 3032 release(); 3033 std(temp, ObjectMonitor::owner_offset_in_bytes(), current_header); 3034 3035 bind(cont); 3036 // flag == EQ indicates success 3037 // flag == NE indicates failure 3038 } 3039 3040 void MacroAssembler::safepoint_poll(Label& slow_path, Register temp_reg) { 3041 if (SafepointMechanism::uses_thread_local_poll()) { 3042 ld(temp_reg, in_bytes(Thread::polling_page_offset()), R16_thread); 3043 // Armed page has poll_bit set. 3044 andi_(temp_reg, temp_reg, SafepointMechanism::poll_bit()); 3045 } else { 3046 lwz(temp_reg, (RegisterOrConstant)(intptr_t)SafepointSynchronize::address_of_state()); 3047 cmpwi(CCR0, temp_reg, SafepointSynchronize::_not_synchronized); 3048 } 3049 bne(CCR0, slow_path); 3050 } 3051 3052 void MacroAssembler::resolve_jobject(Register value, Register tmp1, Register tmp2, bool needs_frame) { 3053 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 3054 bs->resolve_jobject(this, value, tmp1, tmp2, needs_frame); 3055 } 3056 3057 // Values for last_Java_pc, and last_Java_sp must comply to the rules 3058 // in frame_ppc.hpp. 3059 void MacroAssembler::set_last_Java_frame(Register last_Java_sp, Register last_Java_pc) { 3060 // Always set last_Java_pc and flags first because once last_Java_sp 3061 // is visible has_last_Java_frame is true and users will look at the 3062 // rest of the fields. (Note: flags should always be zero before we 3063 // get here so doesn't need to be set.) 3064 3065 // Verify that last_Java_pc was zeroed on return to Java 3066 asm_assert_mem8_is_zero(in_bytes(JavaThread::last_Java_pc_offset()), R16_thread, 3067 "last_Java_pc not zeroed before leaving Java", 0x200); 3068 3069 // When returning from calling out from Java mode the frame anchor's 3070 // last_Java_pc will always be set to NULL. It is set here so that 3071 // if we are doing a call to native (not VM) that we capture the 3072 // known pc and don't have to rely on the native call having a 3073 // standard frame linkage where we can find the pc. 3074 if (last_Java_pc != noreg) 3075 std(last_Java_pc, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread); 3076 3077 // Set last_Java_sp last. 3078 std(last_Java_sp, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread); 3079 } 3080 3081 void MacroAssembler::reset_last_Java_frame(void) { 3082 asm_assert_mem8_isnot_zero(in_bytes(JavaThread::last_Java_sp_offset()), 3083 R16_thread, "SP was not set, still zero", 0x202); 3084 3085 BLOCK_COMMENT("reset_last_Java_frame {"); 3086 li(R0, 0); 3087 3088 // _last_Java_sp = 0 3089 std(R0, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread); 3090 3091 // _last_Java_pc = 0 3092 std(R0, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread); 3093 BLOCK_COMMENT("} reset_last_Java_frame"); 3094 } 3095 3096 void MacroAssembler::set_top_ijava_frame_at_SP_as_last_Java_frame(Register sp, Register tmp1) { 3097 assert_different_registers(sp, tmp1); 3098 3099 // sp points to a TOP_IJAVA_FRAME, retrieve frame's PC via 3100 // TOP_IJAVA_FRAME_ABI. 3101 // FIXME: assert that we really have a TOP_IJAVA_FRAME here! 3102 address entry = pc(); 3103 load_const_optimized(tmp1, entry); 3104 3105 set_last_Java_frame(/*sp=*/sp, /*pc=*/tmp1); 3106 } 3107 3108 void MacroAssembler::get_vm_result(Register oop_result) { 3109 // Read: 3110 // R16_thread 3111 // R16_thread->in_bytes(JavaThread::vm_result_offset()) 3112 // 3113 // Updated: 3114 // oop_result 3115 // R16_thread->in_bytes(JavaThread::vm_result_offset()) 3116 3117 verify_thread(); 3118 3119 ld(oop_result, in_bytes(JavaThread::vm_result_offset()), R16_thread); 3120 li(R0, 0); 3121 std(R0, in_bytes(JavaThread::vm_result_offset()), R16_thread); 3122 3123 verify_oop(oop_result); 3124 } 3125 3126 void MacroAssembler::get_vm_result_2(Register metadata_result) { 3127 // Read: 3128 // R16_thread 3129 // R16_thread->in_bytes(JavaThread::vm_result_2_offset()) 3130 // 3131 // Updated: 3132 // metadata_result 3133 // R16_thread->in_bytes(JavaThread::vm_result_2_offset()) 3134 3135 ld(metadata_result, in_bytes(JavaThread::vm_result_2_offset()), R16_thread); 3136 li(R0, 0); 3137 std(R0, in_bytes(JavaThread::vm_result_2_offset()), R16_thread); 3138 } 3139 3140 Register MacroAssembler::encode_klass_not_null(Register dst, Register src) { 3141 Register current = (src != noreg) ? src : dst; // Klass is in dst if no src provided. 3142 if (CompressedKlassPointers::base() != 0) { 3143 // Use dst as temp if it is free. 3144 sub_const_optimized(dst, current, CompressedKlassPointers::base(), R0); 3145 current = dst; 3146 } 3147 if (CompressedKlassPointers::shift() != 0) { 3148 srdi(dst, current, CompressedKlassPointers::shift()); 3149 current = dst; 3150 } 3151 return current; 3152 } 3153 3154 void MacroAssembler::store_klass(Register dst_oop, Register klass, Register ck) { 3155 if (UseCompressedClassPointers) { 3156 Register compressedKlass = encode_klass_not_null(ck, klass); 3157 stw(compressedKlass, oopDesc::klass_offset_in_bytes(), dst_oop); 3158 } else { 3159 std(klass, oopDesc::klass_offset_in_bytes(), dst_oop); 3160 } 3161 } 3162 3163 void MacroAssembler::store_klass_gap(Register dst_oop, Register val) { 3164 if (UseCompressedClassPointers) { 3165 if (val == noreg) { 3166 val = R0; 3167 li(val, 0); 3168 } 3169 stw(val, oopDesc::klass_gap_offset_in_bytes(), dst_oop); // klass gap if compressed 3170 } 3171 } 3172 3173 int MacroAssembler::instr_size_for_decode_klass_not_null() { 3174 if (!UseCompressedClassPointers) return 0; 3175 int num_instrs = 1; // shift or move 3176 if (CompressedKlassPointers::base() != 0) num_instrs = 7; // shift + load const + add 3177 return num_instrs * BytesPerInstWord; 3178 } 3179 3180 void MacroAssembler::decode_klass_not_null(Register dst, Register src) { 3181 assert(dst != R0, "Dst reg may not be R0, as R0 is used here."); 3182 if (src == noreg) src = dst; 3183 Register shifted_src = src; 3184 if (CompressedKlassPointers::shift() != 0 || 3185 CompressedKlassPointers::base() == 0 && src != dst) { // Move required. 3186 shifted_src = dst; 3187 sldi(shifted_src, src, CompressedKlassPointers::shift()); 3188 } 3189 if (CompressedKlassPointers::base() != 0) { 3190 add_const_optimized(dst, shifted_src, CompressedKlassPointers::base(), R0); 3191 } 3192 } 3193 3194 void MacroAssembler::load_klass(Register dst, Register src) { 3195 if (UseCompressedClassPointers) { 3196 lwz(dst, oopDesc::klass_offset_in_bytes(), src); 3197 // Attention: no null check here! 3198 decode_klass_not_null(dst, dst); 3199 } else { 3200 ld(dst, oopDesc::klass_offset_in_bytes(), src); 3201 } 3202 } 3203 3204 // ((OopHandle)result).resolve(); 3205 void MacroAssembler::resolve_oop_handle(Register result) { 3206 // OopHandle::resolve is an indirection. 3207 ld(result, 0, result); 3208 } 3209 3210 void MacroAssembler::load_mirror_from_const_method(Register mirror, Register const_method) { 3211 ld(mirror, in_bytes(ConstMethod::constants_offset()), const_method); 3212 ld(mirror, ConstantPool::pool_holder_offset_in_bytes(), mirror); 3213 ld(mirror, in_bytes(Klass::java_mirror_offset()), mirror); 3214 resolve_oop_handle(mirror); 3215 } 3216 3217 void MacroAssembler::load_method_holder(Register holder, Register method) { 3218 ld(holder, in_bytes(Method::const_offset()), method); 3219 ld(holder, in_bytes(ConstMethod::constants_offset()), holder); 3220 ld(holder, ConstantPool::pool_holder_offset_in_bytes(), holder); 3221 } 3222 3223 // Clear Array 3224 // For very short arrays. tmp == R0 is allowed. 3225 void MacroAssembler::clear_memory_unrolled(Register base_ptr, int cnt_dwords, Register tmp, int offset) { 3226 if (cnt_dwords > 0) { li(tmp, 0); } 3227 for (int i = 0; i < cnt_dwords; ++i) { std(tmp, offset + i * 8, base_ptr); } 3228 } 3229 3230 // Version for constant short array length. Kills base_ptr. tmp == R0 is allowed. 3231 void MacroAssembler::clear_memory_constlen(Register base_ptr, int cnt_dwords, Register tmp) { 3232 if (cnt_dwords < 8) { 3233 clear_memory_unrolled(base_ptr, cnt_dwords, tmp); 3234 return; 3235 } 3236 3237 Label loop; 3238 const long loopcnt = cnt_dwords >> 1, 3239 remainder = cnt_dwords & 1; 3240 3241 li(tmp, loopcnt); 3242 mtctr(tmp); 3243 li(tmp, 0); 3244 bind(loop); 3245 std(tmp, 0, base_ptr); 3246 std(tmp, 8, base_ptr); 3247 addi(base_ptr, base_ptr, 16); 3248 bdnz(loop); 3249 if (remainder) { std(tmp, 0, base_ptr); } 3250 } 3251 3252 // Kills both input registers. tmp == R0 is allowed. 3253 void MacroAssembler::clear_memory_doubleword(Register base_ptr, Register cnt_dwords, Register tmp, long const_cnt) { 3254 // Procedure for large arrays (uses data cache block zero instruction). 3255 Label startloop, fast, fastloop, small_rest, restloop, done; 3256 const int cl_size = VM_Version::L1_data_cache_line_size(), 3257 cl_dwords = cl_size >> 3, 3258 cl_dw_addr_bits = exact_log2(cl_dwords), 3259 dcbz_min = 1, // Min count of dcbz executions, needs to be >0. 3260 min_cnt = ((dcbz_min + 1) << cl_dw_addr_bits) - 1; 3261 3262 if (const_cnt >= 0) { 3263 // Constant case. 3264 if (const_cnt < min_cnt) { 3265 clear_memory_constlen(base_ptr, const_cnt, tmp); 3266 return; 3267 } 3268 load_const_optimized(cnt_dwords, const_cnt, tmp); 3269 } else { 3270 // cnt_dwords already loaded in register. Need to check size. 3271 cmpdi(CCR1, cnt_dwords, min_cnt); // Big enough? (ensure >= dcbz_min lines included). 3272 blt(CCR1, small_rest); 3273 } 3274 rldicl_(tmp, base_ptr, 64-3, 64-cl_dw_addr_bits); // Extract dword offset within first cache line. 3275 beq(CCR0, fast); // Already 128byte aligned. 3276 3277 subfic(tmp, tmp, cl_dwords); 3278 mtctr(tmp); // Set ctr to hit 128byte boundary (0<ctr<cl_dwords). 3279 subf(cnt_dwords, tmp, cnt_dwords); // rest. 3280 li(tmp, 0); 3281 3282 bind(startloop); // Clear at the beginning to reach 128byte boundary. 3283 std(tmp, 0, base_ptr); // Clear 8byte aligned block. 3284 addi(base_ptr, base_ptr, 8); 3285 bdnz(startloop); 3286 3287 bind(fast); // Clear 128byte blocks. 3288 srdi(tmp, cnt_dwords, cl_dw_addr_bits); // Loop count for 128byte loop (>0). 3289 andi(cnt_dwords, cnt_dwords, cl_dwords-1); // Rest in dwords. 3290 mtctr(tmp); // Load counter. 3291 3292 bind(fastloop); 3293 dcbz(base_ptr); // Clear 128byte aligned block. 3294 addi(base_ptr, base_ptr, cl_size); 3295 bdnz(fastloop); 3296 3297 bind(small_rest); 3298 cmpdi(CCR0, cnt_dwords, 0); // size 0? 3299 beq(CCR0, done); // rest == 0 3300 li(tmp, 0); 3301 mtctr(cnt_dwords); // Load counter. 3302 3303 bind(restloop); // Clear rest. 3304 std(tmp, 0, base_ptr); // Clear 8byte aligned block. 3305 addi(base_ptr, base_ptr, 8); 3306 bdnz(restloop); 3307 3308 bind(done); 3309 } 3310 3311 /////////////////////////////////////////// String intrinsics //////////////////////////////////////////// 3312 3313 #ifdef COMPILER2 3314 // Intrinsics for CompactStrings 3315 3316 // Compress char[] to byte[] by compressing 16 bytes at once. 3317 void MacroAssembler::string_compress_16(Register src, Register dst, Register cnt, 3318 Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5, 3319 Label& Lfailure) { 3320 3321 const Register tmp0 = R0; 3322 assert_different_registers(src, dst, cnt, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5); 3323 Label Lloop, Lslow; 3324 3325 // Check if cnt >= 8 (= 16 bytes) 3326 lis(tmp1, 0xFF); // tmp1 = 0x00FF00FF00FF00FF 3327 srwi_(tmp2, cnt, 3); 3328 beq(CCR0, Lslow); 3329 ori(tmp1, tmp1, 0xFF); 3330 rldimi(tmp1, tmp1, 32, 0); 3331 mtctr(tmp2); 3332 3333 // 2x unrolled loop 3334 bind(Lloop); 3335 ld(tmp2, 0, src); // _0_1_2_3 (Big Endian) 3336 ld(tmp4, 8, src); // _4_5_6_7 3337 3338 orr(tmp0, tmp2, tmp4); 3339 rldicl(tmp3, tmp2, 6*8, 64-24); // _____1_2 3340 rldimi(tmp2, tmp2, 2*8, 2*8); // _0_2_3_3 3341 rldicl(tmp5, tmp4, 6*8, 64-24); // _____5_6 3342 rldimi(tmp4, tmp4, 2*8, 2*8); // _4_6_7_7 3343 3344 andc_(tmp0, tmp0, tmp1); 3345 bne(CCR0, Lfailure); // Not latin1. 3346 addi(src, src, 16); 3347 3348 rlwimi(tmp3, tmp2, 0*8, 24, 31);// _____1_3 3349 srdi(tmp2, tmp2, 3*8); // ____0_2_ 3350 rlwimi(tmp5, tmp4, 0*8, 24, 31);// _____5_7 3351 srdi(tmp4, tmp4, 3*8); // ____4_6_ 3352 3353 orr(tmp2, tmp2, tmp3); // ____0123 3354 orr(tmp4, tmp4, tmp5); // ____4567 3355 3356 stw(tmp2, 0, dst); 3357 stw(tmp4, 4, dst); 3358 addi(dst, dst, 8); 3359 bdnz(Lloop); 3360 3361 bind(Lslow); // Fallback to slow version 3362 } 3363 3364 // Compress char[] to byte[]. cnt must be positive int. 3365 void MacroAssembler::string_compress(Register src, Register dst, Register cnt, Register tmp, Label& Lfailure) { 3366 Label Lloop; 3367 mtctr(cnt); 3368 3369 bind(Lloop); 3370 lhz(tmp, 0, src); 3371 cmplwi(CCR0, tmp, 0xff); 3372 bgt(CCR0, Lfailure); // Not latin1. 3373 addi(src, src, 2); 3374 stb(tmp, 0, dst); 3375 addi(dst, dst, 1); 3376 bdnz(Lloop); 3377 } 3378 3379 // Inflate byte[] to char[] by inflating 16 bytes at once. 3380 void MacroAssembler::string_inflate_16(Register src, Register dst, Register cnt, 3381 Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5) { 3382 const Register tmp0 = R0; 3383 assert_different_registers(src, dst, cnt, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5); 3384 Label Lloop, Lslow; 3385 3386 // Check if cnt >= 8 3387 srwi_(tmp2, cnt, 3); 3388 beq(CCR0, Lslow); 3389 lis(tmp1, 0xFF); // tmp1 = 0x00FF00FF 3390 ori(tmp1, tmp1, 0xFF); 3391 mtctr(tmp2); 3392 3393 // 2x unrolled loop 3394 bind(Lloop); 3395 lwz(tmp2, 0, src); // ____0123 (Big Endian) 3396 lwz(tmp4, 4, src); // ____4567 3397 addi(src, src, 8); 3398 3399 rldicl(tmp3, tmp2, 7*8, 64-8); // _______2 3400 rlwimi(tmp2, tmp2, 3*8, 16, 23);// ____0113 3401 rldicl(tmp5, tmp4, 7*8, 64-8); // _______6 3402 rlwimi(tmp4, tmp4, 3*8, 16, 23);// ____4557 3403 3404 andc(tmp0, tmp2, tmp1); // ____0_1_ 3405 rlwimi(tmp2, tmp3, 2*8, 0, 23); // _____2_3 3406 andc(tmp3, tmp4, tmp1); // ____4_5_ 3407 rlwimi(tmp4, tmp5, 2*8, 0, 23); // _____6_7 3408 3409 rldimi(tmp2, tmp0, 3*8, 0*8); // _0_1_2_3 3410 rldimi(tmp4, tmp3, 3*8, 0*8); // _4_5_6_7 3411 3412 std(tmp2, 0, dst); 3413 std(tmp4, 8, dst); 3414 addi(dst, dst, 16); 3415 bdnz(Lloop); 3416 3417 bind(Lslow); // Fallback to slow version 3418 } 3419 3420 // Inflate byte[] to char[]. cnt must be positive int. 3421 void MacroAssembler::string_inflate(Register src, Register dst, Register cnt, Register tmp) { 3422 Label Lloop; 3423 mtctr(cnt); 3424 3425 bind(Lloop); 3426 lbz(tmp, 0, src); 3427 addi(src, src, 1); 3428 sth(tmp, 0, dst); 3429 addi(dst, dst, 2); 3430 bdnz(Lloop); 3431 } 3432 3433 void MacroAssembler::string_compare(Register str1, Register str2, 3434 Register cnt1, Register cnt2, 3435 Register tmp1, Register result, int ae) { 3436 const Register tmp0 = R0, 3437 diff = tmp1; 3438 3439 assert_different_registers(str1, str2, cnt1, cnt2, tmp0, tmp1, result); 3440 Label Ldone, Lslow, Lloop, Lreturn_diff; 3441 3442 // Note: Making use of the fact that compareTo(a, b) == -compareTo(b, a) 3443 // we interchange str1 and str2 in the UL case and negate the result. 3444 // Like this, str1 is always latin1 encoded, except for the UU case. 3445 // In addition, we need 0 (or sign which is 0) extend. 3446 3447 if (ae == StrIntrinsicNode::UU) { 3448 srwi(cnt1, cnt1, 1); 3449 } else { 3450 clrldi(cnt1, cnt1, 32); 3451 } 3452 3453 if (ae != StrIntrinsicNode::LL) { 3454 srwi(cnt2, cnt2, 1); 3455 } else { 3456 clrldi(cnt2, cnt2, 32); 3457 } 3458 3459 // See if the lengths are different, and calculate min in cnt1. 3460 // Save diff in case we need it for a tie-breaker. 3461 subf_(diff, cnt2, cnt1); // diff = cnt1 - cnt2 3462 // if (diff > 0) { cnt1 = cnt2; } 3463 if (VM_Version::has_isel()) { 3464 isel(cnt1, CCR0, Assembler::greater, /*invert*/ false, cnt2); 3465 } else { 3466 Label Lskip; 3467 blt(CCR0, Lskip); 3468 mr(cnt1, cnt2); 3469 bind(Lskip); 3470 } 3471 3472 // Rename registers 3473 Register chr1 = result; 3474 Register chr2 = tmp0; 3475 3476 // Compare multiple characters in fast loop (only implemented for same encoding). 3477 int stride1 = 8, stride2 = 8; 3478 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3479 int log2_chars_per_iter = (ae == StrIntrinsicNode::LL) ? 3 : 2; 3480 Label Lfastloop, Lskipfast; 3481 3482 srwi_(tmp0, cnt1, log2_chars_per_iter); 3483 beq(CCR0, Lskipfast); 3484 rldicl(cnt2, cnt1, 0, 64 - log2_chars_per_iter); // Remaining characters. 3485 li(cnt1, 1 << log2_chars_per_iter); // Initialize for failure case: Rescan characters from current iteration. 3486 mtctr(tmp0); 3487 3488 bind(Lfastloop); 3489 ld(chr1, 0, str1); 3490 ld(chr2, 0, str2); 3491 cmpd(CCR0, chr1, chr2); 3492 bne(CCR0, Lslow); 3493 addi(str1, str1, stride1); 3494 addi(str2, str2, stride2); 3495 bdnz(Lfastloop); 3496 mr(cnt1, cnt2); // Remaining characters. 3497 bind(Lskipfast); 3498 } 3499 3500 // Loop which searches the first difference character by character. 3501 cmpwi(CCR0, cnt1, 0); 3502 beq(CCR0, Lreturn_diff); 3503 bind(Lslow); 3504 mtctr(cnt1); 3505 3506 switch (ae) { 3507 case StrIntrinsicNode::LL: stride1 = 1; stride2 = 1; break; 3508 case StrIntrinsicNode::UL: // fallthru (see comment above) 3509 case StrIntrinsicNode::LU: stride1 = 1; stride2 = 2; break; 3510 case StrIntrinsicNode::UU: stride1 = 2; stride2 = 2; break; 3511 default: ShouldNotReachHere(); break; 3512 } 3513 3514 bind(Lloop); 3515 if (stride1 == 1) { lbz(chr1, 0, str1); } else { lhz(chr1, 0, str1); } 3516 if (stride2 == 1) { lbz(chr2, 0, str2); } else { lhz(chr2, 0, str2); } 3517 subf_(result, chr2, chr1); // result = chr1 - chr2 3518 bne(CCR0, Ldone); 3519 addi(str1, str1, stride1); 3520 addi(str2, str2, stride2); 3521 bdnz(Lloop); 3522 3523 // If strings are equal up to min length, return the length difference. 3524 bind(Lreturn_diff); 3525 mr(result, diff); 3526 3527 // Otherwise, return the difference between the first mismatched chars. 3528 bind(Ldone); 3529 if (ae == StrIntrinsicNode::UL) { 3530 neg(result, result); // Negate result (see note above). 3531 } 3532 } 3533 3534 void MacroAssembler::array_equals(bool is_array_equ, Register ary1, Register ary2, 3535 Register limit, Register tmp1, Register result, bool is_byte) { 3536 const Register tmp0 = R0; 3537 assert_different_registers(ary1, ary2, limit, tmp0, tmp1, result); 3538 Label Ldone, Lskiploop, Lloop, Lfastloop, Lskipfast; 3539 bool limit_needs_shift = false; 3540 3541 if (is_array_equ) { 3542 const int length_offset = arrayOopDesc::length_offset_in_bytes(); 3543 const int base_offset = arrayOopDesc::base_offset_in_bytes(is_byte ? T_BYTE : T_CHAR); 3544 3545 // Return true if the same array. 3546 cmpd(CCR0, ary1, ary2); 3547 beq(CCR0, Lskiploop); 3548 3549 // Return false if one of them is NULL. 3550 cmpdi(CCR0, ary1, 0); 3551 cmpdi(CCR1, ary2, 0); 3552 li(result, 0); 3553 cror(CCR0, Assembler::equal, CCR1, Assembler::equal); 3554 beq(CCR0, Ldone); 3555 3556 // Load the lengths of arrays. 3557 lwz(limit, length_offset, ary1); 3558 lwz(tmp0, length_offset, ary2); 3559 3560 // Return false if the two arrays are not equal length. 3561 cmpw(CCR0, limit, tmp0); 3562 bne(CCR0, Ldone); 3563 3564 // Load array addresses. 3565 addi(ary1, ary1, base_offset); 3566 addi(ary2, ary2, base_offset); 3567 } else { 3568 limit_needs_shift = !is_byte; 3569 li(result, 0); // Assume not equal. 3570 } 3571 3572 // Rename registers 3573 Register chr1 = tmp0; 3574 Register chr2 = tmp1; 3575 3576 // Compare 8 bytes per iteration in fast loop. 3577 const int log2_chars_per_iter = is_byte ? 3 : 2; 3578 3579 srwi_(tmp0, limit, log2_chars_per_iter + (limit_needs_shift ? 1 : 0)); 3580 beq(CCR0, Lskipfast); 3581 mtctr(tmp0); 3582 3583 bind(Lfastloop); 3584 ld(chr1, 0, ary1); 3585 ld(chr2, 0, ary2); 3586 addi(ary1, ary1, 8); 3587 addi(ary2, ary2, 8); 3588 cmpd(CCR0, chr1, chr2); 3589 bne(CCR0, Ldone); 3590 bdnz(Lfastloop); 3591 3592 bind(Lskipfast); 3593 rldicl_(limit, limit, limit_needs_shift ? 64 - 1 : 0, 64 - log2_chars_per_iter); // Remaining characters. 3594 beq(CCR0, Lskiploop); 3595 mtctr(limit); 3596 3597 // Character by character. 3598 bind(Lloop); 3599 if (is_byte) { 3600 lbz(chr1, 0, ary1); 3601 lbz(chr2, 0, ary2); 3602 addi(ary1, ary1, 1); 3603 addi(ary2, ary2, 1); 3604 } else { 3605 lhz(chr1, 0, ary1); 3606 lhz(chr2, 0, ary2); 3607 addi(ary1, ary1, 2); 3608 addi(ary2, ary2, 2); 3609 } 3610 cmpw(CCR0, chr1, chr2); 3611 bne(CCR0, Ldone); 3612 bdnz(Lloop); 3613 3614 bind(Lskiploop); 3615 li(result, 1); // All characters are equal. 3616 bind(Ldone); 3617 } 3618 3619 void MacroAssembler::string_indexof(Register result, Register haystack, Register haycnt, 3620 Register needle, ciTypeArray* needle_values, Register needlecnt, int needlecntval, 3621 Register tmp1, Register tmp2, Register tmp3, Register tmp4, int ae) { 3622 3623 // Ensure 0<needlecnt<=haycnt in ideal graph as prerequisite! 3624 Label L_TooShort, L_Found, L_NotFound, L_End; 3625 Register last_addr = haycnt, // Kill haycnt at the beginning. 3626 addr = tmp1, 3627 n_start = tmp2, 3628 ch1 = tmp3, 3629 ch2 = R0; 3630 3631 assert(ae != StrIntrinsicNode::LU, "Invalid encoding"); 3632 const int h_csize = (ae == StrIntrinsicNode::LL) ? 1 : 2; 3633 const int n_csize = (ae == StrIntrinsicNode::UU) ? 2 : 1; 3634 3635 // ************************************************************************************************** 3636 // Prepare for main loop: optimized for needle count >=2, bail out otherwise. 3637 // ************************************************************************************************** 3638 3639 // Compute last haystack addr to use if no match gets found. 3640 clrldi(haycnt, haycnt, 32); // Ensure positive int is valid as 64 bit value. 3641 addi(addr, haystack, -h_csize); // Accesses use pre-increment. 3642 if (needlecntval == 0) { // variable needlecnt 3643 cmpwi(CCR6, needlecnt, 2); 3644 clrldi(needlecnt, needlecnt, 32); // Ensure positive int is valid as 64 bit value. 3645 blt(CCR6, L_TooShort); // Variable needlecnt: handle short needle separately. 3646 } 3647 3648 if (n_csize == 2) { lwz(n_start, 0, needle); } else { lhz(n_start, 0, needle); } // Load first 2 characters of needle. 3649 3650 if (needlecntval == 0) { // variable needlecnt 3651 subf(ch1, needlecnt, haycnt); // Last character index to compare is haycnt-needlecnt. 3652 addi(needlecnt, needlecnt, -2); // Rest of needle. 3653 } else { // constant needlecnt 3654 guarantee(needlecntval != 1, "IndexOf with single-character needle must be handled separately"); 3655 assert((needlecntval & 0x7fff) == needlecntval, "wrong immediate"); 3656 addi(ch1, haycnt, -needlecntval); // Last character index to compare is haycnt-needlecnt. 3657 if (needlecntval > 3) { li(needlecnt, needlecntval - 2); } // Rest of needle. 3658 } 3659 3660 if (h_csize == 2) { slwi(ch1, ch1, 1); } // Scale to number of bytes. 3661 3662 if (ae ==StrIntrinsicNode::UL) { 3663 srwi(tmp4, n_start, 1*8); // ___0 3664 rlwimi(n_start, tmp4, 2*8, 0, 23); // _0_1 3665 } 3666 3667 add(last_addr, haystack, ch1); // Point to last address to compare (haystack+2*(haycnt-needlecnt)). 3668 3669 // Main Loop (now we have at least 2 characters). 3670 Label L_OuterLoop, L_InnerLoop, L_FinalCheck, L_Comp1, L_Comp2; 3671 bind(L_OuterLoop); // Search for 1st 2 characters. 3672 Register addr_diff = tmp4; 3673 subf(addr_diff, addr, last_addr); // Difference between already checked address and last address to check. 3674 addi(addr, addr, h_csize); // This is the new address we want to use for comparing. 3675 srdi_(ch2, addr_diff, h_csize); 3676 beq(CCR0, L_FinalCheck); // 2 characters left? 3677 mtctr(ch2); // num of characters / 2 3678 bind(L_InnerLoop); // Main work horse (2x unrolled search loop) 3679 if (h_csize == 2) { // Load 2 characters of haystack (ignore alignment). 3680 lwz(ch1, 0, addr); 3681 lwz(ch2, 2, addr); 3682 } else { 3683 lhz(ch1, 0, addr); 3684 lhz(ch2, 1, addr); 3685 } 3686 cmpw(CCR0, ch1, n_start); // Compare 2 characters (1 would be sufficient but try to reduce branches to CompLoop). 3687 cmpw(CCR1, ch2, n_start); 3688 beq(CCR0, L_Comp1); // Did we find the needle start? 3689 beq(CCR1, L_Comp2); 3690 addi(addr, addr, 2 * h_csize); 3691 bdnz(L_InnerLoop); 3692 bind(L_FinalCheck); 3693 andi_(addr_diff, addr_diff, h_csize); // Remaining characters not covered by InnerLoop: (num of characters) & 1. 3694 beq(CCR0, L_NotFound); 3695 if (h_csize == 2) { lwz(ch1, 0, addr); } else { lhz(ch1, 0, addr); } // One position left at which we have to compare. 3696 cmpw(CCR1, ch1, n_start); 3697 beq(CCR1, L_Comp1); 3698 bind(L_NotFound); 3699 li(result, -1); // not found 3700 b(L_End); 3701 3702 // ************************************************************************************************** 3703 // Special Case: unfortunately, the variable needle case can be called with needlecnt<2 3704 // ************************************************************************************************** 3705 if (needlecntval == 0) { // We have to handle these cases separately. 3706 Label L_OneCharLoop; 3707 bind(L_TooShort); 3708 mtctr(haycnt); 3709 if (n_csize == 2) { lhz(n_start, 0, needle); } else { lbz(n_start, 0, needle); } // First character of needle 3710 bind(L_OneCharLoop); 3711 if (h_csize == 2) { lhzu(ch1, 2, addr); } else { lbzu(ch1, 1, addr); } 3712 cmpw(CCR1, ch1, n_start); 3713 beq(CCR1, L_Found); // Did we find the one character needle? 3714 bdnz(L_OneCharLoop); 3715 li(result, -1); // Not found. 3716 b(L_End); 3717 } 3718 3719 // ************************************************************************************************** 3720 // Regular Case Part II: compare rest of needle (first 2 characters have been compared already) 3721 // ************************************************************************************************** 3722 3723 // Compare the rest 3724 bind(L_Comp2); 3725 addi(addr, addr, h_csize); // First comparison has failed, 2nd one hit. 3726 bind(L_Comp1); // Addr points to possible needle start. 3727 if (needlecntval != 2) { // Const needlecnt==2? 3728 if (needlecntval != 3) { 3729 if (needlecntval == 0) { beq(CCR6, L_Found); } // Variable needlecnt==2? 3730 Register n_ind = tmp4, 3731 h_ind = n_ind; 3732 li(n_ind, 2 * n_csize); // First 2 characters are already compared, use index 2. 3733 mtctr(needlecnt); // Decremented by 2, still > 0. 3734 Label L_CompLoop; 3735 bind(L_CompLoop); 3736 if (ae ==StrIntrinsicNode::UL) { 3737 h_ind = ch1; 3738 sldi(h_ind, n_ind, 1); 3739 } 3740 if (n_csize == 2) { lhzx(ch2, needle, n_ind); } else { lbzx(ch2, needle, n_ind); } 3741 if (h_csize == 2) { lhzx(ch1, addr, h_ind); } else { lbzx(ch1, addr, h_ind); } 3742 cmpw(CCR1, ch1, ch2); 3743 bne(CCR1, L_OuterLoop); 3744 addi(n_ind, n_ind, n_csize); 3745 bdnz(L_CompLoop); 3746 } else { // No loop required if there's only one needle character left. 3747 if (n_csize == 2) { lhz(ch2, 2 * 2, needle); } else { lbz(ch2, 2 * 1, needle); } 3748 if (h_csize == 2) { lhz(ch1, 2 * 2, addr); } else { lbz(ch1, 2 * 1, addr); } 3749 cmpw(CCR1, ch1, ch2); 3750 bne(CCR1, L_OuterLoop); 3751 } 3752 } 3753 // Return index ... 3754 bind(L_Found); 3755 subf(result, haystack, addr); // relative to haystack, ... 3756 if (h_csize == 2) { srdi(result, result, 1); } // in characters. 3757 bind(L_End); 3758 } // string_indexof 3759 3760 void MacroAssembler::string_indexof_char(Register result, Register haystack, Register haycnt, 3761 Register needle, jchar needleChar, Register tmp1, Register tmp2, bool is_byte) { 3762 assert_different_registers(haystack, haycnt, needle, tmp1, tmp2); 3763 3764 Label L_InnerLoop, L_FinalCheck, L_Found1, L_Found2, L_NotFound, L_End; 3765 Register addr = tmp1, 3766 ch1 = tmp2, 3767 ch2 = R0; 3768 3769 const int h_csize = is_byte ? 1 : 2; 3770 3771 //4: 3772 srwi_(tmp2, haycnt, 1); // Shift right by exact_log2(UNROLL_FACTOR). 3773 mr(addr, haystack); 3774 beq(CCR0, L_FinalCheck); 3775 mtctr(tmp2); // Move to count register. 3776 //8: 3777 bind(L_InnerLoop); // Main work horse (2x unrolled search loop). 3778 if (!is_byte) { 3779 lhz(ch1, 0, addr); 3780 lhz(ch2, 2, addr); 3781 } else { 3782 lbz(ch1, 0, addr); 3783 lbz(ch2, 1, addr); 3784 } 3785 (needle != R0) ? cmpw(CCR0, ch1, needle) : cmplwi(CCR0, ch1, (unsigned int)needleChar); 3786 (needle != R0) ? cmpw(CCR1, ch2, needle) : cmplwi(CCR1, ch2, (unsigned int)needleChar); 3787 beq(CCR0, L_Found1); // Did we find the needle? 3788 beq(CCR1, L_Found2); 3789 addi(addr, addr, 2 * h_csize); 3790 bdnz(L_InnerLoop); 3791 //16: 3792 bind(L_FinalCheck); 3793 andi_(R0, haycnt, 1); 3794 beq(CCR0, L_NotFound); 3795 if (!is_byte) { lhz(ch1, 0, addr); } else { lbz(ch1, 0, addr); } // One position left at which we have to compare. 3796 (needle != R0) ? cmpw(CCR1, ch1, needle) : cmplwi(CCR1, ch1, (unsigned int)needleChar); 3797 beq(CCR1, L_Found1); 3798 //21: 3799 bind(L_NotFound); 3800 li(result, -1); // Not found. 3801 b(L_End); 3802 3803 bind(L_Found2); 3804 addi(addr, addr, h_csize); 3805 //24: 3806 bind(L_Found1); // Return index ... 3807 subf(result, haystack, addr); // relative to haystack, ... 3808 if (!is_byte) { srdi(result, result, 1); } // in characters. 3809 bind(L_End); 3810 } // string_indexof_char 3811 3812 3813 void MacroAssembler::has_negatives(Register src, Register cnt, Register result, 3814 Register tmp1, Register tmp2) { 3815 const Register tmp0 = R0; 3816 assert_different_registers(src, result, cnt, tmp0, tmp1, tmp2); 3817 Label Lfastloop, Lslow, Lloop, Lnoneg, Ldone; 3818 3819 // Check if cnt >= 8 (= 16 bytes) 3820 lis(tmp1, (int)(short)0x8080); // tmp1 = 0x8080808080808080 3821 srwi_(tmp2, cnt, 4); 3822 li(result, 1); // Assume there's a negative byte. 3823 beq(CCR0, Lslow); 3824 ori(tmp1, tmp1, 0x8080); 3825 rldimi(tmp1, tmp1, 32, 0); 3826 mtctr(tmp2); 3827 3828 // 2x unrolled loop 3829 bind(Lfastloop); 3830 ld(tmp2, 0, src); 3831 ld(tmp0, 8, src); 3832 3833 orr(tmp0, tmp2, tmp0); 3834 3835 and_(tmp0, tmp0, tmp1); 3836 bne(CCR0, Ldone); // Found negative byte. 3837 addi(src, src, 16); 3838 3839 bdnz(Lfastloop); 3840 3841 bind(Lslow); // Fallback to slow version 3842 rldicl_(tmp0, cnt, 0, 64-4); 3843 beq(CCR0, Lnoneg); 3844 mtctr(tmp0); 3845 bind(Lloop); 3846 lbz(tmp0, 0, src); 3847 addi(src, src, 1); 3848 andi_(tmp0, tmp0, 0x80); 3849 bne(CCR0, Ldone); // Found negative byte. 3850 bdnz(Lloop); 3851 bind(Lnoneg); 3852 li(result, 0); 3853 3854 bind(Ldone); 3855 } 3856 3857 #endif // Compiler2 3858 3859 // Helpers for Intrinsic Emitters 3860 // 3861 // Revert the byte order of a 32bit value in a register 3862 // src: 0x44556677 3863 // dst: 0x77665544 3864 // Three steps to obtain the result: 3865 // 1) Rotate src (as doubleword) left 5 bytes. That puts the leftmost byte of the src word 3866 // into the rightmost byte position. Afterwards, everything left of the rightmost byte is cleared. 3867 // This value initializes dst. 3868 // 2) Rotate src (as word) left 3 bytes. That puts the rightmost byte of the src word into the leftmost 3869 // byte position. Furthermore, byte 5 is rotated into byte 6 position where it is supposed to go. 3870 // This value is mask inserted into dst with a [0..23] mask of 1s. 3871 // 3) Rotate src (as word) left 1 byte. That puts byte 6 into byte 5 position. 3872 // This value is mask inserted into dst with a [8..15] mask of 1s. 3873 void MacroAssembler::load_reverse_32(Register dst, Register src) { 3874 assert_different_registers(dst, src); 3875 3876 rldicl(dst, src, (4+1)*8, 56); // Rotate byte 4 into position 7 (rightmost), clear all to the left. 3877 rlwimi(dst, src, 3*8, 0, 23); // Insert byte 5 into position 6, 7 into 4, leave pos 7 alone. 3878 rlwimi(dst, src, 1*8, 8, 15); // Insert byte 6 into position 5, leave the rest alone. 3879 } 3880 3881 // Calculate the column addresses of the crc32 lookup table into distinct registers. 3882 // This loop-invariant calculation is moved out of the loop body, reducing the loop 3883 // body size from 20 to 16 instructions. 3884 // Returns the offset that was used to calculate the address of column tc3. 3885 // Due to register shortage, setting tc3 may overwrite table. With the return offset 3886 // at hand, the original table address can be easily reconstructed. 3887 int MacroAssembler::crc32_table_columns(Register table, Register tc0, Register tc1, Register tc2, Register tc3) { 3888 assert(!VM_Version::has_vpmsumb(), "Vector version should be used instead!"); 3889 3890 // Point to 4 byte folding tables (byte-reversed version for Big Endian) 3891 // Layout: See StubRoutines::generate_crc_constants. 3892 #ifdef VM_LITTLE_ENDIAN 3893 const int ix0 = 3 * CRC32_TABLE_SIZE; 3894 const int ix1 = 2 * CRC32_TABLE_SIZE; 3895 const int ix2 = 1 * CRC32_TABLE_SIZE; 3896 const int ix3 = 0 * CRC32_TABLE_SIZE; 3897 #else 3898 const int ix0 = 1 * CRC32_TABLE_SIZE; 3899 const int ix1 = 2 * CRC32_TABLE_SIZE; 3900 const int ix2 = 3 * CRC32_TABLE_SIZE; 3901 const int ix3 = 4 * CRC32_TABLE_SIZE; 3902 #endif 3903 assert_different_registers(table, tc0, tc1, tc2); 3904 assert(table == tc3, "must be!"); 3905 3906 addi(tc0, table, ix0); 3907 addi(tc1, table, ix1); 3908 addi(tc2, table, ix2); 3909 if (ix3 != 0) addi(tc3, table, ix3); 3910 3911 return ix3; 3912 } 3913 3914 /** 3915 * uint32_t crc; 3916 * table[crc & 0xFF] ^ (crc >> 8); 3917 */ 3918 void MacroAssembler::fold_byte_crc32(Register crc, Register val, Register table, Register tmp) { 3919 assert_different_registers(crc, table, tmp); 3920 assert_different_registers(val, table); 3921 3922 if (crc == val) { // Must rotate first to use the unmodified value. 3923 rlwinm(tmp, val, 2, 24-2, 31-2); // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest. 3924 // As we use a word (4-byte) instruction, we have to adapt the mask bit positions. 3925 srwi(crc, crc, 8); // Unsigned shift, clear leftmost 8 bits. 3926 } else { 3927 srwi(crc, crc, 8); // Unsigned shift, clear leftmost 8 bits. 3928 rlwinm(tmp, val, 2, 24-2, 31-2); // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest. 3929 } 3930 lwzx(tmp, table, tmp); 3931 xorr(crc, crc, tmp); 3932 } 3933 3934 /** 3935 * Emits code to update CRC-32 with a byte value according to constants in table. 3936 * 3937 * @param [in,out]crc Register containing the crc. 3938 * @param [in]val Register containing the byte to fold into the CRC. 3939 * @param [in]table Register containing the table of crc constants. 3940 * 3941 * uint32_t crc; 3942 * val = crc_table[(val ^ crc) & 0xFF]; 3943 * crc = val ^ (crc >> 8); 3944 */ 3945 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) { 3946 BLOCK_COMMENT("update_byte_crc32:"); 3947 xorr(val, val, crc); 3948 fold_byte_crc32(crc, val, table, val); 3949 } 3950 3951 /** 3952 * @param crc register containing existing CRC (32-bit) 3953 * @param buf register pointing to input byte buffer (byte*) 3954 * @param len register containing number of bytes 3955 * @param table register pointing to CRC table 3956 */ 3957 void MacroAssembler::update_byteLoop_crc32(Register crc, Register buf, Register len, Register table, 3958 Register data, bool loopAlignment) { 3959 assert_different_registers(crc, buf, len, table, data); 3960 3961 Label L_mainLoop, L_done; 3962 const int mainLoop_stepping = 1; 3963 const int mainLoop_alignment = loopAlignment ? 32 : 4; // (InputForNewCode > 4 ? InputForNewCode : 32) : 4; 3964 3965 // Process all bytes in a single-byte loop. 3966 clrldi_(len, len, 32); // Enforce 32 bit. Anything to do? 3967 beq(CCR0, L_done); 3968 3969 mtctr(len); 3970 align(mainLoop_alignment); 3971 BIND(L_mainLoop); 3972 lbz(data, 0, buf); // Byte from buffer, zero-extended. 3973 addi(buf, buf, mainLoop_stepping); // Advance buffer position. 3974 update_byte_crc32(crc, data, table); 3975 bdnz(L_mainLoop); // Iterate. 3976 3977 bind(L_done); 3978 } 3979 3980 /** 3981 * Emits code to update CRC-32 with a 4-byte value according to constants in table 3982 * Implementation according to jdk/src/share/native/java/util/zip/zlib-1.2.8/crc32.c 3983 */ 3984 // A note on the lookup table address(es): 3985 // The implementation uses 4 table columns (byte-reversed versions for Big Endian). 3986 // To save the effort of adding the column offset to the table address each time 3987 // a table element is looked up, it is possible to pass the pre-calculated 3988 // column addresses. 3989 // Uses R9..R12 as work register. Must be saved/restored by caller, if necessary. 3990 void MacroAssembler::update_1word_crc32(Register crc, Register buf, Register table, int bufDisp, int bufInc, 3991 Register t0, Register t1, Register t2, Register t3, 3992 Register tc0, Register tc1, Register tc2, Register tc3) { 3993 assert_different_registers(crc, t3); 3994 3995 // XOR crc with next four bytes of buffer. 3996 lwz(t3, bufDisp, buf); 3997 if (bufInc != 0) { 3998 addi(buf, buf, bufInc); 3999 } 4000 xorr(t3, t3, crc); 4001 4002 // Chop crc into 4 single-byte pieces, shifted left 2 bits, to form the table indices. 4003 rlwinm(t0, t3, 2, 24-2, 31-2); // ((t1 >> 0) & 0xff) << 2 4004 rlwinm(t1, t3, 32+(2- 8), 24-2, 31-2); // ((t1 >> 8) & 0xff) << 2 4005 rlwinm(t2, t3, 32+(2-16), 24-2, 31-2); // ((t1 >> 16) & 0xff) << 2 4006 rlwinm(t3, t3, 32+(2-24), 24-2, 31-2); // ((t1 >> 24) & 0xff) << 2 4007 4008 // Use the pre-calculated column addresses. 4009 // Load pre-calculated table values. 4010 lwzx(t0, tc0, t0); 4011 lwzx(t1, tc1, t1); 4012 lwzx(t2, tc2, t2); 4013 lwzx(t3, tc3, t3); 4014 4015 // Calculate new crc from table values. 4016 xorr(t0, t0, t1); 4017 xorr(t2, t2, t3); 4018 xorr(crc, t0, t2); // Now crc contains the final checksum value. 4019 } 4020 4021 /** 4022 * @param crc register containing existing CRC (32-bit) 4023 * @param buf register pointing to input byte buffer (byte*) 4024 * @param len register containing number of bytes 4025 * @param table register pointing to CRC table 4026 * 4027 * uses R9..R12 as work register. Must be saved/restored by caller! 4028 */ 4029 void MacroAssembler::kernel_crc32_1word(Register crc, Register buf, Register len, Register table, 4030 Register t0, Register t1, Register t2, Register t3, 4031 Register tc0, Register tc1, Register tc2, Register tc3, 4032 bool invertCRC) { 4033 assert_different_registers(crc, buf, len, table); 4034 4035 Label L_mainLoop, L_tail; 4036 Register tmp = t0; 4037 Register data = t0; 4038 Register tmp2 = t1; 4039 const int mainLoop_stepping = 4; 4040 const int tailLoop_stepping = 1; 4041 const int log_stepping = exact_log2(mainLoop_stepping); 4042 const int mainLoop_alignment = 32; // InputForNewCode > 4 ? InputForNewCode : 32; 4043 const int complexThreshold = 2*mainLoop_stepping; 4044 4045 // Don't test for len <= 0 here. This pathological case should not occur anyway. 4046 // Optimizing for it by adding a test and a branch seems to be a waste of CPU cycles 4047 // for all well-behaved cases. The situation itself is detected and handled correctly 4048 // within update_byteLoop_crc32. 4049 assert(tailLoop_stepping == 1, "check tailLoop_stepping!"); 4050 4051 BLOCK_COMMENT("kernel_crc32_1word {"); 4052 4053 if (invertCRC) { 4054 nand(crc, crc, crc); // 1s complement of crc 4055 } 4056 4057 // Check for short (<mainLoop_stepping) buffer. 4058 cmpdi(CCR0, len, complexThreshold); 4059 blt(CCR0, L_tail); 4060 4061 // Pre-mainLoop alignment did show a slight (1%) positive effect on performance. 4062 // We leave the code in for reference. Maybe we need alignment when we exploit vector instructions. 4063 { 4064 // Align buf addr to mainLoop_stepping boundary. 4065 neg(tmp2, buf); // Calculate # preLoop iterations for alignment. 4066 rldicl(tmp2, tmp2, 0, 64-log_stepping); // Rotate tmp2 0 bits, insert into tmp2, anding with mask with 1s from 62..63. 4067 4068 if (complexThreshold > mainLoop_stepping) { 4069 sub(len, len, tmp2); // Remaining bytes for main loop (>=mainLoop_stepping is guaranteed). 4070 } else { 4071 sub(tmp, len, tmp2); // Remaining bytes for main loop. 4072 cmpdi(CCR0, tmp, mainLoop_stepping); 4073 blt(CCR0, L_tail); // For less than one mainloop_stepping left, do only tail processing 4074 mr(len, tmp); // remaining bytes for main loop (>=mainLoop_stepping is guaranteed). 4075 } 4076 update_byteLoop_crc32(crc, buf, tmp2, table, data, false); 4077 } 4078 4079 srdi(tmp2, len, log_stepping); // #iterations for mainLoop 4080 andi(len, len, mainLoop_stepping-1); // remaining bytes for tailLoop 4081 mtctr(tmp2); 4082 4083 #ifdef VM_LITTLE_ENDIAN 4084 Register crc_rv = crc; 4085 #else 4086 Register crc_rv = tmp; // Load_reverse needs separate registers to work on. 4087 // Occupies tmp, but frees up crc. 4088 load_reverse_32(crc_rv, crc); // Revert byte order because we are dealing with big-endian data. 4089 tmp = crc; 4090 #endif 4091 4092 int reconstructTableOffset = crc32_table_columns(table, tc0, tc1, tc2, tc3); 4093 4094 align(mainLoop_alignment); // Octoword-aligned loop address. Shows 2% improvement. 4095 BIND(L_mainLoop); 4096 update_1word_crc32(crc_rv, buf, table, 0, mainLoop_stepping, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3); 4097 bdnz(L_mainLoop); 4098 4099 #ifndef VM_LITTLE_ENDIAN 4100 load_reverse_32(crc, crc_rv); // Revert byte order because we are dealing with big-endian data. 4101 tmp = crc_rv; // Tmp uses it's original register again. 4102 #endif 4103 4104 // Restore original table address for tailLoop. 4105 if (reconstructTableOffset != 0) { 4106 addi(table, table, -reconstructTableOffset); 4107 } 4108 4109 // Process last few (<complexThreshold) bytes of buffer. 4110 BIND(L_tail); 4111 update_byteLoop_crc32(crc, buf, len, table, data, false); 4112 4113 if (invertCRC) { 4114 nand(crc, crc, crc); // 1s complement of crc 4115 } 4116 BLOCK_COMMENT("} kernel_crc32_1word"); 4117 } 4118 4119 /** 4120 * @param crc register containing existing CRC (32-bit) 4121 * @param buf register pointing to input byte buffer (byte*) 4122 * @param len register containing number of bytes 4123 * @param constants register pointing to precomputed constants 4124 * @param t0-t6 temp registers 4125 */ 4126 void MacroAssembler::kernel_crc32_vpmsum(Register crc, Register buf, Register len, Register constants, 4127 Register t0, Register t1, Register t2, Register t3, 4128 Register t4, Register t5, Register t6, bool invertCRC) { 4129 assert_different_registers(crc, buf, len, constants); 4130 4131 Label L_tail; 4132 4133 BLOCK_COMMENT("kernel_crc32_vpmsum {"); 4134 4135 if (invertCRC) { 4136 nand(crc, crc, crc); // 1s complement of crc 4137 } 4138 4139 // Enforce 32 bit. 4140 clrldi(len, len, 32); 4141 4142 // Align if we have enough bytes for the fast version. 4143 const int alignment = 16, 4144 threshold = 32; 4145 Register prealign = t0; 4146 4147 neg(prealign, buf); 4148 addi(t1, len, -threshold); 4149 andi(prealign, prealign, alignment - 1); 4150 cmpw(CCR0, t1, prealign); 4151 blt(CCR0, L_tail); // len - prealign < threshold? 4152 4153 subf(len, prealign, len); 4154 update_byteLoop_crc32(crc, buf, prealign, constants, t2, false); 4155 4156 // Calculate from first aligned address as far as possible. 4157 addi(constants, constants, CRC32_TABLE_SIZE); // Point to vector constants. 4158 kernel_crc32_vpmsum_aligned(crc, buf, len, constants, t0, t1, t2, t3, t4, t5, t6); 4159 addi(constants, constants, -CRC32_TABLE_SIZE); // Point to table again. 4160 4161 // Remaining bytes. 4162 BIND(L_tail); 4163 update_byteLoop_crc32(crc, buf, len, constants, t2, false); 4164 4165 if (invertCRC) { 4166 nand(crc, crc, crc); // 1s complement of crc 4167 } 4168 4169 BLOCK_COMMENT("} kernel_crc32_vpmsum"); 4170 } 4171 4172 /** 4173 * @param crc register containing existing CRC (32-bit) 4174 * @param buf register pointing to input byte buffer (byte*) 4175 * @param len register containing number of bytes (will get updated to remaining bytes) 4176 * @param constants register pointing to CRC table for 128-bit aligned memory 4177 * @param t0-t6 temp registers 4178 */ 4179 void MacroAssembler::kernel_crc32_vpmsum_aligned(Register crc, Register buf, Register len, Register constants, 4180 Register t0, Register t1, Register t2, Register t3, Register t4, Register t5, Register t6) { 4181 4182 // Save non-volatile vector registers (frameless). 4183 Register offset = t1; 4184 int offsetInt = 0; 4185 offsetInt -= 16; li(offset, offsetInt); stvx(VR20, offset, R1_SP); 4186 offsetInt -= 16; li(offset, offsetInt); stvx(VR21, offset, R1_SP); 4187 offsetInt -= 16; li(offset, offsetInt); stvx(VR22, offset, R1_SP); 4188 offsetInt -= 16; li(offset, offsetInt); stvx(VR23, offset, R1_SP); 4189 offsetInt -= 16; li(offset, offsetInt); stvx(VR24, offset, R1_SP); 4190 offsetInt -= 16; li(offset, offsetInt); stvx(VR25, offset, R1_SP); 4191 #ifndef VM_LITTLE_ENDIAN 4192 offsetInt -= 16; li(offset, offsetInt); stvx(VR26, offset, R1_SP); 4193 #endif 4194 offsetInt -= 8; std(R14, offsetInt, R1_SP); 4195 offsetInt -= 8; std(R15, offsetInt, R1_SP); 4196 4197 // Implementation uses an inner loop which uses between 256 and 16 * unroll_factor 4198 // bytes per iteration. The basic scheme is: 4199 // lvx: load vector (Big Endian needs reversal) 4200 // vpmsumw: carry-less 32 bit multiplications with constant representing a large CRC shift 4201 // vxor: xor partial results together to get unroll_factor2 vectors 4202 4203 // Outer loop performs the CRC shifts needed to combine the unroll_factor2 vectors. 4204 4205 // Using 16 * unroll_factor / unroll_factor_2 bytes for constants. 4206 const int unroll_factor = CRC32_UNROLL_FACTOR, 4207 unroll_factor2 = CRC32_UNROLL_FACTOR2; 4208 4209 const int outer_consts_size = (unroll_factor2 - 1) * 16, 4210 inner_consts_size = (unroll_factor / unroll_factor2) * 16; 4211 4212 // Support registers. 4213 Register offs[] = { noreg, t0, t1, t2, t3, t4, t5, t6 }; 4214 Register num_bytes = R14, 4215 loop_count = R15, 4216 cur_const = crc; // will live in VCRC 4217 // Constant array for outer loop: unroll_factor2 - 1 registers, 4218 // Constant array for inner loop: unroll_factor / unroll_factor2 registers. 4219 VectorRegister consts0[] = { VR16, VR17, VR18, VR19, VR20, VR21, VR22 }, 4220 consts1[] = { VR23, VR24 }; 4221 // Data register arrays: 2 arrays with unroll_factor2 registers. 4222 VectorRegister data0[] = { VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7 }, 4223 data1[] = { VR8, VR9, VR10, VR11, VR12, VR13, VR14, VR15 }; 4224 4225 VectorRegister VCRC = data0[0]; 4226 VectorRegister Vc = VR25; 4227 VectorRegister swap_bytes = VR26; // Only for Big Endian. 4228 4229 // We have at least 1 iteration (ensured by caller). 4230 Label L_outer_loop, L_inner_loop, L_last; 4231 4232 // If supported set DSCR pre-fetch to deepest. 4233 if (VM_Version::has_mfdscr()) { 4234 load_const_optimized(t0, VM_Version::_dscr_val | 7); 4235 mtdscr(t0); 4236 } 4237 4238 mtvrwz(VCRC, crc); // crc lives in VCRC, now 4239 4240 for (int i = 1; i < unroll_factor2; ++i) { 4241 li(offs[i], 16 * i); 4242 } 4243 4244 // Load consts for outer loop 4245 lvx(consts0[0], constants); 4246 for (int i = 1; i < unroll_factor2 - 1; ++i) { 4247 lvx(consts0[i], offs[i], constants); 4248 } 4249 4250 load_const_optimized(num_bytes, 16 * unroll_factor); 4251 4252 // Reuse data registers outside of the loop. 4253 VectorRegister Vtmp = data1[0]; 4254 VectorRegister Vtmp2 = data1[1]; 4255 VectorRegister zeroes = data1[2]; 4256 4257 vspltisb(Vtmp, 0); 4258 vsldoi(VCRC, Vtmp, VCRC, 8); // 96 bit zeroes, 32 bit CRC. 4259 4260 // Load vector for vpermxor (to xor both 64 bit parts together) 4261 lvsl(Vtmp, buf); // 000102030405060708090a0b0c0d0e0f 4262 vspltisb(Vc, 4); 4263 vsl(Vc, Vtmp, Vc); // 00102030405060708090a0b0c0d0e0f0 4264 xxspltd(Vc->to_vsr(), Vc->to_vsr(), 0); 4265 vor(Vc, Vtmp, Vc); // 001122334455667708192a3b4c5d6e7f 4266 4267 #ifdef VM_LITTLE_ENDIAN 4268 #define BE_swap_bytes(x) 4269 #else 4270 vspltisb(Vtmp2, 0xf); 4271 vxor(swap_bytes, Vtmp, Vtmp2); 4272 #define BE_swap_bytes(x) vperm(x, x, x, swap_bytes) 4273 #endif 4274 4275 cmpd(CCR0, len, num_bytes); 4276 blt(CCR0, L_last); 4277 4278 addi(cur_const, constants, outer_consts_size); // Point to consts for inner loop 4279 load_const_optimized(loop_count, unroll_factor / (2 * unroll_factor2) - 1); // One double-iteration peeled off. 4280 4281 // ********** Main loop start ********** 4282 align(32); 4283 bind(L_outer_loop); 4284 4285 // Begin of unrolled first iteration (no xor). 4286 lvx(data1[0], buf); 4287 for (int i = 1; i < unroll_factor2 / 2; ++i) { 4288 lvx(data1[i], offs[i], buf); 4289 } 4290 vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result. 4291 lvx(consts1[0], cur_const); 4292 mtctr(loop_count); 4293 for (int i = 0; i < unroll_factor2 / 2; ++i) { 4294 BE_swap_bytes(data1[i]); 4295 if (i == 0) { vxor(data1[0], data1[0], VCRC); } // xor in previous CRC. 4296 lvx(data1[i + unroll_factor2 / 2], offs[i + unroll_factor2 / 2], buf); 4297 vpmsumw(data0[i], data1[i], consts1[0]); 4298 } 4299 addi(buf, buf, 16 * unroll_factor2); 4300 subf(len, num_bytes, len); 4301 lvx(consts1[1], offs[1], cur_const); 4302 addi(cur_const, cur_const, 32); 4303 // Begin of unrolled second iteration (head). 4304 for (int i = 0; i < unroll_factor2 / 2; ++i) { 4305 BE_swap_bytes(data1[i + unroll_factor2 / 2]); 4306 if (i == 0) { lvx(data1[0], buf); } else { lvx(data1[i], offs[i], buf); } 4307 vpmsumw(data0[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2], consts1[0]); 4308 } 4309 for (int i = 0; i < unroll_factor2 / 2; ++i) { 4310 BE_swap_bytes(data1[i]); 4311 lvx(data1[i + unroll_factor2 / 2], offs[i + unroll_factor2 / 2], buf); 4312 vpmsumw(data1[i], data1[i], consts1[1]); 4313 } 4314 addi(buf, buf, 16 * unroll_factor2); 4315 4316 // Generate most performance relevant code. Loads + half of the vpmsumw have been generated. 4317 // Double-iteration allows using the 2 constant registers alternatingly. 4318 align(32); 4319 bind(L_inner_loop); 4320 for (int j = 1; j < 3; ++j) { // j < unroll_factor / unroll_factor2 - 1 for complete unrolling. 4321 if (j & 1) { 4322 lvx(consts1[0], cur_const); 4323 } else { 4324 lvx(consts1[1], offs[1], cur_const); 4325 addi(cur_const, cur_const, 32); 4326 } 4327 for (int i = 0; i < unroll_factor2; ++i) { 4328 int idx = i + unroll_factor2 / 2, inc = 0; // For modulo-scheduled input. 4329 if (idx >= unroll_factor2) { idx -= unroll_factor2; inc = 1; } 4330 BE_swap_bytes(data1[idx]); 4331 vxor(data0[i], data0[i], data1[i]); 4332 if (i == 0) lvx(data1[0], buf); else lvx(data1[i], offs[i], buf); 4333 vpmsumw(data1[idx], data1[idx], consts1[(j + inc) & 1]); 4334 } 4335 addi(buf, buf, 16 * unroll_factor2); 4336 } 4337 bdnz(L_inner_loop); 4338 4339 addi(cur_const, constants, outer_consts_size); // Reset 4340 4341 // Tail of last iteration (no loads). 4342 for (int i = 0; i < unroll_factor2 / 2; ++i) { 4343 BE_swap_bytes(data1[i + unroll_factor2 / 2]); 4344 vxor(data0[i], data0[i], data1[i]); 4345 vpmsumw(data1[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2], consts1[1]); 4346 } 4347 for (int i = 0; i < unroll_factor2 / 2; ++i) { 4348 vpmsumw(data0[i], data0[i], consts0[unroll_factor2 - 2 - i]); // First half of fixup shifts. 4349 vxor(data0[i + unroll_factor2 / 2], data0[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2]); 4350 } 4351 4352 // Last data register is ok, other ones need fixup shift. 4353 for (int i = unroll_factor2 / 2; i < unroll_factor2 - 1; ++i) { 4354 vpmsumw(data0[i], data0[i], consts0[unroll_factor2 - 2 - i]); 4355 } 4356 4357 // Combine to 128 bit result vector VCRC = data0[0]. 4358 for (int i = 1; i < unroll_factor2; i<<=1) { 4359 for (int j = 0; j <= unroll_factor2 - 2*i; j+=2*i) { 4360 vxor(data0[j], data0[j], data0[j+i]); 4361 } 4362 } 4363 cmpd(CCR0, len, num_bytes); 4364 bge(CCR0, L_outer_loop); 4365 4366 // Last chance with lower num_bytes. 4367 bind(L_last); 4368 srdi(loop_count, len, exact_log2(16 * 2 * unroll_factor2)); // Use double-iterations. 4369 // Point behind last const for inner loop. 4370 add_const_optimized(cur_const, constants, outer_consts_size + inner_consts_size); 4371 sldi(R0, loop_count, exact_log2(16 * 2)); // Bytes of constants to be used. 4372 clrrdi(num_bytes, len, exact_log2(16 * 2 * unroll_factor2)); 4373 subf(cur_const, R0, cur_const); // Point to constant to be used first. 4374 4375 addic_(loop_count, loop_count, -1); // One double-iteration peeled off. 4376 bgt(CCR0, L_outer_loop); 4377 // ********** Main loop end ********** 4378 4379 // Restore DSCR pre-fetch value. 4380 if (VM_Version::has_mfdscr()) { 4381 load_const_optimized(t0, VM_Version::_dscr_val); 4382 mtdscr(t0); 4383 } 4384 4385 // ********** Simple loop for remaining 16 byte blocks ********** 4386 { 4387 Label L_loop, L_done; 4388 4389 srdi_(t0, len, 4); // 16 bytes per iteration 4390 clrldi(len, len, 64-4); 4391 beq(CCR0, L_done); 4392 4393 // Point to const (same as last const for inner loop). 4394 add_const_optimized(cur_const, constants, outer_consts_size + inner_consts_size - 16); 4395 mtctr(t0); 4396 lvx(Vtmp2, cur_const); 4397 4398 align(32); 4399 bind(L_loop); 4400 4401 lvx(Vtmp, buf); 4402 addi(buf, buf, 16); 4403 vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result. 4404 BE_swap_bytes(Vtmp); 4405 vxor(VCRC, VCRC, Vtmp); 4406 vpmsumw(VCRC, VCRC, Vtmp2); 4407 bdnz(L_loop); 4408 4409 bind(L_done); 4410 } 4411 // ********** Simple loop end ********** 4412 #undef BE_swap_bytes 4413 4414 // Point to Barrett constants 4415 add_const_optimized(cur_const, constants, outer_consts_size + inner_consts_size); 4416 4417 vspltisb(zeroes, 0); 4418 4419 // Combine to 64 bit result. 4420 vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result. 4421 4422 // Reduce to 32 bit CRC: Remainder by multiply-high. 4423 lvx(Vtmp, cur_const); 4424 vsldoi(Vtmp2, zeroes, VCRC, 12); // Extract high 32 bit. 4425 vpmsumd(Vtmp2, Vtmp2, Vtmp); // Multiply by inverse long poly. 4426 vsldoi(Vtmp2, zeroes, Vtmp2, 12); // Extract high 32 bit. 4427 vsldoi(Vtmp, zeroes, Vtmp, 8); 4428 vpmsumd(Vtmp2, Vtmp2, Vtmp); // Multiply quotient by long poly. 4429 vxor(VCRC, VCRC, Vtmp2); // Remainder fits into 32 bit. 4430 4431 // Move result. len is already updated. 4432 vsldoi(VCRC, VCRC, zeroes, 8); 4433 mfvrd(crc, VCRC); 4434 4435 // Restore non-volatile Vector registers (frameless). 4436 offsetInt = 0; 4437 offsetInt -= 16; li(offset, offsetInt); lvx(VR20, offset, R1_SP); 4438 offsetInt -= 16; li(offset, offsetInt); lvx(VR21, offset, R1_SP); 4439 offsetInt -= 16; li(offset, offsetInt); lvx(VR22, offset, R1_SP); 4440 offsetInt -= 16; li(offset, offsetInt); lvx(VR23, offset, R1_SP); 4441 offsetInt -= 16; li(offset, offsetInt); lvx(VR24, offset, R1_SP); 4442 offsetInt -= 16; li(offset, offsetInt); lvx(VR25, offset, R1_SP); 4443 #ifndef VM_LITTLE_ENDIAN 4444 offsetInt -= 16; li(offset, offsetInt); lvx(VR26, offset, R1_SP); 4445 #endif 4446 offsetInt -= 8; ld(R14, offsetInt, R1_SP); 4447 offsetInt -= 8; ld(R15, offsetInt, R1_SP); 4448 } 4449 4450 void MacroAssembler::crc32(Register crc, Register buf, Register len, Register t0, Register t1, Register t2, 4451 Register t3, Register t4, Register t5, Register t6, Register t7, bool is_crc32c) { 4452 load_const_optimized(t0, is_crc32c ? StubRoutines::crc32c_table_addr() 4453 : StubRoutines::crc_table_addr() , R0); 4454 4455 if (VM_Version::has_vpmsumb()) { 4456 kernel_crc32_vpmsum(crc, buf, len, t0, t1, t2, t3, t4, t5, t6, t7, !is_crc32c); 4457 } else { 4458 kernel_crc32_1word(crc, buf, len, t0, t1, t2, t3, t4, t5, t6, t7, t0, !is_crc32c); 4459 } 4460 } 4461 4462 void MacroAssembler::kernel_crc32_singleByteReg(Register crc, Register val, Register table, bool invertCRC) { 4463 assert_different_registers(crc, val, table); 4464 4465 BLOCK_COMMENT("kernel_crc32_singleByteReg:"); 4466 if (invertCRC) { 4467 nand(crc, crc, crc); // 1s complement of crc 4468 } 4469 4470 update_byte_crc32(crc, val, table); 4471 4472 if (invertCRC) { 4473 nand(crc, crc, crc); // 1s complement of crc 4474 } 4475 } 4476 4477 // dest_lo += src1 + src2 4478 // dest_hi += carry1 + carry2 4479 void MacroAssembler::add2_with_carry(Register dest_hi, 4480 Register dest_lo, 4481 Register src1, Register src2) { 4482 li(R0, 0); 4483 addc(dest_lo, dest_lo, src1); 4484 adde(dest_hi, dest_hi, R0); 4485 addc(dest_lo, dest_lo, src2); 4486 adde(dest_hi, dest_hi, R0); 4487 } 4488 4489 // Multiply 64 bit by 64 bit first loop. 4490 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, 4491 Register x_xstart, 4492 Register y, Register y_idx, 4493 Register z, 4494 Register carry, 4495 Register product_high, Register product, 4496 Register idx, Register kdx, 4497 Register tmp) { 4498 // jlong carry, x[], y[], z[]; 4499 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx--, kdx--) { 4500 // huge_128 product = y[idx] * x[xstart] + carry; 4501 // z[kdx] = (jlong)product; 4502 // carry = (jlong)(product >>> 64); 4503 // } 4504 // z[xstart] = carry; 4505 4506 Label L_first_loop, L_first_loop_exit; 4507 Label L_one_x, L_one_y, L_multiply; 4508 4509 addic_(xstart, xstart, -1); 4510 blt(CCR0, L_one_x); // Special case: length of x is 1. 4511 4512 // Load next two integers of x. 4513 sldi(tmp, xstart, LogBytesPerInt); 4514 ldx(x_xstart, x, tmp); 4515 #ifdef VM_LITTLE_ENDIAN 4516 rldicl(x_xstart, x_xstart, 32, 0); 4517 #endif 4518 4519 align(32, 16); 4520 bind(L_first_loop); 4521 4522 cmpdi(CCR0, idx, 1); 4523 blt(CCR0, L_first_loop_exit); 4524 addi(idx, idx, -2); 4525 beq(CCR0, L_one_y); 4526 4527 // Load next two integers of y. 4528 sldi(tmp, idx, LogBytesPerInt); 4529 ldx(y_idx, y, tmp); 4530 #ifdef VM_LITTLE_ENDIAN 4531 rldicl(y_idx, y_idx, 32, 0); 4532 #endif 4533 4534 4535 bind(L_multiply); 4536 multiply64(product_high, product, x_xstart, y_idx); 4537 4538 li(tmp, 0); 4539 addc(product, product, carry); // Add carry to result. 4540 adde(product_high, product_high, tmp); // Add carry of the last addition. 4541 addi(kdx, kdx, -2); 4542 4543 // Store result. 4544 #ifdef VM_LITTLE_ENDIAN 4545 rldicl(product, product, 32, 0); 4546 #endif 4547 sldi(tmp, kdx, LogBytesPerInt); 4548 stdx(product, z, tmp); 4549 mr_if_needed(carry, product_high); 4550 b(L_first_loop); 4551 4552 4553 bind(L_one_y); // Load one 32 bit portion of y as (0,value). 4554 4555 lwz(y_idx, 0, y); 4556 b(L_multiply); 4557 4558 4559 bind(L_one_x); // Load one 32 bit portion of x as (0,value). 4560 4561 lwz(x_xstart, 0, x); 4562 b(L_first_loop); 4563 4564 bind(L_first_loop_exit); 4565 } 4566 4567 // Multiply 64 bit by 64 bit and add 128 bit. 4568 void MacroAssembler::multiply_add_128_x_128(Register x_xstart, Register y, 4569 Register z, Register yz_idx, 4570 Register idx, Register carry, 4571 Register product_high, Register product, 4572 Register tmp, int offset) { 4573 4574 // huge_128 product = (y[idx] * x_xstart) + z[kdx] + carry; 4575 // z[kdx] = (jlong)product; 4576 4577 sldi(tmp, idx, LogBytesPerInt); 4578 if (offset) { 4579 addi(tmp, tmp, offset); 4580 } 4581 ldx(yz_idx, y, tmp); 4582 #ifdef VM_LITTLE_ENDIAN 4583 rldicl(yz_idx, yz_idx, 32, 0); 4584 #endif 4585 4586 multiply64(product_high, product, x_xstart, yz_idx); 4587 ldx(yz_idx, z, tmp); 4588 #ifdef VM_LITTLE_ENDIAN 4589 rldicl(yz_idx, yz_idx, 32, 0); 4590 #endif 4591 4592 add2_with_carry(product_high, product, carry, yz_idx); 4593 4594 sldi(tmp, idx, LogBytesPerInt); 4595 if (offset) { 4596 addi(tmp, tmp, offset); 4597 } 4598 #ifdef VM_LITTLE_ENDIAN 4599 rldicl(product, product, 32, 0); 4600 #endif 4601 stdx(product, z, tmp); 4602 } 4603 4604 // Multiply 128 bit by 128 bit. Unrolled inner loop. 4605 void MacroAssembler::multiply_128_x_128_loop(Register x_xstart, 4606 Register y, Register z, 4607 Register yz_idx, Register idx, Register carry, 4608 Register product_high, Register product, 4609 Register carry2, Register tmp) { 4610 4611 // jlong carry, x[], y[], z[]; 4612 // int kdx = ystart+1; 4613 // for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop 4614 // huge_128 product = (y[idx+1] * x_xstart) + z[kdx+idx+1] + carry; 4615 // z[kdx+idx+1] = (jlong)product; 4616 // jlong carry2 = (jlong)(product >>> 64); 4617 // product = (y[idx] * x_xstart) + z[kdx+idx] + carry2; 4618 // z[kdx+idx] = (jlong)product; 4619 // carry = (jlong)(product >>> 64); 4620 // } 4621 // idx += 2; 4622 // if (idx > 0) { 4623 // product = (y[idx] * x_xstart) + z[kdx+idx] + carry; 4624 // z[kdx+idx] = (jlong)product; 4625 // carry = (jlong)(product >>> 64); 4626 // } 4627 4628 Label L_third_loop, L_third_loop_exit, L_post_third_loop_done; 4629 const Register jdx = R0; 4630 4631 // Scale the index. 4632 srdi_(jdx, idx, 2); 4633 beq(CCR0, L_third_loop_exit); 4634 mtctr(jdx); 4635 4636 align(32, 16); 4637 bind(L_third_loop); 4638 4639 addi(idx, idx, -4); 4640 4641 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 8); 4642 mr_if_needed(carry2, product_high); 4643 4644 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry2, product_high, product, tmp, 0); 4645 mr_if_needed(carry, product_high); 4646 bdnz(L_third_loop); 4647 4648 bind(L_third_loop_exit); // Handle any left-over operand parts. 4649 4650 andi_(idx, idx, 0x3); 4651 beq(CCR0, L_post_third_loop_done); 4652 4653 Label L_check_1; 4654 4655 addic_(idx, idx, -2); 4656 blt(CCR0, L_check_1); 4657 4658 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 0); 4659 mr_if_needed(carry, product_high); 4660 4661 bind(L_check_1); 4662 4663 addi(idx, idx, 0x2); 4664 andi_(idx, idx, 0x1); 4665 addic_(idx, idx, -1); 4666 blt(CCR0, L_post_third_loop_done); 4667 4668 sldi(tmp, idx, LogBytesPerInt); 4669 lwzx(yz_idx, y, tmp); 4670 multiply64(product_high, product, x_xstart, yz_idx); 4671 lwzx(yz_idx, z, tmp); 4672 4673 add2_with_carry(product_high, product, yz_idx, carry); 4674 4675 sldi(tmp, idx, LogBytesPerInt); 4676 stwx(product, z, tmp); 4677 srdi(product, product, 32); 4678 4679 sldi(product_high, product_high, 32); 4680 orr(product, product, product_high); 4681 mr_if_needed(carry, product); 4682 4683 bind(L_post_third_loop_done); 4684 } // multiply_128_x_128_loop 4685 4686 void MacroAssembler::muladd(Register out, Register in, 4687 Register offset, Register len, Register k, 4688 Register tmp1, Register tmp2, Register carry) { 4689 4690 // Labels 4691 Label LOOP, SKIP; 4692 4693 // Make sure length is positive. 4694 cmpdi (CCR0, len, 0); 4695 4696 // Prepare variables 4697 subi (offset, offset, 4); 4698 li (carry, 0); 4699 ble (CCR0, SKIP); 4700 4701 mtctr (len); 4702 subi (len, len, 1 ); 4703 sldi (len, len, 2 ); 4704 4705 // Main loop 4706 bind(LOOP); 4707 lwzx (tmp1, len, in ); 4708 lwzx (tmp2, offset, out ); 4709 mulld (tmp1, tmp1, k ); 4710 add (tmp2, carry, tmp2 ); 4711 add (tmp2, tmp1, tmp2 ); 4712 stwx (tmp2, offset, out ); 4713 srdi (carry, tmp2, 32 ); 4714 subi (offset, offset, 4 ); 4715 subi (len, len, 4 ); 4716 bdnz (LOOP); 4717 bind(SKIP); 4718 } 4719 4720 void MacroAssembler::multiply_to_len(Register x, Register xlen, 4721 Register y, Register ylen, 4722 Register z, Register zlen, 4723 Register tmp1, Register tmp2, 4724 Register tmp3, Register tmp4, 4725 Register tmp5, Register tmp6, 4726 Register tmp7, Register tmp8, 4727 Register tmp9, Register tmp10, 4728 Register tmp11, Register tmp12, 4729 Register tmp13) { 4730 4731 ShortBranchVerifier sbv(this); 4732 4733 assert_different_registers(x, xlen, y, ylen, z, zlen, 4734 tmp1, tmp2, tmp3, tmp4, tmp5, tmp6); 4735 assert_different_registers(x, xlen, y, ylen, z, zlen, 4736 tmp1, tmp2, tmp3, tmp4, tmp5, tmp7); 4737 assert_different_registers(x, xlen, y, ylen, z, zlen, 4738 tmp1, tmp2, tmp3, tmp4, tmp5, tmp8); 4739 4740 const Register idx = tmp1; 4741 const Register kdx = tmp2; 4742 const Register xstart = tmp3; 4743 4744 const Register y_idx = tmp4; 4745 const Register carry = tmp5; 4746 const Register product = tmp6; 4747 const Register product_high = tmp7; 4748 const Register x_xstart = tmp8; 4749 const Register tmp = tmp9; 4750 4751 // First Loop. 4752 // 4753 // final static long LONG_MASK = 0xffffffffL; 4754 // int xstart = xlen - 1; 4755 // int ystart = ylen - 1; 4756 // long carry = 0; 4757 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) { 4758 // long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry; 4759 // z[kdx] = (int)product; 4760 // carry = product >>> 32; 4761 // } 4762 // z[xstart] = (int)carry; 4763 4764 mr_if_needed(idx, ylen); // idx = ylen 4765 mr_if_needed(kdx, zlen); // kdx = xlen + ylen 4766 li(carry, 0); // carry = 0 4767 4768 Label L_done; 4769 4770 addic_(xstart, xlen, -1); 4771 blt(CCR0, L_done); 4772 4773 multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, 4774 carry, product_high, product, idx, kdx, tmp); 4775 4776 Label L_second_loop; 4777 4778 cmpdi(CCR0, kdx, 0); 4779 beq(CCR0, L_second_loop); 4780 4781 Label L_carry; 4782 4783 addic_(kdx, kdx, -1); 4784 beq(CCR0, L_carry); 4785 4786 // Store lower 32 bits of carry. 4787 sldi(tmp, kdx, LogBytesPerInt); 4788 stwx(carry, z, tmp); 4789 srdi(carry, carry, 32); 4790 addi(kdx, kdx, -1); 4791 4792 4793 bind(L_carry); 4794 4795 // Store upper 32 bits of carry. 4796 sldi(tmp, kdx, LogBytesPerInt); 4797 stwx(carry, z, tmp); 4798 4799 // Second and third (nested) loops. 4800 // 4801 // for (int i = xstart-1; i >= 0; i--) { // Second loop 4802 // carry = 0; 4803 // for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop 4804 // long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) + 4805 // (z[k] & LONG_MASK) + carry; 4806 // z[k] = (int)product; 4807 // carry = product >>> 32; 4808 // } 4809 // z[i] = (int)carry; 4810 // } 4811 // 4812 // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = rdx 4813 4814 bind(L_second_loop); 4815 4816 li(carry, 0); // carry = 0; 4817 4818 addic_(xstart, xstart, -1); // i = xstart-1; 4819 blt(CCR0, L_done); 4820 4821 Register zsave = tmp10; 4822 4823 mr(zsave, z); 4824 4825 4826 Label L_last_x; 4827 4828 sldi(tmp, xstart, LogBytesPerInt); 4829 add(z, z, tmp); // z = z + k - j 4830 addi(z, z, 4); 4831 addic_(xstart, xstart, -1); // i = xstart-1; 4832 blt(CCR0, L_last_x); 4833 4834 sldi(tmp, xstart, LogBytesPerInt); 4835 ldx(x_xstart, x, tmp); 4836 #ifdef VM_LITTLE_ENDIAN 4837 rldicl(x_xstart, x_xstart, 32, 0); 4838 #endif 4839 4840 4841 Label L_third_loop_prologue; 4842 4843 bind(L_third_loop_prologue); 4844 4845 Register xsave = tmp11; 4846 Register xlensave = tmp12; 4847 Register ylensave = tmp13; 4848 4849 mr(xsave, x); 4850 mr(xlensave, xstart); 4851 mr(ylensave, ylen); 4852 4853 4854 multiply_128_x_128_loop(x_xstart, y, z, y_idx, ylen, 4855 carry, product_high, product, x, tmp); 4856 4857 mr(z, zsave); 4858 mr(x, xsave); 4859 mr(xlen, xlensave); // This is the decrement of the loop counter! 4860 mr(ylen, ylensave); 4861 4862 addi(tmp3, xlen, 1); 4863 sldi(tmp, tmp3, LogBytesPerInt); 4864 stwx(carry, z, tmp); 4865 addic_(tmp3, tmp3, -1); 4866 blt(CCR0, L_done); 4867 4868 srdi(carry, carry, 32); 4869 sldi(tmp, tmp3, LogBytesPerInt); 4870 stwx(carry, z, tmp); 4871 b(L_second_loop); 4872 4873 // Next infrequent code is moved outside loops. 4874 bind(L_last_x); 4875 4876 lwz(x_xstart, 0, x); 4877 b(L_third_loop_prologue); 4878 4879 bind(L_done); 4880 } // multiply_to_len 4881 4882 void MacroAssembler::asm_assert(bool check_equal, const char *msg, int id) { 4883 #ifdef ASSERT 4884 Label ok; 4885 if (check_equal) { 4886 beq(CCR0, ok); 4887 } else { 4888 bne(CCR0, ok); 4889 } 4890 stop(msg, id); 4891 bind(ok); 4892 #endif 4893 } 4894 4895 void MacroAssembler::asm_assert_mems_zero(bool check_equal, int size, int mem_offset, 4896 Register mem_base, const char* msg, int id) { 4897 #ifdef ASSERT 4898 switch (size) { 4899 case 4: 4900 lwz(R0, mem_offset, mem_base); 4901 cmpwi(CCR0, R0, 0); 4902 break; 4903 case 8: 4904 ld(R0, mem_offset, mem_base); 4905 cmpdi(CCR0, R0, 0); 4906 break; 4907 default: 4908 ShouldNotReachHere(); 4909 } 4910 asm_assert(check_equal, msg, id); 4911 #endif // ASSERT 4912 } 4913 4914 void MacroAssembler::verify_thread() { 4915 if (VerifyThread) { 4916 unimplemented("'VerifyThread' currently not implemented on PPC"); 4917 } 4918 } 4919 4920 // READ: oop. KILL: R0. Volatile floats perhaps. 4921 void MacroAssembler::verify_oop(Register oop, const char* msg) { 4922 if (!VerifyOops) { 4923 return; 4924 } 4925 4926 address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address(); 4927 const Register tmp = R11; // Will be preserved. 4928 const int nbytes_save = MacroAssembler::num_volatile_regs * 8; 4929 save_volatile_gprs(R1_SP, -nbytes_save); // except R0 4930 4931 mr_if_needed(R4_ARG2, oop); 4932 save_LR_CR(tmp); // save in old frame 4933 push_frame_reg_args(nbytes_save, tmp); 4934 // load FunctionDescriptor** / entry_address * 4935 load_const_optimized(tmp, fd, R0); 4936 // load FunctionDescriptor* / entry_address 4937 ld(tmp, 0, tmp); 4938 load_const_optimized(R3_ARG1, (address)msg, R0); 4939 // Call destination for its side effect. 4940 call_c(tmp); 4941 4942 pop_frame(); 4943 restore_LR_CR(tmp); 4944 restore_volatile_gprs(R1_SP, -nbytes_save); // except R0 4945 } 4946 4947 void MacroAssembler::verify_oop_addr(RegisterOrConstant offs, Register base, const char* msg) { 4948 if (!VerifyOops) { 4949 return; 4950 } 4951 4952 address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address(); 4953 const Register tmp = R11; // Will be preserved. 4954 const int nbytes_save = MacroAssembler::num_volatile_regs * 8; 4955 save_volatile_gprs(R1_SP, -nbytes_save); // except R0 4956 4957 ld(R4_ARG2, offs, base); 4958 save_LR_CR(tmp); // save in old frame 4959 push_frame_reg_args(nbytes_save, tmp); 4960 // load FunctionDescriptor** / entry_address * 4961 load_const_optimized(tmp, fd, R0); 4962 // load FunctionDescriptor* / entry_address 4963 ld(tmp, 0, tmp); 4964 load_const_optimized(R3_ARG1, (address)msg, R0); 4965 // Call destination for its side effect. 4966 call_c(tmp); 4967 4968 pop_frame(); 4969 restore_LR_CR(tmp); 4970 restore_volatile_gprs(R1_SP, -nbytes_save); // except R0 4971 } 4972 4973 const char* stop_types[] = { 4974 "stop", 4975 "untested", 4976 "unimplemented", 4977 "shouldnotreachhere" 4978 }; 4979 4980 static void stop_on_request(int tp, const char* msg) { 4981 tty->print("PPC assembly code requires stop: (%s) %s\n", stop_types[tp%/*stop_end*/4], msg); 4982 guarantee(false, "PPC assembly code requires stop: %s", msg); 4983 } 4984 4985 // Call a C-function that prints output. 4986 void MacroAssembler::stop(int type, const char* msg, int id) { 4987 #ifndef PRODUCT 4988 block_comment(err_msg("stop: %s %s {", stop_types[type%stop_end], msg)); 4989 #else 4990 block_comment("stop {"); 4991 #endif 4992 4993 // setup arguments 4994 load_const_optimized(R3_ARG1, type); 4995 load_const_optimized(R4_ARG2, (void *)msg, /*tmp=*/R0); 4996 call_VM_leaf(CAST_FROM_FN_PTR(address, stop_on_request), R3_ARG1, R4_ARG2); 4997 illtrap(); 4998 emit_int32(id); 4999 block_comment("} stop;"); 5000 } 5001 5002 #ifndef PRODUCT 5003 // Write pattern 0x0101010101010101 in memory region [low-before, high+after]. 5004 // Val, addr are temp registers. 5005 // If low == addr, addr is killed. 5006 // High is preserved. 5007 void MacroAssembler::zap_from_to(Register low, int before, Register high, int after, Register val, Register addr) { 5008 if (!ZapMemory) return; 5009 5010 assert_different_registers(low, val); 5011 5012 BLOCK_COMMENT("zap memory region {"); 5013 load_const_optimized(val, 0x0101010101010101); 5014 int size = before + after; 5015 if (low == high && size < 5 && size > 0) { 5016 int offset = -before*BytesPerWord; 5017 for (int i = 0; i < size; ++i) { 5018 std(val, offset, low); 5019 offset += (1*BytesPerWord); 5020 } 5021 } else { 5022 addi(addr, low, -before*BytesPerWord); 5023 assert_different_registers(high, val); 5024 if (after) addi(high, high, after * BytesPerWord); 5025 Label loop; 5026 bind(loop); 5027 std(val, 0, addr); 5028 addi(addr, addr, 8); 5029 cmpd(CCR6, addr, high); 5030 ble(CCR6, loop); 5031 if (after) addi(high, high, -after * BytesPerWord); // Correct back to old value. 5032 } 5033 BLOCK_COMMENT("} zap memory region"); 5034 } 5035 5036 #endif // !PRODUCT 5037 5038 void SkipIfEqualZero::skip_to_label_if_equal_zero(MacroAssembler* masm, Register temp, 5039 const bool* flag_addr, Label& label) { 5040 int simm16_offset = masm->load_const_optimized(temp, (address)flag_addr, R0, true); 5041 assert(sizeof(bool) == 1, "PowerPC ABI"); 5042 masm->lbz(temp, simm16_offset, temp); 5043 masm->cmpwi(CCR0, temp, 0); 5044 masm->beq(CCR0, label); 5045 } 5046 5047 SkipIfEqualZero::SkipIfEqualZero(MacroAssembler* masm, Register temp, const bool* flag_addr) : _masm(masm), _label() { 5048 skip_to_label_if_equal_zero(masm, temp, flag_addr, _label); 5049 } 5050 5051 SkipIfEqualZero::~SkipIfEqualZero() { 5052 _masm->bind(_label); 5053 }