1 /* 2 * Copyright (c) 1997, 2015, Oracle and/or its affiliates. All rights reserved. 3 * Copyright 2012, 2015 SAP AG. All rights reserved. 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. 9 * 10 * This code is distributed in the hope that it will be useful, but WITHOUT 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 * version 2 for more details (a copy is included in the LICENSE file that 14 * accompanied this code). 15 * 16 * You should have received a copy of the GNU General Public License version 17 * 2 along with this work; if not, write to the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 19 * 20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 21 * or visit www.oracle.com if you need additional information or have any 22 * questions. 23 * 24 */ 25 26 #include "precompiled.hpp" 27 #include "asm/macroAssembler.inline.hpp" 28 #include "compiler/disassembler.hpp" 29 #include "gc/shared/cardTableModRefBS.hpp" 30 #include "gc/shared/collectedHeap.inline.hpp" 31 #include "interpreter/interpreter.hpp" 32 #include "memory/resourceArea.hpp" 33 #include "prims/methodHandles.hpp" 34 #include "runtime/biasedLocking.hpp" 35 #include "runtime/icache.hpp" 36 #include "runtime/interfaceSupport.hpp" 37 #include "runtime/objectMonitor.hpp" 38 #include "runtime/os.hpp" 39 #include "runtime/sharedRuntime.hpp" 40 #include "runtime/stubRoutines.hpp" 41 #include "utilities/macros.hpp" 42 #if INCLUDE_ALL_GCS 43 #include "gc/g1/g1CollectedHeap.inline.hpp" 44 #include "gc/g1/g1SATBCardTableModRefBS.hpp" 45 #include "gc/g1/heapRegion.hpp" 46 #endif // INCLUDE_ALL_GCS 47 48 #ifdef PRODUCT 49 #define BLOCK_COMMENT(str) // nothing 50 #else 51 #define BLOCK_COMMENT(str) block_comment(str) 52 #endif 53 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 54 55 #ifdef ASSERT 56 // On RISC, there's no benefit to verifying instruction boundaries. 57 bool AbstractAssembler::pd_check_instruction_mark() { return false; } 58 #endif 59 60 void MacroAssembler::ld_largeoffset_unchecked(Register d, int si31, Register a, int emit_filler_nop) { 61 assert(Assembler::is_simm(si31, 31) && si31 >= 0, "si31 out of range"); 62 if (Assembler::is_simm(si31, 16)) { 63 ld(d, si31, a); 64 if (emit_filler_nop) nop(); 65 } else { 66 const int hi = MacroAssembler::largeoffset_si16_si16_hi(si31); 67 const int lo = MacroAssembler::largeoffset_si16_si16_lo(si31); 68 addis(d, a, hi); 69 ld(d, lo, d); 70 } 71 } 72 73 void MacroAssembler::ld_largeoffset(Register d, int si31, Register a, int emit_filler_nop) { 74 assert_different_registers(d, a); 75 ld_largeoffset_unchecked(d, si31, a, emit_filler_nop); 76 } 77 78 void MacroAssembler::load_sized_value(Register dst, RegisterOrConstant offs, Register base, 79 size_t size_in_bytes, bool is_signed) { 80 switch (size_in_bytes) { 81 case 8: ld(dst, offs, base); break; 82 case 4: is_signed ? lwa(dst, offs, base) : lwz(dst, offs, base); break; 83 case 2: is_signed ? lha(dst, offs, base) : lhz(dst, offs, base); break; 84 case 1: lbz(dst, offs, base); if (is_signed) extsb(dst, dst); break; // lba doesn't exist :( 85 default: ShouldNotReachHere(); 86 } 87 } 88 89 void MacroAssembler::store_sized_value(Register dst, RegisterOrConstant offs, Register base, 90 size_t size_in_bytes) { 91 switch (size_in_bytes) { 92 case 8: std(dst, offs, base); break; 93 case 4: stw(dst, offs, base); break; 94 case 2: sth(dst, offs, base); break; 95 case 1: stb(dst, offs, base); break; 96 default: ShouldNotReachHere(); 97 } 98 } 99 100 void MacroAssembler::align(int modulus, int max, int rem) { 101 int padding = (rem + modulus - (offset() % modulus)) % modulus; 102 if (padding > max) return; 103 for (int c = (padding >> 2); c > 0; --c) { nop(); } 104 } 105 106 // Issue instructions that calculate given TOC from global TOC. 107 void MacroAssembler::calculate_address_from_global_toc(Register dst, address addr, bool hi16, bool lo16, 108 bool add_relocation, bool emit_dummy_addr) { 109 int offset = -1; 110 if (emit_dummy_addr) { 111 offset = -128; // dummy address 112 } else if (addr != (address)(intptr_t)-1) { 113 offset = MacroAssembler::offset_to_global_toc(addr); 114 } 115 116 if (hi16) { 117 addis(dst, R29, MacroAssembler::largeoffset_si16_si16_hi(offset)); 118 } 119 if (lo16) { 120 if (add_relocation) { 121 // Relocate at the addi to avoid confusion with a load from the method's TOC. 122 relocate(internal_word_Relocation::spec(addr)); 123 } 124 addi(dst, dst, MacroAssembler::largeoffset_si16_si16_lo(offset)); 125 } 126 } 127 128 int MacroAssembler::patch_calculate_address_from_global_toc_at(address a, address bound, address addr) { 129 const int offset = MacroAssembler::offset_to_global_toc(addr); 130 131 const address inst2_addr = a; 132 const int inst2 = *(int *)inst2_addr; 133 134 // The relocation points to the second instruction, the addi, 135 // and the addi reads and writes the same register dst. 136 const int dst = inv_rt_field(inst2); 137 assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst"); 138 139 // Now, find the preceding addis which writes to dst. 140 int inst1 = 0; 141 address inst1_addr = inst2_addr - BytesPerInstWord; 142 while (inst1_addr >= bound) { 143 inst1 = *(int *) inst1_addr; 144 if (is_addis(inst1) && inv_rt_field(inst1) == dst) { 145 // Stop, found the addis which writes dst. 146 break; 147 } 148 inst1_addr -= BytesPerInstWord; 149 } 150 151 assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC"); 152 set_imm((int *)inst1_addr, MacroAssembler::largeoffset_si16_si16_hi(offset)); 153 set_imm((int *)inst2_addr, MacroAssembler::largeoffset_si16_si16_lo(offset)); 154 return (int)((intptr_t)addr - (intptr_t)inst1_addr); 155 } 156 157 address MacroAssembler::get_address_of_calculate_address_from_global_toc_at(address a, address bound) { 158 const address inst2_addr = a; 159 const int inst2 = *(int *)inst2_addr; 160 161 // The relocation points to the second instruction, the addi, 162 // and the addi reads and writes the same register dst. 163 const int dst = inv_rt_field(inst2); 164 assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst"); 165 166 // Now, find the preceding addis which writes to dst. 167 int inst1 = 0; 168 address inst1_addr = inst2_addr - BytesPerInstWord; 169 while (inst1_addr >= bound) { 170 inst1 = *(int *) inst1_addr; 171 if (is_addis(inst1) && inv_rt_field(inst1) == dst) { 172 // stop, found the addis which writes dst 173 break; 174 } 175 inst1_addr -= BytesPerInstWord; 176 } 177 178 assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC"); 179 180 int offset = (get_imm(inst1_addr, 0) << 16) + get_imm(inst2_addr, 0); 181 // -1 is a special case 182 if (offset == -1) { 183 return (address)(intptr_t)-1; 184 } else { 185 return global_toc() + offset; 186 } 187 } 188 189 #ifdef _LP64 190 // Patch compressed oops or klass constants. 191 // Assembler sequence is 192 // 1) compressed oops: 193 // lis rx = const.hi 194 // ori rx = rx | const.lo 195 // 2) compressed klass: 196 // lis rx = const.hi 197 // clrldi rx = rx & 0xFFFFffff // clearMS32b, optional 198 // ori rx = rx | const.lo 199 // Clrldi will be passed by. 200 int MacroAssembler::patch_set_narrow_oop(address a, address bound, narrowOop data) { 201 assert(UseCompressedOops, "Should only patch compressed oops"); 202 203 const address inst2_addr = a; 204 const int inst2 = *(int *)inst2_addr; 205 206 // The relocation points to the second instruction, the ori, 207 // and the ori reads and writes the same register dst. 208 const int dst = inv_rta_field(inst2); 209 assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst"); 210 // Now, find the preceding addis which writes to dst. 211 int inst1 = 0; 212 address inst1_addr = inst2_addr - BytesPerInstWord; 213 bool inst1_found = false; 214 while (inst1_addr >= bound) { 215 inst1 = *(int *)inst1_addr; 216 if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break; } 217 inst1_addr -= BytesPerInstWord; 218 } 219 assert(inst1_found, "inst is not lis"); 220 221 int xc = (data >> 16) & 0xffff; 222 int xd = (data >> 0) & 0xffff; 223 224 set_imm((int *)inst1_addr, (short)(xc)); // see enc_load_con_narrow_hi/_lo 225 set_imm((int *)inst2_addr, (xd)); // unsigned int 226 return (int)((intptr_t)inst2_addr - (intptr_t)inst1_addr); 227 } 228 229 // Get compressed oop or klass constant. 230 narrowOop MacroAssembler::get_narrow_oop(address a, address bound) { 231 assert(UseCompressedOops, "Should only patch compressed oops"); 232 233 const address inst2_addr = a; 234 const int inst2 = *(int *)inst2_addr; 235 236 // The relocation points to the second instruction, the ori, 237 // and the ori reads and writes the same register dst. 238 const int dst = inv_rta_field(inst2); 239 assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst"); 240 // Now, find the preceding lis which writes to dst. 241 int inst1 = 0; 242 address inst1_addr = inst2_addr - BytesPerInstWord; 243 bool inst1_found = false; 244 245 while (inst1_addr >= bound) { 246 inst1 = *(int *) inst1_addr; 247 if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break;} 248 inst1_addr -= BytesPerInstWord; 249 } 250 assert(inst1_found, "inst is not lis"); 251 252 uint xl = ((unsigned int) (get_imm(inst2_addr, 0) & 0xffff)); 253 uint xh = (((get_imm(inst1_addr, 0)) & 0xffff) << 16); 254 255 return (int) (xl | xh); 256 } 257 #endif // _LP64 258 259 void MacroAssembler::load_const_from_method_toc(Register dst, AddressLiteral& a, Register toc) { 260 int toc_offset = 0; 261 // Use RelocationHolder::none for the constant pool entry, otherwise 262 // we will end up with a failing NativeCall::verify(x) where x is 263 // the address of the constant pool entry. 264 // FIXME: We should insert relocation information for oops at the constant 265 // pool entries instead of inserting it at the loads; patching of a constant 266 // pool entry should be less expensive. 267 address oop_address = address_constant((address)a.value(), RelocationHolder::none); 268 // Relocate at the pc of the load. 269 relocate(a.rspec()); 270 toc_offset = (int)(oop_address - code()->consts()->start()); 271 ld_largeoffset_unchecked(dst, toc_offset, toc, true); 272 } 273 274 bool MacroAssembler::is_load_const_from_method_toc_at(address a) { 275 const address inst1_addr = a; 276 const int inst1 = *(int *)inst1_addr; 277 278 // The relocation points to the ld or the addis. 279 return (is_ld(inst1)) || 280 (is_addis(inst1) && inv_ra_field(inst1) != 0); 281 } 282 283 int MacroAssembler::get_offset_of_load_const_from_method_toc_at(address a) { 284 assert(is_load_const_from_method_toc_at(a), "must be load_const_from_method_toc"); 285 286 const address inst1_addr = a; 287 const int inst1 = *(int *)inst1_addr; 288 289 if (is_ld(inst1)) { 290 return inv_d1_field(inst1); 291 } else if (is_addis(inst1)) { 292 const int dst = inv_rt_field(inst1); 293 294 // Now, find the succeeding ld which reads and writes to dst. 295 address inst2_addr = inst1_addr + BytesPerInstWord; 296 int inst2 = 0; 297 while (true) { 298 inst2 = *(int *) inst2_addr; 299 if (is_ld(inst2) && inv_ra_field(inst2) == dst && inv_rt_field(inst2) == dst) { 300 // Stop, found the ld which reads and writes dst. 301 break; 302 } 303 inst2_addr += BytesPerInstWord; 304 } 305 return (inv_d1_field(inst1) << 16) + inv_d1_field(inst2); 306 } 307 ShouldNotReachHere(); 308 return 0; 309 } 310 311 // Get the constant from a `load_const' sequence. 312 long MacroAssembler::get_const(address a) { 313 assert(is_load_const_at(a), "not a load of a constant"); 314 const int *p = (const int*) a; 315 unsigned long x = (((unsigned long) (get_imm(a,0) & 0xffff)) << 48); 316 if (is_ori(*(p+1))) { 317 x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 32); 318 x |= (((unsigned long) (get_imm(a,3) & 0xffff)) << 16); 319 x |= (((unsigned long) (get_imm(a,4) & 0xffff))); 320 } else if (is_lis(*(p+1))) { 321 x |= (((unsigned long) (get_imm(a,2) & 0xffff)) << 32); 322 x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 16); 323 x |= (((unsigned long) (get_imm(a,3) & 0xffff))); 324 } else { 325 ShouldNotReachHere(); 326 return (long) 0; 327 } 328 return (long) x; 329 } 330 331 // Patch the 64 bit constant of a `load_const' sequence. This is a low 332 // level procedure. It neither flushes the instruction cache nor is it 333 // mt safe. 334 void MacroAssembler::patch_const(address a, long x) { 335 assert(is_load_const_at(a), "not a load of a constant"); 336 int *p = (int*) a; 337 if (is_ori(*(p+1))) { 338 set_imm(0 + p, (x >> 48) & 0xffff); 339 set_imm(1 + p, (x >> 32) & 0xffff); 340 set_imm(3 + p, (x >> 16) & 0xffff); 341 set_imm(4 + p, x & 0xffff); 342 } else if (is_lis(*(p+1))) { 343 set_imm(0 + p, (x >> 48) & 0xffff); 344 set_imm(2 + p, (x >> 32) & 0xffff); 345 set_imm(1 + p, (x >> 16) & 0xffff); 346 set_imm(3 + p, x & 0xffff); 347 } else { 348 ShouldNotReachHere(); 349 } 350 } 351 352 AddressLiteral MacroAssembler::allocate_metadata_address(Metadata* obj) { 353 assert(oop_recorder() != NULL, "this assembler needs a Recorder"); 354 int index = oop_recorder()->allocate_metadata_index(obj); 355 RelocationHolder rspec = metadata_Relocation::spec(index); 356 return AddressLiteral((address)obj, rspec); 357 } 358 359 AddressLiteral MacroAssembler::constant_metadata_address(Metadata* obj) { 360 assert(oop_recorder() != NULL, "this assembler needs a Recorder"); 361 int index = oop_recorder()->find_index(obj); 362 RelocationHolder rspec = metadata_Relocation::spec(index); 363 return AddressLiteral((address)obj, rspec); 364 } 365 366 AddressLiteral MacroAssembler::allocate_oop_address(jobject obj) { 367 assert(oop_recorder() != NULL, "this assembler needs an OopRecorder"); 368 int oop_index = oop_recorder()->allocate_oop_index(obj); 369 return AddressLiteral(address(obj), oop_Relocation::spec(oop_index)); 370 } 371 372 AddressLiteral MacroAssembler::constant_oop_address(jobject obj) { 373 assert(oop_recorder() != NULL, "this assembler needs an OopRecorder"); 374 int oop_index = oop_recorder()->find_index(obj); 375 return AddressLiteral(address(obj), oop_Relocation::spec(oop_index)); 376 } 377 378 RegisterOrConstant MacroAssembler::delayed_value_impl(intptr_t* delayed_value_addr, 379 Register tmp, int offset) { 380 intptr_t value = *delayed_value_addr; 381 if (value != 0) { 382 return RegisterOrConstant(value + offset); 383 } 384 385 // Load indirectly to solve generation ordering problem. 386 // static address, no relocation 387 int simm16_offset = load_const_optimized(tmp, delayed_value_addr, noreg, true); 388 ld(tmp, simm16_offset, tmp); // must be aligned ((xa & 3) == 0) 389 390 if (offset != 0) { 391 addi(tmp, tmp, offset); 392 } 393 394 return RegisterOrConstant(tmp); 395 } 396 397 #ifndef PRODUCT 398 void MacroAssembler::pd_print_patched_instruction(address branch) { 399 Unimplemented(); // TODO: PPC port 400 } 401 #endif // ndef PRODUCT 402 403 // Conditional far branch for destinations encodable in 24+2 bits. 404 void MacroAssembler::bc_far(int boint, int biint, Label& dest, int optimize) { 405 406 // If requested by flag optimize, relocate the bc_far as a 407 // runtime_call and prepare for optimizing it when the code gets 408 // relocated. 409 if (optimize == bc_far_optimize_on_relocate) { 410 relocate(relocInfo::runtime_call_type); 411 } 412 413 // variant 2: 414 // 415 // b!cxx SKIP 416 // bxx DEST 417 // SKIP: 418 // 419 420 const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)), 421 opposite_bcond(inv_boint_bcond(boint))); 422 423 // We emit two branches. 424 // First, a conditional branch which jumps around the far branch. 425 const address not_taken_pc = pc() + 2 * BytesPerInstWord; 426 const address bc_pc = pc(); 427 bc(opposite_boint, biint, not_taken_pc); 428 429 const int bc_instr = *(int*)bc_pc; 430 assert(not_taken_pc == (address)inv_bd_field(bc_instr, (intptr_t)bc_pc), "postcondition"); 431 assert(opposite_boint == inv_bo_field(bc_instr), "postcondition"); 432 assert(boint == add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(bc_instr))), 433 opposite_bcond(inv_boint_bcond(inv_bo_field(bc_instr)))), 434 "postcondition"); 435 assert(biint == inv_bi_field(bc_instr), "postcondition"); 436 437 // Second, an unconditional far branch which jumps to dest. 438 // Note: target(dest) remembers the current pc (see CodeSection::target) 439 // and returns the current pc if the label is not bound yet; when 440 // the label gets bound, the unconditional far branch will be patched. 441 const address target_pc = target(dest); 442 const address b_pc = pc(); 443 b(target_pc); 444 445 assert(not_taken_pc == pc(), "postcondition"); 446 assert(dest.is_bound() || target_pc == b_pc, "postcondition"); 447 } 448 449 bool MacroAssembler::is_bc_far_at(address instruction_addr) { 450 return is_bc_far_variant1_at(instruction_addr) || 451 is_bc_far_variant2_at(instruction_addr) || 452 is_bc_far_variant3_at(instruction_addr); 453 } 454 455 address MacroAssembler::get_dest_of_bc_far_at(address instruction_addr) { 456 if (is_bc_far_variant1_at(instruction_addr)) { 457 const address instruction_1_addr = instruction_addr; 458 const int instruction_1 = *(int*)instruction_1_addr; 459 return (address)inv_bd_field(instruction_1, (intptr_t)instruction_1_addr); 460 } else if (is_bc_far_variant2_at(instruction_addr)) { 461 const address instruction_2_addr = instruction_addr + 4; 462 return bxx_destination(instruction_2_addr); 463 } else if (is_bc_far_variant3_at(instruction_addr)) { 464 return instruction_addr + 8; 465 } 466 // variant 4 ??? 467 ShouldNotReachHere(); 468 return NULL; 469 } 470 void MacroAssembler::set_dest_of_bc_far_at(address instruction_addr, address dest) { 471 472 if (is_bc_far_variant3_at(instruction_addr)) { 473 // variant 3, far cond branch to the next instruction, already patched to nops: 474 // 475 // nop 476 // endgroup 477 // SKIP/DEST: 478 // 479 return; 480 } 481 482 // first, extract boint and biint from the current branch 483 int boint = 0; 484 int biint = 0; 485 486 ResourceMark rm; 487 const int code_size = 2 * BytesPerInstWord; 488 CodeBuffer buf(instruction_addr, code_size); 489 MacroAssembler masm(&buf); 490 if (is_bc_far_variant2_at(instruction_addr) && dest == instruction_addr + 8) { 491 // Far branch to next instruction: Optimize it by patching nops (produce variant 3). 492 masm.nop(); 493 masm.endgroup(); 494 } else { 495 if (is_bc_far_variant1_at(instruction_addr)) { 496 // variant 1, the 1st instruction contains the destination address: 497 // 498 // bcxx DEST 499 // endgroup 500 // 501 const int instruction_1 = *(int*)(instruction_addr); 502 boint = inv_bo_field(instruction_1); 503 biint = inv_bi_field(instruction_1); 504 } else if (is_bc_far_variant2_at(instruction_addr)) { 505 // variant 2, the 2nd instruction contains the destination address: 506 // 507 // b!cxx SKIP 508 // bxx DEST 509 // SKIP: 510 // 511 const int instruction_1 = *(int*)(instruction_addr); 512 boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(instruction_1))), 513 opposite_bcond(inv_boint_bcond(inv_bo_field(instruction_1)))); 514 biint = inv_bi_field(instruction_1); 515 } else { 516 // variant 4??? 517 ShouldNotReachHere(); 518 } 519 520 // second, set the new branch destination and optimize the code 521 if (dest != instruction_addr + 4 && // the bc_far is still unbound! 522 masm.is_within_range_of_bcxx(dest, instruction_addr)) { 523 // variant 1: 524 // 525 // bcxx DEST 526 // endgroup 527 // 528 masm.bc(boint, biint, dest); 529 masm.endgroup(); 530 } else { 531 // variant 2: 532 // 533 // b!cxx SKIP 534 // bxx DEST 535 // SKIP: 536 // 537 const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)), 538 opposite_bcond(inv_boint_bcond(boint))); 539 const address not_taken_pc = masm.pc() + 2 * BytesPerInstWord; 540 masm.bc(opposite_boint, biint, not_taken_pc); 541 masm.b(dest); 542 } 543 } 544 ICache::ppc64_flush_icache_bytes(instruction_addr, code_size); 545 } 546 547 // Emit a NOT mt-safe patchable 64 bit absolute call/jump. 548 void MacroAssembler::bxx64_patchable(address dest, relocInfo::relocType rt, bool link) { 549 // get current pc 550 uint64_t start_pc = (uint64_t) pc(); 551 552 const address pc_of_bl = (address) (start_pc + (6*BytesPerInstWord)); // bl is last 553 const address pc_of_b = (address) (start_pc + (0*BytesPerInstWord)); // b is first 554 555 // relocate here 556 if (rt != relocInfo::none) { 557 relocate(rt); 558 } 559 560 if ( ReoptimizeCallSequences && 561 (( link && is_within_range_of_b(dest, pc_of_bl)) || 562 (!link && is_within_range_of_b(dest, pc_of_b)))) { 563 // variant 2: 564 // Emit an optimized, pc-relative call/jump. 565 566 if (link) { 567 // some padding 568 nop(); 569 nop(); 570 nop(); 571 nop(); 572 nop(); 573 nop(); 574 575 // do the call 576 assert(pc() == pc_of_bl, "just checking"); 577 bl(dest, relocInfo::none); 578 } else { 579 // do the jump 580 assert(pc() == pc_of_b, "just checking"); 581 b(dest, relocInfo::none); 582 583 // some padding 584 nop(); 585 nop(); 586 nop(); 587 nop(); 588 nop(); 589 nop(); 590 } 591 592 // Assert that we can identify the emitted call/jump. 593 assert(is_bxx64_patchable_variant2_at((address)start_pc, link), 594 "can't identify emitted call"); 595 } else { 596 // variant 1: 597 mr(R0, R11); // spill R11 -> R0. 598 599 // Load the destination address into CTR, 600 // calculate destination relative to global toc. 601 calculate_address_from_global_toc(R11, dest, true, true, false); 602 603 mtctr(R11); 604 mr(R11, R0); // spill R11 <- R0. 605 nop(); 606 607 // do the call/jump 608 if (link) { 609 bctrl(); 610 } else{ 611 bctr(); 612 } 613 // Assert that we can identify the emitted call/jump. 614 assert(is_bxx64_patchable_variant1b_at((address)start_pc, link), 615 "can't identify emitted call"); 616 } 617 618 // Assert that we can identify the emitted call/jump. 619 assert(is_bxx64_patchable_at((address)start_pc, link), 620 "can't identify emitted call"); 621 assert(get_dest_of_bxx64_patchable_at((address)start_pc, link) == dest, 622 "wrong encoding of dest address"); 623 } 624 625 // Identify a bxx64_patchable instruction. 626 bool MacroAssembler::is_bxx64_patchable_at(address instruction_addr, bool link) { 627 return is_bxx64_patchable_variant1b_at(instruction_addr, link) 628 //|| is_bxx64_patchable_variant1_at(instruction_addr, link) 629 || is_bxx64_patchable_variant2_at(instruction_addr, link); 630 } 631 632 // Does the call64_patchable instruction use a pc-relative encoding of 633 // the call destination? 634 bool MacroAssembler::is_bxx64_patchable_pcrelative_at(address instruction_addr, bool link) { 635 // variant 2 is pc-relative 636 return is_bxx64_patchable_variant2_at(instruction_addr, link); 637 } 638 639 // Identify variant 1. 640 bool MacroAssembler::is_bxx64_patchable_variant1_at(address instruction_addr, bool link) { 641 unsigned int* instr = (unsigned int*) instruction_addr; 642 return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l] 643 && is_mtctr(instr[5]) // mtctr 644 && is_load_const_at(instruction_addr); 645 } 646 647 // Identify variant 1b: load destination relative to global toc. 648 bool MacroAssembler::is_bxx64_patchable_variant1b_at(address instruction_addr, bool link) { 649 unsigned int* instr = (unsigned int*) instruction_addr; 650 return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l] 651 && is_mtctr(instr[3]) // mtctr 652 && is_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord, instruction_addr); 653 } 654 655 // Identify variant 2. 656 bool MacroAssembler::is_bxx64_patchable_variant2_at(address instruction_addr, bool link) { 657 unsigned int* instr = (unsigned int*) instruction_addr; 658 if (link) { 659 return is_bl (instr[6]) // bl dest is last 660 && is_nop(instr[0]) // nop 661 && is_nop(instr[1]) // nop 662 && is_nop(instr[2]) // nop 663 && is_nop(instr[3]) // nop 664 && is_nop(instr[4]) // nop 665 && is_nop(instr[5]); // nop 666 } else { 667 return is_b (instr[0]) // b dest is first 668 && is_nop(instr[1]) // nop 669 && is_nop(instr[2]) // nop 670 && is_nop(instr[3]) // nop 671 && is_nop(instr[4]) // nop 672 && is_nop(instr[5]) // nop 673 && is_nop(instr[6]); // nop 674 } 675 } 676 677 // Set dest address of a bxx64_patchable instruction. 678 void MacroAssembler::set_dest_of_bxx64_patchable_at(address instruction_addr, address dest, bool link) { 679 ResourceMark rm; 680 int code_size = MacroAssembler::bxx64_patchable_size; 681 CodeBuffer buf(instruction_addr, code_size); 682 MacroAssembler masm(&buf); 683 masm.bxx64_patchable(dest, relocInfo::none, link); 684 ICache::ppc64_flush_icache_bytes(instruction_addr, code_size); 685 } 686 687 // Get dest address of a bxx64_patchable instruction. 688 address MacroAssembler::get_dest_of_bxx64_patchable_at(address instruction_addr, bool link) { 689 if (is_bxx64_patchable_variant1_at(instruction_addr, link)) { 690 return (address) (unsigned long) get_const(instruction_addr); 691 } else if (is_bxx64_patchable_variant2_at(instruction_addr, link)) { 692 unsigned int* instr = (unsigned int*) instruction_addr; 693 if (link) { 694 const int instr_idx = 6; // bl is last 695 int branchoffset = branch_destination(instr[instr_idx], 0); 696 return instruction_addr + branchoffset + instr_idx*BytesPerInstWord; 697 } else { 698 const int instr_idx = 0; // b is first 699 int branchoffset = branch_destination(instr[instr_idx], 0); 700 return instruction_addr + branchoffset + instr_idx*BytesPerInstWord; 701 } 702 // Load dest relative to global toc. 703 } else if (is_bxx64_patchable_variant1b_at(instruction_addr, link)) { 704 return get_address_of_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord, 705 instruction_addr); 706 } else { 707 ShouldNotReachHere(); 708 return NULL; 709 } 710 } 711 712 // Uses ordering which corresponds to ABI: 713 // _savegpr0_14: std r14,-144(r1) 714 // _savegpr0_15: std r15,-136(r1) 715 // _savegpr0_16: std r16,-128(r1) 716 void MacroAssembler::save_nonvolatile_gprs(Register dst, int offset) { 717 std(R14, offset, dst); offset += 8; 718 std(R15, offset, dst); offset += 8; 719 std(R16, offset, dst); offset += 8; 720 std(R17, offset, dst); offset += 8; 721 std(R18, offset, dst); offset += 8; 722 std(R19, offset, dst); offset += 8; 723 std(R20, offset, dst); offset += 8; 724 std(R21, offset, dst); offset += 8; 725 std(R22, offset, dst); offset += 8; 726 std(R23, offset, dst); offset += 8; 727 std(R24, offset, dst); offset += 8; 728 std(R25, offset, dst); offset += 8; 729 std(R26, offset, dst); offset += 8; 730 std(R27, offset, dst); offset += 8; 731 std(R28, offset, dst); offset += 8; 732 std(R29, offset, dst); offset += 8; 733 std(R30, offset, dst); offset += 8; 734 std(R31, offset, dst); offset += 8; 735 736 stfd(F14, offset, dst); offset += 8; 737 stfd(F15, offset, dst); offset += 8; 738 stfd(F16, offset, dst); offset += 8; 739 stfd(F17, offset, dst); offset += 8; 740 stfd(F18, offset, dst); offset += 8; 741 stfd(F19, offset, dst); offset += 8; 742 stfd(F20, offset, dst); offset += 8; 743 stfd(F21, offset, dst); offset += 8; 744 stfd(F22, offset, dst); offset += 8; 745 stfd(F23, offset, dst); offset += 8; 746 stfd(F24, offset, dst); offset += 8; 747 stfd(F25, offset, dst); offset += 8; 748 stfd(F26, offset, dst); offset += 8; 749 stfd(F27, offset, dst); offset += 8; 750 stfd(F28, offset, dst); offset += 8; 751 stfd(F29, offset, dst); offset += 8; 752 stfd(F30, offset, dst); offset += 8; 753 stfd(F31, offset, dst); 754 } 755 756 // Uses ordering which corresponds to ABI: 757 // _restgpr0_14: ld r14,-144(r1) 758 // _restgpr0_15: ld r15,-136(r1) 759 // _restgpr0_16: ld r16,-128(r1) 760 void MacroAssembler::restore_nonvolatile_gprs(Register src, int offset) { 761 ld(R14, offset, src); offset += 8; 762 ld(R15, offset, src); offset += 8; 763 ld(R16, offset, src); offset += 8; 764 ld(R17, offset, src); offset += 8; 765 ld(R18, offset, src); offset += 8; 766 ld(R19, offset, src); offset += 8; 767 ld(R20, offset, src); offset += 8; 768 ld(R21, offset, src); offset += 8; 769 ld(R22, offset, src); offset += 8; 770 ld(R23, offset, src); offset += 8; 771 ld(R24, offset, src); offset += 8; 772 ld(R25, offset, src); offset += 8; 773 ld(R26, offset, src); offset += 8; 774 ld(R27, offset, src); offset += 8; 775 ld(R28, offset, src); offset += 8; 776 ld(R29, offset, src); offset += 8; 777 ld(R30, offset, src); offset += 8; 778 ld(R31, offset, src); offset += 8; 779 780 // FP registers 781 lfd(F14, offset, src); offset += 8; 782 lfd(F15, offset, src); offset += 8; 783 lfd(F16, offset, src); offset += 8; 784 lfd(F17, offset, src); offset += 8; 785 lfd(F18, offset, src); offset += 8; 786 lfd(F19, offset, src); offset += 8; 787 lfd(F20, offset, src); offset += 8; 788 lfd(F21, offset, src); offset += 8; 789 lfd(F22, offset, src); offset += 8; 790 lfd(F23, offset, src); offset += 8; 791 lfd(F24, offset, src); offset += 8; 792 lfd(F25, offset, src); offset += 8; 793 lfd(F26, offset, src); offset += 8; 794 lfd(F27, offset, src); offset += 8; 795 lfd(F28, offset, src); offset += 8; 796 lfd(F29, offset, src); offset += 8; 797 lfd(F30, offset, src); offset += 8; 798 lfd(F31, offset, src); 799 } 800 801 // For verify_oops. 802 void MacroAssembler::save_volatile_gprs(Register dst, int offset) { 803 std(R2, offset, dst); offset += 8; 804 std(R3, offset, dst); offset += 8; 805 std(R4, offset, dst); offset += 8; 806 std(R5, offset, dst); offset += 8; 807 std(R6, offset, dst); offset += 8; 808 std(R7, offset, dst); offset += 8; 809 std(R8, offset, dst); offset += 8; 810 std(R9, offset, dst); offset += 8; 811 std(R10, offset, dst); offset += 8; 812 std(R11, offset, dst); offset += 8; 813 std(R12, offset, dst); 814 } 815 816 // For verify_oops. 817 void MacroAssembler::restore_volatile_gprs(Register src, int offset) { 818 ld(R2, offset, src); offset += 8; 819 ld(R3, offset, src); offset += 8; 820 ld(R4, offset, src); offset += 8; 821 ld(R5, offset, src); offset += 8; 822 ld(R6, offset, src); offset += 8; 823 ld(R7, offset, src); offset += 8; 824 ld(R8, offset, src); offset += 8; 825 ld(R9, offset, src); offset += 8; 826 ld(R10, offset, src); offset += 8; 827 ld(R11, offset, src); offset += 8; 828 ld(R12, offset, src); 829 } 830 831 void MacroAssembler::save_LR_CR(Register tmp) { 832 mfcr(tmp); 833 std(tmp, _abi(cr), R1_SP); 834 mflr(tmp); 835 std(tmp, _abi(lr), R1_SP); 836 // Tmp must contain lr on exit! (see return_addr and prolog in ppc64.ad) 837 } 838 839 void MacroAssembler::restore_LR_CR(Register tmp) { 840 assert(tmp != R1_SP, "must be distinct"); 841 ld(tmp, _abi(lr), R1_SP); 842 mtlr(tmp); 843 ld(tmp, _abi(cr), R1_SP); 844 mtcr(tmp); 845 } 846 847 address MacroAssembler::get_PC_trash_LR(Register result) { 848 Label L; 849 bl(L); 850 bind(L); 851 address lr_pc = pc(); 852 mflr(result); 853 return lr_pc; 854 } 855 856 void MacroAssembler::resize_frame(Register offset, Register tmp) { 857 #ifdef ASSERT 858 assert_different_registers(offset, tmp, R1_SP); 859 andi_(tmp, offset, frame::alignment_in_bytes-1); 860 asm_assert_eq("resize_frame: unaligned", 0x204); 861 #endif 862 863 // tmp <- *(SP) 864 ld(tmp, _abi(callers_sp), R1_SP); 865 // addr <- SP + offset; 866 // *(addr) <- tmp; 867 // SP <- addr 868 stdux(tmp, R1_SP, offset); 869 } 870 871 void MacroAssembler::resize_frame(int offset, Register tmp) { 872 assert(is_simm(offset, 16), "too big an offset"); 873 assert_different_registers(tmp, R1_SP); 874 assert((offset & (frame::alignment_in_bytes-1))==0, "resize_frame: unaligned"); 875 // tmp <- *(SP) 876 ld(tmp, _abi(callers_sp), R1_SP); 877 // addr <- SP + offset; 878 // *(addr) <- tmp; 879 // SP <- addr 880 stdu(tmp, offset, R1_SP); 881 } 882 883 void MacroAssembler::resize_frame_absolute(Register addr, Register tmp1, Register tmp2) { 884 // (addr == tmp1) || (addr == tmp2) is allowed here! 885 assert(tmp1 != tmp2, "must be distinct"); 886 887 // compute offset w.r.t. current stack pointer 888 // tmp_1 <- addr - SP (!) 889 subf(tmp1, R1_SP, addr); 890 891 // atomically update SP keeping back link. 892 resize_frame(tmp1/* offset */, tmp2/* tmp */); 893 } 894 895 void MacroAssembler::push_frame(Register bytes, Register tmp) { 896 #ifdef ASSERT 897 assert(bytes != R0, "r0 not allowed here"); 898 andi_(R0, bytes, frame::alignment_in_bytes-1); 899 asm_assert_eq("push_frame(Reg, Reg): unaligned", 0x203); 900 #endif 901 neg(tmp, bytes); 902 stdux(R1_SP, R1_SP, tmp); 903 } 904 905 // Push a frame of size `bytes'. 906 void MacroAssembler::push_frame(unsigned int bytes, Register tmp) { 907 long offset = align_addr(bytes, frame::alignment_in_bytes); 908 if (is_simm(-offset, 16)) { 909 stdu(R1_SP, -offset, R1_SP); 910 } else { 911 load_const(tmp, -offset); 912 stdux(R1_SP, R1_SP, tmp); 913 } 914 } 915 916 // Push a frame of size `bytes' plus abi_reg_args on top. 917 void MacroAssembler::push_frame_reg_args(unsigned int bytes, Register tmp) { 918 push_frame(bytes + frame::abi_reg_args_size, tmp); 919 } 920 921 // Setup up a new C frame with a spill area for non-volatile GPRs and 922 // additional space for local variables. 923 void MacroAssembler::push_frame_reg_args_nonvolatiles(unsigned int bytes, 924 Register tmp) { 925 push_frame(bytes + frame::abi_reg_args_size + frame::spill_nonvolatiles_size, tmp); 926 } 927 928 // Pop current C frame. 929 void MacroAssembler::pop_frame() { 930 ld(R1_SP, _abi(callers_sp), R1_SP); 931 } 932 933 #if defined(ABI_ELFv2) 934 address MacroAssembler::branch_to(Register r_function_entry, bool and_link) { 935 // TODO(asmundak): make sure the caller uses R12 as function descriptor 936 // most of the times. 937 if (R12 != r_function_entry) { 938 mr(R12, r_function_entry); 939 } 940 mtctr(R12); 941 // Do a call or a branch. 942 if (and_link) { 943 bctrl(); 944 } else { 945 bctr(); 946 } 947 _last_calls_return_pc = pc(); 948 949 return _last_calls_return_pc; 950 } 951 952 // Call a C function via a function descriptor and use full C 953 // calling conventions. Updates and returns _last_calls_return_pc. 954 address MacroAssembler::call_c(Register r_function_entry) { 955 return branch_to(r_function_entry, /*and_link=*/true); 956 } 957 958 // For tail calls: only branch, don't link, so callee returns to caller of this function. 959 address MacroAssembler::call_c_and_return_to_caller(Register r_function_entry) { 960 return branch_to(r_function_entry, /*and_link=*/false); 961 } 962 963 address MacroAssembler::call_c(address function_entry, relocInfo::relocType rt) { 964 load_const(R12, function_entry, R0); 965 return branch_to(R12, /*and_link=*/true); 966 } 967 968 #else 969 // Generic version of a call to C function via a function descriptor 970 // with variable support for C calling conventions (TOC, ENV, etc.). 971 // Updates and returns _last_calls_return_pc. 972 address MacroAssembler::branch_to(Register function_descriptor, bool and_link, bool save_toc_before_call, 973 bool restore_toc_after_call, bool load_toc_of_callee, bool load_env_of_callee) { 974 // we emit standard ptrgl glue code here 975 assert((function_descriptor != R0), "function_descriptor cannot be R0"); 976 977 // retrieve necessary entries from the function descriptor 978 ld(R0, in_bytes(FunctionDescriptor::entry_offset()), function_descriptor); 979 mtctr(R0); 980 981 if (load_toc_of_callee) { 982 ld(R2_TOC, in_bytes(FunctionDescriptor::toc_offset()), function_descriptor); 983 } 984 if (load_env_of_callee) { 985 ld(R11, in_bytes(FunctionDescriptor::env_offset()), function_descriptor); 986 } else if (load_toc_of_callee) { 987 li(R11, 0); 988 } 989 990 // do a call or a branch 991 if (and_link) { 992 bctrl(); 993 } else { 994 bctr(); 995 } 996 _last_calls_return_pc = pc(); 997 998 return _last_calls_return_pc; 999 } 1000 1001 // Call a C function via a function descriptor and use full C calling 1002 // conventions. 1003 // We don't use the TOC in generated code, so there is no need to save 1004 // and restore its value. 1005 address MacroAssembler::call_c(Register fd) { 1006 return branch_to(fd, /*and_link=*/true, 1007 /*save toc=*/false, 1008 /*restore toc=*/false, 1009 /*load toc=*/true, 1010 /*load env=*/true); 1011 } 1012 1013 address MacroAssembler::call_c_and_return_to_caller(Register fd) { 1014 return branch_to(fd, /*and_link=*/false, 1015 /*save toc=*/false, 1016 /*restore toc=*/false, 1017 /*load toc=*/true, 1018 /*load env=*/true); 1019 } 1020 1021 address MacroAssembler::call_c(const FunctionDescriptor* fd, relocInfo::relocType rt) { 1022 if (rt != relocInfo::none) { 1023 // this call needs to be relocatable 1024 if (!ReoptimizeCallSequences 1025 || (rt != relocInfo::runtime_call_type && rt != relocInfo::none) 1026 || fd == NULL // support code-size estimation 1027 || !fd->is_friend_function() 1028 || fd->entry() == NULL) { 1029 // it's not a friend function as defined by class FunctionDescriptor, 1030 // so do a full call-c here. 1031 load_const(R11, (address)fd, R0); 1032 1033 bool has_env = (fd != NULL && fd->env() != NULL); 1034 return branch_to(R11, /*and_link=*/true, 1035 /*save toc=*/false, 1036 /*restore toc=*/false, 1037 /*load toc=*/true, 1038 /*load env=*/has_env); 1039 } else { 1040 // It's a friend function. Load the entry point and don't care about 1041 // toc and env. Use an optimizable call instruction, but ensure the 1042 // same code-size as in the case of a non-friend function. 1043 nop(); 1044 nop(); 1045 nop(); 1046 bl64_patchable(fd->entry(), rt); 1047 _last_calls_return_pc = pc(); 1048 return _last_calls_return_pc; 1049 } 1050 } else { 1051 // This call does not need to be relocatable, do more aggressive 1052 // optimizations. 1053 if (!ReoptimizeCallSequences 1054 || !fd->is_friend_function()) { 1055 // It's not a friend function as defined by class FunctionDescriptor, 1056 // so do a full call-c here. 1057 load_const(R11, (address)fd, R0); 1058 return branch_to(R11, /*and_link=*/true, 1059 /*save toc=*/false, 1060 /*restore toc=*/false, 1061 /*load toc=*/true, 1062 /*load env=*/true); 1063 } else { 1064 // it's a friend function, load the entry point and don't care about 1065 // toc and env. 1066 address dest = fd->entry(); 1067 if (is_within_range_of_b(dest, pc())) { 1068 bl(dest); 1069 } else { 1070 bl64_patchable(dest, rt); 1071 } 1072 _last_calls_return_pc = pc(); 1073 return _last_calls_return_pc; 1074 } 1075 } 1076 } 1077 1078 // Call a C function. All constants needed reside in TOC. 1079 // 1080 // Read the address to call from the TOC. 1081 // Read env from TOC, if fd specifies an env. 1082 // Read new TOC from TOC. 1083 address MacroAssembler::call_c_using_toc(const FunctionDescriptor* fd, 1084 relocInfo::relocType rt, Register toc) { 1085 if (!ReoptimizeCallSequences 1086 || (rt != relocInfo::runtime_call_type && rt != relocInfo::none) 1087 || !fd->is_friend_function()) { 1088 // It's not a friend function as defined by class FunctionDescriptor, 1089 // so do a full call-c here. 1090 assert(fd->entry() != NULL, "function must be linked"); 1091 1092 AddressLiteral fd_entry(fd->entry()); 1093 load_const_from_method_toc(R11, fd_entry, toc); 1094 mtctr(R11); 1095 if (fd->env() == NULL) { 1096 li(R11, 0); 1097 nop(); 1098 } else { 1099 AddressLiteral fd_env(fd->env()); 1100 load_const_from_method_toc(R11, fd_env, toc); 1101 } 1102 AddressLiteral fd_toc(fd->toc()); 1103 load_toc_from_toc(R2_TOC, fd_toc, toc); 1104 // R2_TOC is killed. 1105 bctrl(); 1106 _last_calls_return_pc = pc(); 1107 } else { 1108 // It's a friend function, load the entry point and don't care about 1109 // toc and env. Use an optimizable call instruction, but ensure the 1110 // same code-size as in the case of a non-friend function. 1111 nop(); 1112 bl64_patchable(fd->entry(), rt); 1113 _last_calls_return_pc = pc(); 1114 } 1115 return _last_calls_return_pc; 1116 } 1117 #endif // ABI_ELFv2 1118 1119 void MacroAssembler::call_VM_base(Register oop_result, 1120 Register last_java_sp, 1121 address entry_point, 1122 bool check_exceptions) { 1123 BLOCK_COMMENT("call_VM {"); 1124 // Determine last_java_sp register. 1125 if (!last_java_sp->is_valid()) { 1126 last_java_sp = R1_SP; 1127 } 1128 set_top_ijava_frame_at_SP_as_last_Java_frame(last_java_sp, R11_scratch1); 1129 1130 // ARG1 must hold thread address. 1131 mr(R3_ARG1, R16_thread); 1132 #if defined(ABI_ELFv2) 1133 address return_pc = call_c(entry_point, relocInfo::none); 1134 #else 1135 address return_pc = call_c((FunctionDescriptor*)entry_point, relocInfo::none); 1136 #endif 1137 1138 reset_last_Java_frame(); 1139 1140 // Check for pending exceptions. 1141 if (check_exceptions) { 1142 // We don't check for exceptions here. 1143 ShouldNotReachHere(); 1144 } 1145 1146 // Get oop result if there is one and reset the value in the thread. 1147 if (oop_result->is_valid()) { 1148 get_vm_result(oop_result); 1149 } 1150 1151 _last_calls_return_pc = return_pc; 1152 BLOCK_COMMENT("} call_VM"); 1153 } 1154 1155 void MacroAssembler::call_VM_leaf_base(address entry_point) { 1156 BLOCK_COMMENT("call_VM_leaf {"); 1157 #if defined(ABI_ELFv2) 1158 call_c(entry_point, relocInfo::none); 1159 #else 1160 call_c(CAST_FROM_FN_PTR(FunctionDescriptor*, entry_point), relocInfo::none); 1161 #endif 1162 BLOCK_COMMENT("} call_VM_leaf"); 1163 } 1164 1165 void MacroAssembler::call_VM(Register oop_result, address entry_point, bool check_exceptions) { 1166 call_VM_base(oop_result, noreg, entry_point, check_exceptions); 1167 } 1168 1169 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, 1170 bool check_exceptions) { 1171 // R3_ARG1 is reserved for the thread. 1172 mr_if_needed(R4_ARG2, arg_1); 1173 call_VM(oop_result, entry_point, check_exceptions); 1174 } 1175 1176 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2, 1177 bool check_exceptions) { 1178 // R3_ARG1 is reserved for the thread 1179 mr_if_needed(R4_ARG2, arg_1); 1180 assert(arg_2 != R4_ARG2, "smashed argument"); 1181 mr_if_needed(R5_ARG3, arg_2); 1182 call_VM(oop_result, entry_point, check_exceptions); 1183 } 1184 1185 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2, Register arg_3, 1186 bool check_exceptions) { 1187 // R3_ARG1 is reserved for the thread 1188 mr_if_needed(R4_ARG2, arg_1); 1189 assert(arg_2 != R4_ARG2, "smashed argument"); 1190 mr_if_needed(R5_ARG3, arg_2); 1191 mr_if_needed(R6_ARG4, arg_3); 1192 call_VM(oop_result, entry_point, check_exceptions); 1193 } 1194 1195 void MacroAssembler::call_VM_leaf(address entry_point) { 1196 call_VM_leaf_base(entry_point); 1197 } 1198 1199 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1) { 1200 mr_if_needed(R3_ARG1, arg_1); 1201 call_VM_leaf(entry_point); 1202 } 1203 1204 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2) { 1205 mr_if_needed(R3_ARG1, arg_1); 1206 assert(arg_2 != R3_ARG1, "smashed argument"); 1207 mr_if_needed(R4_ARG2, arg_2); 1208 call_VM_leaf(entry_point); 1209 } 1210 1211 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2, Register arg_3) { 1212 mr_if_needed(R3_ARG1, arg_1); 1213 assert(arg_2 != R3_ARG1, "smashed argument"); 1214 mr_if_needed(R4_ARG2, arg_2); 1215 assert(arg_3 != R3_ARG1 && arg_3 != R4_ARG2, "smashed argument"); 1216 mr_if_needed(R5_ARG3, arg_3); 1217 call_VM_leaf(entry_point); 1218 } 1219 1220 // Check whether instruction is a read access to the polling page 1221 // which was emitted by load_from_polling_page(..). 1222 bool MacroAssembler::is_load_from_polling_page(int instruction, void* ucontext, 1223 address* polling_address_ptr) { 1224 if (!is_ld(instruction)) 1225 return false; // It's not a ld. Fail. 1226 1227 int rt = inv_rt_field(instruction); 1228 int ra = inv_ra_field(instruction); 1229 int ds = inv_ds_field(instruction); 1230 if (!(ds == 0 && ra != 0 && rt == 0)) { 1231 return false; // It's not a ld(r0, X, ra). Fail. 1232 } 1233 1234 if (!ucontext) { 1235 // Set polling address. 1236 if (polling_address_ptr != NULL) { 1237 *polling_address_ptr = NULL; 1238 } 1239 return true; // No ucontext given. Can't check value of ra. Assume true. 1240 } 1241 1242 #ifdef LINUX 1243 // Ucontext given. Check that register ra contains the address of 1244 // the safepoing polling page. 1245 ucontext_t* uc = (ucontext_t*) ucontext; 1246 // Set polling address. 1247 address addr = (address)uc->uc_mcontext.regs->gpr[ra] + (ssize_t)ds; 1248 if (polling_address_ptr != NULL) { 1249 *polling_address_ptr = addr; 1250 } 1251 return os::is_poll_address(addr); 1252 #else 1253 // Not on Linux, ucontext must be NULL. 1254 ShouldNotReachHere(); 1255 return false; 1256 #endif 1257 } 1258 1259 bool MacroAssembler::is_memory_serialization(int instruction, JavaThread* thread, void* ucontext) { 1260 #ifdef LINUX 1261 ucontext_t* uc = (ucontext_t*) ucontext; 1262 1263 if (is_stwx(instruction) || is_stwux(instruction)) { 1264 int ra = inv_ra_field(instruction); 1265 int rb = inv_rb_field(instruction); 1266 1267 // look up content of ra and rb in ucontext 1268 address ra_val=(address)uc->uc_mcontext.regs->gpr[ra]; 1269 long rb_val=(long)uc->uc_mcontext.regs->gpr[rb]; 1270 return os::is_memory_serialize_page(thread, ra_val+rb_val); 1271 } else if (is_stw(instruction) || is_stwu(instruction)) { 1272 int ra = inv_ra_field(instruction); 1273 int d1 = inv_d1_field(instruction); 1274 1275 // look up content of ra in ucontext 1276 address ra_val=(address)uc->uc_mcontext.regs->gpr[ra]; 1277 return os::is_memory_serialize_page(thread, ra_val+d1); 1278 } else { 1279 return false; 1280 } 1281 #else 1282 // workaround not needed on !LINUX :-) 1283 ShouldNotCallThis(); 1284 return false; 1285 #endif 1286 } 1287 1288 void MacroAssembler::bang_stack_with_offset(int offset) { 1289 // When increasing the stack, the old stack pointer will be written 1290 // to the new top of stack according to the PPC64 abi. 1291 // Therefore, stack banging is not necessary when increasing 1292 // the stack by <= os::vm_page_size() bytes. 1293 // When increasing the stack by a larger amount, this method is 1294 // called repeatedly to bang the intermediate pages. 1295 1296 // Stack grows down, caller passes positive offset. 1297 assert(offset > 0, "must bang with positive offset"); 1298 1299 long stdoffset = -offset; 1300 1301 if (is_simm(stdoffset, 16)) { 1302 // Signed 16 bit offset, a simple std is ok. 1303 if (UseLoadInstructionsForStackBangingPPC64) { 1304 ld(R0, (int)(signed short)stdoffset, R1_SP); 1305 } else { 1306 std(R0,(int)(signed short)stdoffset, R1_SP); 1307 } 1308 } else if (is_simm(stdoffset, 31)) { 1309 const int hi = MacroAssembler::largeoffset_si16_si16_hi(stdoffset); 1310 const int lo = MacroAssembler::largeoffset_si16_si16_lo(stdoffset); 1311 1312 Register tmp = R11; 1313 addis(tmp, R1_SP, hi); 1314 if (UseLoadInstructionsForStackBangingPPC64) { 1315 ld(R0, lo, tmp); 1316 } else { 1317 std(R0, lo, tmp); 1318 } 1319 } else { 1320 ShouldNotReachHere(); 1321 } 1322 } 1323 1324 // If instruction is a stack bang of the form 1325 // std R0, x(Ry), (see bang_stack_with_offset()) 1326 // stdu R1_SP, x(R1_SP), (see push_frame(), resize_frame()) 1327 // or stdux R1_SP, Rx, R1_SP (see push_frame(), resize_frame()) 1328 // return the banged address. Otherwise, return 0. 1329 address MacroAssembler::get_stack_bang_address(int instruction, void *ucontext) { 1330 #ifdef LINUX 1331 ucontext_t* uc = (ucontext_t*) ucontext; 1332 int rs = inv_rs_field(instruction); 1333 int ra = inv_ra_field(instruction); 1334 if ( (is_ld(instruction) && rs == 0 && UseLoadInstructionsForStackBangingPPC64) 1335 || (is_std(instruction) && rs == 0 && !UseLoadInstructionsForStackBangingPPC64) 1336 || (is_stdu(instruction) && rs == 1)) { 1337 int ds = inv_ds_field(instruction); 1338 // return banged address 1339 return ds+(address)uc->uc_mcontext.regs->gpr[ra]; 1340 } else if (is_stdux(instruction) && rs == 1) { 1341 int rb = inv_rb_field(instruction); 1342 address sp = (address)uc->uc_mcontext.regs->gpr[1]; 1343 long rb_val = (long)uc->uc_mcontext.regs->gpr[rb]; 1344 return ra != 1 || rb_val >= 0 ? NULL // not a stack bang 1345 : sp + rb_val; // banged address 1346 } 1347 return NULL; // not a stack bang 1348 #else 1349 // workaround not needed on !LINUX :-) 1350 ShouldNotCallThis(); 1351 return NULL; 1352 #endif 1353 } 1354 1355 // CmpxchgX sets condition register to cmpX(current, compare). 1356 void MacroAssembler::cmpxchgw(ConditionRegister flag, Register dest_current_value, 1357 Register compare_value, Register exchange_value, 1358 Register addr_base, int semantics, bool cmpxchgx_hint, 1359 Register int_flag_success, bool contention_hint) { 1360 Label retry; 1361 Label failed; 1362 Label done; 1363 1364 // Save one branch if result is returned via register and 1365 // result register is different from the other ones. 1366 bool use_result_reg = (int_flag_success != noreg); 1367 bool preset_result_reg = (int_flag_success != dest_current_value && int_flag_success != compare_value && 1368 int_flag_success != exchange_value && int_flag_success != addr_base); 1369 1370 // release/fence semantics 1371 if (semantics & MemBarRel) { 1372 release(); 1373 } 1374 1375 if (use_result_reg && preset_result_reg) { 1376 li(int_flag_success, 0); // preset (assume cas failed) 1377 } 1378 1379 // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM). 1380 if (contention_hint) { // Don't try to reserve if cmp fails. 1381 lwz(dest_current_value, 0, addr_base); 1382 cmpw(flag, dest_current_value, compare_value); 1383 bne(flag, failed); 1384 } 1385 1386 // atomic emulation loop 1387 bind(retry); 1388 1389 lwarx(dest_current_value, addr_base, cmpxchgx_hint); 1390 cmpw(flag, dest_current_value, compare_value); 1391 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1392 bne_predict_not_taken(flag, failed); 1393 } else { 1394 bne( flag, failed); 1395 } 1396 // branch to done => (flag == ne), (dest_current_value != compare_value) 1397 // fall through => (flag == eq), (dest_current_value == compare_value) 1398 1399 stwcx_(exchange_value, addr_base); 1400 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1401 bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0. 1402 } else { 1403 bne( CCR0, retry); // StXcx_ sets CCR0. 1404 } 1405 // fall through => (flag == eq), (dest_current_value == compare_value), (swapped) 1406 1407 // Result in register (must do this at the end because int_flag_success can be the 1408 // same register as one above). 1409 if (use_result_reg) { 1410 li(int_flag_success, 1); 1411 } 1412 1413 if (semantics & MemBarFenceAfter) { 1414 fence(); 1415 } else if (semantics & MemBarAcq) { 1416 isync(); 1417 } 1418 1419 if (use_result_reg && !preset_result_reg) { 1420 b(done); 1421 } 1422 1423 bind(failed); 1424 if (use_result_reg && !preset_result_reg) { 1425 li(int_flag_success, 0); 1426 } 1427 1428 bind(done); 1429 // (flag == ne) => (dest_current_value != compare_value), (!swapped) 1430 // (flag == eq) => (dest_current_value == compare_value), ( swapped) 1431 } 1432 1433 // Preforms atomic compare exchange: 1434 // if (compare_value == *addr_base) 1435 // *addr_base = exchange_value 1436 // int_flag_success = 1; 1437 // else 1438 // int_flag_success = 0; 1439 // 1440 // ConditionRegister flag = cmp(compare_value, *addr_base) 1441 // Register dest_current_value = *addr_base 1442 // Register compare_value Used to compare with value in memory 1443 // Register exchange_value Written to memory if compare_value == *addr_base 1444 // Register addr_base The memory location to compareXChange 1445 // Register int_flag_success Set to 1 if exchange_value was written to *addr_base 1446 // 1447 // To avoid the costly compare exchange the value is tested beforehand. 1448 // Several special cases exist to avoid that unnecessary information is generated. 1449 // 1450 void MacroAssembler::cmpxchgd(ConditionRegister flag, 1451 Register dest_current_value, RegisterOrConstant compare_value, Register exchange_value, 1452 Register addr_base, int semantics, bool cmpxchgx_hint, 1453 Register int_flag_success, Label* failed_ext, bool contention_hint) { 1454 Label retry; 1455 Label failed_int; 1456 Label& failed = (failed_ext != NULL) ? *failed_ext : failed_int; 1457 Label done; 1458 1459 // Save one branch if result is returned via register and result register is different from the other ones. 1460 bool use_result_reg = (int_flag_success!=noreg); 1461 bool preset_result_reg = (int_flag_success!=dest_current_value && int_flag_success!=compare_value.register_or_noreg() && 1462 int_flag_success!=exchange_value && int_flag_success!=addr_base); 1463 assert(int_flag_success == noreg || failed_ext == NULL, "cannot have both"); 1464 1465 // release/fence semantics 1466 if (semantics & MemBarRel) { 1467 release(); 1468 } 1469 1470 if (use_result_reg && preset_result_reg) { 1471 li(int_flag_success, 0); // preset (assume cas failed) 1472 } 1473 1474 // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM). 1475 if (contention_hint) { // Don't try to reserve if cmp fails. 1476 ld(dest_current_value, 0, addr_base); 1477 cmpd(flag, compare_value, dest_current_value); 1478 bne(flag, failed); 1479 } 1480 1481 // atomic emulation loop 1482 bind(retry); 1483 1484 ldarx(dest_current_value, addr_base, cmpxchgx_hint); 1485 cmpd(flag, compare_value, dest_current_value); 1486 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1487 bne_predict_not_taken(flag, failed); 1488 } else { 1489 bne( flag, failed); 1490 } 1491 1492 stdcx_(exchange_value, addr_base); 1493 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1494 bne_predict_not_taken(CCR0, retry); // stXcx_ sets CCR0 1495 } else { 1496 bne( CCR0, retry); // stXcx_ sets CCR0 1497 } 1498 1499 // result in register (must do this at the end because int_flag_success can be the same register as one above) 1500 if (use_result_reg) { 1501 li(int_flag_success, 1); 1502 } 1503 1504 // POWER6 doesn't need isync in CAS. 1505 // Always emit isync to be on the safe side. 1506 if (semantics & MemBarFenceAfter) { 1507 fence(); 1508 } else if (semantics & MemBarAcq) { 1509 isync(); 1510 } 1511 1512 if (use_result_reg && !preset_result_reg) { 1513 b(done); 1514 } 1515 1516 bind(failed_int); 1517 if (use_result_reg && !preset_result_reg) { 1518 li(int_flag_success, 0); 1519 } 1520 1521 bind(done); 1522 // (flag == ne) => (dest_current_value != compare_value), (!swapped) 1523 // (flag == eq) => (dest_current_value == compare_value), ( swapped) 1524 } 1525 1526 // Look up the method for a megamorphic invokeinterface call. 1527 // The target method is determined by <intf_klass, itable_index>. 1528 // The receiver klass is in recv_klass. 1529 // On success, the result will be in method_result, and execution falls through. 1530 // On failure, execution transfers to the given label. 1531 void MacroAssembler::lookup_interface_method(Register recv_klass, 1532 Register intf_klass, 1533 RegisterOrConstant itable_index, 1534 Register method_result, 1535 Register scan_temp, 1536 Register sethi_temp, 1537 Label& L_no_such_interface) { 1538 assert_different_registers(recv_klass, intf_klass, method_result, scan_temp); 1539 assert(itable_index.is_constant() || itable_index.as_register() == method_result, 1540 "caller must use same register for non-constant itable index as for method"); 1541 1542 // Compute start of first itableOffsetEntry (which is at the end of the vtable). 1543 int vtable_base = InstanceKlass::vtable_start_offset() * wordSize; 1544 int itentry_off = itableMethodEntry::method_offset_in_bytes(); 1545 int logMEsize = exact_log2(itableMethodEntry::size() * wordSize); 1546 int scan_step = itableOffsetEntry::size() * wordSize; 1547 int log_vte_size= exact_log2(vtableEntry::size() * wordSize); 1548 1549 lwz(scan_temp, InstanceKlass::vtable_length_offset() * wordSize, recv_klass); 1550 // %%% We should store the aligned, prescaled offset in the klassoop. 1551 // Then the next several instructions would fold away. 1552 1553 sldi(scan_temp, scan_temp, log_vte_size); 1554 addi(scan_temp, scan_temp, vtable_base); 1555 add(scan_temp, recv_klass, scan_temp); 1556 1557 // Adjust recv_klass by scaled itable_index, so we can free itable_index. 1558 if (itable_index.is_register()) { 1559 Register itable_offset = itable_index.as_register(); 1560 sldi(itable_offset, itable_offset, logMEsize); 1561 if (itentry_off) addi(itable_offset, itable_offset, itentry_off); 1562 add(recv_klass, itable_offset, recv_klass); 1563 } else { 1564 long itable_offset = (long)itable_index.as_constant(); 1565 load_const_optimized(sethi_temp, (itable_offset<<logMEsize)+itentry_off); // static address, no relocation 1566 add(recv_klass, sethi_temp, recv_klass); 1567 } 1568 1569 // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) { 1570 // if (scan->interface() == intf) { 1571 // result = (klass + scan->offset() + itable_index); 1572 // } 1573 // } 1574 Label search, found_method; 1575 1576 for (int peel = 1; peel >= 0; peel--) { 1577 // %%%% Could load both offset and interface in one ldx, if they were 1578 // in the opposite order. This would save a load. 1579 ld(method_result, itableOffsetEntry::interface_offset_in_bytes(), scan_temp); 1580 1581 // Check that this entry is non-null. A null entry means that 1582 // the receiver class doesn't implement the interface, and wasn't the 1583 // same as when the caller was compiled. 1584 cmpd(CCR0, method_result, intf_klass); 1585 1586 if (peel) { 1587 beq(CCR0, found_method); 1588 } else { 1589 bne(CCR0, search); 1590 // (invert the test to fall through to found_method...) 1591 } 1592 1593 if (!peel) break; 1594 1595 bind(search); 1596 1597 cmpdi(CCR0, method_result, 0); 1598 beq(CCR0, L_no_such_interface); 1599 addi(scan_temp, scan_temp, scan_step); 1600 } 1601 1602 bind(found_method); 1603 1604 // Got a hit. 1605 int ito_offset = itableOffsetEntry::offset_offset_in_bytes(); 1606 lwz(scan_temp, ito_offset, scan_temp); 1607 ldx(method_result, scan_temp, recv_klass); 1608 } 1609 1610 // virtual method calling 1611 void MacroAssembler::lookup_virtual_method(Register recv_klass, 1612 RegisterOrConstant vtable_index, 1613 Register method_result) { 1614 1615 assert_different_registers(recv_klass, method_result, vtable_index.register_or_noreg()); 1616 1617 const int base = InstanceKlass::vtable_start_offset() * wordSize; 1618 assert(vtableEntry::size() * wordSize == wordSize, "adjust the scaling in the code below"); 1619 1620 if (vtable_index.is_register()) { 1621 sldi(vtable_index.as_register(), vtable_index.as_register(), LogBytesPerWord); 1622 add(recv_klass, vtable_index.as_register(), recv_klass); 1623 } else { 1624 addi(recv_klass, recv_klass, vtable_index.as_constant() << LogBytesPerWord); 1625 } 1626 ld(R19_method, base + vtableEntry::method_offset_in_bytes(), recv_klass); 1627 } 1628 1629 /////////////////////////////////////////// subtype checking //////////////////////////////////////////// 1630 1631 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass, 1632 Register super_klass, 1633 Register temp1_reg, 1634 Register temp2_reg, 1635 Label& L_success, 1636 Label& L_failure) { 1637 1638 const Register check_cache_offset = temp1_reg; 1639 const Register cached_super = temp2_reg; 1640 1641 assert_different_registers(sub_klass, super_klass, check_cache_offset, cached_super); 1642 1643 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 1644 int sc_offset = in_bytes(Klass::secondary_super_cache_offset()); 1645 1646 // If the pointers are equal, we are done (e.g., String[] elements). 1647 // This self-check enables sharing of secondary supertype arrays among 1648 // non-primary types such as array-of-interface. Otherwise, each such 1649 // type would need its own customized SSA. 1650 // We move this check to the front of the fast path because many 1651 // type checks are in fact trivially successful in this manner, 1652 // so we get a nicely predicted branch right at the start of the check. 1653 cmpd(CCR0, sub_klass, super_klass); 1654 beq(CCR0, L_success); 1655 1656 // Check the supertype display: 1657 lwz(check_cache_offset, sco_offset, super_klass); 1658 // The loaded value is the offset from KlassOopDesc. 1659 1660 ldx(cached_super, check_cache_offset, sub_klass); 1661 cmpd(CCR0, cached_super, super_klass); 1662 beq(CCR0, L_success); 1663 1664 // This check has worked decisively for primary supers. 1665 // Secondary supers are sought in the super_cache ('super_cache_addr'). 1666 // (Secondary supers are interfaces and very deeply nested subtypes.) 1667 // This works in the same check above because of a tricky aliasing 1668 // between the super_cache and the primary super display elements. 1669 // (The 'super_check_addr' can address either, as the case requires.) 1670 // Note that the cache is updated below if it does not help us find 1671 // what we need immediately. 1672 // So if it was a primary super, we can just fail immediately. 1673 // Otherwise, it's the slow path for us (no success at this point). 1674 1675 cmpwi(CCR0, check_cache_offset, sc_offset); 1676 bne(CCR0, L_failure); 1677 // bind(slow_path); // fallthru 1678 } 1679 1680 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass, 1681 Register super_klass, 1682 Register temp1_reg, 1683 Register temp2_reg, 1684 Label* L_success, 1685 Register result_reg) { 1686 const Register array_ptr = temp1_reg; // current value from cache array 1687 const Register temp = temp2_reg; 1688 1689 assert_different_registers(sub_klass, super_klass, array_ptr, temp); 1690 1691 int source_offset = in_bytes(Klass::secondary_supers_offset()); 1692 int target_offset = in_bytes(Klass::secondary_super_cache_offset()); 1693 1694 int length_offset = Array<Klass*>::length_offset_in_bytes(); 1695 int base_offset = Array<Klass*>::base_offset_in_bytes(); 1696 1697 Label hit, loop, failure, fallthru; 1698 1699 ld(array_ptr, source_offset, sub_klass); 1700 1701 //assert(4 == arrayOopDesc::length_length_in_bytes(), "precondition violated."); 1702 lwz(temp, length_offset, array_ptr); 1703 cmpwi(CCR0, temp, 0); 1704 beq(CCR0, result_reg!=noreg ? failure : fallthru); // length 0 1705 1706 mtctr(temp); // load ctr 1707 1708 bind(loop); 1709 // Oops in table are NO MORE compressed. 1710 ld(temp, base_offset, array_ptr); 1711 cmpd(CCR0, temp, super_klass); 1712 beq(CCR0, hit); 1713 addi(array_ptr, array_ptr, BytesPerWord); 1714 bdnz(loop); 1715 1716 bind(failure); 1717 if (result_reg!=noreg) li(result_reg, 1); // load non-zero result (indicates a miss) 1718 b(fallthru); 1719 1720 bind(hit); 1721 std(super_klass, target_offset, sub_klass); // save result to cache 1722 if (result_reg != noreg) li(result_reg, 0); // load zero result (indicates a hit) 1723 if (L_success != NULL) b(*L_success); 1724 1725 bind(fallthru); 1726 } 1727 1728 // Try fast path, then go to slow one if not successful 1729 void MacroAssembler::check_klass_subtype(Register sub_klass, 1730 Register super_klass, 1731 Register temp1_reg, 1732 Register temp2_reg, 1733 Label& L_success) { 1734 Label L_failure; 1735 check_klass_subtype_fast_path(sub_klass, super_klass, temp1_reg, temp2_reg, L_success, L_failure); 1736 check_klass_subtype_slow_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success); 1737 bind(L_failure); // Fallthru if not successful. 1738 } 1739 1740 void MacroAssembler::check_method_handle_type(Register mtype_reg, Register mh_reg, 1741 Register temp_reg, 1742 Label& wrong_method_type) { 1743 assert_different_registers(mtype_reg, mh_reg, temp_reg); 1744 // Compare method type against that of the receiver. 1745 load_heap_oop_not_null(temp_reg, delayed_value(java_lang_invoke_MethodHandle::type_offset_in_bytes, temp_reg), mh_reg); 1746 cmpd(CCR0, temp_reg, mtype_reg); 1747 bne(CCR0, wrong_method_type); 1748 } 1749 1750 RegisterOrConstant MacroAssembler::argument_offset(RegisterOrConstant arg_slot, 1751 Register temp_reg, 1752 int extra_slot_offset) { 1753 // cf. TemplateTable::prepare_invoke(), if (load_receiver). 1754 int stackElementSize = Interpreter::stackElementSize; 1755 int offset = extra_slot_offset * stackElementSize; 1756 if (arg_slot.is_constant()) { 1757 offset += arg_slot.as_constant() * stackElementSize; 1758 return offset; 1759 } else { 1760 assert(temp_reg != noreg, "must specify"); 1761 sldi(temp_reg, arg_slot.as_register(), exact_log2(stackElementSize)); 1762 if (offset != 0) 1763 addi(temp_reg, temp_reg, offset); 1764 return temp_reg; 1765 } 1766 } 1767 1768 void MacroAssembler::biased_locking_enter(ConditionRegister cr_reg, Register obj_reg, 1769 Register mark_reg, Register temp_reg, 1770 Register temp2_reg, Label& done, Label* slow_case) { 1771 assert(UseBiasedLocking, "why call this otherwise?"); 1772 1773 #ifdef ASSERT 1774 assert_different_registers(obj_reg, mark_reg, temp_reg, temp2_reg); 1775 #endif 1776 1777 Label cas_label; 1778 1779 // Branch to done if fast path fails and no slow_case provided. 1780 Label *slow_case_int = (slow_case != NULL) ? slow_case : &done; 1781 1782 // Biased locking 1783 // See whether the lock is currently biased toward our thread and 1784 // whether the epoch is still valid 1785 // Note that the runtime guarantees sufficient alignment of JavaThread 1786 // pointers to allow age to be placed into low bits 1787 assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits, 1788 "biased locking makes assumptions about bit layout"); 1789 1790 if (PrintBiasedLockingStatistics) { 1791 load_const(temp_reg, (address) BiasedLocking::total_entry_count_addr(), temp2_reg); 1792 lwz(temp2_reg, 0, temp_reg); 1793 addi(temp2_reg, temp2_reg, 1); 1794 stw(temp2_reg, 0, temp_reg); 1795 } 1796 1797 andi(temp_reg, mark_reg, markOopDesc::biased_lock_mask_in_place); 1798 cmpwi(cr_reg, temp_reg, markOopDesc::biased_lock_pattern); 1799 bne(cr_reg, cas_label); 1800 1801 load_klass(temp_reg, obj_reg); 1802 1803 load_const_optimized(temp2_reg, ~((int) markOopDesc::age_mask_in_place)); 1804 ld(temp_reg, in_bytes(Klass::prototype_header_offset()), temp_reg); 1805 orr(temp_reg, R16_thread, temp_reg); 1806 xorr(temp_reg, mark_reg, temp_reg); 1807 andr(temp_reg, temp_reg, temp2_reg); 1808 cmpdi(cr_reg, temp_reg, 0); 1809 if (PrintBiasedLockingStatistics) { 1810 Label l; 1811 bne(cr_reg, l); 1812 load_const(mark_reg, (address) BiasedLocking::biased_lock_entry_count_addr()); 1813 lwz(temp2_reg, 0, mark_reg); 1814 addi(temp2_reg, temp2_reg, 1); 1815 stw(temp2_reg, 0, mark_reg); 1816 // restore mark_reg 1817 ld(mark_reg, oopDesc::mark_offset_in_bytes(), obj_reg); 1818 bind(l); 1819 } 1820 beq(cr_reg, done); 1821 1822 Label try_revoke_bias; 1823 Label try_rebias; 1824 1825 // At this point we know that the header has the bias pattern and 1826 // that we are not the bias owner in the current epoch. We need to 1827 // figure out more details about the state of the header in order to 1828 // know what operations can be legally performed on the object's 1829 // header. 1830 1831 // If the low three bits in the xor result aren't clear, that means 1832 // the prototype header is no longer biased and we have to revoke 1833 // the bias on this object. 1834 andi(temp2_reg, temp_reg, markOopDesc::biased_lock_mask_in_place); 1835 cmpwi(cr_reg, temp2_reg, 0); 1836 bne(cr_reg, try_revoke_bias); 1837 1838 // Biasing is still enabled for this data type. See whether the 1839 // epoch of the current bias is still valid, meaning that the epoch 1840 // bits of the mark word are equal to the epoch bits of the 1841 // prototype header. (Note that the prototype header's epoch bits 1842 // only change at a safepoint.) If not, attempt to rebias the object 1843 // toward the current thread. Note that we must be absolutely sure 1844 // that the current epoch is invalid in order to do this because 1845 // otherwise the manipulations it performs on the mark word are 1846 // illegal. 1847 1848 int shift_amount = 64 - markOopDesc::epoch_shift; 1849 // rotate epoch bits to right (little) end and set other bits to 0 1850 // [ big part | epoch | little part ] -> [ 0..0 | epoch ] 1851 rldicl_(temp2_reg, temp_reg, shift_amount, 64 - markOopDesc::epoch_bits); 1852 // branch if epoch bits are != 0, i.e. they differ, because the epoch has been incremented 1853 bne(CCR0, try_rebias); 1854 1855 // The epoch of the current bias is still valid but we know nothing 1856 // about the owner; it might be set or it might be clear. Try to 1857 // acquire the bias of the object using an atomic operation. If this 1858 // fails we will go in to the runtime to revoke the object's bias. 1859 // Note that we first construct the presumed unbiased header so we 1860 // don't accidentally blow away another thread's valid bias. 1861 andi(mark_reg, mark_reg, (markOopDesc::biased_lock_mask_in_place | 1862 markOopDesc::age_mask_in_place | 1863 markOopDesc::epoch_mask_in_place)); 1864 orr(temp_reg, R16_thread, mark_reg); 1865 1866 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 1867 1868 // CmpxchgX sets cr_reg to cmpX(temp2_reg, mark_reg). 1869 cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg, 1870 /*compare_value=*/mark_reg, /*exchange_value=*/temp_reg, 1871 /*where=*/obj_reg, 1872 MacroAssembler::MemBarAcq, 1873 MacroAssembler::cmpxchgx_hint_acquire_lock(), 1874 noreg, slow_case_int); // bail out if failed 1875 1876 // If the biasing toward our thread failed, this means that 1877 // another thread succeeded in biasing it toward itself and we 1878 // need to revoke that bias. The revocation will occur in the 1879 // interpreter runtime in the slow case. 1880 if (PrintBiasedLockingStatistics) { 1881 load_const(temp_reg, (address) BiasedLocking::anonymously_biased_lock_entry_count_addr(), temp2_reg); 1882 lwz(temp2_reg, 0, temp_reg); 1883 addi(temp2_reg, temp2_reg, 1); 1884 stw(temp2_reg, 0, temp_reg); 1885 } 1886 b(done); 1887 1888 bind(try_rebias); 1889 // At this point we know the epoch has expired, meaning that the 1890 // current "bias owner", if any, is actually invalid. Under these 1891 // circumstances _only_, we are allowed to use the current header's 1892 // value as the comparison value when doing the cas to acquire the 1893 // bias in the current epoch. In other words, we allow transfer of 1894 // the bias from one thread to another directly in this situation. 1895 andi(temp_reg, mark_reg, markOopDesc::age_mask_in_place); 1896 orr(temp_reg, R16_thread, temp_reg); 1897 load_klass(temp2_reg, obj_reg); 1898 ld(temp2_reg, in_bytes(Klass::prototype_header_offset()), temp2_reg); 1899 orr(temp_reg, temp_reg, temp2_reg); 1900 1901 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 1902 1903 // CmpxchgX sets cr_reg to cmpX(temp2_reg, mark_reg). 1904 cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg, 1905 /*compare_value=*/mark_reg, /*exchange_value=*/temp_reg, 1906 /*where=*/obj_reg, 1907 MacroAssembler::MemBarAcq, 1908 MacroAssembler::cmpxchgx_hint_acquire_lock(), 1909 noreg, slow_case_int); // bail out if failed 1910 1911 // If the biasing toward our thread failed, this means that 1912 // another thread succeeded in biasing it toward itself and we 1913 // need to revoke that bias. The revocation will occur in the 1914 // interpreter runtime in the slow case. 1915 if (PrintBiasedLockingStatistics) { 1916 load_const(temp_reg, (address) BiasedLocking::rebiased_lock_entry_count_addr(), temp2_reg); 1917 lwz(temp2_reg, 0, temp_reg); 1918 addi(temp2_reg, temp2_reg, 1); 1919 stw(temp2_reg, 0, temp_reg); 1920 } 1921 b(done); 1922 1923 bind(try_revoke_bias); 1924 // The prototype mark in the klass doesn't have the bias bit set any 1925 // more, indicating that objects of this data type are not supposed 1926 // to be biased any more. We are going to try to reset the mark of 1927 // this object to the prototype value and fall through to the 1928 // CAS-based locking scheme. Note that if our CAS fails, it means 1929 // that another thread raced us for the privilege of revoking the 1930 // bias of this particular object, so it's okay to continue in the 1931 // normal locking code. 1932 load_klass(temp_reg, obj_reg); 1933 ld(temp_reg, in_bytes(Klass::prototype_header_offset()), temp_reg); 1934 andi(temp2_reg, mark_reg, markOopDesc::age_mask_in_place); 1935 orr(temp_reg, temp_reg, temp2_reg); 1936 1937 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 1938 1939 // CmpxchgX sets cr_reg to cmpX(temp2_reg, mark_reg). 1940 cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg, 1941 /*compare_value=*/mark_reg, /*exchange_value=*/temp_reg, 1942 /*where=*/obj_reg, 1943 MacroAssembler::MemBarAcq, 1944 MacroAssembler::cmpxchgx_hint_acquire_lock()); 1945 1946 // reload markOop in mark_reg before continuing with lightweight locking 1947 ld(mark_reg, oopDesc::mark_offset_in_bytes(), obj_reg); 1948 1949 // Fall through to the normal CAS-based lock, because no matter what 1950 // the result of the above CAS, some thread must have succeeded in 1951 // removing the bias bit from the object's header. 1952 if (PrintBiasedLockingStatistics) { 1953 Label l; 1954 bne(cr_reg, l); 1955 load_const(temp_reg, (address) BiasedLocking::revoked_lock_entry_count_addr(), temp2_reg); 1956 lwz(temp2_reg, 0, temp_reg); 1957 addi(temp2_reg, temp2_reg, 1); 1958 stw(temp2_reg, 0, temp_reg); 1959 bind(l); 1960 } 1961 1962 bind(cas_label); 1963 } 1964 1965 void MacroAssembler::biased_locking_exit (ConditionRegister cr_reg, Register mark_addr, Register temp_reg, Label& done) { 1966 // Check for biased locking unlock case, which is a no-op 1967 // Note: we do not have to check the thread ID for two reasons. 1968 // First, the interpreter checks for IllegalMonitorStateException at 1969 // a higher level. Second, if the bias was revoked while we held the 1970 // lock, the object could not be rebiased toward another thread, so 1971 // the bias bit would be clear. 1972 1973 ld(temp_reg, 0, mark_addr); 1974 andi(temp_reg, temp_reg, markOopDesc::biased_lock_mask_in_place); 1975 1976 cmpwi(cr_reg, temp_reg, markOopDesc::biased_lock_pattern); 1977 beq(cr_reg, done); 1978 } 1979 1980 // TM on PPC64. 1981 void MacroAssembler::atomic_inc_ptr(Register addr, Register result, int simm16) { 1982 Label retry; 1983 bind(retry); 1984 ldarx(result, addr, /*hint*/ false); 1985 addi(result, result, simm16); 1986 stdcx_(result, addr); 1987 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1988 bne_predict_not_taken(CCR0, retry); // stXcx_ sets CCR0 1989 } else { 1990 bne( CCR0, retry); // stXcx_ sets CCR0 1991 } 1992 } 1993 1994 void MacroAssembler::atomic_ori_int(Register addr, Register result, int uimm16) { 1995 Label retry; 1996 bind(retry); 1997 lwarx(result, addr, /*hint*/ false); 1998 ori(result, result, uimm16); 1999 stwcx_(result, addr); 2000 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 2001 bne_predict_not_taken(CCR0, retry); // stXcx_ sets CCR0 2002 } else { 2003 bne( CCR0, retry); // stXcx_ sets CCR0 2004 } 2005 } 2006 2007 #if INCLUDE_RTM_OPT 2008 2009 // Update rtm_counters based on abort status 2010 // input: abort_status 2011 // rtm_counters (RTMLockingCounters*) 2012 void MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters_Reg) { 2013 // Mapping to keep PreciseRTMLockingStatistics similar to x86. 2014 // x86 ppc (! means inverted, ? means not the same) 2015 // 0 31 Set if abort caused by XABORT instruction. 2016 // 1 ! 7 If set, the transaction may succeed on a retry. This bit is always clear if bit 0 is set. 2017 // 2 13 Set if another logical processor conflicted with a memory address that was part of the transaction that aborted. 2018 // 3 10 Set if an internal buffer overflowed. 2019 // 4 ?12 Set if a debug breakpoint was hit. 2020 // 5 ?32 Set if an abort occurred during execution of a nested transaction. 2021 const int tm_failure_bit[] = {Assembler::tm_tabort, // Note: Seems like signal handler sets this, too. 2022 Assembler::tm_failure_persistent, // inverted: transient 2023 Assembler::tm_trans_cf, 2024 Assembler::tm_footprint_of, 2025 Assembler::tm_non_trans_cf, 2026 Assembler::tm_suspended}; 2027 const bool tm_failure_inv[] = {false, true, false, false, false, false}; 2028 assert(sizeof(tm_failure_bit)/sizeof(int) == RTMLockingCounters::ABORT_STATUS_LIMIT, "adapt mapping!"); 2029 2030 const Register addr_Reg = R0; 2031 // Keep track of offset to where rtm_counters_Reg had pointed to. 2032 int counters_offs = RTMLockingCounters::abort_count_offset(); 2033 addi(addr_Reg, rtm_counters_Reg, counters_offs); 2034 const Register temp_Reg = rtm_counters_Reg; 2035 2036 //atomic_inc_ptr(addr_Reg, temp_Reg); We don't increment atomically 2037 ldx(temp_Reg, addr_Reg); 2038 addi(temp_Reg, temp_Reg, 1); 2039 stdx(temp_Reg, addr_Reg); 2040 2041 if (PrintPreciseRTMLockingStatistics) { 2042 int counters_offs_delta = RTMLockingCounters::abortX_count_offset() - counters_offs; 2043 2044 //mftexasr(abort_status); done by caller 2045 for (int i = 0; i < RTMLockingCounters::ABORT_STATUS_LIMIT; i++) { 2046 counters_offs += counters_offs_delta; 2047 li(temp_Reg, counters_offs_delta); // can't use addi with R0 2048 add(addr_Reg, addr_Reg, temp_Reg); // point to next counter 2049 counters_offs_delta = sizeof(uintx); 2050 2051 Label check_abort; 2052 rldicr_(temp_Reg, abort_status, tm_failure_bit[i], 0); 2053 if (tm_failure_inv[i]) { 2054 bne(CCR0, check_abort); 2055 } else { 2056 beq(CCR0, check_abort); 2057 } 2058 //atomic_inc_ptr(addr_Reg, temp_Reg); We don't increment atomically 2059 ldx(temp_Reg, addr_Reg); 2060 addi(temp_Reg, temp_Reg, 1); 2061 stdx(temp_Reg, addr_Reg); 2062 bind(check_abort); 2063 } 2064 } 2065 li(temp_Reg, -counters_offs); // can't use addi with R0 2066 add(rtm_counters_Reg, addr_Reg, temp_Reg); // restore 2067 } 2068 2069 // Branch if (random & (count-1) != 0), count is 2^n 2070 // tmp and CR0 are killed 2071 void MacroAssembler::branch_on_random_using_tb(Register tmp, int count, Label& brLabel) { 2072 mftb(tmp); 2073 andi_(tmp, tmp, count-1); 2074 bne(CCR0, brLabel); 2075 } 2076 2077 // Perform abort ratio calculation, set no_rtm bit if high ratio. 2078 // input: rtm_counters_Reg (RTMLockingCounters* address) - KILLED 2079 void MacroAssembler::rtm_abort_ratio_calculation(Register rtm_counters_Reg, 2080 RTMLockingCounters* rtm_counters, 2081 Metadata* method_data) { 2082 Label L_done, L_check_always_rtm1, L_check_always_rtm2; 2083 2084 if (RTMLockingCalculationDelay > 0) { 2085 // Delay calculation. 2086 ld(rtm_counters_Reg, (RegisterOrConstant)(intptr_t)RTMLockingCounters::rtm_calculation_flag_addr()); 2087 cmpdi(CCR0, rtm_counters_Reg, 0); 2088 beq(CCR0, L_done); 2089 load_const_optimized(rtm_counters_Reg, (address)rtm_counters, R0); // reload 2090 } 2091 // Abort ratio calculation only if abort_count > RTMAbortThreshold. 2092 // Aborted transactions = abort_count * 100 2093 // All transactions = total_count * RTMTotalCountIncrRate 2094 // Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio) 2095 ld(R0, RTMLockingCounters::abort_count_offset(), rtm_counters_Reg); 2096 cmpdi(CCR0, R0, RTMAbortThreshold); 2097 blt(CCR0, L_check_always_rtm2); 2098 mulli(R0, R0, 100); 2099 2100 const Register tmpReg = rtm_counters_Reg; 2101 ld(tmpReg, RTMLockingCounters::total_count_offset(), rtm_counters_Reg); 2102 mulli(tmpReg, tmpReg, RTMTotalCountIncrRate); 2103 mulli(tmpReg, tmpReg, RTMAbortRatio); 2104 cmpd(CCR0, R0, tmpReg); 2105 blt(CCR0, L_check_always_rtm1); // jump to reload 2106 if (method_data != NULL) { 2107 // Set rtm_state to "no rtm" in MDO. 2108 // Not using a metadata relocation. Method and Class Loader are kept alive anyway. 2109 // (See nmethod::metadata_do and CodeBuffer::finalize_oop_references.) 2110 load_const(R0, (address)method_data + MethodData::rtm_state_offset_in_bytes(), tmpReg); 2111 atomic_ori_int(R0, tmpReg, NoRTM); 2112 } 2113 b(L_done); 2114 2115 bind(L_check_always_rtm1); 2116 load_const_optimized(rtm_counters_Reg, (address)rtm_counters, R0); // reload 2117 bind(L_check_always_rtm2); 2118 ld(tmpReg, RTMLockingCounters::total_count_offset(), rtm_counters_Reg); 2119 cmpdi(CCR0, tmpReg, RTMLockingThreshold / RTMTotalCountIncrRate); 2120 blt(CCR0, L_done); 2121 if (method_data != NULL) { 2122 // Set rtm_state to "always rtm" in MDO. 2123 // Not using a metadata relocation. See above. 2124 load_const(R0, (address)method_data + MethodData::rtm_state_offset_in_bytes(), tmpReg); 2125 atomic_ori_int(R0, tmpReg, UseRTM); 2126 } 2127 bind(L_done); 2128 } 2129 2130 // Update counters and perform abort ratio calculation. 2131 // input: abort_status_Reg 2132 void MacroAssembler::rtm_profiling(Register abort_status_Reg, Register temp_Reg, 2133 RTMLockingCounters* rtm_counters, 2134 Metadata* method_data, 2135 bool profile_rtm) { 2136 2137 assert(rtm_counters != NULL, "should not be NULL when profiling RTM"); 2138 // Update rtm counters based on state at abort. 2139 // Reads abort_status_Reg, updates flags. 2140 assert_different_registers(abort_status_Reg, temp_Reg); 2141 load_const_optimized(temp_Reg, (address)rtm_counters, R0); 2142 rtm_counters_update(abort_status_Reg, temp_Reg); 2143 if (profile_rtm) { 2144 assert(rtm_counters != NULL, "should not be NULL when profiling RTM"); 2145 rtm_abort_ratio_calculation(temp_Reg, rtm_counters, method_data); 2146 } 2147 } 2148 2149 // Retry on abort if abort's status indicates non-persistent failure. 2150 // inputs: retry_count_Reg 2151 // : abort_status_Reg 2152 // output: retry_count_Reg decremented by 1 2153 void MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg, 2154 Label& retryLabel, Label* checkRetry) { 2155 Label doneRetry; 2156 rldicr_(R0, abort_status_Reg, tm_failure_persistent, 0); 2157 bne(CCR0, doneRetry); 2158 if (checkRetry) { bind(*checkRetry); } 2159 addic_(retry_count_Reg, retry_count_Reg, -1); 2160 blt(CCR0, doneRetry); 2161 smt_yield(); // Can't use wait(). No permission (SIGILL). 2162 b(retryLabel); 2163 bind(doneRetry); 2164 } 2165 2166 // Spin and retry if lock is busy. 2167 // inputs: box_Reg (monitor address) 2168 // : retry_count_Reg 2169 // output: retry_count_Reg decremented by 1 2170 // CTR is killed 2171 void MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register owner_addr_Reg, Label& retryLabel) { 2172 Label SpinLoop, doneRetry; 2173 addic_(retry_count_Reg, retry_count_Reg, -1); 2174 blt(CCR0, doneRetry); 2175 li(R0, RTMSpinLoopCount); 2176 mtctr(R0); 2177 2178 bind(SpinLoop); 2179 smt_yield(); // Can't use waitrsv(). No permission (SIGILL). 2180 bdz(retryLabel); 2181 ld(R0, 0, owner_addr_Reg); 2182 cmpdi(CCR0, R0, 0); 2183 bne(CCR0, SpinLoop); 2184 b(retryLabel); 2185 2186 bind(doneRetry); 2187 } 2188 2189 // Use RTM for normal stack locks. 2190 // Input: objReg (object to lock) 2191 void MacroAssembler::rtm_stack_locking(ConditionRegister flag, 2192 Register obj, Register mark_word, Register tmp, 2193 Register retry_on_abort_count_Reg, 2194 RTMLockingCounters* stack_rtm_counters, 2195 Metadata* method_data, bool profile_rtm, 2196 Label& DONE_LABEL, Label& IsInflated) { 2197 assert(UseRTMForStackLocks, "why call this otherwise?"); 2198 assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking"); 2199 Label L_rtm_retry, L_decrement_retry, L_on_abort; 2200 2201 if (RTMRetryCount > 0) { 2202 load_const_optimized(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort 2203 bind(L_rtm_retry); 2204 } 2205 andi_(R0, mark_word, markOopDesc::monitor_value); // inflated vs stack-locked|neutral|biased 2206 bne(CCR0, IsInflated); 2207 2208 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 2209 Label L_noincrement; 2210 if (RTMTotalCountIncrRate > 1) { 2211 branch_on_random_using_tb(tmp, (int)RTMTotalCountIncrRate, L_noincrement); 2212 } 2213 assert(stack_rtm_counters != NULL, "should not be NULL when profiling RTM"); 2214 load_const_optimized(tmp, (address)stack_rtm_counters->total_count_addr(), R0); 2215 //atomic_inc_ptr(tmp, /*temp, will be reloaded*/mark_word); We don't increment atomically 2216 ldx(mark_word, tmp); 2217 addi(mark_word, mark_word, 1); 2218 stdx(mark_word, tmp); 2219 bind(L_noincrement); 2220 } 2221 tbegin_(); 2222 beq(CCR0, L_on_abort); 2223 ld(mark_word, oopDesc::mark_offset_in_bytes(), obj); // Reload in transaction, conflicts need to be tracked. 2224 andi(R0, mark_word, markOopDesc::biased_lock_mask_in_place); // look at 3 lock bits 2225 cmpwi(flag, R0, markOopDesc::unlocked_value); // bits = 001 unlocked 2226 beq(flag, DONE_LABEL); // all done if unlocked 2227 2228 if (UseRTMXendForLockBusy) { 2229 tend_(); 2230 b(L_decrement_retry); 2231 } else { 2232 tabort_(); 2233 } 2234 bind(L_on_abort); 2235 const Register abort_status_Reg = tmp; 2236 mftexasr(abort_status_Reg); 2237 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 2238 rtm_profiling(abort_status_Reg, /*temp*/mark_word, stack_rtm_counters, method_data, profile_rtm); 2239 } 2240 ld(mark_word, oopDesc::mark_offset_in_bytes(), obj); // reload 2241 if (RTMRetryCount > 0) { 2242 // Retry on lock abort if abort status is not permanent. 2243 rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry, &L_decrement_retry); 2244 } else { 2245 bind(L_decrement_retry); 2246 } 2247 } 2248 2249 // Use RTM for inflating locks 2250 // inputs: obj (object to lock) 2251 // mark_word (current header - KILLED) 2252 // boxReg (on-stack box address (displaced header location) - KILLED) 2253 void MacroAssembler::rtm_inflated_locking(ConditionRegister flag, 2254 Register obj, Register mark_word, Register boxReg, 2255 Register retry_on_busy_count_Reg, Register retry_on_abort_count_Reg, 2256 RTMLockingCounters* rtm_counters, 2257 Metadata* method_data, bool profile_rtm, 2258 Label& DONE_LABEL) { 2259 assert(UseRTMLocking, "why call this otherwise?"); 2260 Label L_rtm_retry, L_decrement_retry, L_on_abort; 2261 // Clean monitor_value bit to get valid pointer. 2262 int owner_offset = ObjectMonitor::owner_offset_in_bytes() - markOopDesc::monitor_value; 2263 2264 // Store non-null, using boxReg instead of (intptr_t)markOopDesc::unused_mark(). 2265 std(boxReg, BasicLock::displaced_header_offset_in_bytes(), boxReg); 2266 const Register tmpReg = boxReg; 2267 const Register owner_addr_Reg = mark_word; 2268 addi(owner_addr_Reg, mark_word, owner_offset); 2269 2270 if (RTMRetryCount > 0) { 2271 load_const_optimized(retry_on_busy_count_Reg, RTMRetryCount); // Retry on lock busy. 2272 load_const_optimized(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort. 2273 bind(L_rtm_retry); 2274 } 2275 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 2276 Label L_noincrement; 2277 if (RTMTotalCountIncrRate > 1) { 2278 branch_on_random_using_tb(R0, (int)RTMTotalCountIncrRate, L_noincrement); 2279 } 2280 assert(rtm_counters != NULL, "should not be NULL when profiling RTM"); 2281 load_const(R0, (address)rtm_counters->total_count_addr(), tmpReg); 2282 //atomic_inc_ptr(R0, tmpReg); We don't increment atomically 2283 ldx(tmpReg, R0); 2284 addi(tmpReg, tmpReg, 1); 2285 stdx(tmpReg, R0); 2286 bind(L_noincrement); 2287 } 2288 tbegin_(); 2289 beq(CCR0, L_on_abort); 2290 // We don't reload mark word. Will only be reset at safepoint. 2291 ld(R0, 0, owner_addr_Reg); // Load in transaction, conflicts need to be tracked. 2292 cmpdi(flag, R0, 0); 2293 beq(flag, DONE_LABEL); 2294 2295 if (UseRTMXendForLockBusy) { 2296 tend_(); 2297 b(L_decrement_retry); 2298 } else { 2299 tabort_(); 2300 } 2301 bind(L_on_abort); 2302 const Register abort_status_Reg = tmpReg; 2303 mftexasr(abort_status_Reg); 2304 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 2305 rtm_profiling(abort_status_Reg, /*temp*/ owner_addr_Reg, rtm_counters, method_data, profile_rtm); 2306 // Restore owner_addr_Reg 2307 ld(mark_word, oopDesc::mark_offset_in_bytes(), obj); 2308 #ifdef ASSERT 2309 andi_(R0, mark_word, markOopDesc::monitor_value); 2310 asm_assert_ne("must be inflated", 0xa754); // Deflating only allowed at safepoint. 2311 #endif 2312 addi(owner_addr_Reg, mark_word, owner_offset); 2313 } 2314 if (RTMRetryCount > 0) { 2315 // Retry on lock abort if abort status is not permanent. 2316 rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry); 2317 } 2318 2319 // Appears unlocked - try to swing _owner from null to non-null. 2320 cmpxchgd(flag, /*current val*/ R0, (intptr_t)0, /*new val*/ R16_thread, owner_addr_Reg, 2321 MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq, 2322 MacroAssembler::cmpxchgx_hint_acquire_lock(), noreg, &L_decrement_retry, true); 2323 2324 if (RTMRetryCount > 0) { 2325 // success done else retry 2326 b(DONE_LABEL); 2327 bind(L_decrement_retry); 2328 // Spin and retry if lock is busy. 2329 rtm_retry_lock_on_busy(retry_on_busy_count_Reg, owner_addr_Reg, L_rtm_retry); 2330 } else { 2331 bind(L_decrement_retry); 2332 } 2333 } 2334 2335 #endif // INCLUDE_RTM_OPT 2336 2337 // "The box" is the space on the stack where we copy the object mark. 2338 void MacroAssembler::compiler_fast_lock_object(ConditionRegister flag, Register oop, Register box, 2339 Register temp, Register displaced_header, Register current_header, 2340 bool try_bias, 2341 RTMLockingCounters* rtm_counters, 2342 RTMLockingCounters* stack_rtm_counters, 2343 Metadata* method_data, 2344 bool use_rtm, bool profile_rtm) { 2345 assert_different_registers(oop, box, temp, displaced_header, current_header); 2346 assert(flag != CCR0, "bad condition register"); 2347 Label cont; 2348 Label object_has_monitor; 2349 Label cas_failed; 2350 2351 // Load markOop from object into displaced_header. 2352 ld(displaced_header, oopDesc::mark_offset_in_bytes(), oop); 2353 2354 2355 // Always do locking in runtime. 2356 if (EmitSync & 0x01) { 2357 cmpdi(flag, oop, 0); // Oop can't be 0 here => always false. 2358 return; 2359 } 2360 2361 if (try_bias) { 2362 biased_locking_enter(flag, oop, displaced_header, temp, current_header, cont); 2363 } 2364 2365 #if INCLUDE_RTM_OPT 2366 if (UseRTMForStackLocks && use_rtm) { 2367 rtm_stack_locking(flag, oop, displaced_header, temp, /*temp*/ current_header, 2368 stack_rtm_counters, method_data, profile_rtm, 2369 cont, object_has_monitor); 2370 } 2371 #endif // INCLUDE_RTM_OPT 2372 2373 // Handle existing monitor. 2374 if ((EmitSync & 0x02) == 0) { 2375 // The object has an existing monitor iff (mark & monitor_value) != 0. 2376 andi_(temp, displaced_header, markOopDesc::monitor_value); 2377 bne(CCR0, object_has_monitor); 2378 } 2379 2380 // Set displaced_header to be (markOop of object | UNLOCK_VALUE). 2381 ori(displaced_header, displaced_header, markOopDesc::unlocked_value); 2382 2383 // Load Compare Value application register. 2384 2385 // Initialize the box. (Must happen before we update the object mark!) 2386 std(displaced_header, BasicLock::displaced_header_offset_in_bytes(), box); 2387 2388 // Must fence, otherwise, preceding store(s) may float below cmpxchg. 2389 // Compare object markOop with mark and if equal exchange scratch1 with object markOop. 2390 // CmpxchgX sets cr_reg to cmpX(current, displaced). 2391 membar(Assembler::StoreStore); 2392 cmpxchgd(/*flag=*/flag, 2393 /*current_value=*/current_header, 2394 /*compare_value=*/displaced_header, 2395 /*exchange_value=*/box, 2396 /*where=*/oop, 2397 MacroAssembler::MemBarAcq, 2398 MacroAssembler::cmpxchgx_hint_acquire_lock(), 2399 noreg, 2400 &cas_failed); 2401 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 2402 2403 // If the compare-and-exchange succeeded, then we found an unlocked 2404 // object and we have now locked it. 2405 b(cont); 2406 2407 bind(cas_failed); 2408 // We did not see an unlocked object so try the fast recursive case. 2409 2410 // Check if the owner is self by comparing the value in the markOop of object 2411 // (current_header) with the stack pointer. 2412 sub(current_header, current_header, R1_SP); 2413 load_const_optimized(temp, (address) (~(os::vm_page_size()-1) | 2414 markOopDesc::lock_mask_in_place)); 2415 2416 and_(R0/*==0?*/, current_header, temp); 2417 // If condition is true we are cont and hence we can store 0 as the 2418 // displaced header in the box, which indicates that it is a recursive lock. 2419 mcrf(flag,CCR0); 2420 std(R0/*==0, perhaps*/, BasicLock::displaced_header_offset_in_bytes(), box); 2421 2422 // Handle existing monitor. 2423 if ((EmitSync & 0x02) == 0) { 2424 b(cont); 2425 2426 bind(object_has_monitor); 2427 // The object's monitor m is unlocked iff m->owner == NULL, 2428 // otherwise m->owner may contain a thread or a stack address. 2429 2430 #if INCLUDE_RTM_OPT 2431 // Use the same RTM locking code in 32- and 64-bit VM. 2432 if (use_rtm) { 2433 rtm_inflated_locking(flag, oop, displaced_header, box, temp, /*temp*/ current_header, 2434 rtm_counters, method_data, profile_rtm, cont); 2435 } else { 2436 #endif // INCLUDE_RTM_OPT 2437 2438 // Try to CAS m->owner from NULL to current thread. 2439 addi(temp, displaced_header, ObjectMonitor::owner_offset_in_bytes()-markOopDesc::monitor_value); 2440 li(displaced_header, 0); 2441 // CmpxchgX sets flag to cmpX(current, displaced). 2442 cmpxchgd(/*flag=*/flag, 2443 /*current_value=*/current_header, 2444 /*compare_value=*/(intptr_t)0, 2445 /*exchange_value=*/R16_thread, 2446 /*where=*/temp, 2447 MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq, 2448 MacroAssembler::cmpxchgx_hint_acquire_lock()); 2449 2450 // Store a non-null value into the box. 2451 std(box, BasicLock::displaced_header_offset_in_bytes(), box); 2452 2453 # ifdef ASSERT 2454 bne(flag, cont); 2455 // We have acquired the monitor, check some invariants. 2456 addi(/*monitor=*/temp, temp, -ObjectMonitor::owner_offset_in_bytes()); 2457 // Invariant 1: _recursions should be 0. 2458 //assert(ObjectMonitor::recursions_size_in_bytes() == 8, "unexpected size"); 2459 asm_assert_mem8_is_zero(ObjectMonitor::recursions_offset_in_bytes(), temp, 2460 "monitor->_recursions should be 0", -1); 2461 // Invariant 2: OwnerIsThread shouldn't be 0. 2462 //assert(ObjectMonitor::OwnerIsThread_size_in_bytes() == 4, "unexpected size"); 2463 //asm_assert_mem4_isnot_zero(ObjectMonitor::OwnerIsThread_offset_in_bytes(), temp, 2464 // "monitor->OwnerIsThread shouldn't be 0", -1); 2465 # endif 2466 2467 #if INCLUDE_RTM_OPT 2468 } // use_rtm() 2469 #endif 2470 } 2471 2472 bind(cont); 2473 // flag == EQ indicates success 2474 // flag == NE indicates failure 2475 } 2476 2477 void MacroAssembler::compiler_fast_unlock_object(ConditionRegister flag, Register oop, Register box, 2478 Register temp, Register displaced_header, Register current_header, 2479 bool try_bias, bool use_rtm) { 2480 assert_different_registers(oop, box, temp, displaced_header, current_header); 2481 assert(flag != CCR0, "bad condition register"); 2482 Label cont; 2483 Label object_has_monitor; 2484 2485 // Always do locking in runtime. 2486 if (EmitSync & 0x01) { 2487 cmpdi(flag, oop, 0); // Oop can't be 0 here => always false. 2488 return; 2489 } 2490 2491 if (try_bias) { 2492 biased_locking_exit(flag, oop, current_header, cont); 2493 } 2494 2495 #if INCLUDE_RTM_OPT 2496 if (UseRTMForStackLocks && use_rtm) { 2497 assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking"); 2498 Label L_regular_unlock; 2499 ld(current_header, oopDesc::mark_offset_in_bytes(), oop); // fetch markword 2500 andi(R0, current_header, markOopDesc::biased_lock_mask_in_place); // look at 3 lock bits 2501 cmpwi(flag, R0, markOopDesc::unlocked_value); // bits = 001 unlocked 2502 bne(flag, L_regular_unlock); // else RegularLock 2503 tend_(); // otherwise end... 2504 b(cont); // ... and we're done 2505 bind(L_regular_unlock); 2506 } 2507 #endif 2508 2509 // Find the lock address and load the displaced header from the stack. 2510 ld(displaced_header, BasicLock::displaced_header_offset_in_bytes(), box); 2511 2512 // If the displaced header is 0, we have a recursive unlock. 2513 cmpdi(flag, displaced_header, 0); 2514 beq(flag, cont); 2515 2516 // Handle existing monitor. 2517 if ((EmitSync & 0x02) == 0) { 2518 // The object has an existing monitor iff (mark & monitor_value) != 0. 2519 RTM_OPT_ONLY( if (!(UseRTMForStackLocks && use_rtm)) ) // skip load if already done 2520 ld(current_header, oopDesc::mark_offset_in_bytes(), oop); 2521 andi_(R0, current_header, markOopDesc::monitor_value); 2522 bne(CCR0, object_has_monitor); 2523 } 2524 2525 // Check if it is still a light weight lock, this is is true if we see 2526 // the stack address of the basicLock in the markOop of the object. 2527 // Cmpxchg sets flag to cmpd(current_header, box). 2528 cmpxchgd(/*flag=*/flag, 2529 /*current_value=*/current_header, 2530 /*compare_value=*/box, 2531 /*exchange_value=*/displaced_header, 2532 /*where=*/oop, 2533 MacroAssembler::MemBarRel, 2534 MacroAssembler::cmpxchgx_hint_release_lock(), 2535 noreg, 2536 &cont); 2537 2538 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 2539 2540 // Handle existing monitor. 2541 if ((EmitSync & 0x02) == 0) { 2542 b(cont); 2543 2544 bind(object_has_monitor); 2545 addi(current_header, current_header, -markOopDesc::monitor_value); // monitor 2546 ld(temp, ObjectMonitor::owner_offset_in_bytes(), current_header); 2547 2548 // It's inflated. 2549 #if INCLUDE_RTM_OPT 2550 if (use_rtm) { 2551 Label L_regular_inflated_unlock; 2552 // Clean monitor_value bit to get valid pointer 2553 cmpdi(flag, temp, 0); 2554 bne(flag, L_regular_inflated_unlock); 2555 tend_(); 2556 b(cont); 2557 bind(L_regular_inflated_unlock); 2558 } 2559 #endif 2560 2561 ld(displaced_header, ObjectMonitor::recursions_offset_in_bytes(), current_header); 2562 xorr(temp, R16_thread, temp); // Will be 0 if we are the owner. 2563 orr(temp, temp, displaced_header); // Will be 0 if there are 0 recursions. 2564 cmpdi(flag, temp, 0); 2565 bne(flag, cont); 2566 2567 ld(temp, ObjectMonitor::EntryList_offset_in_bytes(), current_header); 2568 ld(displaced_header, ObjectMonitor::cxq_offset_in_bytes(), current_header); 2569 orr(temp, temp, displaced_header); // Will be 0 if both are 0. 2570 cmpdi(flag, temp, 0); 2571 bne(flag, cont); 2572 release(); 2573 std(temp, ObjectMonitor::owner_offset_in_bytes(), current_header); 2574 } 2575 2576 bind(cont); 2577 // flag == EQ indicates success 2578 // flag == NE indicates failure 2579 } 2580 2581 // Write serialization page so VM thread can do a pseudo remote membar. 2582 // We use the current thread pointer to calculate a thread specific 2583 // offset to write to within the page. This minimizes bus traffic 2584 // due to cache line collision. 2585 void MacroAssembler::serialize_memory(Register thread, Register tmp1, Register tmp2) { 2586 srdi(tmp2, thread, os::get_serialize_page_shift_count()); 2587 2588 int mask = os::vm_page_size() - sizeof(int); 2589 if (Assembler::is_simm(mask, 16)) { 2590 andi(tmp2, tmp2, mask); 2591 } else { 2592 lis(tmp1, (int)((signed short) (mask >> 16))); 2593 ori(tmp1, tmp1, mask & 0x0000ffff); 2594 andr(tmp2, tmp2, tmp1); 2595 } 2596 2597 load_const(tmp1, (long) os::get_memory_serialize_page()); 2598 release(); 2599 stwx(R0, tmp1, tmp2); 2600 } 2601 2602 2603 // GC barrier helper macros 2604 2605 // Write the card table byte if needed. 2606 void MacroAssembler::card_write_barrier_post(Register Rstore_addr, Register Rnew_val, Register Rtmp) { 2607 CardTableModRefBS* bs = 2608 barrier_set_cast<CardTableModRefBS>(Universe::heap()->barrier_set()); 2609 assert(bs->kind() == BarrierSet::CardTableForRS || 2610 bs->kind() == BarrierSet::CardTableExtension, "wrong barrier"); 2611 #ifdef ASSERT 2612 cmpdi(CCR0, Rnew_val, 0); 2613 asm_assert_ne("null oop not allowed", 0x321); 2614 #endif 2615 card_table_write(bs->byte_map_base, Rtmp, Rstore_addr); 2616 } 2617 2618 // Write the card table byte. 2619 void MacroAssembler::card_table_write(jbyte* byte_map_base, Register Rtmp, Register Robj) { 2620 assert_different_registers(Robj, Rtmp, R0); 2621 load_const_optimized(Rtmp, (address)byte_map_base, R0); 2622 srdi(Robj, Robj, CardTableModRefBS::card_shift); 2623 li(R0, 0); // dirty 2624 if (UseConcMarkSweepGC) membar(Assembler::StoreStore); 2625 stbx(R0, Rtmp, Robj); 2626 } 2627 2628 #if INCLUDE_ALL_GCS 2629 // General G1 pre-barrier generator. 2630 // Goal: record the previous value if it is not null. 2631 void MacroAssembler::g1_write_barrier_pre(Register Robj, RegisterOrConstant offset, Register Rpre_val, 2632 Register Rtmp1, Register Rtmp2, bool needs_frame) { 2633 Label runtime, filtered; 2634 2635 // Is marking active? 2636 if (in_bytes(SATBMarkQueue::byte_width_of_active()) == 4) { 2637 lwz(Rtmp1, in_bytes(JavaThread::satb_mark_queue_offset() + SATBMarkQueue::byte_offset_of_active()), R16_thread); 2638 } else { 2639 guarantee(in_bytes(SATBMarkQueue::byte_width_of_active()) == 1, "Assumption"); 2640 lbz(Rtmp1, in_bytes(JavaThread::satb_mark_queue_offset() + SATBMarkQueue::byte_offset_of_active()), R16_thread); 2641 } 2642 cmpdi(CCR0, Rtmp1, 0); 2643 beq(CCR0, filtered); 2644 2645 // Do we need to load the previous value? 2646 if (Robj != noreg) { 2647 // Load the previous value... 2648 if (UseCompressedOops) { 2649 lwz(Rpre_val, offset, Robj); 2650 } else { 2651 ld(Rpre_val, offset, Robj); 2652 } 2653 // Previous value has been loaded into Rpre_val. 2654 } 2655 assert(Rpre_val != noreg, "must have a real register"); 2656 2657 // Is the previous value null? 2658 cmpdi(CCR0, Rpre_val, 0); 2659 beq(CCR0, filtered); 2660 2661 if (Robj != noreg && UseCompressedOops) { 2662 decode_heap_oop_not_null(Rpre_val); 2663 } 2664 2665 // OK, it's not filtered, so we'll need to call enqueue. In the normal 2666 // case, pre_val will be a scratch G-reg, but there are some cases in 2667 // which it's an O-reg. In the first case, do a normal call. In the 2668 // latter, do a save here and call the frameless version. 2669 2670 // Can we store original value in the thread's buffer? 2671 // Is index == 0? 2672 // (The index field is typed as size_t.) 2673 const Register Rbuffer = Rtmp1, Rindex = Rtmp2; 2674 2675 ld(Rindex, in_bytes(JavaThread::satb_mark_queue_offset() + SATBMarkQueue::byte_offset_of_index()), R16_thread); 2676 cmpdi(CCR0, Rindex, 0); 2677 beq(CCR0, runtime); // If index == 0, goto runtime. 2678 ld(Rbuffer, in_bytes(JavaThread::satb_mark_queue_offset() + SATBMarkQueue::byte_offset_of_buf()), R16_thread); 2679 2680 addi(Rindex, Rindex, -wordSize); // Decrement index. 2681 std(Rindex, in_bytes(JavaThread::satb_mark_queue_offset() + SATBMarkQueue::byte_offset_of_index()), R16_thread); 2682 2683 // Record the previous value. 2684 stdx(Rpre_val, Rbuffer, Rindex); 2685 b(filtered); 2686 2687 bind(runtime); 2688 2689 // VM call need frame to access(write) O register. 2690 if (needs_frame) { 2691 save_LR_CR(Rtmp1); 2692 push_frame_reg_args(0, Rtmp2); 2693 } 2694 2695 if (Rpre_val->is_volatile() && Robj == noreg) mr(R31, Rpre_val); // Save pre_val across C call if it was preloaded. 2696 call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_pre), Rpre_val, R16_thread); 2697 if (Rpre_val->is_volatile() && Robj == noreg) mr(Rpre_val, R31); // restore 2698 2699 if (needs_frame) { 2700 pop_frame(); 2701 restore_LR_CR(Rtmp1); 2702 } 2703 2704 bind(filtered); 2705 } 2706 2707 // General G1 post-barrier generator 2708 // Store cross-region card. 2709 void MacroAssembler::g1_write_barrier_post(Register Rstore_addr, Register Rnew_val, Register Rtmp1, Register Rtmp2, Register Rtmp3, Label *filtered_ext) { 2710 Label runtime, filtered_int; 2711 Label& filtered = (filtered_ext != NULL) ? *filtered_ext : filtered_int; 2712 assert_different_registers(Rstore_addr, Rnew_val, Rtmp1, Rtmp2); 2713 2714 G1SATBCardTableLoggingModRefBS* bs = 2715 barrier_set_cast<G1SATBCardTableLoggingModRefBS>(Universe::heap()->barrier_set()); 2716 2717 // Does store cross heap regions? 2718 if (G1RSBarrierRegionFilter) { 2719 xorr(Rtmp1, Rstore_addr, Rnew_val); 2720 srdi_(Rtmp1, Rtmp1, HeapRegion::LogOfHRGrainBytes); 2721 beq(CCR0, filtered); 2722 } 2723 2724 // Crosses regions, storing NULL? 2725 #ifdef ASSERT 2726 cmpdi(CCR0, Rnew_val, 0); 2727 asm_assert_ne("null oop not allowed (G1)", 0x322); // Checked by caller on PPC64, so following branch is obsolete: 2728 //beq(CCR0, filtered); 2729 #endif 2730 2731 // Storing region crossing non-NULL, is card already dirty? 2732 assert(sizeof(*bs->byte_map_base) == sizeof(jbyte), "adjust this code"); 2733 const Register Rcard_addr = Rtmp1; 2734 Register Rbase = Rtmp2; 2735 load_const_optimized(Rbase, (address)bs->byte_map_base, /*temp*/ Rtmp3); 2736 2737 srdi(Rcard_addr, Rstore_addr, CardTableModRefBS::card_shift); 2738 2739 // Get the address of the card. 2740 lbzx(/*card value*/ Rtmp3, Rbase, Rcard_addr); 2741 cmpwi(CCR0, Rtmp3, (int)G1SATBCardTableModRefBS::g1_young_card_val()); 2742 beq(CCR0, filtered); 2743 2744 membar(Assembler::StoreLoad); 2745 lbzx(/*card value*/ Rtmp3, Rbase, Rcard_addr); // Reload after membar. 2746 cmpwi(CCR0, Rtmp3 /* card value */, CardTableModRefBS::dirty_card_val()); 2747 beq(CCR0, filtered); 2748 2749 // Storing a region crossing, non-NULL oop, card is clean. 2750 // Dirty card and log. 2751 li(Rtmp3, CardTableModRefBS::dirty_card_val()); 2752 //release(); // G1: oops are allowed to get visible after dirty marking. 2753 stbx(Rtmp3, Rbase, Rcard_addr); 2754 2755 add(Rcard_addr, Rbase, Rcard_addr); // This is the address which needs to get enqueued. 2756 Rbase = noreg; // end of lifetime 2757 2758 const Register Rqueue_index = Rtmp2, 2759 Rqueue_buf = Rtmp3; 2760 ld(Rqueue_index, in_bytes(JavaThread::dirty_card_queue_offset() + DirtyCardQueue::byte_offset_of_index()), R16_thread); 2761 cmpdi(CCR0, Rqueue_index, 0); 2762 beq(CCR0, runtime); // index == 0 then jump to runtime 2763 ld(Rqueue_buf, in_bytes(JavaThread::dirty_card_queue_offset() + DirtyCardQueue::byte_offset_of_buf()), R16_thread); 2764 2765 addi(Rqueue_index, Rqueue_index, -wordSize); // decrement index 2766 std(Rqueue_index, in_bytes(JavaThread::dirty_card_queue_offset() + DirtyCardQueue::byte_offset_of_index()), R16_thread); 2767 2768 stdx(Rcard_addr, Rqueue_buf, Rqueue_index); // store card 2769 b(filtered); 2770 2771 bind(runtime); 2772 2773 // Save the live input values. 2774 call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_post), Rcard_addr, R16_thread); 2775 2776 bind(filtered_int); 2777 } 2778 #endif // INCLUDE_ALL_GCS 2779 2780 // Values for last_Java_pc, and last_Java_sp must comply to the rules 2781 // in frame_ppc.hpp. 2782 void MacroAssembler::set_last_Java_frame(Register last_Java_sp, Register last_Java_pc) { 2783 // Always set last_Java_pc and flags first because once last_Java_sp 2784 // is visible has_last_Java_frame is true and users will look at the 2785 // rest of the fields. (Note: flags should always be zero before we 2786 // get here so doesn't need to be set.) 2787 2788 // Verify that last_Java_pc was zeroed on return to Java 2789 asm_assert_mem8_is_zero(in_bytes(JavaThread::last_Java_pc_offset()), R16_thread, 2790 "last_Java_pc not zeroed before leaving Java", 0x200); 2791 2792 // When returning from calling out from Java mode the frame anchor's 2793 // last_Java_pc will always be set to NULL. It is set here so that 2794 // if we are doing a call to native (not VM) that we capture the 2795 // known pc and don't have to rely on the native call having a 2796 // standard frame linkage where we can find the pc. 2797 if (last_Java_pc != noreg) 2798 std(last_Java_pc, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread); 2799 2800 // Set last_Java_sp last. 2801 std(last_Java_sp, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread); 2802 } 2803 2804 void MacroAssembler::reset_last_Java_frame(void) { 2805 asm_assert_mem8_isnot_zero(in_bytes(JavaThread::last_Java_sp_offset()), 2806 R16_thread, "SP was not set, still zero", 0x202); 2807 2808 BLOCK_COMMENT("reset_last_Java_frame {"); 2809 li(R0, 0); 2810 2811 // _last_Java_sp = 0 2812 std(R0, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread); 2813 2814 // _last_Java_pc = 0 2815 std(R0, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread); 2816 BLOCK_COMMENT("} reset_last_Java_frame"); 2817 } 2818 2819 void MacroAssembler::set_top_ijava_frame_at_SP_as_last_Java_frame(Register sp, Register tmp1) { 2820 assert_different_registers(sp, tmp1); 2821 2822 // sp points to a TOP_IJAVA_FRAME, retrieve frame's PC via 2823 // TOP_IJAVA_FRAME_ABI. 2824 // FIXME: assert that we really have a TOP_IJAVA_FRAME here! 2825 address entry = pc(); 2826 load_const_optimized(tmp1, entry); 2827 2828 set_last_Java_frame(/*sp=*/sp, /*pc=*/tmp1); 2829 } 2830 2831 void MacroAssembler::get_vm_result(Register oop_result) { 2832 // Read: 2833 // R16_thread 2834 // R16_thread->in_bytes(JavaThread::vm_result_offset()) 2835 // 2836 // Updated: 2837 // oop_result 2838 // R16_thread->in_bytes(JavaThread::vm_result_offset()) 2839 2840 verify_thread(); 2841 2842 ld(oop_result, in_bytes(JavaThread::vm_result_offset()), R16_thread); 2843 li(R0, 0); 2844 std(R0, in_bytes(JavaThread::vm_result_offset()), R16_thread); 2845 2846 verify_oop(oop_result); 2847 } 2848 2849 void MacroAssembler::get_vm_result_2(Register metadata_result) { 2850 // Read: 2851 // R16_thread 2852 // R16_thread->in_bytes(JavaThread::vm_result_2_offset()) 2853 // 2854 // Updated: 2855 // metadata_result 2856 // R16_thread->in_bytes(JavaThread::vm_result_2_offset()) 2857 2858 ld(metadata_result, in_bytes(JavaThread::vm_result_2_offset()), R16_thread); 2859 li(R0, 0); 2860 std(R0, in_bytes(JavaThread::vm_result_2_offset()), R16_thread); 2861 } 2862 2863 Register MacroAssembler::encode_klass_not_null(Register dst, Register src) { 2864 Register current = (src != noreg) ? src : dst; // Klass is in dst if no src provided. 2865 if (Universe::narrow_klass_base() != 0) { 2866 // Use dst as temp if it is free. 2867 sub_const_optimized(dst, current, Universe::narrow_klass_base(), R0); 2868 current = dst; 2869 } 2870 if (Universe::narrow_klass_shift() != 0) { 2871 srdi(dst, current, Universe::narrow_klass_shift()); 2872 current = dst; 2873 } 2874 return current; 2875 } 2876 2877 void MacroAssembler::store_klass(Register dst_oop, Register klass, Register ck) { 2878 if (UseCompressedClassPointers) { 2879 Register compressedKlass = encode_klass_not_null(ck, klass); 2880 stw(compressedKlass, oopDesc::klass_offset_in_bytes(), dst_oop); 2881 } else { 2882 std(klass, oopDesc::klass_offset_in_bytes(), dst_oop); 2883 } 2884 } 2885 2886 void MacroAssembler::store_klass_gap(Register dst_oop, Register val) { 2887 if (UseCompressedClassPointers) { 2888 if (val == noreg) { 2889 val = R0; 2890 li(val, 0); 2891 } 2892 stw(val, oopDesc::klass_gap_offset_in_bytes(), dst_oop); // klass gap if compressed 2893 } 2894 } 2895 2896 int MacroAssembler::instr_size_for_decode_klass_not_null() { 2897 if (!UseCompressedClassPointers) return 0; 2898 int num_instrs = 1; // shift or move 2899 if (Universe::narrow_klass_base() != 0) num_instrs = 7; // shift + load const + add 2900 return num_instrs * BytesPerInstWord; 2901 } 2902 2903 void MacroAssembler::decode_klass_not_null(Register dst, Register src) { 2904 assert(dst != R0, "Dst reg may not be R0, as R0 is used here."); 2905 if (src == noreg) src = dst; 2906 Register shifted_src = src; 2907 if (Universe::narrow_klass_shift() != 0 || 2908 Universe::narrow_klass_base() == 0 && src != dst) { // Move required. 2909 shifted_src = dst; 2910 sldi(shifted_src, src, Universe::narrow_klass_shift()); 2911 } 2912 if (Universe::narrow_klass_base() != 0) { 2913 add_const_optimized(dst, shifted_src, Universe::narrow_klass_base(), R0); 2914 } 2915 } 2916 2917 void MacroAssembler::load_klass(Register dst, Register src) { 2918 if (UseCompressedClassPointers) { 2919 lwz(dst, oopDesc::klass_offset_in_bytes(), src); 2920 // Attention: no null check here! 2921 decode_klass_not_null(dst, dst); 2922 } else { 2923 ld(dst, oopDesc::klass_offset_in_bytes(), src); 2924 } 2925 } 2926 2927 void MacroAssembler::load_klass_with_trap_null_check(Register dst, Register src) { 2928 if (!os::zero_page_read_protected()) { 2929 if (TrapBasedNullChecks) { 2930 trap_null_check(src); 2931 } 2932 } 2933 load_klass(dst, src); 2934 } 2935 2936 void MacroAssembler::reinit_heapbase(Register d, Register tmp) { 2937 if (Universe::heap() != NULL) { 2938 load_const_optimized(R30, Universe::narrow_ptrs_base(), tmp); 2939 } else { 2940 // Heap not yet allocated. Load indirectly. 2941 int simm16_offset = load_const_optimized(R30, Universe::narrow_ptrs_base_addr(), tmp, true); 2942 ld(R30, simm16_offset, R30); 2943 } 2944 } 2945 2946 // Clear Array 2947 // Kills both input registers. tmp == R0 is allowed. 2948 void MacroAssembler::clear_memory_doubleword(Register base_ptr, Register cnt_dwords, Register tmp) { 2949 // Procedure for large arrays (uses data cache block zero instruction). 2950 Label startloop, fast, fastloop, small_rest, restloop, done; 2951 const int cl_size = VM_Version::get_cache_line_size(), 2952 cl_dwords = cl_size>>3, 2953 cl_dw_addr_bits = exact_log2(cl_dwords), 2954 dcbz_min = 1; // Min count of dcbz executions, needs to be >0. 2955 2956 //2: 2957 cmpdi(CCR1, cnt_dwords, ((dcbz_min+1)<<cl_dw_addr_bits)-1); // Big enough? (ensure >=dcbz_min lines included). 2958 blt(CCR1, small_rest); // Too small. 2959 rldicl_(tmp, base_ptr, 64-3, 64-cl_dw_addr_bits); // Extract dword offset within first cache line. 2960 beq(CCR0, fast); // Already 128byte aligned. 2961 2962 subfic(tmp, tmp, cl_dwords); 2963 mtctr(tmp); // Set ctr to hit 128byte boundary (0<ctr<cl_dwords). 2964 subf(cnt_dwords, tmp, cnt_dwords); // rest. 2965 li(tmp, 0); 2966 //10: 2967 bind(startloop); // Clear at the beginning to reach 128byte boundary. 2968 std(tmp, 0, base_ptr); // Clear 8byte aligned block. 2969 addi(base_ptr, base_ptr, 8); 2970 bdnz(startloop); 2971 //13: 2972 bind(fast); // Clear 128byte blocks. 2973 srdi(tmp, cnt_dwords, cl_dw_addr_bits); // Loop count for 128byte loop (>0). 2974 andi(cnt_dwords, cnt_dwords, cl_dwords-1); // Rest in dwords. 2975 mtctr(tmp); // Load counter. 2976 //16: 2977 bind(fastloop); 2978 dcbz(base_ptr); // Clear 128byte aligned block. 2979 addi(base_ptr, base_ptr, cl_size); 2980 bdnz(fastloop); 2981 if (InsertEndGroupPPC64) { endgroup(); } else { nop(); } 2982 //20: 2983 bind(small_rest); 2984 cmpdi(CCR0, cnt_dwords, 0); // size 0? 2985 beq(CCR0, done); // rest == 0 2986 li(tmp, 0); 2987 mtctr(cnt_dwords); // Load counter. 2988 //24: 2989 bind(restloop); // Clear rest. 2990 std(tmp, 0, base_ptr); // Clear 8byte aligned block. 2991 addi(base_ptr, base_ptr, 8); 2992 bdnz(restloop); 2993 //27: 2994 bind(done); 2995 } 2996 2997 /////////////////////////////////////////// String intrinsics //////////////////////////////////////////// 2998 2999 // Search for a single jchar in an jchar[]. 3000 // 3001 // Assumes that result differs from all other registers. 3002 // 3003 // Haystack, needle are the addresses of jchar-arrays. 3004 // NeedleChar is needle[0] if it is known at compile time. 3005 // Haycnt is the length of the haystack. We assume haycnt >=1. 3006 // 3007 // Preserves haystack, haycnt, kills all other registers. 3008 // 3009 // If needle == R0, we search for the constant needleChar. 3010 void MacroAssembler::string_indexof_1(Register result, Register haystack, Register haycnt, 3011 Register needle, jchar needleChar, 3012 Register tmp1, Register tmp2) { 3013 3014 assert_different_registers(result, haystack, haycnt, needle, tmp1, tmp2); 3015 3016 Label L_InnerLoop, L_FinalCheck, L_Found1, L_Found2, L_Found3, L_NotFound, L_End; 3017 Register needle0 = needle, // Contains needle[0]. 3018 addr = tmp1, 3019 ch1 = tmp2, 3020 ch2 = R0; 3021 3022 //2 (variable) or 3 (const): 3023 if (needle != R0) lhz(needle0, 0, needle); // Preload needle character, needle has len==1. 3024 dcbtct(haystack, 0x00); // Indicate R/O access to haystack. 3025 3026 srwi_(tmp2, haycnt, 1); // Shift right by exact_log2(UNROLL_FACTOR). 3027 mr(addr, haystack); 3028 beq(CCR0, L_FinalCheck); 3029 mtctr(tmp2); // Move to count register. 3030 //8: 3031 bind(L_InnerLoop); // Main work horse (2x unrolled search loop). 3032 lhz(ch1, 0, addr); // Load characters from haystack. 3033 lhz(ch2, 2, addr); 3034 (needle != R0) ? cmpw(CCR0, ch1, needle0) : cmplwi(CCR0, ch1, needleChar); 3035 (needle != R0) ? cmpw(CCR1, ch2, needle0) : cmplwi(CCR1, ch2, needleChar); 3036 beq(CCR0, L_Found1); // Did we find the needle? 3037 beq(CCR1, L_Found2); 3038 addi(addr, addr, 4); 3039 bdnz(L_InnerLoop); 3040 //16: 3041 bind(L_FinalCheck); 3042 andi_(R0, haycnt, 1); 3043 beq(CCR0, L_NotFound); 3044 lhz(ch1, 0, addr); // One position left at which we have to compare. 3045 (needle != R0) ? cmpw(CCR1, ch1, needle0) : cmplwi(CCR1, ch1, needleChar); 3046 beq(CCR1, L_Found3); 3047 //21: 3048 bind(L_NotFound); 3049 li(result, -1); // Not found. 3050 b(L_End); 3051 3052 bind(L_Found2); 3053 addi(addr, addr, 2); 3054 //24: 3055 bind(L_Found1); 3056 bind(L_Found3); // Return index ... 3057 subf(addr, haystack, addr); // relative to haystack, 3058 srdi(result, addr, 1); // in characters. 3059 bind(L_End); 3060 } 3061 3062 3063 // Implementation of IndexOf for jchar arrays. 3064 // 3065 // The length of haystack and needle are not constant, i.e. passed in a register. 3066 // 3067 // Preserves registers haystack, needle. 3068 // Kills registers haycnt, needlecnt. 3069 // Assumes that result differs from all other registers. 3070 // Haystack, needle are the addresses of jchar-arrays. 3071 // Haycnt, needlecnt are the lengths of them, respectively. 3072 // 3073 // Needlecntval must be zero or 15-bit unsigned immediate and > 1. 3074 void MacroAssembler::string_indexof(Register result, Register haystack, Register haycnt, 3075 Register needle, ciTypeArray* needle_values, Register needlecnt, int needlecntval, 3076 Register tmp1, Register tmp2, Register tmp3, Register tmp4) { 3077 3078 // Ensure 0<needlecnt<=haycnt in ideal graph as prerequisite! 3079 Label L_TooShort, L_Found, L_NotFound, L_End; 3080 Register last_addr = haycnt, // Kill haycnt at the beginning. 3081 addr = tmp1, 3082 n_start = tmp2, 3083 ch1 = tmp3, 3084 ch2 = R0; 3085 3086 // ************************************************************************************************** 3087 // Prepare for main loop: optimized for needle count >=2, bail out otherwise. 3088 // ************************************************************************************************** 3089 3090 //1 (variable) or 3 (const): 3091 dcbtct(needle, 0x00); // Indicate R/O access to str1. 3092 dcbtct(haystack, 0x00); // Indicate R/O access to str2. 3093 3094 // Compute last haystack addr to use if no match gets found. 3095 if (needlecntval == 0) { // variable needlecnt 3096 //3: 3097 subf(ch1, needlecnt, haycnt); // Last character index to compare is haycnt-needlecnt. 3098 addi(addr, haystack, -2); // Accesses use pre-increment. 3099 cmpwi(CCR6, needlecnt, 2); 3100 blt(CCR6, L_TooShort); // Variable needlecnt: handle short needle separately. 3101 slwi(ch1, ch1, 1); // Scale to number of bytes. 3102 lwz(n_start, 0, needle); // Load first 2 characters of needle. 3103 add(last_addr, haystack, ch1); // Point to last address to compare (haystack+2*(haycnt-needlecnt)). 3104 addi(needlecnt, needlecnt, -2); // Rest of needle. 3105 } else { // constant needlecnt 3106 guarantee(needlecntval != 1, "IndexOf with single-character needle must be handled separately"); 3107 assert((needlecntval & 0x7fff) == needlecntval, "wrong immediate"); 3108 //5: 3109 addi(ch1, haycnt, -needlecntval); // Last character index to compare is haycnt-needlecnt. 3110 lwz(n_start, 0, needle); // Load first 2 characters of needle. 3111 addi(addr, haystack, -2); // Accesses use pre-increment. 3112 slwi(ch1, ch1, 1); // Scale to number of bytes. 3113 add(last_addr, haystack, ch1); // Point to last address to compare (haystack+2*(haycnt-needlecnt)). 3114 li(needlecnt, needlecntval-2); // Rest of needle. 3115 } 3116 3117 // Main Loop (now we have at least 3 characters). 3118 //11: 3119 Label L_OuterLoop, L_InnerLoop, L_FinalCheck, L_Comp1, L_Comp2, L_Comp3; 3120 bind(L_OuterLoop); // Search for 1st 2 characters. 3121 Register addr_diff = tmp4; 3122 subf(addr_diff, addr, last_addr); // Difference between already checked address and last address to check. 3123 addi(addr, addr, 2); // This is the new address we want to use for comparing. 3124 srdi_(ch2, addr_diff, 2); 3125 beq(CCR0, L_FinalCheck); // 2 characters left? 3126 mtctr(ch2); // addr_diff/4 3127 //16: 3128 bind(L_InnerLoop); // Main work horse (2x unrolled search loop) 3129 lwz(ch1, 0, addr); // Load 2 characters of haystack (ignore alignment). 3130 lwz(ch2, 2, addr); 3131 cmpw(CCR0, ch1, n_start); // Compare 2 characters (1 would be sufficient but try to reduce branches to CompLoop). 3132 cmpw(CCR1, ch2, n_start); 3133 beq(CCR0, L_Comp1); // Did we find the needle start? 3134 beq(CCR1, L_Comp2); 3135 addi(addr, addr, 4); 3136 bdnz(L_InnerLoop); 3137 //24: 3138 bind(L_FinalCheck); 3139 rldicl_(addr_diff, addr_diff, 64-1, 63); // Remaining characters not covered by InnerLoop: (addr_diff>>1)&1. 3140 beq(CCR0, L_NotFound); 3141 lwz(ch1, 0, addr); // One position left at which we have to compare. 3142 cmpw(CCR1, ch1, n_start); 3143 beq(CCR1, L_Comp3); 3144 //29: 3145 bind(L_NotFound); 3146 li(result, -1); // not found 3147 b(L_End); 3148 3149 3150 // ************************************************************************************************** 3151 // Special Case: unfortunately, the variable needle case can be called with needlecnt<2 3152 // ************************************************************************************************** 3153 //31: 3154 if ((needlecntval>>1) !=1 ) { // Const needlecnt is 2 or 3? Reduce code size. 3155 int nopcnt = 5; 3156 if (needlecntval !=0 ) ++nopcnt; // Balance alignment (other case: see below). 3157 if (needlecntval == 0) { // We have to handle these cases separately. 3158 Label L_OneCharLoop; 3159 bind(L_TooShort); 3160 mtctr(haycnt); 3161 lhz(n_start, 0, needle); // First character of needle 3162 bind(L_OneCharLoop); 3163 lhzu(ch1, 2, addr); 3164 cmpw(CCR1, ch1, n_start); 3165 beq(CCR1, L_Found); // Did we find the one character needle? 3166 bdnz(L_OneCharLoop); 3167 li(result, -1); // Not found. 3168 b(L_End); 3169 } // 8 instructions, so no impact on alignment. 3170 for (int x = 0; x < nopcnt; ++x) nop(); 3171 } 3172 3173 // ************************************************************************************************** 3174 // Regular Case Part II: compare rest of needle (first 2 characters have been compared already) 3175 // ************************************************************************************************** 3176 3177 // Compare the rest 3178 //36 if needlecntval==0, else 37: 3179 bind(L_Comp2); 3180 addi(addr, addr, 2); // First comparison has failed, 2nd one hit. 3181 bind(L_Comp1); // Addr points to possible needle start. 3182 bind(L_Comp3); // Could have created a copy and use a different return address but saving code size here. 3183 if (needlecntval != 2) { // Const needlecnt==2? 3184 if (needlecntval != 3) { 3185 if (needlecntval == 0) beq(CCR6, L_Found); // Variable needlecnt==2? 3186 Register ind_reg = tmp4; 3187 li(ind_reg, 2*2); // First 2 characters are already compared, use index 2. 3188 mtctr(needlecnt); // Decremented by 2, still > 0. 3189 //40: 3190 Label L_CompLoop; 3191 bind(L_CompLoop); 3192 lhzx(ch2, needle, ind_reg); 3193 lhzx(ch1, addr, ind_reg); 3194 cmpw(CCR1, ch1, ch2); 3195 bne(CCR1, L_OuterLoop); 3196 addi(ind_reg, ind_reg, 2); 3197 bdnz(L_CompLoop); 3198 } else { // No loop required if there's only one needle character left. 3199 lhz(ch2, 2*2, needle); 3200 lhz(ch1, 2*2, addr); 3201 cmpw(CCR1, ch1, ch2); 3202 bne(CCR1, L_OuterLoop); 3203 } 3204 } 3205 // Return index ... 3206 //46: 3207 bind(L_Found); 3208 subf(addr, haystack, addr); // relative to haystack, ... 3209 srdi(result, addr, 1); // in characters. 3210 //48: 3211 bind(L_End); 3212 } 3213 3214 // Implementation of Compare for jchar arrays. 3215 // 3216 // Kills the registers str1, str2, cnt1, cnt2. 3217 // Kills cr0, ctr. 3218 // Assumes that result differes from the input registers. 3219 void MacroAssembler::string_compare(Register str1_reg, Register str2_reg, Register cnt1_reg, Register cnt2_reg, 3220 Register result_reg, Register tmp_reg) { 3221 assert_different_registers(result_reg, str1_reg, str2_reg, cnt1_reg, cnt2_reg, tmp_reg); 3222 3223 Label Ldone, Lslow_case, Lslow_loop, Lfast_loop; 3224 Register cnt_diff = R0, 3225 limit_reg = cnt1_reg, 3226 chr1_reg = result_reg, 3227 chr2_reg = cnt2_reg, 3228 addr_diff = str2_reg; 3229 3230 // Offset 0 should be 32 byte aligned. 3231 //-4: 3232 dcbtct(str1_reg, 0x00); // Indicate R/O access to str1. 3233 dcbtct(str2_reg, 0x00); // Indicate R/O access to str2. 3234 //-2: 3235 // Compute min(cnt1, cnt2) and check if 0 (bail out if we don't need to compare characters). 3236 subf(result_reg, cnt2_reg, cnt1_reg); // difference between cnt1/2 3237 subf_(addr_diff, str1_reg, str2_reg); // alias? 3238 beq(CCR0, Ldone); // return cnt difference if both ones are identical 3239 srawi(limit_reg, result_reg, 31); // generate signmask (cnt1/2 must be non-negative so cnt_diff can't overflow) 3240 mr(cnt_diff, result_reg); 3241 andr(limit_reg, result_reg, limit_reg); // difference or zero (negative): cnt1<cnt2 ? cnt1-cnt2 : 0 3242 add_(limit_reg, cnt2_reg, limit_reg); // min(cnt1, cnt2)==0? 3243 beq(CCR0, Ldone); // return cnt difference if one has 0 length 3244 3245 lhz(chr1_reg, 0, str1_reg); // optional: early out if first characters mismatch 3246 lhzx(chr2_reg, str1_reg, addr_diff); // optional: early out if first characters mismatch 3247 addi(tmp_reg, limit_reg, -1); // min(cnt1, cnt2)-1 3248 subf_(result_reg, chr2_reg, chr1_reg); // optional: early out if first characters mismatch 3249 bne(CCR0, Ldone); // optional: early out if first characters mismatch 3250 3251 // Set loop counter by scaling down tmp_reg 3252 srawi_(chr2_reg, tmp_reg, exact_log2(4)); // (min(cnt1, cnt2)-1)/4 3253 ble(CCR0, Lslow_case); // need >4 characters for fast loop 3254 andi(limit_reg, tmp_reg, 4-1); // remaining characters 3255 3256 // Adapt str1_reg str2_reg for the first loop iteration 3257 mtctr(chr2_reg); // (min(cnt1, cnt2)-1)/4 3258 addi(limit_reg, limit_reg, 4+1); // compare last 5-8 characters in slow_case if mismatch found in fast_loop 3259 //16: 3260 // Compare the rest of the characters 3261 bind(Lfast_loop); 3262 ld(chr1_reg, 0, str1_reg); 3263 ldx(chr2_reg, str1_reg, addr_diff); 3264 cmpd(CCR0, chr2_reg, chr1_reg); 3265 bne(CCR0, Lslow_case); // return chr1_reg 3266 addi(str1_reg, str1_reg, 4*2); 3267 bdnz(Lfast_loop); 3268 addi(limit_reg, limit_reg, -4); // no mismatch found in fast_loop, only 1-4 characters missing 3269 //23: 3270 bind(Lslow_case); 3271 mtctr(limit_reg); 3272 //24: 3273 bind(Lslow_loop); 3274 lhz(chr1_reg, 0, str1_reg); 3275 lhzx(chr2_reg, str1_reg, addr_diff); 3276 subf_(result_reg, chr2_reg, chr1_reg); 3277 bne(CCR0, Ldone); // return chr1_reg 3278 addi(str1_reg, str1_reg, 1*2); 3279 bdnz(Lslow_loop); 3280 //30: 3281 // If strings are equal up to min length, return the length difference. 3282 mr(result_reg, cnt_diff); 3283 nop(); // alignment 3284 //32: 3285 // Otherwise, return the difference between the first mismatched chars. 3286 bind(Ldone); 3287 } 3288 3289 3290 // Compare char[] arrays. 3291 // 3292 // str1_reg USE only 3293 // str2_reg USE only 3294 // cnt_reg USE_DEF, due to tmp reg shortage 3295 // result_reg DEF only, might compromise USE only registers 3296 void MacroAssembler::char_arrays_equals(Register str1_reg, Register str2_reg, Register cnt_reg, Register result_reg, 3297 Register tmp1_reg, Register tmp2_reg, Register tmp3_reg, Register tmp4_reg, 3298 Register tmp5_reg) { 3299 3300 // Str1 may be the same register as str2 which can occur e.g. after scalar replacement. 3301 assert_different_registers(result_reg, str1_reg, cnt_reg, tmp1_reg, tmp2_reg, tmp3_reg, tmp4_reg, tmp5_reg); 3302 assert_different_registers(result_reg, str2_reg, cnt_reg, tmp1_reg, tmp2_reg, tmp3_reg, tmp4_reg, tmp5_reg); 3303 3304 // Offset 0 should be 32 byte aligned. 3305 Label Linit_cbc, Lcbc, Lloop, Ldone_true, Ldone_false; 3306 Register index_reg = tmp5_reg; 3307 Register cbc_iter = tmp4_reg; 3308 3309 //-1: 3310 dcbtct(str1_reg, 0x00); // Indicate R/O access to str1. 3311 dcbtct(str2_reg, 0x00); // Indicate R/O access to str2. 3312 //1: 3313 andi(cbc_iter, cnt_reg, 4-1); // Remaining iterations after 4 java characters per iteration loop. 3314 li(index_reg, 0); // init 3315 li(result_reg, 0); // assume false 3316 srwi_(tmp2_reg, cnt_reg, exact_log2(4)); // Div: 4 java characters per iteration (main loop). 3317 3318 cmpwi(CCR1, cbc_iter, 0); // CCR1 = (cbc_iter==0) 3319 beq(CCR0, Linit_cbc); // too short 3320 mtctr(tmp2_reg); 3321 //8: 3322 bind(Lloop); 3323 ldx(tmp1_reg, str1_reg, index_reg); 3324 ldx(tmp2_reg, str2_reg, index_reg); 3325 cmpd(CCR0, tmp1_reg, tmp2_reg); 3326 bne(CCR0, Ldone_false); // Unequal char pair found -> done. 3327 addi(index_reg, index_reg, 4*sizeof(jchar)); 3328 bdnz(Lloop); 3329 //14: 3330 bind(Linit_cbc); 3331 beq(CCR1, Ldone_true); 3332 mtctr(cbc_iter); 3333 //16: 3334 bind(Lcbc); 3335 lhzx(tmp1_reg, str1_reg, index_reg); 3336 lhzx(tmp2_reg, str2_reg, index_reg); 3337 cmpw(CCR0, tmp1_reg, tmp2_reg); 3338 bne(CCR0, Ldone_false); // Unequal char pair found -> done. 3339 addi(index_reg, index_reg, 1*sizeof(jchar)); 3340 bdnz(Lcbc); 3341 nop(); 3342 bind(Ldone_true); 3343 li(result_reg, 1); 3344 //24: 3345 bind(Ldone_false); 3346 } 3347 3348 3349 void MacroAssembler::char_arrays_equalsImm(Register str1_reg, Register str2_reg, int cntval, Register result_reg, 3350 Register tmp1_reg, Register tmp2_reg) { 3351 // Str1 may be the same register as str2 which can occur e.g. after scalar replacement. 3352 assert_different_registers(result_reg, str1_reg, tmp1_reg, tmp2_reg); 3353 assert_different_registers(result_reg, str2_reg, tmp1_reg, tmp2_reg); 3354 assert(sizeof(jchar) == 2, "must be"); 3355 assert(cntval >= 0 && ((cntval & 0x7fff) == cntval), "wrong immediate"); 3356 3357 Label Ldone_false; 3358 3359 if (cntval < 16) { // short case 3360 if (cntval != 0) li(result_reg, 0); // assume false 3361 3362 const int num_bytes = cntval*sizeof(jchar); 3363 int index = 0; 3364 for (int next_index; (next_index = index + 8) <= num_bytes; index = next_index) { 3365 ld(tmp1_reg, index, str1_reg); 3366 ld(tmp2_reg, index, str2_reg); 3367 cmpd(CCR0, tmp1_reg, tmp2_reg); 3368 bne(CCR0, Ldone_false); 3369 } 3370 if (cntval & 2) { 3371 lwz(tmp1_reg, index, str1_reg); 3372 lwz(tmp2_reg, index, str2_reg); 3373 cmpw(CCR0, tmp1_reg, tmp2_reg); 3374 bne(CCR0, Ldone_false); 3375 index += 4; 3376 } 3377 if (cntval & 1) { 3378 lhz(tmp1_reg, index, str1_reg); 3379 lhz(tmp2_reg, index, str2_reg); 3380 cmpw(CCR0, tmp1_reg, tmp2_reg); 3381 bne(CCR0, Ldone_false); 3382 } 3383 // fallthrough: true 3384 } else { 3385 Label Lloop; 3386 Register index_reg = tmp1_reg; 3387 const int loopcnt = cntval/4; 3388 assert(loopcnt > 0, "must be"); 3389 // Offset 0 should be 32 byte aligned. 3390 //2: 3391 dcbtct(str1_reg, 0x00); // Indicate R/O access to str1. 3392 dcbtct(str2_reg, 0x00); // Indicate R/O access to str2. 3393 li(tmp2_reg, loopcnt); 3394 li(index_reg, 0); // init 3395 li(result_reg, 0); // assume false 3396 mtctr(tmp2_reg); 3397 //8: 3398 bind(Lloop); 3399 ldx(R0, str1_reg, index_reg); 3400 ldx(tmp2_reg, str2_reg, index_reg); 3401 cmpd(CCR0, R0, tmp2_reg); 3402 bne(CCR0, Ldone_false); // Unequal char pair found -> done. 3403 addi(index_reg, index_reg, 4*sizeof(jchar)); 3404 bdnz(Lloop); 3405 //14: 3406 if (cntval & 2) { 3407 lwzx(R0, str1_reg, index_reg); 3408 lwzx(tmp2_reg, str2_reg, index_reg); 3409 cmpw(CCR0, R0, tmp2_reg); 3410 bne(CCR0, Ldone_false); 3411 if (cntval & 1) addi(index_reg, index_reg, 2*sizeof(jchar)); 3412 } 3413 if (cntval & 1) { 3414 lhzx(R0, str1_reg, index_reg); 3415 lhzx(tmp2_reg, str2_reg, index_reg); 3416 cmpw(CCR0, R0, tmp2_reg); 3417 bne(CCR0, Ldone_false); 3418 } 3419 // fallthru: true 3420 } 3421 li(result_reg, 1); 3422 bind(Ldone_false); 3423 } 3424 3425 // Helpers for Intrinsic Emitters 3426 // 3427 // Revert the byte order of a 32bit value in a register 3428 // src: 0x44556677 3429 // dst: 0x77665544 3430 // Three steps to obtain the result: 3431 // 1) Rotate src (as doubleword) left 5 bytes. That puts the leftmost byte of the src word 3432 // into the rightmost byte position. Afterwards, everything left of the rightmost byte is cleared. 3433 // This value initializes dst. 3434 // 2) Rotate src (as word) left 3 bytes. That puts the rightmost byte of the src word into the leftmost 3435 // byte position. Furthermore, byte 5 is rotated into byte 6 position where it is supposed to go. 3436 // This value is mask inserted into dst with a [0..23] mask of 1s. 3437 // 3) Rotate src (as word) left 1 byte. That puts byte 6 into byte 5 position. 3438 // This value is mask inserted into dst with a [8..15] mask of 1s. 3439 void MacroAssembler::load_reverse_32(Register dst, Register src) { 3440 assert_different_registers(dst, src); 3441 3442 rldicl(dst, src, (4+1)*8, 56); // Rotate byte 4 into position 7 (rightmost), clear all to the left. 3443 rlwimi(dst, src, 3*8, 0, 23); // Insert byte 5 into position 6, 7 into 4, leave pos 7 alone. 3444 rlwimi(dst, src, 1*8, 8, 15); // Insert byte 6 into position 5, leave the rest alone. 3445 } 3446 3447 // Calculate the column addresses of the crc32 lookup table into distinct registers. 3448 // This loop-invariant calculation is moved out of the loop body, reducing the loop 3449 // body size from 20 to 16 instructions. 3450 // Returns the offset that was used to calculate the address of column tc3. 3451 // Due to register shortage, setting tc3 may overwrite table. With the return offset 3452 // at hand, the original table address can be easily reconstructed. 3453 int MacroAssembler::crc32_table_columns(Register table, Register tc0, Register tc1, Register tc2, Register tc3) { 3454 3455 #ifdef VM_LITTLE_ENDIAN 3456 // This is what we implement (the DOLIT4 part): 3457 // ========================================================================= */ 3458 // #define DOLIT4 c ^= *buf4++; \ 3459 // c = crc_table[3][c & 0xff] ^ crc_table[2][(c >> 8) & 0xff] ^ \ 3460 // crc_table[1][(c >> 16) & 0xff] ^ crc_table[0][c >> 24] 3461 // #define DOLIT32 DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4 3462 // ========================================================================= */ 3463 const int ix0 = 3*(4*CRC32_COLUMN_SIZE); 3464 const int ix1 = 2*(4*CRC32_COLUMN_SIZE); 3465 const int ix2 = 1*(4*CRC32_COLUMN_SIZE); 3466 const int ix3 = 0*(4*CRC32_COLUMN_SIZE); 3467 #else 3468 // This is what we implement (the DOBIG4 part): 3469 // ========================================================================= 3470 // #define DOBIG4 c ^= *++buf4; \ 3471 // c = crc_table[4][c & 0xff] ^ crc_table[5][(c >> 8) & 0xff] ^ \ 3472 // crc_table[6][(c >> 16) & 0xff] ^ crc_table[7][c >> 24] 3473 // #define DOBIG32 DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4 3474 // ========================================================================= 3475 const int ix0 = 4*(4*CRC32_COLUMN_SIZE); 3476 const int ix1 = 5*(4*CRC32_COLUMN_SIZE); 3477 const int ix2 = 6*(4*CRC32_COLUMN_SIZE); 3478 const int ix3 = 7*(4*CRC32_COLUMN_SIZE); 3479 #endif 3480 assert_different_registers(table, tc0, tc1, tc2); 3481 assert(table == tc3, "must be!"); 3482 3483 if (ix0 != 0) addi(tc0, table, ix0); 3484 if (ix1 != 0) addi(tc1, table, ix1); 3485 if (ix2 != 0) addi(tc2, table, ix2); 3486 if (ix3 != 0) addi(tc3, table, ix3); 3487 3488 return ix3; 3489 } 3490 3491 /** 3492 * uint32_t crc; 3493 * timesXtoThe32[crc & 0xFF] ^ (crc >> 8); 3494 */ 3495 void MacroAssembler::fold_byte_crc32(Register crc, Register val, Register table, Register tmp) { 3496 assert_different_registers(crc, table, tmp); 3497 assert_different_registers(val, table); 3498 3499 if (crc == val) { // Must rotate first to use the unmodified value. 3500 rlwinm(tmp, val, 2, 24-2, 31-2); // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest. 3501 // As we use a word (4-byte) instruction, we have to adapt the mask bit positions. 3502 srwi(crc, crc, 8); // Unsigned shift, clear leftmost 8 bits. 3503 } else { 3504 srwi(crc, crc, 8); // Unsigned shift, clear leftmost 8 bits. 3505 rlwinm(tmp, val, 2, 24-2, 31-2); // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest. 3506 } 3507 lwzx(tmp, table, tmp); 3508 xorr(crc, crc, tmp); 3509 } 3510 3511 /** 3512 * uint32_t crc; 3513 * timesXtoThe32[crc & 0xFF] ^ (crc >> 8); 3514 */ 3515 void MacroAssembler::fold_8bit_crc32(Register crc, Register table, Register tmp) { 3516 fold_byte_crc32(crc, crc, table, tmp); 3517 } 3518 3519 /** 3520 * Emits code to update CRC-32 with a byte value according to constants in table. 3521 * 3522 * @param [in,out]crc Register containing the crc. 3523 * @param [in]val Register containing the byte to fold into the CRC. 3524 * @param [in]table Register containing the table of crc constants. 3525 * 3526 * uint32_t crc; 3527 * val = crc_table[(val ^ crc) & 0xFF]; 3528 * crc = val ^ (crc >> 8); 3529 */ 3530 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) { 3531 BLOCK_COMMENT("update_byte_crc32:"); 3532 xorr(val, val, crc); 3533 fold_byte_crc32(crc, val, table, val); 3534 } 3535 3536 /** 3537 * @param crc register containing existing CRC (32-bit) 3538 * @param buf register pointing to input byte buffer (byte*) 3539 * @param len register containing number of bytes 3540 * @param table register pointing to CRC table 3541 */ 3542 void MacroAssembler::update_byteLoop_crc32(Register crc, Register buf, Register len, Register table, 3543 Register data, bool loopAlignment, bool invertCRC) { 3544 assert_different_registers(crc, buf, len, table, data); 3545 3546 Label L_mainLoop, L_done; 3547 const int mainLoop_stepping = 1; 3548 const int mainLoop_alignment = loopAlignment ? 32 : 4; // (InputForNewCode > 4 ? InputForNewCode : 32) : 4; 3549 3550 // Process all bytes in a single-byte loop. 3551 cmpdi(CCR0, len, 0); // Anything to do? 3552 mtctr(len); 3553 beq(CCR0, L_done); 3554 3555 if (invertCRC) { 3556 nand(crc, crc, crc); // ~c 3557 } 3558 3559 align(mainLoop_alignment); 3560 BIND(L_mainLoop); 3561 lbz(data, 0, buf); // Byte from buffer, zero-extended. 3562 addi(buf, buf, mainLoop_stepping); // Advance buffer position. 3563 update_byte_crc32(crc, data, table); 3564 bdnz(L_mainLoop); // Iterate. 3565 3566 if (invertCRC) { 3567 nand(crc, crc, crc); // ~c 3568 } 3569 3570 bind(L_done); 3571 } 3572 3573 /** 3574 * Emits code to update CRC-32 with a 4-byte value according to constants in table 3575 * Implementation according to jdk/src/share/native/java/util/zip/zlib-1.2.8/crc32.c 3576 */ 3577 // A not on the lookup table address(es): 3578 // The lookup table consists of two sets of four columns each. 3579 // The columns {0..3} are used for little-endian machines. 3580 // The columns {4..7} are used for big-endian machines. 3581 // To save the effort of adding the column offset to the table address each time 3582 // a table element is looked up, it is possible to pass the pre-calculated 3583 // column addresses. 3584 // Uses R9..R12 as work register. Must be saved/restored by caller, if necessary. 3585 void MacroAssembler::update_1word_crc32(Register crc, Register buf, Register table, int bufDisp, int bufInc, 3586 Register t0, Register t1, Register t2, Register t3, 3587 Register tc0, Register tc1, Register tc2, Register tc3) { 3588 assert_different_registers(crc, t3); 3589 3590 // XOR crc with next four bytes of buffer. 3591 lwz(t3, bufDisp, buf); 3592 if (bufInc != 0) { 3593 addi(buf, buf, bufInc); 3594 } 3595 xorr(t3, t3, crc); 3596 3597 // Chop crc into 4 single-byte pieces, shifted left 2 bits, to form the table indices. 3598 rlwinm(t0, t3, 2, 24-2, 31-2); // ((t1 >> 0) & 0xff) << 2 3599 rlwinm(t1, t3, 32+(2- 8), 24-2, 31-2); // ((t1 >> 8) & 0xff) << 2 3600 rlwinm(t2, t3, 32+(2-16), 24-2, 31-2); // ((t1 >> 16) & 0xff) << 2 3601 rlwinm(t3, t3, 32+(2-24), 24-2, 31-2); // ((t1 >> 24) & 0xff) << 2 3602 3603 // Use the pre-calculated column addresses. 3604 // Load pre-calculated table values. 3605 lwzx(t0, tc0, t0); 3606 lwzx(t1, tc1, t1); 3607 lwzx(t2, tc2, t2); 3608 lwzx(t3, tc3, t3); 3609 3610 // Calculate new crc from table values. 3611 xorr(t0, t0, t1); 3612 xorr(t2, t2, t3); 3613 xorr(crc, t0, t2); // Now crc contains the final checksum value. 3614 } 3615 3616 /** 3617 * @param crc register containing existing CRC (32-bit) 3618 * @param buf register pointing to input byte buffer (byte*) 3619 * @param len register containing number of bytes 3620 * @param table register pointing to CRC table 3621 * 3622 * Uses R9..R12 as work register. Must be saved/restored by caller! 3623 */ 3624 void MacroAssembler::kernel_crc32_2word(Register crc, Register buf, Register len, Register table, 3625 Register t0, Register t1, Register t2, Register t3, 3626 Register tc0, Register tc1, Register tc2, Register tc3) { 3627 assert_different_registers(crc, buf, len, table); 3628 3629 Label L_mainLoop, L_tail; 3630 Register tmp = t0; 3631 Register data = t0; 3632 Register tmp2 = t1; 3633 const int mainLoop_stepping = 8; 3634 const int tailLoop_stepping = 1; 3635 const int log_stepping = exact_log2(mainLoop_stepping); 3636 const int mainLoop_alignment = 32; // InputForNewCode > 4 ? InputForNewCode : 32; 3637 const int complexThreshold = 2*mainLoop_stepping; 3638 3639 // Don't test for len <= 0 here. This pathological case should not occur anyway. 3640 // Optimizing for it by adding a test and a branch seems to be a waste of CPU cycles. 3641 // The situation itself is detected and handled correctly by the conditional branches 3642 // following aghi(len, -stepping) and aghi(len, +stepping). 3643 assert(tailLoop_stepping == 1, "check tailLoop_stepping!"); 3644 3645 BLOCK_COMMENT("kernel_crc32_2word {"); 3646 3647 nand(crc, crc, crc); // ~c 3648 3649 // Check for short (<mainLoop_stepping) buffer. 3650 cmpdi(CCR0, len, complexThreshold); 3651 blt(CCR0, L_tail); 3652 3653 // Pre-mainLoop alignment did show a slight (1%) positive effect on performance. 3654 // We leave the code in for reference. Maybe we need alignment when we exploit vector instructions. 3655 { 3656 // Align buf addr to mainLoop_stepping boundary. 3657 neg(tmp2, buf); // Calculate # preLoop iterations for alignment. 3658 rldicl(tmp2, tmp2, 0, 64-log_stepping); // Rotate tmp2 0 bits, insert into tmp2, anding with mask with 1s from 62..63. 3659 3660 if (complexThreshold > mainLoop_stepping) { 3661 sub(len, len, tmp2); // Remaining bytes for main loop (>=mainLoop_stepping is guaranteed). 3662 } else { 3663 sub(tmp, len, tmp2); // Remaining bytes for main loop. 3664 cmpdi(CCR0, tmp, mainLoop_stepping); 3665 blt(CCR0, L_tail); // For less than one mainloop_stepping left, do only tail processing 3666 mr(len, tmp); // remaining bytes for main loop (>=mainLoop_stepping is guaranteed). 3667 } 3668 update_byteLoop_crc32(crc, buf, tmp2, table, data, false, false); 3669 } 3670 3671 srdi(tmp2, len, log_stepping); // #iterations for mainLoop 3672 andi(len, len, mainLoop_stepping-1); // remaining bytes for tailLoop 3673 mtctr(tmp2); 3674 3675 #ifdef VM_LITTLE_ENDIAN 3676 Register crc_rv = crc; 3677 #else 3678 Register crc_rv = tmp; // Load_reverse needs separate registers to work on. 3679 // Occupies tmp, but frees up crc. 3680 load_reverse_32(crc_rv, crc); // Revert byte order because we are dealing with big-endian data. 3681 tmp = crc; 3682 #endif 3683 3684 int reconstructTableOffset = crc32_table_columns(table, tc0, tc1, tc2, tc3); 3685 3686 align(mainLoop_alignment); // Octoword-aligned loop address. Shows 2% improvement. 3687 BIND(L_mainLoop); 3688 update_1word_crc32(crc_rv, buf, table, 0, 0, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3); 3689 update_1word_crc32(crc_rv, buf, table, 4, mainLoop_stepping, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3); 3690 bdnz(L_mainLoop); 3691 3692 #ifndef VM_LITTLE_ENDIAN 3693 load_reverse_32(crc, crc_rv); // Revert byte order because we are dealing with big-endian data. 3694 tmp = crc_rv; // Tmp uses it's original register again. 3695 #endif 3696 3697 // Restore original table address for tailLoop. 3698 if (reconstructTableOffset != 0) { 3699 addi(table, table, -reconstructTableOffset); 3700 } 3701 3702 // Process last few (<complexThreshold) bytes of buffer. 3703 BIND(L_tail); 3704 update_byteLoop_crc32(crc, buf, len, table, data, false, false); 3705 3706 nand(crc, crc, crc); // ~c 3707 BLOCK_COMMENT("} kernel_crc32_2word"); 3708 } 3709 3710 /** 3711 * @param crc register containing existing CRC (32-bit) 3712 * @param buf register pointing to input byte buffer (byte*) 3713 * @param len register containing number of bytes 3714 * @param table register pointing to CRC table 3715 * 3716 * uses R9..R12 as work register. Must be saved/restored by caller! 3717 */ 3718 void MacroAssembler::kernel_crc32_1word(Register crc, Register buf, Register len, Register table, 3719 Register t0, Register t1, Register t2, Register t3, 3720 Register tc0, Register tc1, Register tc2, Register tc3) { 3721 assert_different_registers(crc, buf, len, table); 3722 3723 Label L_mainLoop, L_tail; 3724 Register tmp = t0; 3725 Register data = t0; 3726 Register tmp2 = t1; 3727 const int mainLoop_stepping = 4; 3728 const int tailLoop_stepping = 1; 3729 const int log_stepping = exact_log2(mainLoop_stepping); 3730 const int mainLoop_alignment = 32; // InputForNewCode > 4 ? InputForNewCode : 32; 3731 const int complexThreshold = 2*mainLoop_stepping; 3732 3733 // Don't test for len <= 0 here. This pathological case should not occur anyway. 3734 // Optimizing for it by adding a test and a branch seems to be a waste of CPU cycles. 3735 // The situation itself is detected and handled correctly by the conditional branches 3736 // following aghi(len, -stepping) and aghi(len, +stepping). 3737 assert(tailLoop_stepping == 1, "check tailLoop_stepping!"); 3738 3739 BLOCK_COMMENT("kernel_crc32_1word {"); 3740 3741 nand(crc, crc, crc); // ~c 3742 3743 // Check for short (<mainLoop_stepping) buffer. 3744 cmpdi(CCR0, len, complexThreshold); 3745 blt(CCR0, L_tail); 3746 3747 // Pre-mainLoop alignment did show a slight (1%) positive effect on performance. 3748 // We leave the code in for reference. Maybe we need alignment when we exploit vector instructions. 3749 { 3750 // Align buf addr to mainLoop_stepping boundary. 3751 neg(tmp2, buf); // Calculate # preLoop iterations for alignment. 3752 rldicl(tmp2, tmp2, 0, 64-log_stepping); // Rotate tmp2 0 bits, insert into tmp2, anding with mask with 1s from 62..63. 3753 3754 if (complexThreshold > mainLoop_stepping) { 3755 sub(len, len, tmp2); // Remaining bytes for main loop (>=mainLoop_stepping is guaranteed). 3756 } else { 3757 sub(tmp, len, tmp2); // Remaining bytes for main loop. 3758 cmpdi(CCR0, tmp, mainLoop_stepping); 3759 blt(CCR0, L_tail); // For less than one mainloop_stepping left, do only tail processing 3760 mr(len, tmp); // remaining bytes for main loop (>=mainLoop_stepping is guaranteed). 3761 } 3762 update_byteLoop_crc32(crc, buf, tmp2, table, data, false, false); 3763 } 3764 3765 srdi(tmp2, len, log_stepping); // #iterations for mainLoop 3766 andi(len, len, mainLoop_stepping-1); // remaining bytes for tailLoop 3767 mtctr(tmp2); 3768 3769 #ifdef VM_LITTLE_ENDIAN 3770 Register crc_rv = crc; 3771 #else 3772 Register crc_rv = tmp; // Load_reverse needs separate registers to work on. 3773 // Occupies tmp, but frees up crc. 3774 load_reverse_32(crc_rv, crc); // evert byte order because we are dealing with big-endian data. 3775 tmp = crc; 3776 #endif 3777 3778 int reconstructTableOffset = crc32_table_columns(table, tc0, tc1, tc2, tc3); 3779 3780 align(mainLoop_alignment); // Octoword-aligned loop address. Shows 2% improvement. 3781 BIND(L_mainLoop); 3782 update_1word_crc32(crc_rv, buf, table, 0, mainLoop_stepping, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3); 3783 bdnz(L_mainLoop); 3784 3785 #ifndef VM_LITTLE_ENDIAN 3786 load_reverse_32(crc, crc_rv); // Revert byte order because we are dealing with big-endian data. 3787 tmp = crc_rv; // Tmp uses it's original register again. 3788 #endif 3789 3790 // Restore original table address for tailLoop. 3791 if (reconstructTableOffset != 0) { 3792 addi(table, table, -reconstructTableOffset); 3793 } 3794 3795 // Process last few (<complexThreshold) bytes of buffer. 3796 BIND(L_tail); 3797 update_byteLoop_crc32(crc, buf, len, table, data, false, false); 3798 3799 nand(crc, crc, crc); // ~c 3800 BLOCK_COMMENT("} kernel_crc32_1word"); 3801 } 3802 3803 /** 3804 * @param crc register containing existing CRC (32-bit) 3805 * @param buf register pointing to input byte buffer (byte*) 3806 * @param len register containing number of bytes 3807 * @param table register pointing to CRC table 3808 * 3809 * Uses R7_ARG5, R8_ARG6 as work registers. 3810 */ 3811 void MacroAssembler::kernel_crc32_1byte(Register crc, Register buf, Register len, Register table, 3812 Register t0, Register t1, Register t2, Register t3) { 3813 assert_different_registers(crc, buf, len, table); 3814 3815 Register data = t0; // Holds the current byte to be folded into crc. 3816 3817 BLOCK_COMMENT("kernel_crc32_1byte {"); 3818 3819 // Process all bytes in a single-byte loop. 3820 update_byteLoop_crc32(crc, buf, len, table, data, true, true); 3821 3822 BLOCK_COMMENT("} kernel_crc32_1byte"); 3823 } 3824 3825 void MacroAssembler::kernel_crc32_singleByte(Register crc, Register buf, Register len, Register table, Register tmp) { 3826 assert_different_registers(crc, buf, /* len, not used!! */ table, tmp); 3827 3828 BLOCK_COMMENT("kernel_crc32_singleByte:"); 3829 nand(crc, crc, crc); // ~c 3830 3831 lbz(tmp, 0, buf); // Byte from buffer, zero-extended. 3832 update_byte_crc32(crc, tmp, table); 3833 3834 nand(crc, crc, crc); // ~c 3835 } 3836 3837 // dest_lo += src1 + src2 3838 // dest_hi += carry1 + carry2 3839 void MacroAssembler::add2_with_carry(Register dest_hi, 3840 Register dest_lo, 3841 Register src1, Register src2) { 3842 li(R0, 0); 3843 addc(dest_lo, dest_lo, src1); 3844 adde(dest_hi, dest_hi, R0); 3845 addc(dest_lo, dest_lo, src2); 3846 adde(dest_hi, dest_hi, R0); 3847 } 3848 3849 // Multiply 64 bit by 64 bit first loop. 3850 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, 3851 Register x_xstart, 3852 Register y, Register y_idx, 3853 Register z, 3854 Register carry, 3855 Register product_high, Register product, 3856 Register idx, Register kdx, 3857 Register tmp) { 3858 // jlong carry, x[], y[], z[]; 3859 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx--, kdx--) { 3860 // huge_128 product = y[idx] * x[xstart] + carry; 3861 // z[kdx] = (jlong)product; 3862 // carry = (jlong)(product >>> 64); 3863 // } 3864 // z[xstart] = carry; 3865 3866 Label L_first_loop, L_first_loop_exit; 3867 Label L_one_x, L_one_y, L_multiply; 3868 3869 addic_(xstart, xstart, -1); 3870 blt(CCR0, L_one_x); // Special case: length of x is 1. 3871 3872 // Load next two integers of x. 3873 sldi(tmp, xstart, LogBytesPerInt); 3874 ldx(x_xstart, x, tmp); 3875 #ifdef VM_LITTLE_ENDIAN 3876 rldicl(x_xstart, x_xstart, 32, 0); 3877 #endif 3878 3879 align(32, 16); 3880 bind(L_first_loop); 3881 3882 cmpdi(CCR0, idx, 1); 3883 blt(CCR0, L_first_loop_exit); 3884 addi(idx, idx, -2); 3885 beq(CCR0, L_one_y); 3886 3887 // Load next two integers of y. 3888 sldi(tmp, idx, LogBytesPerInt); 3889 ldx(y_idx, y, tmp); 3890 #ifdef VM_LITTLE_ENDIAN 3891 rldicl(y_idx, y_idx, 32, 0); 3892 #endif 3893 3894 3895 bind(L_multiply); 3896 multiply64(product_high, product, x_xstart, y_idx); 3897 3898 li(tmp, 0); 3899 addc(product, product, carry); // Add carry to result. 3900 adde(product_high, product_high, tmp); // Add carry of the last addition. 3901 addi(kdx, kdx, -2); 3902 3903 // Store result. 3904 #ifdef VM_LITTLE_ENDIAN 3905 rldicl(product, product, 32, 0); 3906 #endif 3907 sldi(tmp, kdx, LogBytesPerInt); 3908 stdx(product, z, tmp); 3909 mr_if_needed(carry, product_high); 3910 b(L_first_loop); 3911 3912 3913 bind(L_one_y); // Load one 32 bit portion of y as (0,value). 3914 3915 lwz(y_idx, 0, y); 3916 b(L_multiply); 3917 3918 3919 bind(L_one_x); // Load one 32 bit portion of x as (0,value). 3920 3921 lwz(x_xstart, 0, x); 3922 b(L_first_loop); 3923 3924 bind(L_first_loop_exit); 3925 } 3926 3927 // Multiply 64 bit by 64 bit and add 128 bit. 3928 void MacroAssembler::multiply_add_128_x_128(Register x_xstart, Register y, 3929 Register z, Register yz_idx, 3930 Register idx, Register carry, 3931 Register product_high, Register product, 3932 Register tmp, int offset) { 3933 3934 // huge_128 product = (y[idx] * x_xstart) + z[kdx] + carry; 3935 // z[kdx] = (jlong)product; 3936 3937 sldi(tmp, idx, LogBytesPerInt); 3938 if (offset) { 3939 addi(tmp, tmp, offset); 3940 } 3941 ldx(yz_idx, y, tmp); 3942 #ifdef VM_LITTLE_ENDIAN 3943 rldicl(yz_idx, yz_idx, 32, 0); 3944 #endif 3945 3946 multiply64(product_high, product, x_xstart, yz_idx); 3947 ldx(yz_idx, z, tmp); 3948 #ifdef VM_LITTLE_ENDIAN 3949 rldicl(yz_idx, yz_idx, 32, 0); 3950 #endif 3951 3952 add2_with_carry(product_high, product, carry, yz_idx); 3953 3954 sldi(tmp, idx, LogBytesPerInt); 3955 if (offset) { 3956 addi(tmp, tmp, offset); 3957 } 3958 #ifdef VM_LITTLE_ENDIAN 3959 rldicl(product, product, 32, 0); 3960 #endif 3961 stdx(product, z, tmp); 3962 } 3963 3964 // Multiply 128 bit by 128 bit. Unrolled inner loop. 3965 void MacroAssembler::multiply_128_x_128_loop(Register x_xstart, 3966 Register y, Register z, 3967 Register yz_idx, Register idx, Register carry, 3968 Register product_high, Register product, 3969 Register carry2, Register tmp) { 3970 3971 // jlong carry, x[], y[], z[]; 3972 // int kdx = ystart+1; 3973 // for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop 3974 // huge_128 product = (y[idx+1] * x_xstart) + z[kdx+idx+1] + carry; 3975 // z[kdx+idx+1] = (jlong)product; 3976 // jlong carry2 = (jlong)(product >>> 64); 3977 // product = (y[idx] * x_xstart) + z[kdx+idx] + carry2; 3978 // z[kdx+idx] = (jlong)product; 3979 // carry = (jlong)(product >>> 64); 3980 // } 3981 // idx += 2; 3982 // if (idx > 0) { 3983 // product = (y[idx] * x_xstart) + z[kdx+idx] + carry; 3984 // z[kdx+idx] = (jlong)product; 3985 // carry = (jlong)(product >>> 64); 3986 // } 3987 3988 Label L_third_loop, L_third_loop_exit, L_post_third_loop_done; 3989 const Register jdx = R0; 3990 3991 // Scale the index. 3992 srdi_(jdx, idx, 2); 3993 beq(CCR0, L_third_loop_exit); 3994 mtctr(jdx); 3995 3996 align(32, 16); 3997 bind(L_third_loop); 3998 3999 addi(idx, idx, -4); 4000 4001 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 8); 4002 mr_if_needed(carry2, product_high); 4003 4004 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry2, product_high, product, tmp, 0); 4005 mr_if_needed(carry, product_high); 4006 bdnz(L_third_loop); 4007 4008 bind(L_third_loop_exit); // Handle any left-over operand parts. 4009 4010 andi_(idx, idx, 0x3); 4011 beq(CCR0, L_post_third_loop_done); 4012 4013 Label L_check_1; 4014 4015 addic_(idx, idx, -2); 4016 blt(CCR0, L_check_1); 4017 4018 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 0); 4019 mr_if_needed(carry, product_high); 4020 4021 bind(L_check_1); 4022 4023 addi(idx, idx, 0x2); 4024 andi_(idx, idx, 0x1) ; 4025 addic_(idx, idx, -1); 4026 blt(CCR0, L_post_third_loop_done); 4027 4028 sldi(tmp, idx, LogBytesPerInt); 4029 lwzx(yz_idx, y, tmp); 4030 multiply64(product_high, product, x_xstart, yz_idx); 4031 lwzx(yz_idx, z, tmp); 4032 4033 add2_with_carry(product_high, product, yz_idx, carry); 4034 4035 sldi(tmp, idx, LogBytesPerInt); 4036 stwx(product, z, tmp); 4037 srdi(product, product, 32); 4038 4039 sldi(product_high, product_high, 32); 4040 orr(product, product, product_high); 4041 mr_if_needed(carry, product); 4042 4043 bind(L_post_third_loop_done); 4044 } // multiply_128_x_128_loop 4045 4046 void MacroAssembler::multiply_to_len(Register x, Register xlen, 4047 Register y, Register ylen, 4048 Register z, Register zlen, 4049 Register tmp1, Register tmp2, 4050 Register tmp3, Register tmp4, 4051 Register tmp5, Register tmp6, 4052 Register tmp7, Register tmp8, 4053 Register tmp9, Register tmp10, 4054 Register tmp11, Register tmp12, 4055 Register tmp13) { 4056 4057 ShortBranchVerifier sbv(this); 4058 4059 assert_different_registers(x, xlen, y, ylen, z, zlen, 4060 tmp1, tmp2, tmp3, tmp4, tmp5, tmp6); 4061 assert_different_registers(x, xlen, y, ylen, z, zlen, 4062 tmp1, tmp2, tmp3, tmp4, tmp5, tmp7); 4063 assert_different_registers(x, xlen, y, ylen, z, zlen, 4064 tmp1, tmp2, tmp3, tmp4, tmp5, tmp8); 4065 4066 const Register idx = tmp1; 4067 const Register kdx = tmp2; 4068 const Register xstart = tmp3; 4069 4070 const Register y_idx = tmp4; 4071 const Register carry = tmp5; 4072 const Register product = tmp6; 4073 const Register product_high = tmp7; 4074 const Register x_xstart = tmp8; 4075 const Register tmp = tmp9; 4076 4077 // First Loop. 4078 // 4079 // final static long LONG_MASK = 0xffffffffL; 4080 // int xstart = xlen - 1; 4081 // int ystart = ylen - 1; 4082 // long carry = 0; 4083 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) { 4084 // long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry; 4085 // z[kdx] = (int)product; 4086 // carry = product >>> 32; 4087 // } 4088 // z[xstart] = (int)carry; 4089 4090 mr_if_needed(idx, ylen); // idx = ylen 4091 mr_if_needed(kdx, zlen); // kdx = xlen + ylen 4092 li(carry, 0); // carry = 0 4093 4094 Label L_done; 4095 4096 addic_(xstart, xlen, -1); 4097 blt(CCR0, L_done); 4098 4099 multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, 4100 carry, product_high, product, idx, kdx, tmp); 4101 4102 Label L_second_loop; 4103 4104 cmpdi(CCR0, kdx, 0); 4105 beq(CCR0, L_second_loop); 4106 4107 Label L_carry; 4108 4109 addic_(kdx, kdx, -1); 4110 beq(CCR0, L_carry); 4111 4112 // Store lower 32 bits of carry. 4113 sldi(tmp, kdx, LogBytesPerInt); 4114 stwx(carry, z, tmp); 4115 srdi(carry, carry, 32); 4116 addi(kdx, kdx, -1); 4117 4118 4119 bind(L_carry); 4120 4121 // Store upper 32 bits of carry. 4122 sldi(tmp, kdx, LogBytesPerInt); 4123 stwx(carry, z, tmp); 4124 4125 // Second and third (nested) loops. 4126 // 4127 // for (int i = xstart-1; i >= 0; i--) { // Second loop 4128 // carry = 0; 4129 // for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop 4130 // long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) + 4131 // (z[k] & LONG_MASK) + carry; 4132 // z[k] = (int)product; 4133 // carry = product >>> 32; 4134 // } 4135 // z[i] = (int)carry; 4136 // } 4137 // 4138 // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = rdx 4139 4140 bind(L_second_loop); 4141 4142 li(carry, 0); // carry = 0; 4143 4144 addic_(xstart, xstart, -1); // i = xstart-1; 4145 blt(CCR0, L_done); 4146 4147 Register zsave = tmp10; 4148 4149 mr(zsave, z); 4150 4151 4152 Label L_last_x; 4153 4154 sldi(tmp, xstart, LogBytesPerInt); 4155 add(z, z, tmp); // z = z + k - j 4156 addi(z, z, 4); 4157 addic_(xstart, xstart, -1); // i = xstart-1; 4158 blt(CCR0, L_last_x); 4159 4160 sldi(tmp, xstart, LogBytesPerInt); 4161 ldx(x_xstart, x, tmp); 4162 #ifdef VM_LITTLE_ENDIAN 4163 rldicl(x_xstart, x_xstart, 32, 0); 4164 #endif 4165 4166 4167 Label L_third_loop_prologue; 4168 4169 bind(L_third_loop_prologue); 4170 4171 Register xsave = tmp11; 4172 Register xlensave = tmp12; 4173 Register ylensave = tmp13; 4174 4175 mr(xsave, x); 4176 mr(xlensave, xstart); 4177 mr(ylensave, ylen); 4178 4179 4180 multiply_128_x_128_loop(x_xstart, y, z, y_idx, ylen, 4181 carry, product_high, product, x, tmp); 4182 4183 mr(z, zsave); 4184 mr(x, xsave); 4185 mr(xlen, xlensave); // This is the decrement of the loop counter! 4186 mr(ylen, ylensave); 4187 4188 addi(tmp3, xlen, 1); 4189 sldi(tmp, tmp3, LogBytesPerInt); 4190 stwx(carry, z, tmp); 4191 addic_(tmp3, tmp3, -1); 4192 blt(CCR0, L_done); 4193 4194 srdi(carry, carry, 32); 4195 sldi(tmp, tmp3, LogBytesPerInt); 4196 stwx(carry, z, tmp); 4197 b(L_second_loop); 4198 4199 // Next infrequent code is moved outside loops. 4200 bind(L_last_x); 4201 4202 lwz(x_xstart, 0, x); 4203 b(L_third_loop_prologue); 4204 4205 bind(L_done); 4206 } // multiply_to_len 4207 4208 void MacroAssembler::asm_assert(bool check_equal, const char *msg, int id) { 4209 #ifdef ASSERT 4210 Label ok; 4211 if (check_equal) { 4212 beq(CCR0, ok); 4213 } else { 4214 bne(CCR0, ok); 4215 } 4216 stop(msg, id); 4217 bind(ok); 4218 #endif 4219 } 4220 4221 void MacroAssembler::asm_assert_mems_zero(bool check_equal, int size, int mem_offset, 4222 Register mem_base, const char* msg, int id) { 4223 #ifdef ASSERT 4224 switch (size) { 4225 case 4: 4226 lwz(R0, mem_offset, mem_base); 4227 cmpwi(CCR0, R0, 0); 4228 break; 4229 case 8: 4230 ld(R0, mem_offset, mem_base); 4231 cmpdi(CCR0, R0, 0); 4232 break; 4233 default: 4234 ShouldNotReachHere(); 4235 } 4236 asm_assert(check_equal, msg, id); 4237 #endif // ASSERT 4238 } 4239 4240 void MacroAssembler::verify_thread() { 4241 if (VerifyThread) { 4242 unimplemented("'VerifyThread' currently not implemented on PPC"); 4243 } 4244 } 4245 4246 // READ: oop. KILL: R0. Volatile floats perhaps. 4247 void MacroAssembler::verify_oop(Register oop, const char* msg) { 4248 if (!VerifyOops) { 4249 return; 4250 } 4251 4252 address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address(); 4253 const Register tmp = R11; // Will be preserved. 4254 const int nbytes_save = 11*8; // Volatile gprs except R0. 4255 save_volatile_gprs(R1_SP, -nbytes_save); // except R0 4256 4257 if (oop == tmp) mr(R4_ARG2, oop); 4258 save_LR_CR(tmp); // save in old frame 4259 push_frame_reg_args(nbytes_save, tmp); 4260 // load FunctionDescriptor** / entry_address * 4261 load_const_optimized(tmp, fd, R0); 4262 // load FunctionDescriptor* / entry_address 4263 ld(tmp, 0, tmp); 4264 if (oop != tmp) mr_if_needed(R4_ARG2, oop); 4265 load_const_optimized(R3_ARG1, (address)msg, R0); 4266 // Call destination for its side effect. 4267 call_c(tmp); 4268 4269 pop_frame(); 4270 restore_LR_CR(tmp); 4271 restore_volatile_gprs(R1_SP, -nbytes_save); // except R0 4272 } 4273 4274 const char* stop_types[] = { 4275 "stop", 4276 "untested", 4277 "unimplemented", 4278 "shouldnotreachhere" 4279 }; 4280 4281 static void stop_on_request(int tp, const char* msg) { 4282 tty->print("PPC assembly code requires stop: (%s) %s\n", stop_types[tp%/*stop_end*/4], msg); 4283 guarantee(false, "PPC assembly code requires stop: %s", msg); 4284 } 4285 4286 // Call a C-function that prints output. 4287 void MacroAssembler::stop(int type, const char* msg, int id) { 4288 #ifndef PRODUCT 4289 block_comment(err_msg("stop: %s %s {", stop_types[type%stop_end], msg)); 4290 #else 4291 block_comment("stop {"); 4292 #endif 4293 4294 // setup arguments 4295 load_const_optimized(R3_ARG1, type); 4296 load_const_optimized(R4_ARG2, (void *)msg, /*tmp=*/R0); 4297 call_VM_leaf(CAST_FROM_FN_PTR(address, stop_on_request), R3_ARG1, R4_ARG2); 4298 illtrap(); 4299 emit_int32(id); 4300 block_comment("} stop;"); 4301 } 4302 4303 #ifndef PRODUCT 4304 // Write pattern 0x0101010101010101 in memory region [low-before, high+after]. 4305 // Val, addr are temp registers. 4306 // If low == addr, addr is killed. 4307 // High is preserved. 4308 void MacroAssembler::zap_from_to(Register low, int before, Register high, int after, Register val, Register addr) { 4309 if (!ZapMemory) return; 4310 4311 assert_different_registers(low, val); 4312 4313 BLOCK_COMMENT("zap memory region {"); 4314 load_const_optimized(val, 0x0101010101010101); 4315 int size = before + after; 4316 if (low == high && size < 5 && size > 0) { 4317 int offset = -before*BytesPerWord; 4318 for (int i = 0; i < size; ++i) { 4319 std(val, offset, low); 4320 offset += (1*BytesPerWord); 4321 } 4322 } else { 4323 addi(addr, low, -before*BytesPerWord); 4324 assert_different_registers(high, val); 4325 if (after) addi(high, high, after * BytesPerWord); 4326 Label loop; 4327 bind(loop); 4328 std(val, 0, addr); 4329 addi(addr, addr, 8); 4330 cmpd(CCR6, addr, high); 4331 ble(CCR6, loop); 4332 if (after) addi(high, high, -after * BytesPerWord); // Correct back to old value. 4333 } 4334 BLOCK_COMMENT("} zap memory region"); 4335 } 4336 4337 #endif // !PRODUCT 4338 4339 SkipIfEqualZero::SkipIfEqualZero(MacroAssembler* masm, Register temp, const bool* flag_addr) : _masm(masm), _label() { 4340 int simm16_offset = masm->load_const_optimized(temp, (address)flag_addr, R0, true); 4341 assert(sizeof(bool) == 1, "PowerPC ABI"); 4342 masm->lbz(temp, simm16_offset, temp); 4343 masm->cmpwi(CCR0, temp, 0); 4344 masm->beq(CCR0, _label); 4345 } 4346 4347 SkipIfEqualZero::~SkipIfEqualZero() { 4348 _masm->bind(_label); 4349 }