1 /* 2 * Copyright (c) 1997, 2015, Oracle and/or its affiliates. All rights reserved. 3 * Copyright 2012, 2015 SAP AG. All rights reserved. 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. 9 * 10 * This code is distributed in the hope that it will be useful, but WITHOUT 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 * version 2 for more details (a copy is included in the LICENSE file that 14 * accompanied this code). 15 * 16 * You should have received a copy of the GNU General Public License version 17 * 2 along with this work; if not, write to the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 19 * 20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 21 * or visit www.oracle.com if you need additional information or have any 22 * questions. 23 * 24 */ 25 26 #include "precompiled.hpp" 27 #include "asm/macroAssembler.inline.hpp" 28 #include "compiler/disassembler.hpp" 29 #include "gc/shared/cardTableModRefBS.hpp" 30 #include "gc/shared/collectedHeap.inline.hpp" 31 #include "interpreter/interpreter.hpp" 32 #include "memory/resourceArea.hpp" 33 #include "prims/methodHandles.hpp" 34 #include "runtime/biasedLocking.hpp" 35 #include "runtime/icache.hpp" 36 #include "runtime/interfaceSupport.hpp" 37 #include "runtime/objectMonitor.hpp" 38 #include "runtime/os.hpp" 39 #include "runtime/sharedRuntime.hpp" 40 #include "runtime/stubRoutines.hpp" 41 #include "utilities/macros.hpp" 42 #if INCLUDE_ALL_GCS 43 #include "gc/g1/g1CollectedHeap.inline.hpp" 44 #include "gc/g1/g1SATBCardTableModRefBS.hpp" 45 #include "gc/g1/heapRegion.hpp" 46 #endif // INCLUDE_ALL_GCS 47 48 #ifdef PRODUCT 49 #define BLOCK_COMMENT(str) // nothing 50 #else 51 #define BLOCK_COMMENT(str) block_comment(str) 52 #endif 53 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 54 55 #ifdef ASSERT 56 // On RISC, there's no benefit to verifying instruction boundaries. 57 bool AbstractAssembler::pd_check_instruction_mark() { return false; } 58 #endif 59 60 void MacroAssembler::ld_largeoffset_unchecked(Register d, int si31, Register a, int emit_filler_nop) { 61 assert(Assembler::is_simm(si31, 31) && si31 >= 0, "si31 out of range"); 62 if (Assembler::is_simm(si31, 16)) { 63 ld(d, si31, a); 64 if (emit_filler_nop) nop(); 65 } else { 66 const int hi = MacroAssembler::largeoffset_si16_si16_hi(si31); 67 const int lo = MacroAssembler::largeoffset_si16_si16_lo(si31); 68 addis(d, a, hi); 69 ld(d, lo, d); 70 } 71 } 72 73 void MacroAssembler::ld_largeoffset(Register d, int si31, Register a, int emit_filler_nop) { 74 assert_different_registers(d, a); 75 ld_largeoffset_unchecked(d, si31, a, emit_filler_nop); 76 } 77 78 void MacroAssembler::load_sized_value(Register dst, RegisterOrConstant offs, Register base, 79 size_t size_in_bytes, bool is_signed) { 80 switch (size_in_bytes) { 81 case 8: ld(dst, offs, base); break; 82 case 4: is_signed ? lwa(dst, offs, base) : lwz(dst, offs, base); break; 83 case 2: is_signed ? lha(dst, offs, base) : lhz(dst, offs, base); break; 84 case 1: lbz(dst, offs, base); if (is_signed) extsb(dst, dst); break; // lba doesn't exist :( 85 default: ShouldNotReachHere(); 86 } 87 } 88 89 void MacroAssembler::store_sized_value(Register dst, RegisterOrConstant offs, Register base, 90 size_t size_in_bytes) { 91 switch (size_in_bytes) { 92 case 8: std(dst, offs, base); break; 93 case 4: stw(dst, offs, base); break; 94 case 2: sth(dst, offs, base); break; 95 case 1: stb(dst, offs, base); break; 96 default: ShouldNotReachHere(); 97 } 98 } 99 100 void MacroAssembler::align(int modulus, int max, int rem) { 101 int padding = (rem + modulus - (offset() % modulus)) % modulus; 102 if (padding > max) return; 103 for (int c = (padding >> 2); c > 0; --c) { nop(); } 104 } 105 106 // Issue instructions that calculate given TOC from global TOC. 107 void MacroAssembler::calculate_address_from_global_toc(Register dst, address addr, bool hi16, bool lo16, 108 bool add_relocation, bool emit_dummy_addr) { 109 int offset = -1; 110 if (emit_dummy_addr) { 111 offset = -128; // dummy address 112 } else if (addr != (address)(intptr_t)-1) { 113 offset = MacroAssembler::offset_to_global_toc(addr); 114 } 115 116 if (hi16) { 117 addis(dst, R29, MacroAssembler::largeoffset_si16_si16_hi(offset)); 118 } 119 if (lo16) { 120 if (add_relocation) { 121 // Relocate at the addi to avoid confusion with a load from the method's TOC. 122 relocate(internal_word_Relocation::spec(addr)); 123 } 124 addi(dst, dst, MacroAssembler::largeoffset_si16_si16_lo(offset)); 125 } 126 } 127 128 int MacroAssembler::patch_calculate_address_from_global_toc_at(address a, address bound, address addr) { 129 const int offset = MacroAssembler::offset_to_global_toc(addr); 130 131 const address inst2_addr = a; 132 const int inst2 = *(int *)inst2_addr; 133 134 // The relocation points to the second instruction, the addi, 135 // and the addi reads and writes the same register dst. 136 const int dst = inv_rt_field(inst2); 137 assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst"); 138 139 // Now, find the preceding addis which writes to dst. 140 int inst1 = 0; 141 address inst1_addr = inst2_addr - BytesPerInstWord; 142 while (inst1_addr >= bound) { 143 inst1 = *(int *) inst1_addr; 144 if (is_addis(inst1) && inv_rt_field(inst1) == dst) { 145 // Stop, found the addis which writes dst. 146 break; 147 } 148 inst1_addr -= BytesPerInstWord; 149 } 150 151 assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC"); 152 set_imm((int *)inst1_addr, MacroAssembler::largeoffset_si16_si16_hi(offset)); 153 set_imm((int *)inst2_addr, MacroAssembler::largeoffset_si16_si16_lo(offset)); 154 return (int)((intptr_t)addr - (intptr_t)inst1_addr); 155 } 156 157 address MacroAssembler::get_address_of_calculate_address_from_global_toc_at(address a, address bound) { 158 const address inst2_addr = a; 159 const int inst2 = *(int *)inst2_addr; 160 161 // The relocation points to the second instruction, the addi, 162 // and the addi reads and writes the same register dst. 163 const int dst = inv_rt_field(inst2); 164 assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst"); 165 166 // Now, find the preceding addis which writes to dst. 167 int inst1 = 0; 168 address inst1_addr = inst2_addr - BytesPerInstWord; 169 while (inst1_addr >= bound) { 170 inst1 = *(int *) inst1_addr; 171 if (is_addis(inst1) && inv_rt_field(inst1) == dst) { 172 // stop, found the addis which writes dst 173 break; 174 } 175 inst1_addr -= BytesPerInstWord; 176 } 177 178 assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC"); 179 180 int offset = (get_imm(inst1_addr, 0) << 16) + get_imm(inst2_addr, 0); 181 // -1 is a special case 182 if (offset == -1) { 183 return (address)(intptr_t)-1; 184 } else { 185 return global_toc() + offset; 186 } 187 } 188 189 #ifdef _LP64 190 // Patch compressed oops or klass constants. 191 // Assembler sequence is 192 // 1) compressed oops: 193 // lis rx = const.hi 194 // ori rx = rx | const.lo 195 // 2) compressed klass: 196 // lis rx = const.hi 197 // clrldi rx = rx & 0xFFFFffff // clearMS32b, optional 198 // ori rx = rx | const.lo 199 // Clrldi will be passed by. 200 int MacroAssembler::patch_set_narrow_oop(address a, address bound, narrowOop data) { 201 assert(UseCompressedOops, "Should only patch compressed oops"); 202 203 const address inst2_addr = a; 204 const int inst2 = *(int *)inst2_addr; 205 206 // The relocation points to the second instruction, the ori, 207 // and the ori reads and writes the same register dst. 208 const int dst = inv_rta_field(inst2); 209 assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst"); 210 // Now, find the preceding addis which writes to dst. 211 int inst1 = 0; 212 address inst1_addr = inst2_addr - BytesPerInstWord; 213 bool inst1_found = false; 214 while (inst1_addr >= bound) { 215 inst1 = *(int *)inst1_addr; 216 if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break; } 217 inst1_addr -= BytesPerInstWord; 218 } 219 assert(inst1_found, "inst is not lis"); 220 221 int xc = (data >> 16) & 0xffff; 222 int xd = (data >> 0) & 0xffff; 223 224 set_imm((int *)inst1_addr, (short)(xc)); // see enc_load_con_narrow_hi/_lo 225 set_imm((int *)inst2_addr, (xd)); // unsigned int 226 return (int)((intptr_t)inst2_addr - (intptr_t)inst1_addr); 227 } 228 229 // Get compressed oop or klass constant. 230 narrowOop MacroAssembler::get_narrow_oop(address a, address bound) { 231 assert(UseCompressedOops, "Should only patch compressed oops"); 232 233 const address inst2_addr = a; 234 const int inst2 = *(int *)inst2_addr; 235 236 // The relocation points to the second instruction, the ori, 237 // and the ori reads and writes the same register dst. 238 const int dst = inv_rta_field(inst2); 239 assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst"); 240 // Now, find the preceding lis which writes to dst. 241 int inst1 = 0; 242 address inst1_addr = inst2_addr - BytesPerInstWord; 243 bool inst1_found = false; 244 245 while (inst1_addr >= bound) { 246 inst1 = *(int *) inst1_addr; 247 if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break;} 248 inst1_addr -= BytesPerInstWord; 249 } 250 assert(inst1_found, "inst is not lis"); 251 252 uint xl = ((unsigned int) (get_imm(inst2_addr, 0) & 0xffff)); 253 uint xh = (((get_imm(inst1_addr, 0)) & 0xffff) << 16); 254 255 return (int) (xl | xh); 256 } 257 #endif // _LP64 258 259 void MacroAssembler::load_const_from_method_toc(Register dst, AddressLiteral& a, Register toc) { 260 int toc_offset = 0; 261 // Use RelocationHolder::none for the constant pool entry, otherwise 262 // we will end up with a failing NativeCall::verify(x) where x is 263 // the address of the constant pool entry. 264 // FIXME: We should insert relocation information for oops at the constant 265 // pool entries instead of inserting it at the loads; patching of a constant 266 // pool entry should be less expensive. 267 address oop_address = address_constant((address)a.value(), RelocationHolder::none); 268 // Relocate at the pc of the load. 269 relocate(a.rspec()); 270 toc_offset = (int)(oop_address - code()->consts()->start()); 271 ld_largeoffset_unchecked(dst, toc_offset, toc, true); 272 } 273 274 bool MacroAssembler::is_load_const_from_method_toc_at(address a) { 275 const address inst1_addr = a; 276 const int inst1 = *(int *)inst1_addr; 277 278 // The relocation points to the ld or the addis. 279 return (is_ld(inst1)) || 280 (is_addis(inst1) && inv_ra_field(inst1) != 0); 281 } 282 283 int MacroAssembler::get_offset_of_load_const_from_method_toc_at(address a) { 284 assert(is_load_const_from_method_toc_at(a), "must be load_const_from_method_toc"); 285 286 const address inst1_addr = a; 287 const int inst1 = *(int *)inst1_addr; 288 289 if (is_ld(inst1)) { 290 return inv_d1_field(inst1); 291 } else if (is_addis(inst1)) { 292 const int dst = inv_rt_field(inst1); 293 294 // Now, find the succeeding ld which reads and writes to dst. 295 address inst2_addr = inst1_addr + BytesPerInstWord; 296 int inst2 = 0; 297 while (true) { 298 inst2 = *(int *) inst2_addr; 299 if (is_ld(inst2) && inv_ra_field(inst2) == dst && inv_rt_field(inst2) == dst) { 300 // Stop, found the ld which reads and writes dst. 301 break; 302 } 303 inst2_addr += BytesPerInstWord; 304 } 305 return (inv_d1_field(inst1) << 16) + inv_d1_field(inst2); 306 } 307 ShouldNotReachHere(); 308 return 0; 309 } 310 311 // Get the constant from a `load_const' sequence. 312 long MacroAssembler::get_const(address a) { 313 assert(is_load_const_at(a), "not a load of a constant"); 314 const int *p = (const int*) a; 315 unsigned long x = (((unsigned long) (get_imm(a,0) & 0xffff)) << 48); 316 if (is_ori(*(p+1))) { 317 x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 32); 318 x |= (((unsigned long) (get_imm(a,3) & 0xffff)) << 16); 319 x |= (((unsigned long) (get_imm(a,4) & 0xffff))); 320 } else if (is_lis(*(p+1))) { 321 x |= (((unsigned long) (get_imm(a,2) & 0xffff)) << 32); 322 x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 16); 323 x |= (((unsigned long) (get_imm(a,3) & 0xffff))); 324 } else { 325 ShouldNotReachHere(); 326 return (long) 0; 327 } 328 return (long) x; 329 } 330 331 // Patch the 64 bit constant of a `load_const' sequence. This is a low 332 // level procedure. It neither flushes the instruction cache nor is it 333 // mt safe. 334 void MacroAssembler::patch_const(address a, long x) { 335 assert(is_load_const_at(a), "not a load of a constant"); 336 int *p = (int*) a; 337 if (is_ori(*(p+1))) { 338 set_imm(0 + p, (x >> 48) & 0xffff); 339 set_imm(1 + p, (x >> 32) & 0xffff); 340 set_imm(3 + p, (x >> 16) & 0xffff); 341 set_imm(4 + p, x & 0xffff); 342 } else if (is_lis(*(p+1))) { 343 set_imm(0 + p, (x >> 48) & 0xffff); 344 set_imm(2 + p, (x >> 32) & 0xffff); 345 set_imm(1 + p, (x >> 16) & 0xffff); 346 set_imm(3 + p, x & 0xffff); 347 } else { 348 ShouldNotReachHere(); 349 } 350 } 351 352 AddressLiteral MacroAssembler::allocate_metadata_address(Metadata* obj) { 353 assert(oop_recorder() != NULL, "this assembler needs a Recorder"); 354 int index = oop_recorder()->allocate_metadata_index(obj); 355 RelocationHolder rspec = metadata_Relocation::spec(index); 356 return AddressLiteral((address)obj, rspec); 357 } 358 359 AddressLiteral MacroAssembler::constant_metadata_address(Metadata* obj) { 360 assert(oop_recorder() != NULL, "this assembler needs a Recorder"); 361 int index = oop_recorder()->find_index(obj); 362 RelocationHolder rspec = metadata_Relocation::spec(index); 363 return AddressLiteral((address)obj, rspec); 364 } 365 366 AddressLiteral MacroAssembler::allocate_oop_address(jobject obj) { 367 assert(oop_recorder() != NULL, "this assembler needs an OopRecorder"); 368 int oop_index = oop_recorder()->allocate_oop_index(obj); 369 return AddressLiteral(address(obj), oop_Relocation::spec(oop_index)); 370 } 371 372 AddressLiteral MacroAssembler::constant_oop_address(jobject obj) { 373 assert(oop_recorder() != NULL, "this assembler needs an OopRecorder"); 374 int oop_index = oop_recorder()->find_index(obj); 375 return AddressLiteral(address(obj), oop_Relocation::spec(oop_index)); 376 } 377 378 RegisterOrConstant MacroAssembler::delayed_value_impl(intptr_t* delayed_value_addr, 379 Register tmp, int offset) { 380 intptr_t value = *delayed_value_addr; 381 if (value != 0) { 382 return RegisterOrConstant(value + offset); 383 } 384 385 // Load indirectly to solve generation ordering problem. 386 // static address, no relocation 387 int simm16_offset = load_const_optimized(tmp, delayed_value_addr, noreg, true); 388 ld(tmp, simm16_offset, tmp); // must be aligned ((xa & 3) == 0) 389 390 if (offset != 0) { 391 addi(tmp, tmp, offset); 392 } 393 394 return RegisterOrConstant(tmp); 395 } 396 397 #ifndef PRODUCT 398 void MacroAssembler::pd_print_patched_instruction(address branch) { 399 Unimplemented(); // TODO: PPC port 400 } 401 #endif // ndef PRODUCT 402 403 // Conditional far branch for destinations encodable in 24+2 bits. 404 void MacroAssembler::bc_far(int boint, int biint, Label& dest, int optimize) { 405 406 // If requested by flag optimize, relocate the bc_far as a 407 // runtime_call and prepare for optimizing it when the code gets 408 // relocated. 409 if (optimize == bc_far_optimize_on_relocate) { 410 relocate(relocInfo::runtime_call_type); 411 } 412 413 // variant 2: 414 // 415 // b!cxx SKIP 416 // bxx DEST 417 // SKIP: 418 // 419 420 const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)), 421 opposite_bcond(inv_boint_bcond(boint))); 422 423 // We emit two branches. 424 // First, a conditional branch which jumps around the far branch. 425 const address not_taken_pc = pc() + 2 * BytesPerInstWord; 426 const address bc_pc = pc(); 427 bc(opposite_boint, biint, not_taken_pc); 428 429 const int bc_instr = *(int*)bc_pc; 430 assert(not_taken_pc == (address)inv_bd_field(bc_instr, (intptr_t)bc_pc), "postcondition"); 431 assert(opposite_boint == inv_bo_field(bc_instr), "postcondition"); 432 assert(boint == add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(bc_instr))), 433 opposite_bcond(inv_boint_bcond(inv_bo_field(bc_instr)))), 434 "postcondition"); 435 assert(biint == inv_bi_field(bc_instr), "postcondition"); 436 437 // Second, an unconditional far branch which jumps to dest. 438 // Note: target(dest) remembers the current pc (see CodeSection::target) 439 // and returns the current pc if the label is not bound yet; when 440 // the label gets bound, the unconditional far branch will be patched. 441 const address target_pc = target(dest); 442 const address b_pc = pc(); 443 b(target_pc); 444 445 assert(not_taken_pc == pc(), "postcondition"); 446 assert(dest.is_bound() || target_pc == b_pc, "postcondition"); 447 } 448 449 bool MacroAssembler::is_bc_far_at(address instruction_addr) { 450 return is_bc_far_variant1_at(instruction_addr) || 451 is_bc_far_variant2_at(instruction_addr) || 452 is_bc_far_variant3_at(instruction_addr); 453 } 454 455 address MacroAssembler::get_dest_of_bc_far_at(address instruction_addr) { 456 if (is_bc_far_variant1_at(instruction_addr)) { 457 const address instruction_1_addr = instruction_addr; 458 const int instruction_1 = *(int*)instruction_1_addr; 459 return (address)inv_bd_field(instruction_1, (intptr_t)instruction_1_addr); 460 } else if (is_bc_far_variant2_at(instruction_addr)) { 461 const address instruction_2_addr = instruction_addr + 4; 462 return bxx_destination(instruction_2_addr); 463 } else if (is_bc_far_variant3_at(instruction_addr)) { 464 return instruction_addr + 8; 465 } 466 // variant 4 ??? 467 ShouldNotReachHere(); 468 return NULL; 469 } 470 void MacroAssembler::set_dest_of_bc_far_at(address instruction_addr, address dest) { 471 472 if (is_bc_far_variant3_at(instruction_addr)) { 473 // variant 3, far cond branch to the next instruction, already patched to nops: 474 // 475 // nop 476 // endgroup 477 // SKIP/DEST: 478 // 479 return; 480 } 481 482 // first, extract boint and biint from the current branch 483 int boint = 0; 484 int biint = 0; 485 486 ResourceMark rm; 487 const int code_size = 2 * BytesPerInstWord; 488 CodeBuffer buf(instruction_addr, code_size); 489 MacroAssembler masm(&buf); 490 if (is_bc_far_variant2_at(instruction_addr) && dest == instruction_addr + 8) { 491 // Far branch to next instruction: Optimize it by patching nops (produce variant 3). 492 masm.nop(); 493 masm.endgroup(); 494 } else { 495 if (is_bc_far_variant1_at(instruction_addr)) { 496 // variant 1, the 1st instruction contains the destination address: 497 // 498 // bcxx DEST 499 // endgroup 500 // 501 const int instruction_1 = *(int*)(instruction_addr); 502 boint = inv_bo_field(instruction_1); 503 biint = inv_bi_field(instruction_1); 504 } else if (is_bc_far_variant2_at(instruction_addr)) { 505 // variant 2, the 2nd instruction contains the destination address: 506 // 507 // b!cxx SKIP 508 // bxx DEST 509 // SKIP: 510 // 511 const int instruction_1 = *(int*)(instruction_addr); 512 boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(instruction_1))), 513 opposite_bcond(inv_boint_bcond(inv_bo_field(instruction_1)))); 514 biint = inv_bi_field(instruction_1); 515 } else { 516 // variant 4??? 517 ShouldNotReachHere(); 518 } 519 520 // second, set the new branch destination and optimize the code 521 if (dest != instruction_addr + 4 && // the bc_far is still unbound! 522 masm.is_within_range_of_bcxx(dest, instruction_addr)) { 523 // variant 1: 524 // 525 // bcxx DEST 526 // endgroup 527 // 528 masm.bc(boint, biint, dest); 529 masm.endgroup(); 530 } else { 531 // variant 2: 532 // 533 // b!cxx SKIP 534 // bxx DEST 535 // SKIP: 536 // 537 const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)), 538 opposite_bcond(inv_boint_bcond(boint))); 539 const address not_taken_pc = masm.pc() + 2 * BytesPerInstWord; 540 masm.bc(opposite_boint, biint, not_taken_pc); 541 masm.b(dest); 542 } 543 } 544 ICache::ppc64_flush_icache_bytes(instruction_addr, code_size); 545 } 546 547 // Emit a NOT mt-safe patchable 64 bit absolute call/jump. 548 void MacroAssembler::bxx64_patchable(address dest, relocInfo::relocType rt, bool link) { 549 // get current pc 550 uint64_t start_pc = (uint64_t) pc(); 551 552 const address pc_of_bl = (address) (start_pc + (6*BytesPerInstWord)); // bl is last 553 const address pc_of_b = (address) (start_pc + (0*BytesPerInstWord)); // b is first 554 555 // relocate here 556 if (rt != relocInfo::none) { 557 relocate(rt); 558 } 559 560 if ( ReoptimizeCallSequences && 561 (( link && is_within_range_of_b(dest, pc_of_bl)) || 562 (!link && is_within_range_of_b(dest, pc_of_b)))) { 563 // variant 2: 564 // Emit an optimized, pc-relative call/jump. 565 566 if (link) { 567 // some padding 568 nop(); 569 nop(); 570 nop(); 571 nop(); 572 nop(); 573 nop(); 574 575 // do the call 576 assert(pc() == pc_of_bl, "just checking"); 577 bl(dest, relocInfo::none); 578 } else { 579 // do the jump 580 assert(pc() == pc_of_b, "just checking"); 581 b(dest, relocInfo::none); 582 583 // some padding 584 nop(); 585 nop(); 586 nop(); 587 nop(); 588 nop(); 589 nop(); 590 } 591 592 // Assert that we can identify the emitted call/jump. 593 assert(is_bxx64_patchable_variant2_at((address)start_pc, link), 594 "can't identify emitted call"); 595 } else { 596 // variant 1: 597 mr(R0, R11); // spill R11 -> R0. 598 599 // Load the destination address into CTR, 600 // calculate destination relative to global toc. 601 calculate_address_from_global_toc(R11, dest, true, true, false); 602 603 mtctr(R11); 604 mr(R11, R0); // spill R11 <- R0. 605 nop(); 606 607 // do the call/jump 608 if (link) { 609 bctrl(); 610 } else{ 611 bctr(); 612 } 613 // Assert that we can identify the emitted call/jump. 614 assert(is_bxx64_patchable_variant1b_at((address)start_pc, link), 615 "can't identify emitted call"); 616 } 617 618 // Assert that we can identify the emitted call/jump. 619 assert(is_bxx64_patchable_at((address)start_pc, link), 620 "can't identify emitted call"); 621 assert(get_dest_of_bxx64_patchable_at((address)start_pc, link) == dest, 622 "wrong encoding of dest address"); 623 } 624 625 // Identify a bxx64_patchable instruction. 626 bool MacroAssembler::is_bxx64_patchable_at(address instruction_addr, bool link) { 627 return is_bxx64_patchable_variant1b_at(instruction_addr, link) 628 //|| is_bxx64_patchable_variant1_at(instruction_addr, link) 629 || is_bxx64_patchable_variant2_at(instruction_addr, link); 630 } 631 632 // Does the call64_patchable instruction use a pc-relative encoding of 633 // the call destination? 634 bool MacroAssembler::is_bxx64_patchable_pcrelative_at(address instruction_addr, bool link) { 635 // variant 2 is pc-relative 636 return is_bxx64_patchable_variant2_at(instruction_addr, link); 637 } 638 639 // Identify variant 1. 640 bool MacroAssembler::is_bxx64_patchable_variant1_at(address instruction_addr, bool link) { 641 unsigned int* instr = (unsigned int*) instruction_addr; 642 return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l] 643 && is_mtctr(instr[5]) // mtctr 644 && is_load_const_at(instruction_addr); 645 } 646 647 // Identify variant 1b: load destination relative to global toc. 648 bool MacroAssembler::is_bxx64_patchable_variant1b_at(address instruction_addr, bool link) { 649 unsigned int* instr = (unsigned int*) instruction_addr; 650 return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l] 651 && is_mtctr(instr[3]) // mtctr 652 && is_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord, instruction_addr); 653 } 654 655 // Identify variant 2. 656 bool MacroAssembler::is_bxx64_patchable_variant2_at(address instruction_addr, bool link) { 657 unsigned int* instr = (unsigned int*) instruction_addr; 658 if (link) { 659 return is_bl (instr[6]) // bl dest is last 660 && is_nop(instr[0]) // nop 661 && is_nop(instr[1]) // nop 662 && is_nop(instr[2]) // nop 663 && is_nop(instr[3]) // nop 664 && is_nop(instr[4]) // nop 665 && is_nop(instr[5]); // nop 666 } else { 667 return is_b (instr[0]) // b dest is first 668 && is_nop(instr[1]) // nop 669 && is_nop(instr[2]) // nop 670 && is_nop(instr[3]) // nop 671 && is_nop(instr[4]) // nop 672 && is_nop(instr[5]) // nop 673 && is_nop(instr[6]); // nop 674 } 675 } 676 677 // Set dest address of a bxx64_patchable instruction. 678 void MacroAssembler::set_dest_of_bxx64_patchable_at(address instruction_addr, address dest, bool link) { 679 ResourceMark rm; 680 int code_size = MacroAssembler::bxx64_patchable_size; 681 CodeBuffer buf(instruction_addr, code_size); 682 MacroAssembler masm(&buf); 683 masm.bxx64_patchable(dest, relocInfo::none, link); 684 ICache::ppc64_flush_icache_bytes(instruction_addr, code_size); 685 } 686 687 // Get dest address of a bxx64_patchable instruction. 688 address MacroAssembler::get_dest_of_bxx64_patchable_at(address instruction_addr, bool link) { 689 if (is_bxx64_patchable_variant1_at(instruction_addr, link)) { 690 return (address) (unsigned long) get_const(instruction_addr); 691 } else if (is_bxx64_patchable_variant2_at(instruction_addr, link)) { 692 unsigned int* instr = (unsigned int*) instruction_addr; 693 if (link) { 694 const int instr_idx = 6; // bl is last 695 int branchoffset = branch_destination(instr[instr_idx], 0); 696 return instruction_addr + branchoffset + instr_idx*BytesPerInstWord; 697 } else { 698 const int instr_idx = 0; // b is first 699 int branchoffset = branch_destination(instr[instr_idx], 0); 700 return instruction_addr + branchoffset + instr_idx*BytesPerInstWord; 701 } 702 // Load dest relative to global toc. 703 } else if (is_bxx64_patchable_variant1b_at(instruction_addr, link)) { 704 return get_address_of_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord, 705 instruction_addr); 706 } else { 707 ShouldNotReachHere(); 708 return NULL; 709 } 710 } 711 712 // Uses ordering which corresponds to ABI: 713 // _savegpr0_14: std r14,-144(r1) 714 // _savegpr0_15: std r15,-136(r1) 715 // _savegpr0_16: std r16,-128(r1) 716 void MacroAssembler::save_nonvolatile_gprs(Register dst, int offset) { 717 std(R14, offset, dst); offset += 8; 718 std(R15, offset, dst); offset += 8; 719 std(R16, offset, dst); offset += 8; 720 std(R17, offset, dst); offset += 8; 721 std(R18, offset, dst); offset += 8; 722 std(R19, offset, dst); offset += 8; 723 std(R20, offset, dst); offset += 8; 724 std(R21, offset, dst); offset += 8; 725 std(R22, offset, dst); offset += 8; 726 std(R23, offset, dst); offset += 8; 727 std(R24, offset, dst); offset += 8; 728 std(R25, offset, dst); offset += 8; 729 std(R26, offset, dst); offset += 8; 730 std(R27, offset, dst); offset += 8; 731 std(R28, offset, dst); offset += 8; 732 std(R29, offset, dst); offset += 8; 733 std(R30, offset, dst); offset += 8; 734 std(R31, offset, dst); offset += 8; 735 736 stfd(F14, offset, dst); offset += 8; 737 stfd(F15, offset, dst); offset += 8; 738 stfd(F16, offset, dst); offset += 8; 739 stfd(F17, offset, dst); offset += 8; 740 stfd(F18, offset, dst); offset += 8; 741 stfd(F19, offset, dst); offset += 8; 742 stfd(F20, offset, dst); offset += 8; 743 stfd(F21, offset, dst); offset += 8; 744 stfd(F22, offset, dst); offset += 8; 745 stfd(F23, offset, dst); offset += 8; 746 stfd(F24, offset, dst); offset += 8; 747 stfd(F25, offset, dst); offset += 8; 748 stfd(F26, offset, dst); offset += 8; 749 stfd(F27, offset, dst); offset += 8; 750 stfd(F28, offset, dst); offset += 8; 751 stfd(F29, offset, dst); offset += 8; 752 stfd(F30, offset, dst); offset += 8; 753 stfd(F31, offset, dst); 754 } 755 756 // Uses ordering which corresponds to ABI: 757 // _restgpr0_14: ld r14,-144(r1) 758 // _restgpr0_15: ld r15,-136(r1) 759 // _restgpr0_16: ld r16,-128(r1) 760 void MacroAssembler::restore_nonvolatile_gprs(Register src, int offset) { 761 ld(R14, offset, src); offset += 8; 762 ld(R15, offset, src); offset += 8; 763 ld(R16, offset, src); offset += 8; 764 ld(R17, offset, src); offset += 8; 765 ld(R18, offset, src); offset += 8; 766 ld(R19, offset, src); offset += 8; 767 ld(R20, offset, src); offset += 8; 768 ld(R21, offset, src); offset += 8; 769 ld(R22, offset, src); offset += 8; 770 ld(R23, offset, src); offset += 8; 771 ld(R24, offset, src); offset += 8; 772 ld(R25, offset, src); offset += 8; 773 ld(R26, offset, src); offset += 8; 774 ld(R27, offset, src); offset += 8; 775 ld(R28, offset, src); offset += 8; 776 ld(R29, offset, src); offset += 8; 777 ld(R30, offset, src); offset += 8; 778 ld(R31, offset, src); offset += 8; 779 780 // FP registers 781 lfd(F14, offset, src); offset += 8; 782 lfd(F15, offset, src); offset += 8; 783 lfd(F16, offset, src); offset += 8; 784 lfd(F17, offset, src); offset += 8; 785 lfd(F18, offset, src); offset += 8; 786 lfd(F19, offset, src); offset += 8; 787 lfd(F20, offset, src); offset += 8; 788 lfd(F21, offset, src); offset += 8; 789 lfd(F22, offset, src); offset += 8; 790 lfd(F23, offset, src); offset += 8; 791 lfd(F24, offset, src); offset += 8; 792 lfd(F25, offset, src); offset += 8; 793 lfd(F26, offset, src); offset += 8; 794 lfd(F27, offset, src); offset += 8; 795 lfd(F28, offset, src); offset += 8; 796 lfd(F29, offset, src); offset += 8; 797 lfd(F30, offset, src); offset += 8; 798 lfd(F31, offset, src); 799 } 800 801 // For verify_oops. 802 void MacroAssembler::save_volatile_gprs(Register dst, int offset) { 803 std(R2, offset, dst); offset += 8; 804 std(R3, offset, dst); offset += 8; 805 std(R4, offset, dst); offset += 8; 806 std(R5, offset, dst); offset += 8; 807 std(R6, offset, dst); offset += 8; 808 std(R7, offset, dst); offset += 8; 809 std(R8, offset, dst); offset += 8; 810 std(R9, offset, dst); offset += 8; 811 std(R10, offset, dst); offset += 8; 812 std(R11, offset, dst); offset += 8; 813 std(R12, offset, dst); 814 } 815 816 // For verify_oops. 817 void MacroAssembler::restore_volatile_gprs(Register src, int offset) { 818 ld(R2, offset, src); offset += 8; 819 ld(R3, offset, src); offset += 8; 820 ld(R4, offset, src); offset += 8; 821 ld(R5, offset, src); offset += 8; 822 ld(R6, offset, src); offset += 8; 823 ld(R7, offset, src); offset += 8; 824 ld(R8, offset, src); offset += 8; 825 ld(R9, offset, src); offset += 8; 826 ld(R10, offset, src); offset += 8; 827 ld(R11, offset, src); offset += 8; 828 ld(R12, offset, src); 829 } 830 831 void MacroAssembler::save_LR_CR(Register tmp) { 832 mfcr(tmp); 833 std(tmp, _abi(cr), R1_SP); 834 mflr(tmp); 835 std(tmp, _abi(lr), R1_SP); 836 // Tmp must contain lr on exit! (see return_addr and prolog in ppc64.ad) 837 } 838 839 void MacroAssembler::restore_LR_CR(Register tmp) { 840 assert(tmp != R1_SP, "must be distinct"); 841 ld(tmp, _abi(lr), R1_SP); 842 mtlr(tmp); 843 ld(tmp, _abi(cr), R1_SP); 844 mtcr(tmp); 845 } 846 847 address MacroAssembler::get_PC_trash_LR(Register result) { 848 Label L; 849 bl(L); 850 bind(L); 851 address lr_pc = pc(); 852 mflr(result); 853 return lr_pc; 854 } 855 856 void MacroAssembler::resize_frame(Register offset, Register tmp) { 857 #ifdef ASSERT 858 assert_different_registers(offset, tmp, R1_SP); 859 andi_(tmp, offset, frame::alignment_in_bytes-1); 860 asm_assert_eq("resize_frame: unaligned", 0x204); 861 #endif 862 863 // tmp <- *(SP) 864 ld(tmp, _abi(callers_sp), R1_SP); 865 // addr <- SP + offset; 866 // *(addr) <- tmp; 867 // SP <- addr 868 stdux(tmp, R1_SP, offset); 869 } 870 871 void MacroAssembler::resize_frame(int offset, Register tmp) { 872 assert(is_simm(offset, 16), "too big an offset"); 873 assert_different_registers(tmp, R1_SP); 874 assert((offset & (frame::alignment_in_bytes-1))==0, "resize_frame: unaligned"); 875 // tmp <- *(SP) 876 ld(tmp, _abi(callers_sp), R1_SP); 877 // addr <- SP + offset; 878 // *(addr) <- tmp; 879 // SP <- addr 880 stdu(tmp, offset, R1_SP); 881 } 882 883 void MacroAssembler::resize_frame_absolute(Register addr, Register tmp1, Register tmp2) { 884 // (addr == tmp1) || (addr == tmp2) is allowed here! 885 assert(tmp1 != tmp2, "must be distinct"); 886 887 // compute offset w.r.t. current stack pointer 888 // tmp_1 <- addr - SP (!) 889 subf(tmp1, R1_SP, addr); 890 891 // atomically update SP keeping back link. 892 resize_frame(tmp1/* offset */, tmp2/* tmp */); 893 } 894 895 void MacroAssembler::push_frame(Register bytes, Register tmp) { 896 #ifdef ASSERT 897 assert(bytes != R0, "r0 not allowed here"); 898 andi_(R0, bytes, frame::alignment_in_bytes-1); 899 asm_assert_eq("push_frame(Reg, Reg): unaligned", 0x203); 900 #endif 901 neg(tmp, bytes); 902 stdux(R1_SP, R1_SP, tmp); 903 } 904 905 // Push a frame of size `bytes'. 906 void MacroAssembler::push_frame(unsigned int bytes, Register tmp) { 907 long offset = align_addr(bytes, frame::alignment_in_bytes); 908 if (is_simm(-offset, 16)) { 909 stdu(R1_SP, -offset, R1_SP); 910 } else { 911 load_const(tmp, -offset); 912 stdux(R1_SP, R1_SP, tmp); 913 } 914 } 915 916 // Push a frame of size `bytes' plus abi_reg_args on top. 917 void MacroAssembler::push_frame_reg_args(unsigned int bytes, Register tmp) { 918 push_frame(bytes + frame::abi_reg_args_size, tmp); 919 } 920 921 // Setup up a new C frame with a spill area for non-volatile GPRs and 922 // additional space for local variables. 923 void MacroAssembler::push_frame_reg_args_nonvolatiles(unsigned int bytes, 924 Register tmp) { 925 push_frame(bytes + frame::abi_reg_args_size + frame::spill_nonvolatiles_size, tmp); 926 } 927 928 // Pop current C frame. 929 void MacroAssembler::pop_frame() { 930 ld(R1_SP, _abi(callers_sp), R1_SP); 931 } 932 933 #if defined(ABI_ELFv2) 934 address MacroAssembler::branch_to(Register r_function_entry, bool and_link) { 935 // TODO(asmundak): make sure the caller uses R12 as function descriptor 936 // most of the times. 937 if (R12 != r_function_entry) { 938 mr(R12, r_function_entry); 939 } 940 mtctr(R12); 941 // Do a call or a branch. 942 if (and_link) { 943 bctrl(); 944 } else { 945 bctr(); 946 } 947 _last_calls_return_pc = pc(); 948 949 return _last_calls_return_pc; 950 } 951 952 // Call a C function via a function descriptor and use full C 953 // calling conventions. Updates and returns _last_calls_return_pc. 954 address MacroAssembler::call_c(Register r_function_entry) { 955 return branch_to(r_function_entry, /*and_link=*/true); 956 } 957 958 // For tail calls: only branch, don't link, so callee returns to caller of this function. 959 address MacroAssembler::call_c_and_return_to_caller(Register r_function_entry) { 960 return branch_to(r_function_entry, /*and_link=*/false); 961 } 962 963 address MacroAssembler::call_c(address function_entry, relocInfo::relocType rt) { 964 load_const(R12, function_entry, R0); 965 return branch_to(R12, /*and_link=*/true); 966 } 967 968 #else 969 // Generic version of a call to C function via a function descriptor 970 // with variable support for C calling conventions (TOC, ENV, etc.). 971 // Updates and returns _last_calls_return_pc. 972 address MacroAssembler::branch_to(Register function_descriptor, bool and_link, bool save_toc_before_call, 973 bool restore_toc_after_call, bool load_toc_of_callee, bool load_env_of_callee) { 974 // we emit standard ptrgl glue code here 975 assert((function_descriptor != R0), "function_descriptor cannot be R0"); 976 977 // retrieve necessary entries from the function descriptor 978 ld(R0, in_bytes(FunctionDescriptor::entry_offset()), function_descriptor); 979 mtctr(R0); 980 981 if (load_toc_of_callee) { 982 ld(R2_TOC, in_bytes(FunctionDescriptor::toc_offset()), function_descriptor); 983 } 984 if (load_env_of_callee) { 985 ld(R11, in_bytes(FunctionDescriptor::env_offset()), function_descriptor); 986 } else if (load_toc_of_callee) { 987 li(R11, 0); 988 } 989 990 // do a call or a branch 991 if (and_link) { 992 bctrl(); 993 } else { 994 bctr(); 995 } 996 _last_calls_return_pc = pc(); 997 998 return _last_calls_return_pc; 999 } 1000 1001 // Call a C function via a function descriptor and use full C calling 1002 // conventions. 1003 // We don't use the TOC in generated code, so there is no need to save 1004 // and restore its value. 1005 address MacroAssembler::call_c(Register fd) { 1006 return branch_to(fd, /*and_link=*/true, 1007 /*save toc=*/false, 1008 /*restore toc=*/false, 1009 /*load toc=*/true, 1010 /*load env=*/true); 1011 } 1012 1013 address MacroAssembler::call_c_and_return_to_caller(Register fd) { 1014 return branch_to(fd, /*and_link=*/false, 1015 /*save toc=*/false, 1016 /*restore toc=*/false, 1017 /*load toc=*/true, 1018 /*load env=*/true); 1019 } 1020 1021 address MacroAssembler::call_c(const FunctionDescriptor* fd, relocInfo::relocType rt) { 1022 if (rt != relocInfo::none) { 1023 // this call needs to be relocatable 1024 if (!ReoptimizeCallSequences 1025 || (rt != relocInfo::runtime_call_type && rt != relocInfo::none) 1026 || fd == NULL // support code-size estimation 1027 || !fd->is_friend_function() 1028 || fd->entry() == NULL) { 1029 // it's not a friend function as defined by class FunctionDescriptor, 1030 // so do a full call-c here. 1031 load_const(R11, (address)fd, R0); 1032 1033 bool has_env = (fd != NULL && fd->env() != NULL); 1034 return branch_to(R11, /*and_link=*/true, 1035 /*save toc=*/false, 1036 /*restore toc=*/false, 1037 /*load toc=*/true, 1038 /*load env=*/has_env); 1039 } else { 1040 // It's a friend function. Load the entry point and don't care about 1041 // toc and env. Use an optimizable call instruction, but ensure the 1042 // same code-size as in the case of a non-friend function. 1043 nop(); 1044 nop(); 1045 nop(); 1046 bl64_patchable(fd->entry(), rt); 1047 _last_calls_return_pc = pc(); 1048 return _last_calls_return_pc; 1049 } 1050 } else { 1051 // This call does not need to be relocatable, do more aggressive 1052 // optimizations. 1053 if (!ReoptimizeCallSequences 1054 || !fd->is_friend_function()) { 1055 // It's not a friend function as defined by class FunctionDescriptor, 1056 // so do a full call-c here. 1057 load_const(R11, (address)fd, R0); 1058 return branch_to(R11, /*and_link=*/true, 1059 /*save toc=*/false, 1060 /*restore toc=*/false, 1061 /*load toc=*/true, 1062 /*load env=*/true); 1063 } else { 1064 // it's a friend function, load the entry point and don't care about 1065 // toc and env. 1066 address dest = fd->entry(); 1067 if (is_within_range_of_b(dest, pc())) { 1068 bl(dest); 1069 } else { 1070 bl64_patchable(dest, rt); 1071 } 1072 _last_calls_return_pc = pc(); 1073 return _last_calls_return_pc; 1074 } 1075 } 1076 } 1077 1078 // Call a C function. All constants needed reside in TOC. 1079 // 1080 // Read the address to call from the TOC. 1081 // Read env from TOC, if fd specifies an env. 1082 // Read new TOC from TOC. 1083 address MacroAssembler::call_c_using_toc(const FunctionDescriptor* fd, 1084 relocInfo::relocType rt, Register toc) { 1085 if (!ReoptimizeCallSequences 1086 || (rt != relocInfo::runtime_call_type && rt != relocInfo::none) 1087 || !fd->is_friend_function()) { 1088 // It's not a friend function as defined by class FunctionDescriptor, 1089 // so do a full call-c here. 1090 assert(fd->entry() != NULL, "function must be linked"); 1091 1092 AddressLiteral fd_entry(fd->entry()); 1093 load_const_from_method_toc(R11, fd_entry, toc); 1094 mtctr(R11); 1095 if (fd->env() == NULL) { 1096 li(R11, 0); 1097 nop(); 1098 } else { 1099 AddressLiteral fd_env(fd->env()); 1100 load_const_from_method_toc(R11, fd_env, toc); 1101 } 1102 AddressLiteral fd_toc(fd->toc()); 1103 load_toc_from_toc(R2_TOC, fd_toc, toc); 1104 // R2_TOC is killed. 1105 bctrl(); 1106 _last_calls_return_pc = pc(); 1107 } else { 1108 // It's a friend function, load the entry point and don't care about 1109 // toc and env. Use an optimizable call instruction, but ensure the 1110 // same code-size as in the case of a non-friend function. 1111 nop(); 1112 bl64_patchable(fd->entry(), rt); 1113 _last_calls_return_pc = pc(); 1114 } 1115 return _last_calls_return_pc; 1116 } 1117 #endif // ABI_ELFv2 1118 1119 void MacroAssembler::call_VM_base(Register oop_result, 1120 Register last_java_sp, 1121 address entry_point, 1122 bool check_exceptions) { 1123 BLOCK_COMMENT("call_VM {"); 1124 // Determine last_java_sp register. 1125 if (!last_java_sp->is_valid()) { 1126 last_java_sp = R1_SP; 1127 } 1128 set_top_ijava_frame_at_SP_as_last_Java_frame(last_java_sp, R11_scratch1); 1129 1130 // ARG1 must hold thread address. 1131 mr(R3_ARG1, R16_thread); 1132 #if defined(ABI_ELFv2) 1133 address return_pc = call_c(entry_point, relocInfo::none); 1134 #else 1135 address return_pc = call_c((FunctionDescriptor*)entry_point, relocInfo::none); 1136 #endif 1137 1138 reset_last_Java_frame(); 1139 1140 // Check for pending exceptions. 1141 if (check_exceptions) { 1142 // We don't check for exceptions here. 1143 ShouldNotReachHere(); 1144 } 1145 1146 // Get oop result if there is one and reset the value in the thread. 1147 if (oop_result->is_valid()) { 1148 get_vm_result(oop_result); 1149 } 1150 1151 _last_calls_return_pc = return_pc; 1152 BLOCK_COMMENT("} call_VM"); 1153 } 1154 1155 void MacroAssembler::call_VM_leaf_base(address entry_point) { 1156 BLOCK_COMMENT("call_VM_leaf {"); 1157 #if defined(ABI_ELFv2) 1158 call_c(entry_point, relocInfo::none); 1159 #else 1160 call_c(CAST_FROM_FN_PTR(FunctionDescriptor*, entry_point), relocInfo::none); 1161 #endif 1162 BLOCK_COMMENT("} call_VM_leaf"); 1163 } 1164 1165 void MacroAssembler::call_VM(Register oop_result, address entry_point, bool check_exceptions) { 1166 call_VM_base(oop_result, noreg, entry_point, check_exceptions); 1167 } 1168 1169 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, 1170 bool check_exceptions) { 1171 // R3_ARG1 is reserved for the thread. 1172 mr_if_needed(R4_ARG2, arg_1); 1173 call_VM(oop_result, entry_point, check_exceptions); 1174 } 1175 1176 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2, 1177 bool check_exceptions) { 1178 // R3_ARG1 is reserved for the thread 1179 mr_if_needed(R4_ARG2, arg_1); 1180 assert(arg_2 != R4_ARG2, "smashed argument"); 1181 mr_if_needed(R5_ARG3, arg_2); 1182 call_VM(oop_result, entry_point, check_exceptions); 1183 } 1184 1185 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2, Register arg_3, 1186 bool check_exceptions) { 1187 // R3_ARG1 is reserved for the thread 1188 mr_if_needed(R4_ARG2, arg_1); 1189 assert(arg_2 != R4_ARG2, "smashed argument"); 1190 mr_if_needed(R5_ARG3, arg_2); 1191 mr_if_needed(R6_ARG4, arg_3); 1192 call_VM(oop_result, entry_point, check_exceptions); 1193 } 1194 1195 void MacroAssembler::call_VM_leaf(address entry_point) { 1196 call_VM_leaf_base(entry_point); 1197 } 1198 1199 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1) { 1200 mr_if_needed(R3_ARG1, arg_1); 1201 call_VM_leaf(entry_point); 1202 } 1203 1204 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2) { 1205 mr_if_needed(R3_ARG1, arg_1); 1206 assert(arg_2 != R3_ARG1, "smashed argument"); 1207 mr_if_needed(R4_ARG2, arg_2); 1208 call_VM_leaf(entry_point); 1209 } 1210 1211 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2, Register arg_3) { 1212 mr_if_needed(R3_ARG1, arg_1); 1213 assert(arg_2 != R3_ARG1, "smashed argument"); 1214 mr_if_needed(R4_ARG2, arg_2); 1215 assert(arg_3 != R3_ARG1 && arg_3 != R4_ARG2, "smashed argument"); 1216 mr_if_needed(R5_ARG3, arg_3); 1217 call_VM_leaf(entry_point); 1218 } 1219 1220 // Check whether instruction is a read access to the polling page 1221 // which was emitted by load_from_polling_page(..). 1222 bool MacroAssembler::is_load_from_polling_page(int instruction, void* ucontext, 1223 address* polling_address_ptr) { 1224 if (!is_ld(instruction)) 1225 return false; // It's not a ld. Fail. 1226 1227 int rt = inv_rt_field(instruction); 1228 int ra = inv_ra_field(instruction); 1229 int ds = inv_ds_field(instruction); 1230 if (!(ds == 0 && ra != 0 && rt == 0)) { 1231 return false; // It's not a ld(r0, X, ra). Fail. 1232 } 1233 1234 if (!ucontext) { 1235 // Set polling address. 1236 if (polling_address_ptr != NULL) { 1237 *polling_address_ptr = NULL; 1238 } 1239 return true; // No ucontext given. Can't check value of ra. Assume true. 1240 } 1241 1242 #ifdef LINUX 1243 // Ucontext given. Check that register ra contains the address of 1244 // the safepoing polling page. 1245 ucontext_t* uc = (ucontext_t*) ucontext; 1246 // Set polling address. 1247 address addr = (address)uc->uc_mcontext.regs->gpr[ra] + (ssize_t)ds; 1248 if (polling_address_ptr != NULL) { 1249 *polling_address_ptr = addr; 1250 } 1251 return os::is_poll_address(addr); 1252 #else 1253 // Not on Linux, ucontext must be NULL. 1254 ShouldNotReachHere(); 1255 return false; 1256 #endif 1257 } 1258 1259 bool MacroAssembler::is_memory_serialization(int instruction, JavaThread* thread, void* ucontext) { 1260 #ifdef LINUX 1261 ucontext_t* uc = (ucontext_t*) ucontext; 1262 1263 if (is_stwx(instruction) || is_stwux(instruction)) { 1264 int ra = inv_ra_field(instruction); 1265 int rb = inv_rb_field(instruction); 1266 1267 // look up content of ra and rb in ucontext 1268 address ra_val=(address)uc->uc_mcontext.regs->gpr[ra]; 1269 long rb_val=(long)uc->uc_mcontext.regs->gpr[rb]; 1270 return os::is_memory_serialize_page(thread, ra_val+rb_val); 1271 } else if (is_stw(instruction) || is_stwu(instruction)) { 1272 int ra = inv_ra_field(instruction); 1273 int d1 = inv_d1_field(instruction); 1274 1275 // look up content of ra in ucontext 1276 address ra_val=(address)uc->uc_mcontext.regs->gpr[ra]; 1277 return os::is_memory_serialize_page(thread, ra_val+d1); 1278 } else { 1279 return false; 1280 } 1281 #else 1282 // workaround not needed on !LINUX :-) 1283 ShouldNotCallThis(); 1284 return false; 1285 #endif 1286 } 1287 1288 void MacroAssembler::bang_stack_with_offset(int offset) { 1289 // When increasing the stack, the old stack pointer will be written 1290 // to the new top of stack according to the PPC64 abi. 1291 // Therefore, stack banging is not necessary when increasing 1292 // the stack by <= os::vm_page_size() bytes. 1293 // When increasing the stack by a larger amount, this method is 1294 // called repeatedly to bang the intermediate pages. 1295 1296 // Stack grows down, caller passes positive offset. 1297 assert(offset > 0, "must bang with positive offset"); 1298 1299 long stdoffset = -offset; 1300 1301 if (is_simm(stdoffset, 16)) { 1302 // Signed 16 bit offset, a simple std is ok. 1303 if (UseLoadInstructionsForStackBangingPPC64) { 1304 ld(R0, (int)(signed short)stdoffset, R1_SP); 1305 } else { 1306 std(R0,(int)(signed short)stdoffset, R1_SP); 1307 } 1308 } else if (is_simm(stdoffset, 31)) { 1309 const int hi = MacroAssembler::largeoffset_si16_si16_hi(stdoffset); 1310 const int lo = MacroAssembler::largeoffset_si16_si16_lo(stdoffset); 1311 1312 Register tmp = R11; 1313 addis(tmp, R1_SP, hi); 1314 if (UseLoadInstructionsForStackBangingPPC64) { 1315 ld(R0, lo, tmp); 1316 } else { 1317 std(R0, lo, tmp); 1318 } 1319 } else { 1320 ShouldNotReachHere(); 1321 } 1322 } 1323 1324 // If instruction is a stack bang of the form 1325 // std R0, x(Ry), (see bang_stack_with_offset()) 1326 // stdu R1_SP, x(R1_SP), (see push_frame(), resize_frame()) 1327 // or stdux R1_SP, Rx, R1_SP (see push_frame(), resize_frame()) 1328 // return the banged address. Otherwise, return 0. 1329 address MacroAssembler::get_stack_bang_address(int instruction, void *ucontext) { 1330 #ifdef LINUX 1331 ucontext_t* uc = (ucontext_t*) ucontext; 1332 int rs = inv_rs_field(instruction); 1333 int ra = inv_ra_field(instruction); 1334 if ( (is_ld(instruction) && rs == 0 && UseLoadInstructionsForStackBangingPPC64) 1335 || (is_std(instruction) && rs == 0 && !UseLoadInstructionsForStackBangingPPC64) 1336 || (is_stdu(instruction) && rs == 1)) { 1337 int ds = inv_ds_field(instruction); 1338 // return banged address 1339 return ds+(address)uc->uc_mcontext.regs->gpr[ra]; 1340 } else if (is_stdux(instruction) && rs == 1) { 1341 int rb = inv_rb_field(instruction); 1342 address sp = (address)uc->uc_mcontext.regs->gpr[1]; 1343 long rb_val = (long)uc->uc_mcontext.regs->gpr[rb]; 1344 return ra != 1 || rb_val >= 0 ? NULL // not a stack bang 1345 : sp + rb_val; // banged address 1346 } 1347 return NULL; // not a stack bang 1348 #else 1349 // workaround not needed on !LINUX :-) 1350 ShouldNotCallThis(); 1351 return NULL; 1352 #endif 1353 } 1354 1355 // CmpxchgX sets condition register to cmpX(current, compare). 1356 void MacroAssembler::cmpxchgw(ConditionRegister flag, Register dest_current_value, 1357 Register compare_value, Register exchange_value, 1358 Register addr_base, int semantics, bool cmpxchgx_hint, 1359 Register int_flag_success, bool contention_hint) { 1360 Label retry; 1361 Label failed; 1362 Label done; 1363 1364 // Save one branch if result is returned via register and 1365 // result register is different from the other ones. 1366 bool use_result_reg = (int_flag_success != noreg); 1367 bool preset_result_reg = (int_flag_success != dest_current_value && int_flag_success != compare_value && 1368 int_flag_success != exchange_value && int_flag_success != addr_base); 1369 1370 // release/fence semantics 1371 if (semantics & MemBarRel) { 1372 release(); 1373 } 1374 1375 if (use_result_reg && preset_result_reg) { 1376 li(int_flag_success, 0); // preset (assume cas failed) 1377 } 1378 1379 // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM). 1380 if (contention_hint) { // Don't try to reserve if cmp fails. 1381 lwz(dest_current_value, 0, addr_base); 1382 cmpw(flag, dest_current_value, compare_value); 1383 bne(flag, failed); 1384 } 1385 1386 // atomic emulation loop 1387 bind(retry); 1388 1389 lwarx(dest_current_value, addr_base, cmpxchgx_hint); 1390 cmpw(flag, dest_current_value, compare_value); 1391 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1392 bne_predict_not_taken(flag, failed); 1393 } else { 1394 bne( flag, failed); 1395 } 1396 // branch to done => (flag == ne), (dest_current_value != compare_value) 1397 // fall through => (flag == eq), (dest_current_value == compare_value) 1398 1399 stwcx_(exchange_value, addr_base); 1400 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1401 bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0. 1402 } else { 1403 bne( CCR0, retry); // StXcx_ sets CCR0. 1404 } 1405 // fall through => (flag == eq), (dest_current_value == compare_value), (swapped) 1406 1407 // Result in register (must do this at the end because int_flag_success can be the 1408 // same register as one above). 1409 if (use_result_reg) { 1410 li(int_flag_success, 1); 1411 } 1412 1413 if (semantics & MemBarFenceAfter) { 1414 fence(); 1415 } else if (semantics & MemBarAcq) { 1416 isync(); 1417 } 1418 1419 if (use_result_reg && !preset_result_reg) { 1420 b(done); 1421 } 1422 1423 bind(failed); 1424 if (use_result_reg && !preset_result_reg) { 1425 li(int_flag_success, 0); 1426 } 1427 1428 bind(done); 1429 // (flag == ne) => (dest_current_value != compare_value), (!swapped) 1430 // (flag == eq) => (dest_current_value == compare_value), ( swapped) 1431 } 1432 1433 // Preforms atomic compare exchange: 1434 // if (compare_value == *addr_base) 1435 // *addr_base = exchange_value 1436 // int_flag_success = 1; 1437 // else 1438 // int_flag_success = 0; 1439 // 1440 // ConditionRegister flag = cmp(compare_value, *addr_base) 1441 // Register dest_current_value = *addr_base 1442 // Register compare_value Used to compare with value in memory 1443 // Register exchange_value Written to memory if compare_value == *addr_base 1444 // Register addr_base The memory location to compareXChange 1445 // Register int_flag_success Set to 1 if exchange_value was written to *addr_base 1446 // 1447 // To avoid the costly compare exchange the value is tested beforehand. 1448 // Several special cases exist to avoid that unnecessary information is generated. 1449 // 1450 void MacroAssembler::cmpxchgd(ConditionRegister flag, 1451 Register dest_current_value, RegisterOrConstant compare_value, Register exchange_value, 1452 Register addr_base, int semantics, bool cmpxchgx_hint, 1453 Register int_flag_success, Label* failed_ext, bool contention_hint) { 1454 Label retry; 1455 Label failed_int; 1456 Label& failed = (failed_ext != NULL) ? *failed_ext : failed_int; 1457 Label done; 1458 1459 // Save one branch if result is returned via register and result register is different from the other ones. 1460 bool use_result_reg = (int_flag_success!=noreg); 1461 bool preset_result_reg = (int_flag_success!=dest_current_value && int_flag_success!=compare_value.register_or_noreg() && 1462 int_flag_success!=exchange_value && int_flag_success!=addr_base); 1463 assert(int_flag_success == noreg || failed_ext == NULL, "cannot have both"); 1464 1465 // release/fence semantics 1466 if (semantics & MemBarRel) { 1467 release(); 1468 } 1469 1470 if (use_result_reg && preset_result_reg) { 1471 li(int_flag_success, 0); // preset (assume cas failed) 1472 } 1473 1474 // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM). 1475 if (contention_hint) { // Don't try to reserve if cmp fails. 1476 ld(dest_current_value, 0, addr_base); 1477 cmpd(flag, compare_value, dest_current_value); 1478 bne(flag, failed); 1479 } 1480 1481 // atomic emulation loop 1482 bind(retry); 1483 1484 ldarx(dest_current_value, addr_base, cmpxchgx_hint); 1485 cmpd(flag, compare_value, dest_current_value); 1486 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1487 bne_predict_not_taken(flag, failed); 1488 } else { 1489 bne( flag, failed); 1490 } 1491 1492 stdcx_(exchange_value, addr_base); 1493 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1494 bne_predict_not_taken(CCR0, retry); // stXcx_ sets CCR0 1495 } else { 1496 bne( CCR0, retry); // stXcx_ sets CCR0 1497 } 1498 1499 // result in register (must do this at the end because int_flag_success can be the same register as one above) 1500 if (use_result_reg) { 1501 li(int_flag_success, 1); 1502 } 1503 1504 // POWER6 doesn't need isync in CAS. 1505 // Always emit isync to be on the safe side. 1506 if (semantics & MemBarFenceAfter) { 1507 fence(); 1508 } else if (semantics & MemBarAcq) { 1509 isync(); 1510 } 1511 1512 if (use_result_reg && !preset_result_reg) { 1513 b(done); 1514 } 1515 1516 bind(failed_int); 1517 if (use_result_reg && !preset_result_reg) { 1518 li(int_flag_success, 0); 1519 } 1520 1521 bind(done); 1522 // (flag == ne) => (dest_current_value != compare_value), (!swapped) 1523 // (flag == eq) => (dest_current_value == compare_value), ( swapped) 1524 } 1525 1526 // Look up the method for a megamorphic invokeinterface call. 1527 // The target method is determined by <intf_klass, itable_index>. 1528 // The receiver klass is in recv_klass. 1529 // On success, the result will be in method_result, and execution falls through. 1530 // On failure, execution transfers to the given label. 1531 void MacroAssembler::lookup_interface_method(Register recv_klass, 1532 Register intf_klass, 1533 RegisterOrConstant itable_index, 1534 Register method_result, 1535 Register scan_temp, 1536 Register sethi_temp, 1537 Label& L_no_such_interface) { 1538 assert_different_registers(recv_klass, intf_klass, method_result, scan_temp); 1539 assert(itable_index.is_constant() || itable_index.as_register() == method_result, 1540 "caller must use same register for non-constant itable index as for method"); 1541 1542 // Compute start of first itableOffsetEntry (which is at the end of the vtable). 1543 int vtable_base = InstanceKlass::vtable_start_offset() * wordSize; 1544 int itentry_off = itableMethodEntry::method_offset_in_bytes(); 1545 int logMEsize = exact_log2(itableMethodEntry::size() * wordSize); 1546 int scan_step = itableOffsetEntry::size() * wordSize; 1547 int log_vte_size= exact_log2(vtableEntry::size() * wordSize); 1548 1549 lwz(scan_temp, InstanceKlass::vtable_length_offset() * wordSize, recv_klass); 1550 // %%% We should store the aligned, prescaled offset in the klassoop. 1551 // Then the next several instructions would fold away. 1552 1553 sldi(scan_temp, scan_temp, log_vte_size); 1554 addi(scan_temp, scan_temp, vtable_base); 1555 add(scan_temp, recv_klass, scan_temp); 1556 1557 // Adjust recv_klass by scaled itable_index, so we can free itable_index. 1558 if (itable_index.is_register()) { 1559 Register itable_offset = itable_index.as_register(); 1560 sldi(itable_offset, itable_offset, logMEsize); 1561 if (itentry_off) addi(itable_offset, itable_offset, itentry_off); 1562 add(recv_klass, itable_offset, recv_klass); 1563 } else { 1564 long itable_offset = (long)itable_index.as_constant(); 1565 load_const_optimized(sethi_temp, (itable_offset<<logMEsize)+itentry_off); // static address, no relocation 1566 add(recv_klass, sethi_temp, recv_klass); 1567 } 1568 1569 // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) { 1570 // if (scan->interface() == intf) { 1571 // result = (klass + scan->offset() + itable_index); 1572 // } 1573 // } 1574 Label search, found_method; 1575 1576 for (int peel = 1; peel >= 0; peel--) { 1577 // %%%% Could load both offset and interface in one ldx, if they were 1578 // in the opposite order. This would save a load. 1579 ld(method_result, itableOffsetEntry::interface_offset_in_bytes(), scan_temp); 1580 1581 // Check that this entry is non-null. A null entry means that 1582 // the receiver class doesn't implement the interface, and wasn't the 1583 // same as when the caller was compiled. 1584 cmpd(CCR0, method_result, intf_klass); 1585 1586 if (peel) { 1587 beq(CCR0, found_method); 1588 } else { 1589 bne(CCR0, search); 1590 // (invert the test to fall through to found_method...) 1591 } 1592 1593 if (!peel) break; 1594 1595 bind(search); 1596 1597 cmpdi(CCR0, method_result, 0); 1598 beq(CCR0, L_no_such_interface); 1599 addi(scan_temp, scan_temp, scan_step); 1600 } 1601 1602 bind(found_method); 1603 1604 // Got a hit. 1605 int ito_offset = itableOffsetEntry::offset_offset_in_bytes(); 1606 lwz(scan_temp, ito_offset, scan_temp); 1607 ldx(method_result, scan_temp, recv_klass); 1608 } 1609 1610 // virtual method calling 1611 void MacroAssembler::lookup_virtual_method(Register recv_klass, 1612 RegisterOrConstant vtable_index, 1613 Register method_result) { 1614 1615 assert_different_registers(recv_klass, method_result, vtable_index.register_or_noreg()); 1616 1617 const int base = InstanceKlass::vtable_start_offset() * wordSize; 1618 assert(vtableEntry::size() * wordSize == wordSize, "adjust the scaling in the code below"); 1619 1620 if (vtable_index.is_register()) { 1621 sldi(vtable_index.as_register(), vtable_index.as_register(), LogBytesPerWord); 1622 add(recv_klass, vtable_index.as_register(), recv_klass); 1623 } else { 1624 addi(recv_klass, recv_klass, vtable_index.as_constant() << LogBytesPerWord); 1625 } 1626 ld(R19_method, base + vtableEntry::method_offset_in_bytes(), recv_klass); 1627 } 1628 1629 /////////////////////////////////////////// subtype checking //////////////////////////////////////////// 1630 1631 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass, 1632 Register super_klass, 1633 Register temp1_reg, 1634 Register temp2_reg, 1635 Label& L_success, 1636 Label& L_failure) { 1637 1638 const Register check_cache_offset = temp1_reg; 1639 const Register cached_super = temp2_reg; 1640 1641 assert_different_registers(sub_klass, super_klass, check_cache_offset, cached_super); 1642 1643 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 1644 int sc_offset = in_bytes(Klass::secondary_super_cache_offset()); 1645 1646 // If the pointers are equal, we are done (e.g., String[] elements). 1647 // This self-check enables sharing of secondary supertype arrays among 1648 // non-primary types such as array-of-interface. Otherwise, each such 1649 // type would need its own customized SSA. 1650 // We move this check to the front of the fast path because many 1651 // type checks are in fact trivially successful in this manner, 1652 // so we get a nicely predicted branch right at the start of the check. 1653 cmpd(CCR0, sub_klass, super_klass); 1654 beq(CCR0, L_success); 1655 1656 // Check the supertype display: 1657 lwz(check_cache_offset, sco_offset, super_klass); 1658 // The loaded value is the offset from KlassOopDesc. 1659 1660 ldx(cached_super, check_cache_offset, sub_klass); 1661 cmpd(CCR0, cached_super, super_klass); 1662 beq(CCR0, L_success); 1663 1664 // This check has worked decisively for primary supers. 1665 // Secondary supers are sought in the super_cache ('super_cache_addr'). 1666 // (Secondary supers are interfaces and very deeply nested subtypes.) 1667 // This works in the same check above because of a tricky aliasing 1668 // between the super_cache and the primary super display elements. 1669 // (The 'super_check_addr' can address either, as the case requires.) 1670 // Note that the cache is updated below if it does not help us find 1671 // what we need immediately. 1672 // So if it was a primary super, we can just fail immediately. 1673 // Otherwise, it's the slow path for us (no success at this point). 1674 1675 cmpwi(CCR0, check_cache_offset, sc_offset); 1676 bne(CCR0, L_failure); 1677 // bind(slow_path); // fallthru 1678 } 1679 1680 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass, 1681 Register super_klass, 1682 Register temp1_reg, 1683 Register temp2_reg, 1684 Label* L_success, 1685 Register result_reg) { 1686 const Register array_ptr = temp1_reg; // current value from cache array 1687 const Register temp = temp2_reg; 1688 1689 assert_different_registers(sub_klass, super_klass, array_ptr, temp); 1690 1691 int source_offset = in_bytes(Klass::secondary_supers_offset()); 1692 int target_offset = in_bytes(Klass::secondary_super_cache_offset()); 1693 1694 int length_offset = Array<Klass*>::length_offset_in_bytes(); 1695 int base_offset = Array<Klass*>::base_offset_in_bytes(); 1696 1697 Label hit, loop, failure, fallthru; 1698 1699 ld(array_ptr, source_offset, sub_klass); 1700 1701 //assert(4 == arrayOopDesc::length_length_in_bytes(), "precondition violated."); 1702 lwz(temp, length_offset, array_ptr); 1703 cmpwi(CCR0, temp, 0); 1704 beq(CCR0, result_reg!=noreg ? failure : fallthru); // length 0 1705 1706 mtctr(temp); // load ctr 1707 1708 bind(loop); 1709 // Oops in table are NO MORE compressed. 1710 ld(temp, base_offset, array_ptr); 1711 cmpd(CCR0, temp, super_klass); 1712 beq(CCR0, hit); 1713 addi(array_ptr, array_ptr, BytesPerWord); 1714 bdnz(loop); 1715 1716 bind(failure); 1717 if (result_reg!=noreg) li(result_reg, 1); // load non-zero result (indicates a miss) 1718 b(fallthru); 1719 1720 bind(hit); 1721 std(super_klass, target_offset, sub_klass); // save result to cache 1722 if (result_reg != noreg) li(result_reg, 0); // load zero result (indicates a hit) 1723 if (L_success != NULL) b(*L_success); 1724 1725 bind(fallthru); 1726 } 1727 1728 // Try fast path, then go to slow one if not successful 1729 void MacroAssembler::check_klass_subtype(Register sub_klass, 1730 Register super_klass, 1731 Register temp1_reg, 1732 Register temp2_reg, 1733 Label& L_success) { 1734 Label L_failure; 1735 check_klass_subtype_fast_path(sub_klass, super_klass, temp1_reg, temp2_reg, L_success, L_failure); 1736 check_klass_subtype_slow_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success); 1737 bind(L_failure); // Fallthru if not successful. 1738 } 1739 1740 void MacroAssembler::check_method_handle_type(Register mtype_reg, Register mh_reg, 1741 Register temp_reg, 1742 Label& wrong_method_type) { 1743 assert_different_registers(mtype_reg, mh_reg, temp_reg); 1744 // Compare method type against that of the receiver. 1745 load_heap_oop_not_null(temp_reg, delayed_value(java_lang_invoke_MethodHandle::type_offset_in_bytes, temp_reg), mh_reg); 1746 cmpd(CCR0, temp_reg, mtype_reg); 1747 bne(CCR0, wrong_method_type); 1748 } 1749 1750 RegisterOrConstant MacroAssembler::argument_offset(RegisterOrConstant arg_slot, 1751 Register temp_reg, 1752 int extra_slot_offset) { 1753 // cf. TemplateTable::prepare_invoke(), if (load_receiver). 1754 int stackElementSize = Interpreter::stackElementSize; 1755 int offset = extra_slot_offset * stackElementSize; 1756 if (arg_slot.is_constant()) { 1757 offset += arg_slot.as_constant() * stackElementSize; 1758 return offset; 1759 } else { 1760 assert(temp_reg != noreg, "must specify"); 1761 sldi(temp_reg, arg_slot.as_register(), exact_log2(stackElementSize)); 1762 if (offset != 0) 1763 addi(temp_reg, temp_reg, offset); 1764 return temp_reg; 1765 } 1766 } 1767 1768 void MacroAssembler::biased_locking_enter(ConditionRegister cr_reg, Register obj_reg, 1769 Register mark_reg, Register temp_reg, 1770 Register temp2_reg, Label& done, Label* slow_case) { 1771 assert(UseBiasedLocking, "why call this otherwise?"); 1772 1773 #ifdef ASSERT 1774 assert_different_registers(obj_reg, mark_reg, temp_reg, temp2_reg); 1775 #endif 1776 1777 Label cas_label; 1778 1779 // Branch to done if fast path fails and no slow_case provided. 1780 Label *slow_case_int = (slow_case != NULL) ? slow_case : &done; 1781 1782 // Biased locking 1783 // See whether the lock is currently biased toward our thread and 1784 // whether the epoch is still valid 1785 // Note that the runtime guarantees sufficient alignment of JavaThread 1786 // pointers to allow age to be placed into low bits 1787 assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits, 1788 "biased locking makes assumptions about bit layout"); 1789 1790 if (PrintBiasedLockingStatistics) { 1791 load_const(temp_reg, (address) BiasedLocking::total_entry_count_addr(), temp2_reg); 1792 lwz(temp2_reg, 0, temp_reg); 1793 addi(temp2_reg, temp2_reg, 1); 1794 stw(temp2_reg, 0, temp_reg); 1795 } 1796 1797 andi(temp_reg, mark_reg, markOopDesc::biased_lock_mask_in_place); 1798 cmpwi(cr_reg, temp_reg, markOopDesc::biased_lock_pattern); 1799 bne(cr_reg, cas_label); 1800 1801 load_klass(temp_reg, obj_reg); 1802 1803 load_const_optimized(temp2_reg, ~((int) markOopDesc::age_mask_in_place)); 1804 ld(temp_reg, in_bytes(Klass::prototype_header_offset()), temp_reg); 1805 orr(temp_reg, R16_thread, temp_reg); 1806 xorr(temp_reg, mark_reg, temp_reg); 1807 andr(temp_reg, temp_reg, temp2_reg); 1808 cmpdi(cr_reg, temp_reg, 0); 1809 if (PrintBiasedLockingStatistics) { 1810 Label l; 1811 bne(cr_reg, l); 1812 load_const(mark_reg, (address) BiasedLocking::biased_lock_entry_count_addr()); 1813 lwz(temp2_reg, 0, mark_reg); 1814 addi(temp2_reg, temp2_reg, 1); 1815 stw(temp2_reg, 0, mark_reg); 1816 // restore mark_reg 1817 ld(mark_reg, oopDesc::mark_offset_in_bytes(), obj_reg); 1818 bind(l); 1819 } 1820 beq(cr_reg, done); 1821 1822 Label try_revoke_bias; 1823 Label try_rebias; 1824 1825 // At this point we know that the header has the bias pattern and 1826 // that we are not the bias owner in the current epoch. We need to 1827 // figure out more details about the state of the header in order to 1828 // know what operations can be legally performed on the object's 1829 // header. 1830 1831 // If the low three bits in the xor result aren't clear, that means 1832 // the prototype header is no longer biased and we have to revoke 1833 // the bias on this object. 1834 andi(temp2_reg, temp_reg, markOopDesc::biased_lock_mask_in_place); 1835 cmpwi(cr_reg, temp2_reg, 0); 1836 bne(cr_reg, try_revoke_bias); 1837 1838 // Biasing is still enabled for this data type. See whether the 1839 // epoch of the current bias is still valid, meaning that the epoch 1840 // bits of the mark word are equal to the epoch bits of the 1841 // prototype header. (Note that the prototype header's epoch bits 1842 // only change at a safepoint.) If not, attempt to rebias the object 1843 // toward the current thread. Note that we must be absolutely sure 1844 // that the current epoch is invalid in order to do this because 1845 // otherwise the manipulations it performs on the mark word are 1846 // illegal. 1847 1848 int shift_amount = 64 - markOopDesc::epoch_shift; 1849 // rotate epoch bits to right (little) end and set other bits to 0 1850 // [ big part | epoch | little part ] -> [ 0..0 | epoch ] 1851 rldicl_(temp2_reg, temp_reg, shift_amount, 64 - markOopDesc::epoch_bits); 1852 // branch if epoch bits are != 0, i.e. they differ, because the epoch has been incremented 1853 bne(CCR0, try_rebias); 1854 1855 // The epoch of the current bias is still valid but we know nothing 1856 // about the owner; it might be set or it might be clear. Try to 1857 // acquire the bias of the object using an atomic operation. If this 1858 // fails we will go in to the runtime to revoke the object's bias. 1859 // Note that we first construct the presumed unbiased header so we 1860 // don't accidentally blow away another thread's valid bias. 1861 andi(mark_reg, mark_reg, (markOopDesc::biased_lock_mask_in_place | 1862 markOopDesc::age_mask_in_place | 1863 markOopDesc::epoch_mask_in_place)); 1864 orr(temp_reg, R16_thread, mark_reg); 1865 1866 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 1867 1868 // CmpxchgX sets cr_reg to cmpX(temp2_reg, mark_reg). 1869 cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg, 1870 /*compare_value=*/mark_reg, /*exchange_value=*/temp_reg, 1871 /*where=*/obj_reg, 1872 MacroAssembler::MemBarAcq, 1873 MacroAssembler::cmpxchgx_hint_acquire_lock(), 1874 noreg, slow_case_int); // bail out if failed 1875 1876 // If the biasing toward our thread failed, this means that 1877 // another thread succeeded in biasing it toward itself and we 1878 // need to revoke that bias. The revocation will occur in the 1879 // interpreter runtime in the slow case. 1880 if (PrintBiasedLockingStatistics) { 1881 load_const(temp_reg, (address) BiasedLocking::anonymously_biased_lock_entry_count_addr(), temp2_reg); 1882 lwz(temp2_reg, 0, temp_reg); 1883 addi(temp2_reg, temp2_reg, 1); 1884 stw(temp2_reg, 0, temp_reg); 1885 } 1886 b(done); 1887 1888 bind(try_rebias); 1889 // At this point we know the epoch has expired, meaning that the 1890 // current "bias owner", if any, is actually invalid. Under these 1891 // circumstances _only_, we are allowed to use the current header's 1892 // value as the comparison value when doing the cas to acquire the 1893 // bias in the current epoch. In other words, we allow transfer of 1894 // the bias from one thread to another directly in this situation. 1895 andi(temp_reg, mark_reg, markOopDesc::age_mask_in_place); 1896 orr(temp_reg, R16_thread, temp_reg); 1897 load_klass(temp2_reg, obj_reg); 1898 ld(temp2_reg, in_bytes(Klass::prototype_header_offset()), temp2_reg); 1899 orr(temp_reg, temp_reg, temp2_reg); 1900 1901 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 1902 1903 // CmpxchgX sets cr_reg to cmpX(temp2_reg, mark_reg). 1904 cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg, 1905 /*compare_value=*/mark_reg, /*exchange_value=*/temp_reg, 1906 /*where=*/obj_reg, 1907 MacroAssembler::MemBarAcq, 1908 MacroAssembler::cmpxchgx_hint_acquire_lock(), 1909 noreg, slow_case_int); // bail out if failed 1910 1911 // If the biasing toward our thread failed, this means that 1912 // another thread succeeded in biasing it toward itself and we 1913 // need to revoke that bias. The revocation will occur in the 1914 // interpreter runtime in the slow case. 1915 if (PrintBiasedLockingStatistics) { 1916 load_const(temp_reg, (address) BiasedLocking::rebiased_lock_entry_count_addr(), temp2_reg); 1917 lwz(temp2_reg, 0, temp_reg); 1918 addi(temp2_reg, temp2_reg, 1); 1919 stw(temp2_reg, 0, temp_reg); 1920 } 1921 b(done); 1922 1923 bind(try_revoke_bias); 1924 // The prototype mark in the klass doesn't have the bias bit set any 1925 // more, indicating that objects of this data type are not supposed 1926 // to be biased any more. We are going to try to reset the mark of 1927 // this object to the prototype value and fall through to the 1928 // CAS-based locking scheme. Note that if our CAS fails, it means 1929 // that another thread raced us for the privilege of revoking the 1930 // bias of this particular object, so it's okay to continue in the 1931 // normal locking code. 1932 load_klass(temp_reg, obj_reg); 1933 ld(temp_reg, in_bytes(Klass::prototype_header_offset()), temp_reg); 1934 andi(temp2_reg, mark_reg, markOopDesc::age_mask_in_place); 1935 orr(temp_reg, temp_reg, temp2_reg); 1936 1937 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 1938 1939 // CmpxchgX sets cr_reg to cmpX(temp2_reg, mark_reg). 1940 cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg, 1941 /*compare_value=*/mark_reg, /*exchange_value=*/temp_reg, 1942 /*where=*/obj_reg, 1943 MacroAssembler::MemBarAcq, 1944 MacroAssembler::cmpxchgx_hint_acquire_lock()); 1945 1946 // reload markOop in mark_reg before continuing with lightweight locking 1947 ld(mark_reg, oopDesc::mark_offset_in_bytes(), obj_reg); 1948 1949 // Fall through to the normal CAS-based lock, because no matter what 1950 // the result of the above CAS, some thread must have succeeded in 1951 // removing the bias bit from the object's header. 1952 if (PrintBiasedLockingStatistics) { 1953 Label l; 1954 bne(cr_reg, l); 1955 load_const(temp_reg, (address) BiasedLocking::revoked_lock_entry_count_addr(), temp2_reg); 1956 lwz(temp2_reg, 0, temp_reg); 1957 addi(temp2_reg, temp2_reg, 1); 1958 stw(temp2_reg, 0, temp_reg); 1959 bind(l); 1960 } 1961 1962 bind(cas_label); 1963 } 1964 1965 void MacroAssembler::biased_locking_exit (ConditionRegister cr_reg, Register mark_addr, Register temp_reg, Label& done) { 1966 // Check for biased locking unlock case, which is a no-op 1967 // Note: we do not have to check the thread ID for two reasons. 1968 // First, the interpreter checks for IllegalMonitorStateException at 1969 // a higher level. Second, if the bias was revoked while we held the 1970 // lock, the object could not be rebiased toward another thread, so 1971 // the bias bit would be clear. 1972 1973 ld(temp_reg, 0, mark_addr); 1974 andi(temp_reg, temp_reg, markOopDesc::biased_lock_mask_in_place); 1975 1976 cmpwi(cr_reg, temp_reg, markOopDesc::biased_lock_pattern); 1977 beq(cr_reg, done); 1978 } 1979 1980 // TM on PPC64. 1981 void MacroAssembler::atomic_inc_ptr(Register addr, Register result, int simm16) { 1982 Label retry; 1983 bind(retry); 1984 ldarx(result, addr, /*hint*/ false); 1985 addi(result, result, simm16); 1986 stdcx_(result, addr); 1987 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1988 bne_predict_not_taken(CCR0, retry); // stXcx_ sets CCR0 1989 } else { 1990 bne( CCR0, retry); // stXcx_ sets CCR0 1991 } 1992 } 1993 1994 void MacroAssembler::atomic_ori_int(Register addr, Register result, int uimm16) { 1995 Label retry; 1996 bind(retry); 1997 lwarx(result, addr, /*hint*/ false); 1998 ori(result, result, uimm16); 1999 stwcx_(result, addr); 2000 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 2001 bne_predict_not_taken(CCR0, retry); // stXcx_ sets CCR0 2002 } else { 2003 bne( CCR0, retry); // stXcx_ sets CCR0 2004 } 2005 } 2006 2007 #if INCLUDE_RTM_OPT 2008 2009 // Update rtm_counters based on abort status 2010 // input: abort_status 2011 // rtm_counters (RTMLockingCounters*) 2012 void MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters_Reg) { 2013 // Mapping to keep PreciseRTMLockingStatistics similar to x86. 2014 // x86 ppc (! means inverted, ? means not the same) 2015 // 0 31 Set if abort caused by XABORT instruction. 2016 // 1 ! 7 If set, the transaction may succeed on a retry. This bit is always clear if bit 0 is set. 2017 // 2 13 Set if another logical processor conflicted with a memory address that was part of the transaction that aborted. 2018 // 3 10 Set if an internal buffer overflowed. 2019 // 4 ?12 Set if a debug breakpoint was hit. 2020 // 5 ?32 Set if an abort occurred during execution of a nested transaction. 2021 const int tm_failure_bit[] = {Assembler::tm_tabort, // Note: Seems like signal handler sets this, too. 2022 Assembler::tm_failure_persistent, // inverted: transient 2023 Assembler::tm_trans_cf, 2024 Assembler::tm_footprint_of, 2025 Assembler::tm_non_trans_cf, 2026 Assembler::tm_suspended}; 2027 const bool tm_failure_inv[] = {false, true, false, false, false, false}; 2028 assert(sizeof(tm_failure_bit)/sizeof(int) == RTMLockingCounters::ABORT_STATUS_LIMIT, "adapt mapping!"); 2029 2030 const Register addr_Reg = R0; 2031 // Keep track of offset to where rtm_counters_Reg had pointed to. 2032 int counters_offs = RTMLockingCounters::abort_count_offset(); 2033 addi(addr_Reg, rtm_counters_Reg, counters_offs); 2034 const Register temp_Reg = rtm_counters_Reg; 2035 2036 //atomic_inc_ptr(addr_Reg, temp_Reg); We don't increment atomically 2037 ldx(temp_Reg, addr_Reg); 2038 addi(temp_Reg, temp_Reg, 1); 2039 stdx(temp_Reg, addr_Reg); 2040 2041 if (PrintPreciseRTMLockingStatistics) { 2042 int counters_offs_delta = RTMLockingCounters::abortX_count_offset() - counters_offs; 2043 2044 //mftexasr(abort_status); done by caller 2045 for (int i = 0; i < RTMLockingCounters::ABORT_STATUS_LIMIT; i++) { 2046 counters_offs += counters_offs_delta; 2047 li(temp_Reg, counters_offs_delta); // can't use addi with R0 2048 add(addr_Reg, addr_Reg, temp_Reg); // point to next counter 2049 counters_offs_delta = sizeof(uintx); 2050 2051 Label check_abort; 2052 rldicr_(temp_Reg, abort_status, tm_failure_bit[i], 0); 2053 if (tm_failure_inv[i]) { 2054 bne(CCR0, check_abort); 2055 } else { 2056 beq(CCR0, check_abort); 2057 } 2058 //atomic_inc_ptr(addr_Reg, temp_Reg); We don't increment atomically 2059 ldx(temp_Reg, addr_Reg); 2060 addi(temp_Reg, temp_Reg, 1); 2061 stdx(temp_Reg, addr_Reg); 2062 bind(check_abort); 2063 } 2064 } 2065 li(temp_Reg, -counters_offs); // can't use addi with R0 2066 add(rtm_counters_Reg, addr_Reg, temp_Reg); // restore 2067 } 2068 2069 // Branch if (random & (count-1) != 0), count is 2^n 2070 // tmp and CR0 are killed 2071 void MacroAssembler::branch_on_random_using_tb(Register tmp, int count, Label& brLabel) { 2072 mftb(tmp); 2073 andi_(tmp, tmp, count-1); 2074 bne(CCR0, brLabel); 2075 } 2076 2077 // Perform abort ratio calculation, set no_rtm bit if high ratio. 2078 // input: rtm_counters_Reg (RTMLockingCounters* address) - KILLED 2079 void MacroAssembler::rtm_abort_ratio_calculation(Register rtm_counters_Reg, 2080 RTMLockingCounters* rtm_counters, 2081 Metadata* method_data) { 2082 Label L_done, L_check_always_rtm1, L_check_always_rtm2; 2083 2084 if (RTMLockingCalculationDelay > 0) { 2085 // Delay calculation. 2086 ld(rtm_counters_Reg, (RegisterOrConstant)(intptr_t)RTMLockingCounters::rtm_calculation_flag_addr()); 2087 cmpdi(CCR0, rtm_counters_Reg, 0); 2088 beq(CCR0, L_done); 2089 load_const_optimized(rtm_counters_Reg, (address)rtm_counters, R0); // reload 2090 } 2091 // Abort ratio calculation only if abort_count > RTMAbortThreshold. 2092 // Aborted transactions = abort_count * 100 2093 // All transactions = total_count * RTMTotalCountIncrRate 2094 // Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio) 2095 ld(R0, RTMLockingCounters::abort_count_offset(), rtm_counters_Reg); 2096 cmpdi(CCR0, R0, RTMAbortThreshold); 2097 blt(CCR0, L_check_always_rtm2); 2098 mulli(R0, R0, 100); 2099 2100 const Register tmpReg = rtm_counters_Reg; 2101 ld(tmpReg, RTMLockingCounters::total_count_offset(), rtm_counters_Reg); 2102 mulli(tmpReg, tmpReg, RTMTotalCountIncrRate); 2103 mulli(tmpReg, tmpReg, RTMAbortRatio); 2104 cmpd(CCR0, R0, tmpReg); 2105 blt(CCR0, L_check_always_rtm1); // jump to reload 2106 if (method_data != NULL) { 2107 // Set rtm_state to "no rtm" in MDO. 2108 // Not using a metadata relocation. Method and Class Loader are kept alive anyway. 2109 // (See nmethod::metadata_do and CodeBuffer::finalize_oop_references.) 2110 load_const(R0, (address)method_data + MethodData::rtm_state_offset_in_bytes(), tmpReg); 2111 atomic_ori_int(R0, tmpReg, NoRTM); 2112 } 2113 b(L_done); 2114 2115 bind(L_check_always_rtm1); 2116 load_const_optimized(rtm_counters_Reg, (address)rtm_counters, R0); // reload 2117 bind(L_check_always_rtm2); 2118 ld(tmpReg, RTMLockingCounters::total_count_offset(), rtm_counters_Reg); 2119 cmpdi(CCR0, tmpReg, RTMLockingThreshold / RTMTotalCountIncrRate); 2120 blt(CCR0, L_done); 2121 if (method_data != NULL) { 2122 // Set rtm_state to "always rtm" in MDO. 2123 // Not using a metadata relocation. See above. 2124 load_const(R0, (address)method_data + MethodData::rtm_state_offset_in_bytes(), tmpReg); 2125 atomic_ori_int(R0, tmpReg, UseRTM); 2126 } 2127 bind(L_done); 2128 } 2129 2130 // Update counters and perform abort ratio calculation. 2131 // input: abort_status_Reg 2132 void MacroAssembler::rtm_profiling(Register abort_status_Reg, Register temp_Reg, 2133 RTMLockingCounters* rtm_counters, 2134 Metadata* method_data, 2135 bool profile_rtm) { 2136 2137 assert(rtm_counters != NULL, "should not be NULL when profiling RTM"); 2138 // Update rtm counters based on state at abort. 2139 // Reads abort_status_Reg, updates flags. 2140 assert_different_registers(abort_status_Reg, temp_Reg); 2141 load_const_optimized(temp_Reg, (address)rtm_counters, R0); 2142 rtm_counters_update(abort_status_Reg, temp_Reg); 2143 if (profile_rtm) { 2144 assert(rtm_counters != NULL, "should not be NULL when profiling RTM"); 2145 rtm_abort_ratio_calculation(temp_Reg, rtm_counters, method_data); 2146 } 2147 } 2148 2149 // Retry on abort if abort's status indicates non-persistent failure. 2150 // inputs: retry_count_Reg 2151 // : abort_status_Reg 2152 // output: retry_count_Reg decremented by 1 2153 void MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg, 2154 Label& retryLabel, Label* checkRetry) { 2155 Label doneRetry; 2156 rldicr_(R0, abort_status_Reg, tm_failure_persistent, 0); 2157 bne(CCR0, doneRetry); 2158 if (checkRetry) { bind(*checkRetry); } 2159 addic_(retry_count_Reg, retry_count_Reg, -1); 2160 blt(CCR0, doneRetry); 2161 smt_yield(); // Can't use wait(). No permission (SIGILL). 2162 b(retryLabel); 2163 bind(doneRetry); 2164 } 2165 2166 // Spin and retry if lock is busy. 2167 // inputs: box_Reg (monitor address) 2168 // : retry_count_Reg 2169 // output: retry_count_Reg decremented by 1 2170 // CTR is killed 2171 void MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register owner_addr_Reg, Label& retryLabel) { 2172 Label SpinLoop, doneRetry; 2173 addic_(retry_count_Reg, retry_count_Reg, -1); 2174 blt(CCR0, doneRetry); 2175 li(R0, RTMSpinLoopCount); 2176 mtctr(R0); 2177 2178 bind(SpinLoop); 2179 smt_yield(); // Can't use waitrsv(). No permission (SIGILL). 2180 bdz(retryLabel); 2181 ld(R0, 0, owner_addr_Reg); 2182 cmpdi(CCR0, R0, 0); 2183 bne(CCR0, SpinLoop); 2184 b(retryLabel); 2185 2186 bind(doneRetry); 2187 } 2188 2189 // Use RTM for normal stack locks. 2190 // Input: objReg (object to lock) 2191 void MacroAssembler::rtm_stack_locking(ConditionRegister flag, 2192 Register obj, Register mark_word, Register tmp, 2193 Register retry_on_abort_count_Reg, 2194 RTMLockingCounters* stack_rtm_counters, 2195 Metadata* method_data, bool profile_rtm, 2196 Label& DONE_LABEL, Label& IsInflated) { 2197 assert(UseRTMForStackLocks, "why call this otherwise?"); 2198 assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking"); 2199 Label L_rtm_retry, L_decrement_retry, L_on_abort; 2200 2201 if (RTMRetryCount > 0) { 2202 load_const_optimized(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort 2203 bind(L_rtm_retry); 2204 } 2205 andi_(R0, mark_word, markOopDesc::monitor_value); // inflated vs stack-locked|neutral|biased 2206 bne(CCR0, IsInflated); 2207 2208 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 2209 Label L_noincrement; 2210 if (RTMTotalCountIncrRate > 1) { 2211 branch_on_random_using_tb(tmp, (int)RTMTotalCountIncrRate, L_noincrement); 2212 } 2213 assert(stack_rtm_counters != NULL, "should not be NULL when profiling RTM"); 2214 load_const_optimized(tmp, (address)stack_rtm_counters->total_count_addr(), R0); 2215 //atomic_inc_ptr(tmp, /*temp, will be reloaded*/mark_word); We don't increment atomically 2216 ldx(mark_word, tmp); 2217 addi(mark_word, mark_word, 1); 2218 stdx(mark_word, tmp); 2219 bind(L_noincrement); 2220 } 2221 tbegin_(); 2222 beq(CCR0, L_on_abort); 2223 ld(mark_word, oopDesc::mark_offset_in_bytes(), obj); // Reload in transaction, conflicts need to be tracked. 2224 andi(R0, mark_word, markOopDesc::biased_lock_mask_in_place); // look at 3 lock bits 2225 cmpwi(flag, R0, markOopDesc::unlocked_value); // bits = 001 unlocked 2226 beq(flag, DONE_LABEL); // all done if unlocked 2227 2228 if (UseRTMXendForLockBusy) { 2229 tend_(); 2230 b(L_decrement_retry); 2231 } else { 2232 tabort_(); 2233 } 2234 bind(L_on_abort); 2235 const Register abort_status_Reg = tmp; 2236 mftexasr(abort_status_Reg); 2237 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 2238 rtm_profiling(abort_status_Reg, /*temp*/mark_word, stack_rtm_counters, method_data, profile_rtm); 2239 } 2240 ld(mark_word, oopDesc::mark_offset_in_bytes(), obj); // reload 2241 if (RTMRetryCount > 0) { 2242 // Retry on lock abort if abort status is not permanent. 2243 rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry, &L_decrement_retry); 2244 } else { 2245 bind(L_decrement_retry); 2246 } 2247 } 2248 2249 // Use RTM for inflating locks 2250 // inputs: obj (object to lock) 2251 // mark_word (current header - KILLED) 2252 // boxReg (on-stack box address (displaced header location) - KILLED) 2253 void MacroAssembler::rtm_inflated_locking(ConditionRegister flag, 2254 Register obj, Register mark_word, Register boxReg, 2255 Register retry_on_busy_count_Reg, Register retry_on_abort_count_Reg, 2256 RTMLockingCounters* rtm_counters, 2257 Metadata* method_data, bool profile_rtm, 2258 Label& DONE_LABEL) { 2259 assert(UseRTMLocking, "why call this otherwise?"); 2260 Label L_rtm_retry, L_decrement_retry, L_on_abort; 2261 // Clean monitor_value bit to get valid pointer. 2262 int owner_offset = ObjectMonitor::owner_offset_in_bytes() - markOopDesc::monitor_value; 2263 2264 // Store non-null, using boxReg instead of (intptr_t)markOopDesc::unused_mark(). 2265 std(boxReg, BasicLock::displaced_header_offset_in_bytes(), boxReg); 2266 const Register tmpReg = boxReg; 2267 const Register owner_addr_Reg = mark_word; 2268 addi(owner_addr_Reg, mark_word, owner_offset); 2269 2270 if (RTMRetryCount > 0) { 2271 load_const_optimized(retry_on_busy_count_Reg, RTMRetryCount); // Retry on lock busy. 2272 load_const_optimized(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort. 2273 bind(L_rtm_retry); 2274 } 2275 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 2276 Label L_noincrement; 2277 if (RTMTotalCountIncrRate > 1) { 2278 branch_on_random_using_tb(R0, (int)RTMTotalCountIncrRate, L_noincrement); 2279 } 2280 assert(rtm_counters != NULL, "should not be NULL when profiling RTM"); 2281 load_const(R0, (address)rtm_counters->total_count_addr(), tmpReg); 2282 //atomic_inc_ptr(R0, tmpReg); We don't increment atomically 2283 ldx(tmpReg, R0); 2284 addi(tmpReg, tmpReg, 1); 2285 stdx(tmpReg, R0); 2286 bind(L_noincrement); 2287 } 2288 tbegin_(); 2289 beq(CCR0, L_on_abort); 2290 // We don't reload mark word. Will only be reset at safepoint. 2291 ld(R0, 0, owner_addr_Reg); // Load in transaction, conflicts need to be tracked. 2292 cmpdi(flag, R0, 0); 2293 beq(flag, DONE_LABEL); 2294 2295 if (UseRTMXendForLockBusy) { 2296 tend_(); 2297 b(L_decrement_retry); 2298 } else { 2299 tabort_(); 2300 } 2301 bind(L_on_abort); 2302 const Register abort_status_Reg = tmpReg; 2303 mftexasr(abort_status_Reg); 2304 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 2305 rtm_profiling(abort_status_Reg, /*temp*/ owner_addr_Reg, rtm_counters, method_data, profile_rtm); 2306 // Restore owner_addr_Reg 2307 ld(mark_word, oopDesc::mark_offset_in_bytes(), obj); 2308 #ifdef ASSERT 2309 andi_(R0, mark_word, markOopDesc::monitor_value); 2310 asm_assert_ne("must be inflated", 0xa754); // Deflating only allowed at safepoint. 2311 #endif 2312 addi(owner_addr_Reg, mark_word, owner_offset); 2313 } 2314 if (RTMRetryCount > 0) { 2315 // Retry on lock abort if abort status is not permanent. 2316 rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry); 2317 } 2318 2319 // Appears unlocked - try to swing _owner from null to non-null. 2320 cmpxchgd(flag, /*current val*/ R0, (intptr_t)0, /*new val*/ R16_thread, owner_addr_Reg, 2321 MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq, 2322 MacroAssembler::cmpxchgx_hint_acquire_lock(), noreg, &L_decrement_retry, true); 2323 2324 if (RTMRetryCount > 0) { 2325 // success done else retry 2326 b(DONE_LABEL); 2327 bind(L_decrement_retry); 2328 // Spin and retry if lock is busy. 2329 rtm_retry_lock_on_busy(retry_on_busy_count_Reg, owner_addr_Reg, L_rtm_retry); 2330 } else { 2331 bind(L_decrement_retry); 2332 } 2333 } 2334 2335 #endif // INCLUDE_RTM_OPT 2336 2337 // "The box" is the space on the stack where we copy the object mark. 2338 void MacroAssembler::compiler_fast_lock_object(ConditionRegister flag, Register oop, Register box, 2339 Register temp, Register displaced_header, Register current_header, 2340 bool try_bias, 2341 RTMLockingCounters* rtm_counters, 2342 RTMLockingCounters* stack_rtm_counters, 2343 Metadata* method_data, 2344 bool use_rtm, bool profile_rtm) { 2345 assert_different_registers(oop, box, temp, displaced_header, current_header); 2346 assert(flag != CCR0, "bad condition register"); 2347 Label cont; 2348 Label object_has_monitor; 2349 Label cas_failed; 2350 2351 // Load markOop from object into displaced_header. 2352 ld(displaced_header, oopDesc::mark_offset_in_bytes(), oop); 2353 2354 2355 // Always do locking in runtime. 2356 if (EmitSync & 0x01) { 2357 cmpdi(flag, oop, 0); // Oop can't be 0 here => always false. 2358 return; 2359 } 2360 2361 if (try_bias) { 2362 biased_locking_enter(flag, oop, displaced_header, temp, current_header, cont); 2363 } 2364 2365 #if INCLUDE_RTM_OPT 2366 if (UseRTMForStackLocks && use_rtm) { 2367 rtm_stack_locking(flag, oop, displaced_header, temp, /*temp*/ current_header, 2368 stack_rtm_counters, method_data, profile_rtm, 2369 cont, object_has_monitor); 2370 } 2371 #endif // INCLUDE_RTM_OPT 2372 2373 // Handle existing monitor. 2374 if ((EmitSync & 0x02) == 0) { 2375 // The object has an existing monitor iff (mark & monitor_value) != 0. 2376 andi_(temp, displaced_header, markOopDesc::monitor_value); 2377 bne(CCR0, object_has_monitor); 2378 } 2379 2380 // Set displaced_header to be (markOop of object | UNLOCK_VALUE). 2381 ori(displaced_header, displaced_header, markOopDesc::unlocked_value); 2382 2383 // Load Compare Value application register. 2384 2385 // Initialize the box. (Must happen before we update the object mark!) 2386 std(displaced_header, BasicLock::displaced_header_offset_in_bytes(), box); 2387 2388 // Must fence, otherwise, preceding store(s) may float below cmpxchg. 2389 // Compare object markOop with mark and if equal exchange scratch1 with object markOop. 2390 // CmpxchgX sets cr_reg to cmpX(current, displaced). 2391 membar(Assembler::StoreStore); 2392 cmpxchgd(/*flag=*/flag, 2393 /*current_value=*/current_header, 2394 /*compare_value=*/displaced_header, 2395 /*exchange_value=*/box, 2396 /*where=*/oop, 2397 MacroAssembler::MemBarAcq, 2398 MacroAssembler::cmpxchgx_hint_acquire_lock(), 2399 noreg, 2400 &cas_failed); 2401 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 2402 2403 // If the compare-and-exchange succeeded, then we found an unlocked 2404 // object and we have now locked it. 2405 b(cont); 2406 2407 bind(cas_failed); 2408 // We did not see an unlocked object so try the fast recursive case. 2409 2410 // Check if the owner is self by comparing the value in the markOop of object 2411 // (current_header) with the stack pointer. 2412 sub(current_header, current_header, R1_SP); 2413 load_const_optimized(temp, (address) (~(os::vm_page_size()-1) | 2414 markOopDesc::lock_mask_in_place)); 2415 2416 and_(R0/*==0?*/, current_header, temp); 2417 // If condition is true we are cont and hence we can store 0 as the 2418 // displaced header in the box, which indicates that it is a recursive lock. 2419 mcrf(flag,CCR0); 2420 std(R0/*==0, perhaps*/, BasicLock::displaced_header_offset_in_bytes(), box); 2421 2422 // Handle existing monitor. 2423 if ((EmitSync & 0x02) == 0) { 2424 b(cont); 2425 2426 bind(object_has_monitor); 2427 // The object's monitor m is unlocked iff m->owner == NULL, 2428 // otherwise m->owner may contain a thread or a stack address. 2429 2430 #if INCLUDE_RTM_OPT 2431 // Use the same RTM locking code in 32- and 64-bit VM. 2432 if (use_rtm) { 2433 rtm_inflated_locking(flag, oop, displaced_header, box, temp, /*temp*/ current_header, 2434 rtm_counters, method_data, profile_rtm, cont); 2435 } else { 2436 #endif // INCLUDE_RTM_OPT 2437 2438 // Try to CAS m->owner from NULL to current thread. 2439 addi(temp, displaced_header, ObjectMonitor::owner_offset_in_bytes()-markOopDesc::monitor_value); 2440 li(displaced_header, 0); 2441 // CmpxchgX sets flag to cmpX(current, displaced). 2442 cmpxchgd(/*flag=*/flag, 2443 /*current_value=*/current_header, 2444 /*compare_value=*/(intptr_t)0, 2445 /*exchange_value=*/R16_thread, 2446 /*where=*/temp, 2447 MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq, 2448 MacroAssembler::cmpxchgx_hint_acquire_lock()); 2449 2450 // Store a non-null value into the box. 2451 std(box, BasicLock::displaced_header_offset_in_bytes(), box); 2452 2453 # ifdef ASSERT 2454 bne(flag, cont); 2455 // We have acquired the monitor, check some invariants. 2456 addi(/*monitor=*/temp, temp, -ObjectMonitor::owner_offset_in_bytes()); 2457 // Invariant 1: _recursions should be 0. 2458 //assert(ObjectMonitor::recursions_size_in_bytes() == 8, "unexpected size"); 2459 asm_assert_mem8_is_zero(ObjectMonitor::recursions_offset_in_bytes(), temp, 2460 "monitor->_recursions should be 0", -1); 2461 // Invariant 2: OwnerIsThread shouldn't be 0. 2462 //assert(ObjectMonitor::OwnerIsThread_size_in_bytes() == 4, "unexpected size"); 2463 //asm_assert_mem4_isnot_zero(ObjectMonitor::OwnerIsThread_offset_in_bytes(), temp, 2464 // "monitor->OwnerIsThread shouldn't be 0", -1); 2465 # endif 2466 2467 #if INCLUDE_RTM_OPT 2468 } // use_rtm() 2469 #endif 2470 } 2471 2472 bind(cont); 2473 // flag == EQ indicates success 2474 // flag == NE indicates failure 2475 } 2476 2477 void MacroAssembler::compiler_fast_unlock_object(ConditionRegister flag, Register oop, Register box, 2478 Register temp, Register displaced_header, Register current_header, 2479 bool try_bias, bool use_rtm) { 2480 assert_different_registers(oop, box, temp, displaced_header, current_header); 2481 assert(flag != CCR0, "bad condition register"); 2482 Label cont; 2483 Label object_has_monitor; 2484 2485 // Always do locking in runtime. 2486 if (EmitSync & 0x01) { 2487 cmpdi(flag, oop, 0); // Oop can't be 0 here => always false. 2488 return; 2489 } 2490 2491 if (try_bias) { 2492 biased_locking_exit(flag, oop, current_header, cont); 2493 } 2494 2495 #if INCLUDE_RTM_OPT 2496 if (UseRTMForStackLocks && use_rtm) { 2497 assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking"); 2498 Label L_regular_unlock; 2499 ld(current_header, oopDesc::mark_offset_in_bytes(), oop); // fetch markword 2500 andi(R0, current_header, markOopDesc::biased_lock_mask_in_place); // look at 3 lock bits 2501 cmpwi(flag, R0, markOopDesc::unlocked_value); // bits = 001 unlocked 2502 bne(flag, L_regular_unlock); // else RegularLock 2503 tend_(); // otherwise end... 2504 b(cont); // ... and we're done 2505 bind(L_regular_unlock); 2506 } 2507 #endif 2508 2509 // Find the lock address and load the displaced header from the stack. 2510 ld(displaced_header, BasicLock::displaced_header_offset_in_bytes(), box); 2511 2512 // If the displaced header is 0, we have a recursive unlock. 2513 cmpdi(flag, displaced_header, 0); 2514 beq(flag, cont); 2515 2516 // Handle existing monitor. 2517 if ((EmitSync & 0x02) == 0) { 2518 // The object has an existing monitor iff (mark & monitor_value) != 0. 2519 RTM_OPT_ONLY( if (!(UseRTMForStackLocks && use_rtm)) ) // skip load if already done 2520 ld(current_header, oopDesc::mark_offset_in_bytes(), oop); 2521 andi_(R0, current_header, markOopDesc::monitor_value); 2522 bne(CCR0, object_has_monitor); 2523 } 2524 2525 // Check if it is still a light weight lock, this is is true if we see 2526 // the stack address of the basicLock in the markOop of the object. 2527 // Cmpxchg sets flag to cmpd(current_header, box). 2528 cmpxchgd(/*flag=*/flag, 2529 /*current_value=*/current_header, 2530 /*compare_value=*/box, 2531 /*exchange_value=*/displaced_header, 2532 /*where=*/oop, 2533 MacroAssembler::MemBarRel, 2534 MacroAssembler::cmpxchgx_hint_release_lock(), 2535 noreg, 2536 &cont); 2537 2538 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 2539 2540 // Handle existing monitor. 2541 if ((EmitSync & 0x02) == 0) { 2542 b(cont); 2543 2544 bind(object_has_monitor); 2545 addi(current_header, current_header, -markOopDesc::monitor_value); // monitor 2546 ld(temp, ObjectMonitor::owner_offset_in_bytes(), current_header); 2547 2548 // It's inflated. 2549 #if INCLUDE_RTM_OPT 2550 if (use_rtm) { 2551 Label L_regular_inflated_unlock; 2552 // Clean monitor_value bit to get valid pointer 2553 cmpdi(flag, temp, 0); 2554 bne(flag, L_regular_inflated_unlock); 2555 tend_(); 2556 b(cont); 2557 bind(L_regular_inflated_unlock); 2558 } 2559 #endif 2560 2561 ld(displaced_header, ObjectMonitor::recursions_offset_in_bytes(), current_header); 2562 xorr(temp, R16_thread, temp); // Will be 0 if we are the owner. 2563 orr(temp, temp, displaced_header); // Will be 0 if there are 0 recursions. 2564 cmpdi(flag, temp, 0); 2565 bne(flag, cont); 2566 2567 ld(temp, ObjectMonitor::EntryList_offset_in_bytes(), current_header); 2568 ld(displaced_header, ObjectMonitor::cxq_offset_in_bytes(), current_header); 2569 orr(temp, temp, displaced_header); // Will be 0 if both are 0. 2570 cmpdi(flag, temp, 0); 2571 bne(flag, cont); 2572 release(); 2573 std(temp, ObjectMonitor::owner_offset_in_bytes(), current_header); 2574 } 2575 2576 bind(cont); 2577 // flag == EQ indicates success 2578 // flag == NE indicates failure 2579 } 2580 2581 // Write serialization page so VM thread can do a pseudo remote membar. 2582 // We use the current thread pointer to calculate a thread specific 2583 // offset to write to within the page. This minimizes bus traffic 2584 // due to cache line collision. 2585 void MacroAssembler::serialize_memory(Register thread, Register tmp1, Register tmp2) { 2586 srdi(tmp2, thread, os::get_serialize_page_shift_count()); 2587 2588 int mask = os::vm_page_size() - sizeof(int); 2589 if (Assembler::is_simm(mask, 16)) { 2590 andi(tmp2, tmp2, mask); 2591 } else { 2592 lis(tmp1, (int)((signed short) (mask >> 16))); 2593 ori(tmp1, tmp1, mask & 0x0000ffff); 2594 andr(tmp2, tmp2, tmp1); 2595 } 2596 2597 load_const(tmp1, (long) os::get_memory_serialize_page()); 2598 release(); 2599 stwx(R0, tmp1, tmp2); 2600 } 2601 2602 2603 // GC barrier helper macros 2604 2605 // Write the card table byte if needed. 2606 void MacroAssembler::card_write_barrier_post(Register Rstore_addr, Register Rnew_val, Register Rtmp) { 2607 CardTableModRefBS* bs = 2608 barrier_set_cast<CardTableModRefBS>(Universe::heap()->barrier_set()); 2609 assert(bs->kind() == BarrierSet::CardTableForRS || 2610 bs->kind() == BarrierSet::CardTableExtension, "wrong barrier"); 2611 #ifdef ASSERT 2612 cmpdi(CCR0, Rnew_val, 0); 2613 asm_assert_ne("null oop not allowed", 0x321); 2614 #endif 2615 card_table_write(bs->byte_map_base, Rtmp, Rstore_addr); 2616 } 2617 2618 // Write the card table byte. 2619 void MacroAssembler::card_table_write(jbyte* byte_map_base, Register Rtmp, Register Robj) { 2620 assert_different_registers(Robj, Rtmp, R0); 2621 load_const_optimized(Rtmp, (address)byte_map_base, R0); 2622 srdi(Robj, Robj, CardTableModRefBS::card_shift); 2623 li(R0, 0); // dirty 2624 if (UseConcMarkSweepGC) membar(Assembler::StoreStore); 2625 stbx(R0, Rtmp, Robj); 2626 } 2627 2628 #if INCLUDE_ALL_GCS 2629 // General G1 pre-barrier generator. 2630 // Goal: record the previous value if it is not null. 2631 void MacroAssembler::g1_write_barrier_pre(Register Robj, RegisterOrConstant offset, Register Rpre_val, 2632 Register Rtmp1, Register Rtmp2, bool needs_frame) { 2633 Label runtime, filtered; 2634 2635 // Is marking active? 2636 if (in_bytes(PtrQueue::byte_width_of_active()) == 4) { 2637 lwz(Rtmp1, in_bytes(JavaThread::satb_mark_queue_offset() + PtrQueue::byte_offset_of_active()), R16_thread); 2638 } else { 2639 guarantee(in_bytes(PtrQueue::byte_width_of_active()) == 1, "Assumption"); 2640 lbz(Rtmp1, in_bytes(JavaThread::satb_mark_queue_offset() + PtrQueue::byte_offset_of_active()), R16_thread); 2641 } 2642 cmpdi(CCR0, Rtmp1, 0); 2643 beq(CCR0, filtered); 2644 2645 // Do we need to load the previous value? 2646 if (Robj != noreg) { 2647 // Load the previous value... 2648 if (UseCompressedOops) { 2649 lwz(Rpre_val, offset, Robj); 2650 } else { 2651 ld(Rpre_val, offset, Robj); 2652 } 2653 // Previous value has been loaded into Rpre_val. 2654 } 2655 assert(Rpre_val != noreg, "must have a real register"); 2656 2657 // Is the previous value null? 2658 cmpdi(CCR0, Rpre_val, 0); 2659 beq(CCR0, filtered); 2660 2661 if (Robj != noreg && UseCompressedOops) { 2662 decode_heap_oop_not_null(Rpre_val); 2663 } 2664 2665 // OK, it's not filtered, so we'll need to call enqueue. In the normal 2666 // case, pre_val will be a scratch G-reg, but there are some cases in 2667 // which it's an O-reg. In the first case, do a normal call. In the 2668 // latter, do a save here and call the frameless version. 2669 2670 // Can we store original value in the thread's buffer? 2671 // Is index == 0? 2672 // (The index field is typed as size_t.) 2673 const Register Rbuffer = Rtmp1, Rindex = Rtmp2; 2674 2675 ld(Rindex, in_bytes(JavaThread::satb_mark_queue_offset() + PtrQueue::byte_offset_of_index()), R16_thread); 2676 cmpdi(CCR0, Rindex, 0); 2677 beq(CCR0, runtime); // If index == 0, goto runtime. 2678 ld(Rbuffer, in_bytes(JavaThread::satb_mark_queue_offset() + PtrQueue::byte_offset_of_buf()), R16_thread); 2679 2680 addi(Rindex, Rindex, -wordSize); // Decrement index. 2681 std(Rindex, in_bytes(JavaThread::satb_mark_queue_offset() + PtrQueue::byte_offset_of_index()), R16_thread); 2682 2683 // Record the previous value. 2684 stdx(Rpre_val, Rbuffer, Rindex); 2685 b(filtered); 2686 2687 bind(runtime); 2688 2689 // VM call need frame to access(write) O register. 2690 if (needs_frame) { 2691 save_LR_CR(Rtmp1); 2692 push_frame_reg_args(0, Rtmp2); 2693 } 2694 2695 if (Rpre_val->is_volatile() && Robj == noreg) mr(R31, Rpre_val); // Save pre_val across C call if it was preloaded. 2696 call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_pre), Rpre_val, R16_thread); 2697 if (Rpre_val->is_volatile() && Robj == noreg) mr(Rpre_val, R31); // restore 2698 2699 if (needs_frame) { 2700 pop_frame(); 2701 restore_LR_CR(Rtmp1); 2702 } 2703 2704 bind(filtered); 2705 } 2706 2707 // General G1 post-barrier generator 2708 // Store cross-region card. 2709 void MacroAssembler::g1_write_barrier_post(Register Rstore_addr, Register Rnew_val, Register Rtmp1, Register Rtmp2, Register Rtmp3, Label *filtered_ext) { 2710 Label runtime, filtered_int; 2711 Label& filtered = (filtered_ext != NULL) ? *filtered_ext : filtered_int; 2712 assert_different_registers(Rstore_addr, Rnew_val, Rtmp1, Rtmp2); 2713 2714 G1SATBCardTableLoggingModRefBS* bs = 2715 barrier_set_cast<G1SATBCardTableLoggingModRefBS>(Universe::heap()->barrier_set()); 2716 2717 // Does store cross heap regions? 2718 if (G1RSBarrierRegionFilter) { 2719 xorr(Rtmp1, Rstore_addr, Rnew_val); 2720 srdi_(Rtmp1, Rtmp1, HeapRegion::LogOfHRGrainBytes); 2721 beq(CCR0, filtered); 2722 } 2723 2724 // Crosses regions, storing NULL? 2725 #ifdef ASSERT 2726 cmpdi(CCR0, Rnew_val, 0); 2727 asm_assert_ne("null oop not allowed (G1)", 0x322); // Checked by caller on PPC64, so following branch is obsolete: 2728 //beq(CCR0, filtered); 2729 #endif 2730 2731 // Storing region crossing non-NULL, is card already dirty? 2732 assert(sizeof(*bs->byte_map_base) == sizeof(jbyte), "adjust this code"); 2733 const Register Rcard_addr = Rtmp1; 2734 Register Rbase = Rtmp2; 2735 load_const_optimized(Rbase, (address)bs->byte_map_base, /*temp*/ Rtmp3); 2736 2737 srdi(Rcard_addr, Rstore_addr, CardTableModRefBS::card_shift); 2738 2739 // Get the address of the card. 2740 lbzx(/*card value*/ Rtmp3, Rbase, Rcard_addr); 2741 cmpwi(CCR0, Rtmp3, (int)G1SATBCardTableModRefBS::g1_young_card_val()); 2742 beq(CCR0, filtered); 2743 2744 membar(Assembler::StoreLoad); 2745 lbzx(/*card value*/ Rtmp3, Rbase, Rcard_addr); // Reload after membar. 2746 cmpwi(CCR0, Rtmp3 /* card value */, CardTableModRefBS::dirty_card_val()); 2747 beq(CCR0, filtered); 2748 2749 // Storing a region crossing, non-NULL oop, card is clean. 2750 // Dirty card and log. 2751 li(Rtmp3, CardTableModRefBS::dirty_card_val()); 2752 //release(); // G1: oops are allowed to get visible after dirty marking. 2753 stbx(Rtmp3, Rbase, Rcard_addr); 2754 2755 add(Rcard_addr, Rbase, Rcard_addr); // This is the address which needs to get enqueued. 2756 Rbase = noreg; // end of lifetime 2757 2758 const Register Rqueue_index = Rtmp2, 2759 Rqueue_buf = Rtmp3; 2760 ld(Rqueue_index, in_bytes(JavaThread::dirty_card_queue_offset() + PtrQueue::byte_offset_of_index()), R16_thread); 2761 cmpdi(CCR0, Rqueue_index, 0); 2762 beq(CCR0, runtime); // index == 0 then jump to runtime 2763 ld(Rqueue_buf, in_bytes(JavaThread::dirty_card_queue_offset() + PtrQueue::byte_offset_of_buf()), R16_thread); 2764 2765 addi(Rqueue_index, Rqueue_index, -wordSize); // decrement index 2766 std(Rqueue_index, in_bytes(JavaThread::dirty_card_queue_offset() + PtrQueue::byte_offset_of_index()), R16_thread); 2767 2768 stdx(Rcard_addr, Rqueue_buf, Rqueue_index); // store card 2769 b(filtered); 2770 2771 bind(runtime); 2772 2773 // Save the live input values. 2774 call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_post), Rcard_addr, R16_thread); 2775 2776 bind(filtered_int); 2777 } 2778 #endif // INCLUDE_ALL_GCS 2779 2780 // Values for last_Java_pc, and last_Java_sp must comply to the rules 2781 // in frame_ppc.hpp. 2782 void MacroAssembler::set_last_Java_frame(Register last_Java_sp, Register last_Java_pc) { 2783 // Always set last_Java_pc and flags first because once last_Java_sp 2784 // is visible has_last_Java_frame is true and users will look at the 2785 // rest of the fields. (Note: flags should always be zero before we 2786 // get here so doesn't need to be set.) 2787 2788 // Verify that last_Java_pc was zeroed on return to Java 2789 asm_assert_mem8_is_zero(in_bytes(JavaThread::last_Java_pc_offset()), R16_thread, 2790 "last_Java_pc not zeroed before leaving Java", 0x200); 2791 2792 // When returning from calling out from Java mode the frame anchor's 2793 // last_Java_pc will always be set to NULL. It is set here so that 2794 // if we are doing a call to native (not VM) that we capture the 2795 // known pc and don't have to rely on the native call having a 2796 // standard frame linkage where we can find the pc. 2797 if (last_Java_pc != noreg) 2798 std(last_Java_pc, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread); 2799 2800 // Set last_Java_sp last. 2801 std(last_Java_sp, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread); 2802 } 2803 2804 void MacroAssembler::reset_last_Java_frame(void) { 2805 asm_assert_mem8_isnot_zero(in_bytes(JavaThread::last_Java_sp_offset()), 2806 R16_thread, "SP was not set, still zero", 0x202); 2807 2808 BLOCK_COMMENT("reset_last_Java_frame {"); 2809 li(R0, 0); 2810 2811 // _last_Java_sp = 0 2812 std(R0, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread); 2813 2814 // _last_Java_pc = 0 2815 std(R0, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread); 2816 BLOCK_COMMENT("} reset_last_Java_frame"); 2817 } 2818 2819 void MacroAssembler::set_top_ijava_frame_at_SP_as_last_Java_frame(Register sp, Register tmp1) { 2820 assert_different_registers(sp, tmp1); 2821 2822 // sp points to a TOP_IJAVA_FRAME, retrieve frame's PC via 2823 // TOP_IJAVA_FRAME_ABI. 2824 // FIXME: assert that we really have a TOP_IJAVA_FRAME here! 2825 #ifdef CC_INTERP 2826 ld(tmp1/*pc*/, _top_ijava_frame_abi(frame_manager_lr), sp); 2827 #else 2828 address entry = pc(); 2829 load_const_optimized(tmp1, entry); 2830 #endif 2831 2832 set_last_Java_frame(/*sp=*/sp, /*pc=*/tmp1); 2833 } 2834 2835 void MacroAssembler::get_vm_result(Register oop_result) { 2836 // Read: 2837 // R16_thread 2838 // R16_thread->in_bytes(JavaThread::vm_result_offset()) 2839 // 2840 // Updated: 2841 // oop_result 2842 // R16_thread->in_bytes(JavaThread::vm_result_offset()) 2843 2844 verify_thread(); 2845 2846 ld(oop_result, in_bytes(JavaThread::vm_result_offset()), R16_thread); 2847 li(R0, 0); 2848 std(R0, in_bytes(JavaThread::vm_result_offset()), R16_thread); 2849 2850 verify_oop(oop_result); 2851 } 2852 2853 void MacroAssembler::get_vm_result_2(Register metadata_result) { 2854 // Read: 2855 // R16_thread 2856 // R16_thread->in_bytes(JavaThread::vm_result_2_offset()) 2857 // 2858 // Updated: 2859 // metadata_result 2860 // R16_thread->in_bytes(JavaThread::vm_result_2_offset()) 2861 2862 ld(metadata_result, in_bytes(JavaThread::vm_result_2_offset()), R16_thread); 2863 li(R0, 0); 2864 std(R0, in_bytes(JavaThread::vm_result_2_offset()), R16_thread); 2865 } 2866 2867 Register MacroAssembler::encode_klass_not_null(Register dst, Register src) { 2868 Register current = (src != noreg) ? src : dst; // Klass is in dst if no src provided. 2869 if (Universe::narrow_klass_base() != 0) { 2870 // Use dst as temp if it is free. 2871 sub_const_optimized(dst, current, Universe::narrow_klass_base(), R0); 2872 current = dst; 2873 } 2874 if (Universe::narrow_klass_shift() != 0) { 2875 srdi(dst, current, Universe::narrow_klass_shift()); 2876 current = dst; 2877 } 2878 return current; 2879 } 2880 2881 void MacroAssembler::store_klass(Register dst_oop, Register klass, Register ck) { 2882 if (UseCompressedClassPointers) { 2883 Register compressedKlass = encode_klass_not_null(ck, klass); 2884 stw(compressedKlass, oopDesc::klass_offset_in_bytes(), dst_oop); 2885 } else { 2886 std(klass, oopDesc::klass_offset_in_bytes(), dst_oop); 2887 } 2888 } 2889 2890 void MacroAssembler::store_klass_gap(Register dst_oop, Register val) { 2891 if (UseCompressedClassPointers) { 2892 if (val == noreg) { 2893 val = R0; 2894 li(val, 0); 2895 } 2896 stw(val, oopDesc::klass_gap_offset_in_bytes(), dst_oop); // klass gap if compressed 2897 } 2898 } 2899 2900 int MacroAssembler::instr_size_for_decode_klass_not_null() { 2901 if (!UseCompressedClassPointers) return 0; 2902 int num_instrs = 1; // shift or move 2903 if (Universe::narrow_klass_base() != 0) num_instrs = 7; // shift + load const + add 2904 return num_instrs * BytesPerInstWord; 2905 } 2906 2907 void MacroAssembler::decode_klass_not_null(Register dst, Register src) { 2908 assert(dst != R0, "Dst reg may not be R0, as R0 is used here."); 2909 if (src == noreg) src = dst; 2910 Register shifted_src = src; 2911 if (Universe::narrow_klass_shift() != 0 || 2912 Universe::narrow_klass_base() == 0 && src != dst) { // Move required. 2913 shifted_src = dst; 2914 sldi(shifted_src, src, Universe::narrow_klass_shift()); 2915 } 2916 if (Universe::narrow_klass_base() != 0) { 2917 add_const_optimized(dst, shifted_src, Universe::narrow_klass_base(), R0); 2918 } 2919 } 2920 2921 void MacroAssembler::load_klass(Register dst, Register src) { 2922 if (UseCompressedClassPointers) { 2923 lwz(dst, oopDesc::klass_offset_in_bytes(), src); 2924 // Attention: no null check here! 2925 decode_klass_not_null(dst, dst); 2926 } else { 2927 ld(dst, oopDesc::klass_offset_in_bytes(), src); 2928 } 2929 } 2930 2931 void MacroAssembler::load_klass_with_trap_null_check(Register dst, Register src) { 2932 if (!os::zero_page_read_protected()) { 2933 if (TrapBasedNullChecks) { 2934 trap_null_check(src); 2935 } 2936 } 2937 load_klass(dst, src); 2938 } 2939 2940 void MacroAssembler::reinit_heapbase(Register d, Register tmp) { 2941 if (Universe::heap() != NULL) { 2942 load_const_optimized(R30, Universe::narrow_ptrs_base(), tmp); 2943 } else { 2944 // Heap not yet allocated. Load indirectly. 2945 int simm16_offset = load_const_optimized(R30, Universe::narrow_ptrs_base_addr(), tmp, true); 2946 ld(R30, simm16_offset, R30); 2947 } 2948 } 2949 2950 // Clear Array 2951 // Kills both input registers. tmp == R0 is allowed. 2952 void MacroAssembler::clear_memory_doubleword(Register base_ptr, Register cnt_dwords, Register tmp) { 2953 // Procedure for large arrays (uses data cache block zero instruction). 2954 Label startloop, fast, fastloop, small_rest, restloop, done; 2955 const int cl_size = VM_Version::get_cache_line_size(), 2956 cl_dwords = cl_size>>3, 2957 cl_dw_addr_bits = exact_log2(cl_dwords), 2958 dcbz_min = 1; // Min count of dcbz executions, needs to be >0. 2959 2960 //2: 2961 cmpdi(CCR1, cnt_dwords, ((dcbz_min+1)<<cl_dw_addr_bits)-1); // Big enough? (ensure >=dcbz_min lines included). 2962 blt(CCR1, small_rest); // Too small. 2963 rldicl_(tmp, base_ptr, 64-3, 64-cl_dw_addr_bits); // Extract dword offset within first cache line. 2964 beq(CCR0, fast); // Already 128byte aligned. 2965 2966 subfic(tmp, tmp, cl_dwords); 2967 mtctr(tmp); // Set ctr to hit 128byte boundary (0<ctr<cl_dwords). 2968 subf(cnt_dwords, tmp, cnt_dwords); // rest. 2969 li(tmp, 0); 2970 //10: 2971 bind(startloop); // Clear at the beginning to reach 128byte boundary. 2972 std(tmp, 0, base_ptr); // Clear 8byte aligned block. 2973 addi(base_ptr, base_ptr, 8); 2974 bdnz(startloop); 2975 //13: 2976 bind(fast); // Clear 128byte blocks. 2977 srdi(tmp, cnt_dwords, cl_dw_addr_bits); // Loop count for 128byte loop (>0). 2978 andi(cnt_dwords, cnt_dwords, cl_dwords-1); // Rest in dwords. 2979 mtctr(tmp); // Load counter. 2980 //16: 2981 bind(fastloop); 2982 dcbz(base_ptr); // Clear 128byte aligned block. 2983 addi(base_ptr, base_ptr, cl_size); 2984 bdnz(fastloop); 2985 if (InsertEndGroupPPC64) { endgroup(); } else { nop(); } 2986 //20: 2987 bind(small_rest); 2988 cmpdi(CCR0, cnt_dwords, 0); // size 0? 2989 beq(CCR0, done); // rest == 0 2990 li(tmp, 0); 2991 mtctr(cnt_dwords); // Load counter. 2992 //24: 2993 bind(restloop); // Clear rest. 2994 std(tmp, 0, base_ptr); // Clear 8byte aligned block. 2995 addi(base_ptr, base_ptr, 8); 2996 bdnz(restloop); 2997 //27: 2998 bind(done); 2999 } 3000 3001 /////////////////////////////////////////// String intrinsics //////////////////////////////////////////// 3002 3003 // Search for a single jchar in an jchar[]. 3004 // 3005 // Assumes that result differs from all other registers. 3006 // 3007 // Haystack, needle are the addresses of jchar-arrays. 3008 // NeedleChar is needle[0] if it is known at compile time. 3009 // Haycnt is the length of the haystack. We assume haycnt >=1. 3010 // 3011 // Preserves haystack, haycnt, kills all other registers. 3012 // 3013 // If needle == R0, we search for the constant needleChar. 3014 void MacroAssembler::string_indexof_1(Register result, Register haystack, Register haycnt, 3015 Register needle, jchar needleChar, 3016 Register tmp1, Register tmp2) { 3017 3018 assert_different_registers(result, haystack, haycnt, needle, tmp1, tmp2); 3019 3020 Label L_InnerLoop, L_FinalCheck, L_Found1, L_Found2, L_Found3, L_NotFound, L_End; 3021 Register needle0 = needle, // Contains needle[0]. 3022 addr = tmp1, 3023 ch1 = tmp2, 3024 ch2 = R0; 3025 3026 //2 (variable) or 3 (const): 3027 if (needle != R0) lhz(needle0, 0, needle); // Preload needle character, needle has len==1. 3028 dcbtct(haystack, 0x00); // Indicate R/O access to haystack. 3029 3030 srwi_(tmp2, haycnt, 1); // Shift right by exact_log2(UNROLL_FACTOR). 3031 mr(addr, haystack); 3032 beq(CCR0, L_FinalCheck); 3033 mtctr(tmp2); // Move to count register. 3034 //8: 3035 bind(L_InnerLoop); // Main work horse (2x unrolled search loop). 3036 lhz(ch1, 0, addr); // Load characters from haystack. 3037 lhz(ch2, 2, addr); 3038 (needle != R0) ? cmpw(CCR0, ch1, needle0) : cmplwi(CCR0, ch1, needleChar); 3039 (needle != R0) ? cmpw(CCR1, ch2, needle0) : cmplwi(CCR1, ch2, needleChar); 3040 beq(CCR0, L_Found1); // Did we find the needle? 3041 beq(CCR1, L_Found2); 3042 addi(addr, addr, 4); 3043 bdnz(L_InnerLoop); 3044 //16: 3045 bind(L_FinalCheck); 3046 andi_(R0, haycnt, 1); 3047 beq(CCR0, L_NotFound); 3048 lhz(ch1, 0, addr); // One position left at which we have to compare. 3049 (needle != R0) ? cmpw(CCR1, ch1, needle0) : cmplwi(CCR1, ch1, needleChar); 3050 beq(CCR1, L_Found3); 3051 //21: 3052 bind(L_NotFound); 3053 li(result, -1); // Not found. 3054 b(L_End); 3055 3056 bind(L_Found2); 3057 addi(addr, addr, 2); 3058 //24: 3059 bind(L_Found1); 3060 bind(L_Found3); // Return index ... 3061 subf(addr, haystack, addr); // relative to haystack, 3062 srdi(result, addr, 1); // in characters. 3063 bind(L_End); 3064 } 3065 3066 3067 // Implementation of IndexOf for jchar arrays. 3068 // 3069 // The length of haystack and needle are not constant, i.e. passed in a register. 3070 // 3071 // Preserves registers haystack, needle. 3072 // Kills registers haycnt, needlecnt. 3073 // Assumes that result differs from all other registers. 3074 // Haystack, needle are the addresses of jchar-arrays. 3075 // Haycnt, needlecnt are the lengths of them, respectively. 3076 // 3077 // Needlecntval must be zero or 15-bit unsigned immediate and > 1. 3078 void MacroAssembler::string_indexof(Register result, Register haystack, Register haycnt, 3079 Register needle, ciTypeArray* needle_values, Register needlecnt, int needlecntval, 3080 Register tmp1, Register tmp2, Register tmp3, Register tmp4) { 3081 3082 // Ensure 0<needlecnt<=haycnt in ideal graph as prerequisite! 3083 Label L_TooShort, L_Found, L_NotFound, L_End; 3084 Register last_addr = haycnt, // Kill haycnt at the beginning. 3085 addr = tmp1, 3086 n_start = tmp2, 3087 ch1 = tmp3, 3088 ch2 = R0; 3089 3090 // ************************************************************************************************** 3091 // Prepare for main loop: optimized for needle count >=2, bail out otherwise. 3092 // ************************************************************************************************** 3093 3094 //1 (variable) or 3 (const): 3095 dcbtct(needle, 0x00); // Indicate R/O access to str1. 3096 dcbtct(haystack, 0x00); // Indicate R/O access to str2. 3097 3098 // Compute last haystack addr to use if no match gets found. 3099 if (needlecntval == 0) { // variable needlecnt 3100 //3: 3101 subf(ch1, needlecnt, haycnt); // Last character index to compare is haycnt-needlecnt. 3102 addi(addr, haystack, -2); // Accesses use pre-increment. 3103 cmpwi(CCR6, needlecnt, 2); 3104 blt(CCR6, L_TooShort); // Variable needlecnt: handle short needle separately. 3105 slwi(ch1, ch1, 1); // Scale to number of bytes. 3106 lwz(n_start, 0, needle); // Load first 2 characters of needle. 3107 add(last_addr, haystack, ch1); // Point to last address to compare (haystack+2*(haycnt-needlecnt)). 3108 addi(needlecnt, needlecnt, -2); // Rest of needle. 3109 } else { // constant needlecnt 3110 guarantee(needlecntval != 1, "IndexOf with single-character needle must be handled separately"); 3111 assert((needlecntval & 0x7fff) == needlecntval, "wrong immediate"); 3112 //5: 3113 addi(ch1, haycnt, -needlecntval); // Last character index to compare is haycnt-needlecnt. 3114 lwz(n_start, 0, needle); // Load first 2 characters of needle. 3115 addi(addr, haystack, -2); // Accesses use pre-increment. 3116 slwi(ch1, ch1, 1); // Scale to number of bytes. 3117 add(last_addr, haystack, ch1); // Point to last address to compare (haystack+2*(haycnt-needlecnt)). 3118 li(needlecnt, needlecntval-2); // Rest of needle. 3119 } 3120 3121 // Main Loop (now we have at least 3 characters). 3122 //11: 3123 Label L_OuterLoop, L_InnerLoop, L_FinalCheck, L_Comp1, L_Comp2, L_Comp3; 3124 bind(L_OuterLoop); // Search for 1st 2 characters. 3125 Register addr_diff = tmp4; 3126 subf(addr_diff, addr, last_addr); // Difference between already checked address and last address to check. 3127 addi(addr, addr, 2); // This is the new address we want to use for comparing. 3128 srdi_(ch2, addr_diff, 2); 3129 beq(CCR0, L_FinalCheck); // 2 characters left? 3130 mtctr(ch2); // addr_diff/4 3131 //16: 3132 bind(L_InnerLoop); // Main work horse (2x unrolled search loop) 3133 lwz(ch1, 0, addr); // Load 2 characters of haystack (ignore alignment). 3134 lwz(ch2, 2, addr); 3135 cmpw(CCR0, ch1, n_start); // Compare 2 characters (1 would be sufficient but try to reduce branches to CompLoop). 3136 cmpw(CCR1, ch2, n_start); 3137 beq(CCR0, L_Comp1); // Did we find the needle start? 3138 beq(CCR1, L_Comp2); 3139 addi(addr, addr, 4); 3140 bdnz(L_InnerLoop); 3141 //24: 3142 bind(L_FinalCheck); 3143 rldicl_(addr_diff, addr_diff, 64-1, 63); // Remaining characters not covered by InnerLoop: (addr_diff>>1)&1. 3144 beq(CCR0, L_NotFound); 3145 lwz(ch1, 0, addr); // One position left at which we have to compare. 3146 cmpw(CCR1, ch1, n_start); 3147 beq(CCR1, L_Comp3); 3148 //29: 3149 bind(L_NotFound); 3150 li(result, -1); // not found 3151 b(L_End); 3152 3153 3154 // ************************************************************************************************** 3155 // Special Case: unfortunately, the variable needle case can be called with needlecnt<2 3156 // ************************************************************************************************** 3157 //31: 3158 if ((needlecntval>>1) !=1 ) { // Const needlecnt is 2 or 3? Reduce code size. 3159 int nopcnt = 5; 3160 if (needlecntval !=0 ) ++nopcnt; // Balance alignment (other case: see below). 3161 if (needlecntval == 0) { // We have to handle these cases separately. 3162 Label L_OneCharLoop; 3163 bind(L_TooShort); 3164 mtctr(haycnt); 3165 lhz(n_start, 0, needle); // First character of needle 3166 bind(L_OneCharLoop); 3167 lhzu(ch1, 2, addr); 3168 cmpw(CCR1, ch1, n_start); 3169 beq(CCR1, L_Found); // Did we find the one character needle? 3170 bdnz(L_OneCharLoop); 3171 li(result, -1); // Not found. 3172 b(L_End); 3173 } // 8 instructions, so no impact on alignment. 3174 for (int x = 0; x < nopcnt; ++x) nop(); 3175 } 3176 3177 // ************************************************************************************************** 3178 // Regular Case Part II: compare rest of needle (first 2 characters have been compared already) 3179 // ************************************************************************************************** 3180 3181 // Compare the rest 3182 //36 if needlecntval==0, else 37: 3183 bind(L_Comp2); 3184 addi(addr, addr, 2); // First comparison has failed, 2nd one hit. 3185 bind(L_Comp1); // Addr points to possible needle start. 3186 bind(L_Comp3); // Could have created a copy and use a different return address but saving code size here. 3187 if (needlecntval != 2) { // Const needlecnt==2? 3188 if (needlecntval != 3) { 3189 if (needlecntval == 0) beq(CCR6, L_Found); // Variable needlecnt==2? 3190 Register ind_reg = tmp4; 3191 li(ind_reg, 2*2); // First 2 characters are already compared, use index 2. 3192 mtctr(needlecnt); // Decremented by 2, still > 0. 3193 //40: 3194 Label L_CompLoop; 3195 bind(L_CompLoop); 3196 lhzx(ch2, needle, ind_reg); 3197 lhzx(ch1, addr, ind_reg); 3198 cmpw(CCR1, ch1, ch2); 3199 bne(CCR1, L_OuterLoop); 3200 addi(ind_reg, ind_reg, 2); 3201 bdnz(L_CompLoop); 3202 } else { // No loop required if there's only one needle character left. 3203 lhz(ch2, 2*2, needle); 3204 lhz(ch1, 2*2, addr); 3205 cmpw(CCR1, ch1, ch2); 3206 bne(CCR1, L_OuterLoop); 3207 } 3208 } 3209 // Return index ... 3210 //46: 3211 bind(L_Found); 3212 subf(addr, haystack, addr); // relative to haystack, ... 3213 srdi(result, addr, 1); // in characters. 3214 //48: 3215 bind(L_End); 3216 } 3217 3218 // Implementation of Compare for jchar arrays. 3219 // 3220 // Kills the registers str1, str2, cnt1, cnt2. 3221 // Kills cr0, ctr. 3222 // Assumes that result differes from the input registers. 3223 void MacroAssembler::string_compare(Register str1_reg, Register str2_reg, Register cnt1_reg, Register cnt2_reg, 3224 Register result_reg, Register tmp_reg) { 3225 assert_different_registers(result_reg, str1_reg, str2_reg, cnt1_reg, cnt2_reg, tmp_reg); 3226 3227 Label Ldone, Lslow_case, Lslow_loop, Lfast_loop; 3228 Register cnt_diff = R0, 3229 limit_reg = cnt1_reg, 3230 chr1_reg = result_reg, 3231 chr2_reg = cnt2_reg, 3232 addr_diff = str2_reg; 3233 3234 // Offset 0 should be 32 byte aligned. 3235 //-4: 3236 dcbtct(str1_reg, 0x00); // Indicate R/O access to str1. 3237 dcbtct(str2_reg, 0x00); // Indicate R/O access to str2. 3238 //-2: 3239 // Compute min(cnt1, cnt2) and check if 0 (bail out if we don't need to compare characters). 3240 subf(result_reg, cnt2_reg, cnt1_reg); // difference between cnt1/2 3241 subf_(addr_diff, str1_reg, str2_reg); // alias? 3242 beq(CCR0, Ldone); // return cnt difference if both ones are identical 3243 srawi(limit_reg, result_reg, 31); // generate signmask (cnt1/2 must be non-negative so cnt_diff can't overflow) 3244 mr(cnt_diff, result_reg); 3245 andr(limit_reg, result_reg, limit_reg); // difference or zero (negative): cnt1<cnt2 ? cnt1-cnt2 : 0 3246 add_(limit_reg, cnt2_reg, limit_reg); // min(cnt1, cnt2)==0? 3247 beq(CCR0, Ldone); // return cnt difference if one has 0 length 3248 3249 lhz(chr1_reg, 0, str1_reg); // optional: early out if first characters mismatch 3250 lhzx(chr2_reg, str1_reg, addr_diff); // optional: early out if first characters mismatch 3251 addi(tmp_reg, limit_reg, -1); // min(cnt1, cnt2)-1 3252 subf_(result_reg, chr2_reg, chr1_reg); // optional: early out if first characters mismatch 3253 bne(CCR0, Ldone); // optional: early out if first characters mismatch 3254 3255 // Set loop counter by scaling down tmp_reg 3256 srawi_(chr2_reg, tmp_reg, exact_log2(4)); // (min(cnt1, cnt2)-1)/4 3257 ble(CCR0, Lslow_case); // need >4 characters for fast loop 3258 andi(limit_reg, tmp_reg, 4-1); // remaining characters 3259 3260 // Adapt str1_reg str2_reg for the first loop iteration 3261 mtctr(chr2_reg); // (min(cnt1, cnt2)-1)/4 3262 addi(limit_reg, limit_reg, 4+1); // compare last 5-8 characters in slow_case if mismatch found in fast_loop 3263 //16: 3264 // Compare the rest of the characters 3265 bind(Lfast_loop); 3266 ld(chr1_reg, 0, str1_reg); 3267 ldx(chr2_reg, str1_reg, addr_diff); 3268 cmpd(CCR0, chr2_reg, chr1_reg); 3269 bne(CCR0, Lslow_case); // return chr1_reg 3270 addi(str1_reg, str1_reg, 4*2); 3271 bdnz(Lfast_loop); 3272 addi(limit_reg, limit_reg, -4); // no mismatch found in fast_loop, only 1-4 characters missing 3273 //23: 3274 bind(Lslow_case); 3275 mtctr(limit_reg); 3276 //24: 3277 bind(Lslow_loop); 3278 lhz(chr1_reg, 0, str1_reg); 3279 lhzx(chr2_reg, str1_reg, addr_diff); 3280 subf_(result_reg, chr2_reg, chr1_reg); 3281 bne(CCR0, Ldone); // return chr1_reg 3282 addi(str1_reg, str1_reg, 1*2); 3283 bdnz(Lslow_loop); 3284 //30: 3285 // If strings are equal up to min length, return the length difference. 3286 mr(result_reg, cnt_diff); 3287 nop(); // alignment 3288 //32: 3289 // Otherwise, return the difference between the first mismatched chars. 3290 bind(Ldone); 3291 } 3292 3293 3294 // Compare char[] arrays. 3295 // 3296 // str1_reg USE only 3297 // str2_reg USE only 3298 // cnt_reg USE_DEF, due to tmp reg shortage 3299 // result_reg DEF only, might compromise USE only registers 3300 void MacroAssembler::char_arrays_equals(Register str1_reg, Register str2_reg, Register cnt_reg, Register result_reg, 3301 Register tmp1_reg, Register tmp2_reg, Register tmp3_reg, Register tmp4_reg, 3302 Register tmp5_reg) { 3303 3304 // Str1 may be the same register as str2 which can occur e.g. after scalar replacement. 3305 assert_different_registers(result_reg, str1_reg, cnt_reg, tmp1_reg, tmp2_reg, tmp3_reg, tmp4_reg, tmp5_reg); 3306 assert_different_registers(result_reg, str2_reg, cnt_reg, tmp1_reg, tmp2_reg, tmp3_reg, tmp4_reg, tmp5_reg); 3307 3308 // Offset 0 should be 32 byte aligned. 3309 Label Linit_cbc, Lcbc, Lloop, Ldone_true, Ldone_false; 3310 Register index_reg = tmp5_reg; 3311 Register cbc_iter = tmp4_reg; 3312 3313 //-1: 3314 dcbtct(str1_reg, 0x00); // Indicate R/O access to str1. 3315 dcbtct(str2_reg, 0x00); // Indicate R/O access to str2. 3316 //1: 3317 andi(cbc_iter, cnt_reg, 4-1); // Remaining iterations after 4 java characters per iteration loop. 3318 li(index_reg, 0); // init 3319 li(result_reg, 0); // assume false 3320 srwi_(tmp2_reg, cnt_reg, exact_log2(4)); // Div: 4 java characters per iteration (main loop). 3321 3322 cmpwi(CCR1, cbc_iter, 0); // CCR1 = (cbc_iter==0) 3323 beq(CCR0, Linit_cbc); // too short 3324 mtctr(tmp2_reg); 3325 //8: 3326 bind(Lloop); 3327 ldx(tmp1_reg, str1_reg, index_reg); 3328 ldx(tmp2_reg, str2_reg, index_reg); 3329 cmpd(CCR0, tmp1_reg, tmp2_reg); 3330 bne(CCR0, Ldone_false); // Unequal char pair found -> done. 3331 addi(index_reg, index_reg, 4*sizeof(jchar)); 3332 bdnz(Lloop); 3333 //14: 3334 bind(Linit_cbc); 3335 beq(CCR1, Ldone_true); 3336 mtctr(cbc_iter); 3337 //16: 3338 bind(Lcbc); 3339 lhzx(tmp1_reg, str1_reg, index_reg); 3340 lhzx(tmp2_reg, str2_reg, index_reg); 3341 cmpw(CCR0, tmp1_reg, tmp2_reg); 3342 bne(CCR0, Ldone_false); // Unequal char pair found -> done. 3343 addi(index_reg, index_reg, 1*sizeof(jchar)); 3344 bdnz(Lcbc); 3345 nop(); 3346 bind(Ldone_true); 3347 li(result_reg, 1); 3348 //24: 3349 bind(Ldone_false); 3350 } 3351 3352 3353 void MacroAssembler::char_arrays_equalsImm(Register str1_reg, Register str2_reg, int cntval, Register result_reg, 3354 Register tmp1_reg, Register tmp2_reg) { 3355 // Str1 may be the same register as str2 which can occur e.g. after scalar replacement. 3356 assert_different_registers(result_reg, str1_reg, tmp1_reg, tmp2_reg); 3357 assert_different_registers(result_reg, str2_reg, tmp1_reg, tmp2_reg); 3358 assert(sizeof(jchar) == 2, "must be"); 3359 assert(cntval >= 0 && ((cntval & 0x7fff) == cntval), "wrong immediate"); 3360 3361 Label Ldone_false; 3362 3363 if (cntval < 16) { // short case 3364 if (cntval != 0) li(result_reg, 0); // assume false 3365 3366 const int num_bytes = cntval*sizeof(jchar); 3367 int index = 0; 3368 for (int next_index; (next_index = index + 8) <= num_bytes; index = next_index) { 3369 ld(tmp1_reg, index, str1_reg); 3370 ld(tmp2_reg, index, str2_reg); 3371 cmpd(CCR0, tmp1_reg, tmp2_reg); 3372 bne(CCR0, Ldone_false); 3373 } 3374 if (cntval & 2) { 3375 lwz(tmp1_reg, index, str1_reg); 3376 lwz(tmp2_reg, index, str2_reg); 3377 cmpw(CCR0, tmp1_reg, tmp2_reg); 3378 bne(CCR0, Ldone_false); 3379 index += 4; 3380 } 3381 if (cntval & 1) { 3382 lhz(tmp1_reg, index, str1_reg); 3383 lhz(tmp2_reg, index, str2_reg); 3384 cmpw(CCR0, tmp1_reg, tmp2_reg); 3385 bne(CCR0, Ldone_false); 3386 } 3387 // fallthrough: true 3388 } else { 3389 Label Lloop; 3390 Register index_reg = tmp1_reg; 3391 const int loopcnt = cntval/4; 3392 assert(loopcnt > 0, "must be"); 3393 // Offset 0 should be 32 byte aligned. 3394 //2: 3395 dcbtct(str1_reg, 0x00); // Indicate R/O access to str1. 3396 dcbtct(str2_reg, 0x00); // Indicate R/O access to str2. 3397 li(tmp2_reg, loopcnt); 3398 li(index_reg, 0); // init 3399 li(result_reg, 0); // assume false 3400 mtctr(tmp2_reg); 3401 //8: 3402 bind(Lloop); 3403 ldx(R0, str1_reg, index_reg); 3404 ldx(tmp2_reg, str2_reg, index_reg); 3405 cmpd(CCR0, R0, tmp2_reg); 3406 bne(CCR0, Ldone_false); // Unequal char pair found -> done. 3407 addi(index_reg, index_reg, 4*sizeof(jchar)); 3408 bdnz(Lloop); 3409 //14: 3410 if (cntval & 2) { 3411 lwzx(R0, str1_reg, index_reg); 3412 lwzx(tmp2_reg, str2_reg, index_reg); 3413 cmpw(CCR0, R0, tmp2_reg); 3414 bne(CCR0, Ldone_false); 3415 if (cntval & 1) addi(index_reg, index_reg, 2*sizeof(jchar)); 3416 } 3417 if (cntval & 1) { 3418 lhzx(R0, str1_reg, index_reg); 3419 lhzx(tmp2_reg, str2_reg, index_reg); 3420 cmpw(CCR0, R0, tmp2_reg); 3421 bne(CCR0, Ldone_false); 3422 } 3423 // fallthru: true 3424 } 3425 li(result_reg, 1); 3426 bind(Ldone_false); 3427 } 3428 3429 // Helpers for Intrinsic Emitters 3430 // 3431 // Revert the byte order of a 32bit value in a register 3432 // src: 0x44556677 3433 // dst: 0x77665544 3434 // Three steps to obtain the result: 3435 // 1) Rotate src (as doubleword) left 5 bytes. That puts the leftmost byte of the src word 3436 // into the rightmost byte position. Afterwards, everything left of the rightmost byte is cleared. 3437 // This value initializes dst. 3438 // 2) Rotate src (as word) left 3 bytes. That puts the rightmost byte of the src word into the leftmost 3439 // byte position. Furthermore, byte 5 is rotated into byte 6 position where it is supposed to go. 3440 // This value is mask inserted into dst with a [0..23] mask of 1s. 3441 // 3) Rotate src (as word) left 1 byte. That puts byte 6 into byte 5 position. 3442 // This value is mask inserted into dst with a [8..15] mask of 1s. 3443 void MacroAssembler::load_reverse_32(Register dst, Register src) { 3444 assert_different_registers(dst, src); 3445 3446 rldicl(dst, src, (4+1)*8, 56); // Rotate byte 4 into position 7 (rightmost), clear all to the left. 3447 rlwimi(dst, src, 3*8, 0, 23); // Insert byte 5 into position 6, 7 into 4, leave pos 7 alone. 3448 rlwimi(dst, src, 1*8, 8, 15); // Insert byte 6 into position 5, leave the rest alone. 3449 } 3450 3451 // Calculate the column addresses of the crc32 lookup table into distinct registers. 3452 // This loop-invariant calculation is moved out of the loop body, reducing the loop 3453 // body size from 20 to 16 instructions. 3454 // Returns the offset that was used to calculate the address of column tc3. 3455 // Due to register shortage, setting tc3 may overwrite table. With the return offset 3456 // at hand, the original table address can be easily reconstructed. 3457 int MacroAssembler::crc32_table_columns(Register table, Register tc0, Register tc1, Register tc2, Register tc3) { 3458 3459 #ifdef VM_LITTLE_ENDIAN 3460 // This is what we implement (the DOLIT4 part): 3461 // ========================================================================= */ 3462 // #define DOLIT4 c ^= *buf4++; \ 3463 // c = crc_table[3][c & 0xff] ^ crc_table[2][(c >> 8) & 0xff] ^ \ 3464 // crc_table[1][(c >> 16) & 0xff] ^ crc_table[0][c >> 24] 3465 // #define DOLIT32 DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4 3466 // ========================================================================= */ 3467 const int ix0 = 3*(4*CRC32_COLUMN_SIZE); 3468 const int ix1 = 2*(4*CRC32_COLUMN_SIZE); 3469 const int ix2 = 1*(4*CRC32_COLUMN_SIZE); 3470 const int ix3 = 0*(4*CRC32_COLUMN_SIZE); 3471 #else 3472 // This is what we implement (the DOBIG4 part): 3473 // ========================================================================= 3474 // #define DOBIG4 c ^= *++buf4; \ 3475 // c = crc_table[4][c & 0xff] ^ crc_table[5][(c >> 8) & 0xff] ^ \ 3476 // crc_table[6][(c >> 16) & 0xff] ^ crc_table[7][c >> 24] 3477 // #define DOBIG32 DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4 3478 // ========================================================================= 3479 const int ix0 = 4*(4*CRC32_COLUMN_SIZE); 3480 const int ix1 = 5*(4*CRC32_COLUMN_SIZE); 3481 const int ix2 = 6*(4*CRC32_COLUMN_SIZE); 3482 const int ix3 = 7*(4*CRC32_COLUMN_SIZE); 3483 #endif 3484 assert_different_registers(table, tc0, tc1, tc2); 3485 assert(table == tc3, "must be!"); 3486 3487 if (ix0 != 0) addi(tc0, table, ix0); 3488 if (ix1 != 0) addi(tc1, table, ix1); 3489 if (ix2 != 0) addi(tc2, table, ix2); 3490 if (ix3 != 0) addi(tc3, table, ix3); 3491 3492 return ix3; 3493 } 3494 3495 /** 3496 * uint32_t crc; 3497 * timesXtoThe32[crc & 0xFF] ^ (crc >> 8); 3498 */ 3499 void MacroAssembler::fold_byte_crc32(Register crc, Register val, Register table, Register tmp) { 3500 assert_different_registers(crc, table, tmp); 3501 assert_different_registers(val, table); 3502 3503 if (crc == val) { // Must rotate first to use the unmodified value. 3504 rlwinm(tmp, val, 2, 24-2, 31-2); // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest. 3505 // As we use a word (4-byte) instruction, we have to adapt the mask bit positions. 3506 srwi(crc, crc, 8); // Unsigned shift, clear leftmost 8 bits. 3507 } else { 3508 srwi(crc, crc, 8); // Unsigned shift, clear leftmost 8 bits. 3509 rlwinm(tmp, val, 2, 24-2, 31-2); // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest. 3510 } 3511 lwzx(tmp, table, tmp); 3512 xorr(crc, crc, tmp); 3513 } 3514 3515 /** 3516 * uint32_t crc; 3517 * timesXtoThe32[crc & 0xFF] ^ (crc >> 8); 3518 */ 3519 void MacroAssembler::fold_8bit_crc32(Register crc, Register table, Register tmp) { 3520 fold_byte_crc32(crc, crc, table, tmp); 3521 } 3522 3523 /** 3524 * Emits code to update CRC-32 with a byte value according to constants in table. 3525 * 3526 * @param [in,out]crc Register containing the crc. 3527 * @param [in]val Register containing the byte to fold into the CRC. 3528 * @param [in]table Register containing the table of crc constants. 3529 * 3530 * uint32_t crc; 3531 * val = crc_table[(val ^ crc) & 0xFF]; 3532 * crc = val ^ (crc >> 8); 3533 */ 3534 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) { 3535 BLOCK_COMMENT("update_byte_crc32:"); 3536 xorr(val, val, crc); 3537 fold_byte_crc32(crc, val, table, val); 3538 } 3539 3540 /** 3541 * @param crc register containing existing CRC (32-bit) 3542 * @param buf register pointing to input byte buffer (byte*) 3543 * @param len register containing number of bytes 3544 * @param table register pointing to CRC table 3545 */ 3546 void MacroAssembler::update_byteLoop_crc32(Register crc, Register buf, Register len, Register table, 3547 Register data, bool loopAlignment, bool invertCRC) { 3548 assert_different_registers(crc, buf, len, table, data); 3549 3550 Label L_mainLoop, L_done; 3551 const int mainLoop_stepping = 1; 3552 const int mainLoop_alignment = loopAlignment ? 32 : 4; // (InputForNewCode > 4 ? InputForNewCode : 32) : 4; 3553 3554 // Process all bytes in a single-byte loop. 3555 cmpdi(CCR0, len, 0); // Anything to do? 3556 mtctr(len); 3557 beq(CCR0, L_done); 3558 3559 if (invertCRC) { 3560 nand(crc, crc, crc); // ~c 3561 } 3562 3563 align(mainLoop_alignment); 3564 BIND(L_mainLoop); 3565 lbz(data, 0, buf); // Byte from buffer, zero-extended. 3566 addi(buf, buf, mainLoop_stepping); // Advance buffer position. 3567 update_byte_crc32(crc, data, table); 3568 bdnz(L_mainLoop); // Iterate. 3569 3570 if (invertCRC) { 3571 nand(crc, crc, crc); // ~c 3572 } 3573 3574 bind(L_done); 3575 } 3576 3577 /** 3578 * Emits code to update CRC-32 with a 4-byte value according to constants in table 3579 * Implementation according to jdk/src/share/native/java/util/zip/zlib-1.2.8/crc32.c 3580 */ 3581 // A not on the lookup table address(es): 3582 // The lookup table consists of two sets of four columns each. 3583 // The columns {0..3} are used for little-endian machines. 3584 // The columns {4..7} are used for big-endian machines. 3585 // To save the effort of adding the column offset to the table address each time 3586 // a table element is looked up, it is possible to pass the pre-calculated 3587 // column addresses. 3588 // Uses R9..R12 as work register. Must be saved/restored by caller, if necessary. 3589 void MacroAssembler::update_1word_crc32(Register crc, Register buf, Register table, int bufDisp, int bufInc, 3590 Register t0, Register t1, Register t2, Register t3, 3591 Register tc0, Register tc1, Register tc2, Register tc3) { 3592 assert_different_registers(crc, t3); 3593 3594 // XOR crc with next four bytes of buffer. 3595 lwz(t3, bufDisp, buf); 3596 if (bufInc != 0) { 3597 addi(buf, buf, bufInc); 3598 } 3599 xorr(t3, t3, crc); 3600 3601 // Chop crc into 4 single-byte pieces, shifted left 2 bits, to form the table indices. 3602 rlwinm(t0, t3, 2, 24-2, 31-2); // ((t1 >> 0) & 0xff) << 2 3603 rlwinm(t1, t3, 32+(2- 8), 24-2, 31-2); // ((t1 >> 8) & 0xff) << 2 3604 rlwinm(t2, t3, 32+(2-16), 24-2, 31-2); // ((t1 >> 16) & 0xff) << 2 3605 rlwinm(t3, t3, 32+(2-24), 24-2, 31-2); // ((t1 >> 24) & 0xff) << 2 3606 3607 // Use the pre-calculated column addresses. 3608 // Load pre-calculated table values. 3609 lwzx(t0, tc0, t0); 3610 lwzx(t1, tc1, t1); 3611 lwzx(t2, tc2, t2); 3612 lwzx(t3, tc3, t3); 3613 3614 // Calculate new crc from table values. 3615 xorr(t0, t0, t1); 3616 xorr(t2, t2, t3); 3617 xorr(crc, t0, t2); // Now crc contains the final checksum value. 3618 } 3619 3620 /** 3621 * @param crc register containing existing CRC (32-bit) 3622 * @param buf register pointing to input byte buffer (byte*) 3623 * @param len register containing number of bytes 3624 * @param table register pointing to CRC table 3625 * 3626 * Uses R9..R12 as work register. Must be saved/restored by caller! 3627 */ 3628 void MacroAssembler::kernel_crc32_2word(Register crc, Register buf, Register len, Register table, 3629 Register t0, Register t1, Register t2, Register t3, 3630 Register tc0, Register tc1, Register tc2, Register tc3) { 3631 assert_different_registers(crc, buf, len, table); 3632 3633 Label L_mainLoop, L_tail; 3634 Register tmp = t0; 3635 Register data = t0; 3636 Register tmp2 = t1; 3637 const int mainLoop_stepping = 8; 3638 const int tailLoop_stepping = 1; 3639 const int log_stepping = exact_log2(mainLoop_stepping); 3640 const int mainLoop_alignment = 32; // InputForNewCode > 4 ? InputForNewCode : 32; 3641 const int complexThreshold = 2*mainLoop_stepping; 3642 3643 // Don't test for len <= 0 here. This pathological case should not occur anyway. 3644 // Optimizing for it by adding a test and a branch seems to be a waste of CPU cycles. 3645 // The situation itself is detected and handled correctly by the conditional branches 3646 // following aghi(len, -stepping) and aghi(len, +stepping). 3647 assert(tailLoop_stepping == 1, "check tailLoop_stepping!"); 3648 3649 BLOCK_COMMENT("kernel_crc32_2word {"); 3650 3651 nand(crc, crc, crc); // ~c 3652 3653 // Check for short (<mainLoop_stepping) buffer. 3654 cmpdi(CCR0, len, complexThreshold); 3655 blt(CCR0, L_tail); 3656 3657 // Pre-mainLoop alignment did show a slight (1%) positive effect on performance. 3658 // We leave the code in for reference. Maybe we need alignment when we exploit vector instructions. 3659 { 3660 // Align buf addr to mainLoop_stepping boundary. 3661 neg(tmp2, buf); // Calculate # preLoop iterations for alignment. 3662 rldicl(tmp2, tmp2, 0, 64-log_stepping); // Rotate tmp2 0 bits, insert into tmp2, anding with mask with 1s from 62..63. 3663 3664 if (complexThreshold > mainLoop_stepping) { 3665 sub(len, len, tmp2); // Remaining bytes for main loop (>=mainLoop_stepping is guaranteed). 3666 } else { 3667 sub(tmp, len, tmp2); // Remaining bytes for main loop. 3668 cmpdi(CCR0, tmp, mainLoop_stepping); 3669 blt(CCR0, L_tail); // For less than one mainloop_stepping left, do only tail processing 3670 mr(len, tmp); // remaining bytes for main loop (>=mainLoop_stepping is guaranteed). 3671 } 3672 update_byteLoop_crc32(crc, buf, tmp2, table, data, false, false); 3673 } 3674 3675 srdi(tmp2, len, log_stepping); // #iterations for mainLoop 3676 andi(len, len, mainLoop_stepping-1); // remaining bytes for tailLoop 3677 mtctr(tmp2); 3678 3679 #ifdef VM_LITTLE_ENDIAN 3680 Register crc_rv = crc; 3681 #else 3682 Register crc_rv = tmp; // Load_reverse needs separate registers to work on. 3683 // Occupies tmp, but frees up crc. 3684 load_reverse_32(crc_rv, crc); // Revert byte order because we are dealing with big-endian data. 3685 tmp = crc; 3686 #endif 3687 3688 int reconstructTableOffset = crc32_table_columns(table, tc0, tc1, tc2, tc3); 3689 3690 align(mainLoop_alignment); // Octoword-aligned loop address. Shows 2% improvement. 3691 BIND(L_mainLoop); 3692 update_1word_crc32(crc_rv, buf, table, 0, 0, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3); 3693 update_1word_crc32(crc_rv, buf, table, 4, mainLoop_stepping, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3); 3694 bdnz(L_mainLoop); 3695 3696 #ifndef VM_LITTLE_ENDIAN 3697 load_reverse_32(crc, crc_rv); // Revert byte order because we are dealing with big-endian data. 3698 tmp = crc_rv; // Tmp uses it's original register again. 3699 #endif 3700 3701 // Restore original table address for tailLoop. 3702 if (reconstructTableOffset != 0) { 3703 addi(table, table, -reconstructTableOffset); 3704 } 3705 3706 // Process last few (<complexThreshold) bytes of buffer. 3707 BIND(L_tail); 3708 update_byteLoop_crc32(crc, buf, len, table, data, false, false); 3709 3710 nand(crc, crc, crc); // ~c 3711 BLOCK_COMMENT("} kernel_crc32_2word"); 3712 } 3713 3714 /** 3715 * @param crc register containing existing CRC (32-bit) 3716 * @param buf register pointing to input byte buffer (byte*) 3717 * @param len register containing number of bytes 3718 * @param table register pointing to CRC table 3719 * 3720 * uses R9..R12 as work register. Must be saved/restored by caller! 3721 */ 3722 void MacroAssembler::kernel_crc32_1word(Register crc, Register buf, Register len, Register table, 3723 Register t0, Register t1, Register t2, Register t3, 3724 Register tc0, Register tc1, Register tc2, Register tc3) { 3725 assert_different_registers(crc, buf, len, table); 3726 3727 Label L_mainLoop, L_tail; 3728 Register tmp = t0; 3729 Register data = t0; 3730 Register tmp2 = t1; 3731 const int mainLoop_stepping = 4; 3732 const int tailLoop_stepping = 1; 3733 const int log_stepping = exact_log2(mainLoop_stepping); 3734 const int mainLoop_alignment = 32; // InputForNewCode > 4 ? InputForNewCode : 32; 3735 const int complexThreshold = 2*mainLoop_stepping; 3736 3737 // Don't test for len <= 0 here. This pathological case should not occur anyway. 3738 // Optimizing for it by adding a test and a branch seems to be a waste of CPU cycles. 3739 // The situation itself is detected and handled correctly by the conditional branches 3740 // following aghi(len, -stepping) and aghi(len, +stepping). 3741 assert(tailLoop_stepping == 1, "check tailLoop_stepping!"); 3742 3743 BLOCK_COMMENT("kernel_crc32_1word {"); 3744 3745 nand(crc, crc, crc); // ~c 3746 3747 // Check for short (<mainLoop_stepping) buffer. 3748 cmpdi(CCR0, len, complexThreshold); 3749 blt(CCR0, L_tail); 3750 3751 // Pre-mainLoop alignment did show a slight (1%) positive effect on performance. 3752 // We leave the code in for reference. Maybe we need alignment when we exploit vector instructions. 3753 { 3754 // Align buf addr to mainLoop_stepping boundary. 3755 neg(tmp2, buf); // Calculate # preLoop iterations for alignment. 3756 rldicl(tmp2, tmp2, 0, 64-log_stepping); // Rotate tmp2 0 bits, insert into tmp2, anding with mask with 1s from 62..63. 3757 3758 if (complexThreshold > mainLoop_stepping) { 3759 sub(len, len, tmp2); // Remaining bytes for main loop (>=mainLoop_stepping is guaranteed). 3760 } else { 3761 sub(tmp, len, tmp2); // Remaining bytes for main loop. 3762 cmpdi(CCR0, tmp, mainLoop_stepping); 3763 blt(CCR0, L_tail); // For less than one mainloop_stepping left, do only tail processing 3764 mr(len, tmp); // remaining bytes for main loop (>=mainLoop_stepping is guaranteed). 3765 } 3766 update_byteLoop_crc32(crc, buf, tmp2, table, data, false, false); 3767 } 3768 3769 srdi(tmp2, len, log_stepping); // #iterations for mainLoop 3770 andi(len, len, mainLoop_stepping-1); // remaining bytes for tailLoop 3771 mtctr(tmp2); 3772 3773 #ifdef VM_LITTLE_ENDIAN 3774 Register crc_rv = crc; 3775 #else 3776 Register crc_rv = tmp; // Load_reverse needs separate registers to work on. 3777 // Occupies tmp, but frees up crc. 3778 load_reverse_32(crc_rv, crc); // evert byte order because we are dealing with big-endian data. 3779 tmp = crc; 3780 #endif 3781 3782 int reconstructTableOffset = crc32_table_columns(table, tc0, tc1, tc2, tc3); 3783 3784 align(mainLoop_alignment); // Octoword-aligned loop address. Shows 2% improvement. 3785 BIND(L_mainLoop); 3786 update_1word_crc32(crc_rv, buf, table, 0, mainLoop_stepping, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3); 3787 bdnz(L_mainLoop); 3788 3789 #ifndef VM_LITTLE_ENDIAN 3790 load_reverse_32(crc, crc_rv); // Revert byte order because we are dealing with big-endian data. 3791 tmp = crc_rv; // Tmp uses it's original register again. 3792 #endif 3793 3794 // Restore original table address for tailLoop. 3795 if (reconstructTableOffset != 0) { 3796 addi(table, table, -reconstructTableOffset); 3797 } 3798 3799 // Process last few (<complexThreshold) bytes of buffer. 3800 BIND(L_tail); 3801 update_byteLoop_crc32(crc, buf, len, table, data, false, false); 3802 3803 nand(crc, crc, crc); // ~c 3804 BLOCK_COMMENT("} kernel_crc32_1word"); 3805 } 3806 3807 /** 3808 * @param crc register containing existing CRC (32-bit) 3809 * @param buf register pointing to input byte buffer (byte*) 3810 * @param len register containing number of bytes 3811 * @param table register pointing to CRC table 3812 * 3813 * Uses R7_ARG5, R8_ARG6 as work registers. 3814 */ 3815 void MacroAssembler::kernel_crc32_1byte(Register crc, Register buf, Register len, Register table, 3816 Register t0, Register t1, Register t2, Register t3) { 3817 assert_different_registers(crc, buf, len, table); 3818 3819 Register data = t0; // Holds the current byte to be folded into crc. 3820 3821 BLOCK_COMMENT("kernel_crc32_1byte {"); 3822 3823 // Process all bytes in a single-byte loop. 3824 update_byteLoop_crc32(crc, buf, len, table, data, true, true); 3825 3826 BLOCK_COMMENT("} kernel_crc32_1byte"); 3827 } 3828 3829 void MacroAssembler::kernel_crc32_singleByte(Register crc, Register buf, Register len, Register table, Register tmp) { 3830 assert_different_registers(crc, buf, /* len, not used!! */ table, tmp); 3831 3832 BLOCK_COMMENT("kernel_crc32_singleByte:"); 3833 nand(crc, crc, crc); // ~c 3834 3835 lbz(tmp, 0, buf); // Byte from buffer, zero-extended. 3836 update_byte_crc32(crc, tmp, table); 3837 3838 nand(crc, crc, crc); // ~c 3839 } 3840 3841 // dest_lo += src1 + src2 3842 // dest_hi += carry1 + carry2 3843 void MacroAssembler::add2_with_carry(Register dest_hi, 3844 Register dest_lo, 3845 Register src1, Register src2) { 3846 li(R0, 0); 3847 addc(dest_lo, dest_lo, src1); 3848 adde(dest_hi, dest_hi, R0); 3849 addc(dest_lo, dest_lo, src2); 3850 adde(dest_hi, dest_hi, R0); 3851 } 3852 3853 // Multiply 64 bit by 64 bit first loop. 3854 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, 3855 Register x_xstart, 3856 Register y, Register y_idx, 3857 Register z, 3858 Register carry, 3859 Register product_high, Register product, 3860 Register idx, Register kdx, 3861 Register tmp) { 3862 // jlong carry, x[], y[], z[]; 3863 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx--, kdx--) { 3864 // huge_128 product = y[idx] * x[xstart] + carry; 3865 // z[kdx] = (jlong)product; 3866 // carry = (jlong)(product >>> 64); 3867 // } 3868 // z[xstart] = carry; 3869 3870 Label L_first_loop, L_first_loop_exit; 3871 Label L_one_x, L_one_y, L_multiply; 3872 3873 addic_(xstart, xstart, -1); 3874 blt(CCR0, L_one_x); // Special case: length of x is 1. 3875 3876 // Load next two integers of x. 3877 sldi(tmp, xstart, LogBytesPerInt); 3878 ldx(x_xstart, x, tmp); 3879 #ifdef VM_LITTLE_ENDIAN 3880 rldicl(x_xstart, x_xstart, 32, 0); 3881 #endif 3882 3883 align(32, 16); 3884 bind(L_first_loop); 3885 3886 cmpdi(CCR0, idx, 1); 3887 blt(CCR0, L_first_loop_exit); 3888 addi(idx, idx, -2); 3889 beq(CCR0, L_one_y); 3890 3891 // Load next two integers of y. 3892 sldi(tmp, idx, LogBytesPerInt); 3893 ldx(y_idx, y, tmp); 3894 #ifdef VM_LITTLE_ENDIAN 3895 rldicl(y_idx, y_idx, 32, 0); 3896 #endif 3897 3898 3899 bind(L_multiply); 3900 multiply64(product_high, product, x_xstart, y_idx); 3901 3902 li(tmp, 0); 3903 addc(product, product, carry); // Add carry to result. 3904 adde(product_high, product_high, tmp); // Add carry of the last addition. 3905 addi(kdx, kdx, -2); 3906 3907 // Store result. 3908 #ifdef VM_LITTLE_ENDIAN 3909 rldicl(product, product, 32, 0); 3910 #endif 3911 sldi(tmp, kdx, LogBytesPerInt); 3912 stdx(product, z, tmp); 3913 mr_if_needed(carry, product_high); 3914 b(L_first_loop); 3915 3916 3917 bind(L_one_y); // Load one 32 bit portion of y as (0,value). 3918 3919 lwz(y_idx, 0, y); 3920 b(L_multiply); 3921 3922 3923 bind(L_one_x); // Load one 32 bit portion of x as (0,value). 3924 3925 lwz(x_xstart, 0, x); 3926 b(L_first_loop); 3927 3928 bind(L_first_loop_exit); 3929 } 3930 3931 // Multiply 64 bit by 64 bit and add 128 bit. 3932 void MacroAssembler::multiply_add_128_x_128(Register x_xstart, Register y, 3933 Register z, Register yz_idx, 3934 Register idx, Register carry, 3935 Register product_high, Register product, 3936 Register tmp, int offset) { 3937 3938 // huge_128 product = (y[idx] * x_xstart) + z[kdx] + carry; 3939 // z[kdx] = (jlong)product; 3940 3941 sldi(tmp, idx, LogBytesPerInt); 3942 if (offset) { 3943 addi(tmp, tmp, offset); 3944 } 3945 ldx(yz_idx, y, tmp); 3946 #ifdef VM_LITTLE_ENDIAN 3947 rldicl(yz_idx, yz_idx, 32, 0); 3948 #endif 3949 3950 multiply64(product_high, product, x_xstart, yz_idx); 3951 ldx(yz_idx, z, tmp); 3952 #ifdef VM_LITTLE_ENDIAN 3953 rldicl(yz_idx, yz_idx, 32, 0); 3954 #endif 3955 3956 add2_with_carry(product_high, product, carry, yz_idx); 3957 3958 sldi(tmp, idx, LogBytesPerInt); 3959 if (offset) { 3960 addi(tmp, tmp, offset); 3961 } 3962 #ifdef VM_LITTLE_ENDIAN 3963 rldicl(product, product, 32, 0); 3964 #endif 3965 stdx(product, z, tmp); 3966 } 3967 3968 // Multiply 128 bit by 128 bit. Unrolled inner loop. 3969 void MacroAssembler::multiply_128_x_128_loop(Register x_xstart, 3970 Register y, Register z, 3971 Register yz_idx, Register idx, Register carry, 3972 Register product_high, Register product, 3973 Register carry2, Register tmp) { 3974 3975 // jlong carry, x[], y[], z[]; 3976 // int kdx = ystart+1; 3977 // for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop 3978 // huge_128 product = (y[idx+1] * x_xstart) + z[kdx+idx+1] + carry; 3979 // z[kdx+idx+1] = (jlong)product; 3980 // jlong carry2 = (jlong)(product >>> 64); 3981 // product = (y[idx] * x_xstart) + z[kdx+idx] + carry2; 3982 // z[kdx+idx] = (jlong)product; 3983 // carry = (jlong)(product >>> 64); 3984 // } 3985 // idx += 2; 3986 // if (idx > 0) { 3987 // product = (y[idx] * x_xstart) + z[kdx+idx] + carry; 3988 // z[kdx+idx] = (jlong)product; 3989 // carry = (jlong)(product >>> 64); 3990 // } 3991 3992 Label L_third_loop, L_third_loop_exit, L_post_third_loop_done; 3993 const Register jdx = R0; 3994 3995 // Scale the index. 3996 srdi_(jdx, idx, 2); 3997 beq(CCR0, L_third_loop_exit); 3998 mtctr(jdx); 3999 4000 align(32, 16); 4001 bind(L_third_loop); 4002 4003 addi(idx, idx, -4); 4004 4005 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 8); 4006 mr_if_needed(carry2, product_high); 4007 4008 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry2, product_high, product, tmp, 0); 4009 mr_if_needed(carry, product_high); 4010 bdnz(L_third_loop); 4011 4012 bind(L_third_loop_exit); // Handle any left-over operand parts. 4013 4014 andi_(idx, idx, 0x3); 4015 beq(CCR0, L_post_third_loop_done); 4016 4017 Label L_check_1; 4018 4019 addic_(idx, idx, -2); 4020 blt(CCR0, L_check_1); 4021 4022 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 0); 4023 mr_if_needed(carry, product_high); 4024 4025 bind(L_check_1); 4026 4027 addi(idx, idx, 0x2); 4028 andi_(idx, idx, 0x1) ; 4029 addic_(idx, idx, -1); 4030 blt(CCR0, L_post_third_loop_done); 4031 4032 sldi(tmp, idx, LogBytesPerInt); 4033 lwzx(yz_idx, y, tmp); 4034 multiply64(product_high, product, x_xstart, yz_idx); 4035 lwzx(yz_idx, z, tmp); 4036 4037 add2_with_carry(product_high, product, yz_idx, carry); 4038 4039 sldi(tmp, idx, LogBytesPerInt); 4040 stwx(product, z, tmp); 4041 srdi(product, product, 32); 4042 4043 sldi(product_high, product_high, 32); 4044 orr(product, product, product_high); 4045 mr_if_needed(carry, product); 4046 4047 bind(L_post_third_loop_done); 4048 } // multiply_128_x_128_loop 4049 4050 void MacroAssembler::multiply_to_len(Register x, Register xlen, 4051 Register y, Register ylen, 4052 Register z, Register zlen, 4053 Register tmp1, Register tmp2, 4054 Register tmp3, Register tmp4, 4055 Register tmp5, Register tmp6, 4056 Register tmp7, Register tmp8, 4057 Register tmp9, Register tmp10, 4058 Register tmp11, Register tmp12, 4059 Register tmp13) { 4060 4061 ShortBranchVerifier sbv(this); 4062 4063 assert_different_registers(x, xlen, y, ylen, z, zlen, 4064 tmp1, tmp2, tmp3, tmp4, tmp5, tmp6); 4065 assert_different_registers(x, xlen, y, ylen, z, zlen, 4066 tmp1, tmp2, tmp3, tmp4, tmp5, tmp7); 4067 assert_different_registers(x, xlen, y, ylen, z, zlen, 4068 tmp1, tmp2, tmp3, tmp4, tmp5, tmp8); 4069 4070 const Register idx = tmp1; 4071 const Register kdx = tmp2; 4072 const Register xstart = tmp3; 4073 4074 const Register y_idx = tmp4; 4075 const Register carry = tmp5; 4076 const Register product = tmp6; 4077 const Register product_high = tmp7; 4078 const Register x_xstart = tmp8; 4079 const Register tmp = tmp9; 4080 4081 // First Loop. 4082 // 4083 // final static long LONG_MASK = 0xffffffffL; 4084 // int xstart = xlen - 1; 4085 // int ystart = ylen - 1; 4086 // long carry = 0; 4087 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) { 4088 // long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry; 4089 // z[kdx] = (int)product; 4090 // carry = product >>> 32; 4091 // } 4092 // z[xstart] = (int)carry; 4093 4094 mr_if_needed(idx, ylen); // idx = ylen 4095 mr_if_needed(kdx, zlen); // kdx = xlen + ylen 4096 li(carry, 0); // carry = 0 4097 4098 Label L_done; 4099 4100 addic_(xstart, xlen, -1); 4101 blt(CCR0, L_done); 4102 4103 multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, 4104 carry, product_high, product, idx, kdx, tmp); 4105 4106 Label L_second_loop; 4107 4108 cmpdi(CCR0, kdx, 0); 4109 beq(CCR0, L_second_loop); 4110 4111 Label L_carry; 4112 4113 addic_(kdx, kdx, -1); 4114 beq(CCR0, L_carry); 4115 4116 // Store lower 32 bits of carry. 4117 sldi(tmp, kdx, LogBytesPerInt); 4118 stwx(carry, z, tmp); 4119 srdi(carry, carry, 32); 4120 addi(kdx, kdx, -1); 4121 4122 4123 bind(L_carry); 4124 4125 // Store upper 32 bits of carry. 4126 sldi(tmp, kdx, LogBytesPerInt); 4127 stwx(carry, z, tmp); 4128 4129 // Second and third (nested) loops. 4130 // 4131 // for (int i = xstart-1; i >= 0; i--) { // Second loop 4132 // carry = 0; 4133 // for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop 4134 // long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) + 4135 // (z[k] & LONG_MASK) + carry; 4136 // z[k] = (int)product; 4137 // carry = product >>> 32; 4138 // } 4139 // z[i] = (int)carry; 4140 // } 4141 // 4142 // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = rdx 4143 4144 bind(L_second_loop); 4145 4146 li(carry, 0); // carry = 0; 4147 4148 addic_(xstart, xstart, -1); // i = xstart-1; 4149 blt(CCR0, L_done); 4150 4151 Register zsave = tmp10; 4152 4153 mr(zsave, z); 4154 4155 4156 Label L_last_x; 4157 4158 sldi(tmp, xstart, LogBytesPerInt); 4159 add(z, z, tmp); // z = z + k - j 4160 addi(z, z, 4); 4161 addic_(xstart, xstart, -1); // i = xstart-1; 4162 blt(CCR0, L_last_x); 4163 4164 sldi(tmp, xstart, LogBytesPerInt); 4165 ldx(x_xstart, x, tmp); 4166 #ifdef VM_LITTLE_ENDIAN 4167 rldicl(x_xstart, x_xstart, 32, 0); 4168 #endif 4169 4170 4171 Label L_third_loop_prologue; 4172 4173 bind(L_third_loop_prologue); 4174 4175 Register xsave = tmp11; 4176 Register xlensave = tmp12; 4177 Register ylensave = tmp13; 4178 4179 mr(xsave, x); 4180 mr(xlensave, xstart); 4181 mr(ylensave, ylen); 4182 4183 4184 multiply_128_x_128_loop(x_xstart, y, z, y_idx, ylen, 4185 carry, product_high, product, x, tmp); 4186 4187 mr(z, zsave); 4188 mr(x, xsave); 4189 mr(xlen, xlensave); // This is the decrement of the loop counter! 4190 mr(ylen, ylensave); 4191 4192 addi(tmp3, xlen, 1); 4193 sldi(tmp, tmp3, LogBytesPerInt); 4194 stwx(carry, z, tmp); 4195 addic_(tmp3, tmp3, -1); 4196 blt(CCR0, L_done); 4197 4198 srdi(carry, carry, 32); 4199 sldi(tmp, tmp3, LogBytesPerInt); 4200 stwx(carry, z, tmp); 4201 b(L_second_loop); 4202 4203 // Next infrequent code is moved outside loops. 4204 bind(L_last_x); 4205 4206 lwz(x_xstart, 0, x); 4207 b(L_third_loop_prologue); 4208 4209 bind(L_done); 4210 } // multiply_to_len 4211 4212 void MacroAssembler::asm_assert(bool check_equal, const char *msg, int id) { 4213 #ifdef ASSERT 4214 Label ok; 4215 if (check_equal) { 4216 beq(CCR0, ok); 4217 } else { 4218 bne(CCR0, ok); 4219 } 4220 stop(msg, id); 4221 bind(ok); 4222 #endif 4223 } 4224 4225 void MacroAssembler::asm_assert_mems_zero(bool check_equal, int size, int mem_offset, 4226 Register mem_base, const char* msg, int id) { 4227 #ifdef ASSERT 4228 switch (size) { 4229 case 4: 4230 lwz(R0, mem_offset, mem_base); 4231 cmpwi(CCR0, R0, 0); 4232 break; 4233 case 8: 4234 ld(R0, mem_offset, mem_base); 4235 cmpdi(CCR0, R0, 0); 4236 break; 4237 default: 4238 ShouldNotReachHere(); 4239 } 4240 asm_assert(check_equal, msg, id); 4241 #endif // ASSERT 4242 } 4243 4244 void MacroAssembler::verify_thread() { 4245 if (VerifyThread) { 4246 unimplemented("'VerifyThread' currently not implemented on PPC"); 4247 } 4248 } 4249 4250 // READ: oop. KILL: R0. Volatile floats perhaps. 4251 void MacroAssembler::verify_oop(Register oop, const char* msg) { 4252 if (!VerifyOops) { 4253 return; 4254 } 4255 4256 address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address(); 4257 const Register tmp = R11; // Will be preserved. 4258 const int nbytes_save = 11*8; // Volatile gprs except R0. 4259 save_volatile_gprs(R1_SP, -nbytes_save); // except R0 4260 4261 if (oop == tmp) mr(R4_ARG2, oop); 4262 save_LR_CR(tmp); // save in old frame 4263 push_frame_reg_args(nbytes_save, tmp); 4264 // load FunctionDescriptor** / entry_address * 4265 load_const_optimized(tmp, fd, R0); 4266 // load FunctionDescriptor* / entry_address 4267 ld(tmp, 0, tmp); 4268 if (oop != tmp) mr_if_needed(R4_ARG2, oop); 4269 load_const_optimized(R3_ARG1, (address)msg, R0); 4270 // Call destination for its side effect. 4271 call_c(tmp); 4272 4273 pop_frame(); 4274 restore_LR_CR(tmp); 4275 restore_volatile_gprs(R1_SP, -nbytes_save); // except R0 4276 } 4277 4278 const char* stop_types[] = { 4279 "stop", 4280 "untested", 4281 "unimplemented", 4282 "shouldnotreachhere" 4283 }; 4284 4285 static void stop_on_request(int tp, const char* msg) { 4286 tty->print("PPC assembly code requires stop: (%s) %s\n", stop_types[tp%/*stop_end*/4], msg); 4287 guarantee(false, err_msg("PPC assembly code requires stop: %s", msg)); 4288 } 4289 4290 // Call a C-function that prints output. 4291 void MacroAssembler::stop(int type, const char* msg, int id) { 4292 #ifndef PRODUCT 4293 block_comment(err_msg("stop: %s %s {", stop_types[type%stop_end], msg)); 4294 #else 4295 block_comment("stop {"); 4296 #endif 4297 4298 // setup arguments 4299 load_const_optimized(R3_ARG1, type); 4300 load_const_optimized(R4_ARG2, (void *)msg, /*tmp=*/R0); 4301 call_VM_leaf(CAST_FROM_FN_PTR(address, stop_on_request), R3_ARG1, R4_ARG2); 4302 illtrap(); 4303 emit_int32(id); 4304 block_comment("} stop;"); 4305 } 4306 4307 #ifndef PRODUCT 4308 // Write pattern 0x0101010101010101 in memory region [low-before, high+after]. 4309 // Val, addr are temp registers. 4310 // If low == addr, addr is killed. 4311 // High is preserved. 4312 void MacroAssembler::zap_from_to(Register low, int before, Register high, int after, Register val, Register addr) { 4313 if (!ZapMemory) return; 4314 4315 assert_different_registers(low, val); 4316 4317 BLOCK_COMMENT("zap memory region {"); 4318 load_const_optimized(val, 0x0101010101010101); 4319 int size = before + after; 4320 if (low == high && size < 5 && size > 0) { 4321 int offset = -before*BytesPerWord; 4322 for (int i = 0; i < size; ++i) { 4323 std(val, offset, low); 4324 offset += (1*BytesPerWord); 4325 } 4326 } else { 4327 addi(addr, low, -before*BytesPerWord); 4328 assert_different_registers(high, val); 4329 if (after) addi(high, high, after * BytesPerWord); 4330 Label loop; 4331 bind(loop); 4332 std(val, 0, addr); 4333 addi(addr, addr, 8); 4334 cmpd(CCR6, addr, high); 4335 ble(CCR6, loop); 4336 if (after) addi(high, high, -after * BytesPerWord); // Correct back to old value. 4337 } 4338 BLOCK_COMMENT("} zap memory region"); 4339 } 4340 4341 #endif // !PRODUCT 4342 4343 SkipIfEqualZero::SkipIfEqualZero(MacroAssembler* masm, Register temp, const bool* flag_addr) : _masm(masm), _label() { 4344 int simm16_offset = masm->load_const_optimized(temp, (address)flag_addr, R0, true); 4345 assert(sizeof(bool) == 1, "PowerPC ABI"); 4346 masm->lbz(temp, simm16_offset, temp); 4347 masm->cmpwi(CCR0, temp, 0); 4348 masm->beq(CCR0, _label); 4349 } 4350 4351 SkipIfEqualZero::~SkipIfEqualZero() { 4352 _masm->bind(_label); 4353 }