1 /* 2 * Copyright (c) 1997, 2015, Oracle and/or its affiliates. All rights reserved. 3 * Copyright 2012, 2015 SAP AG. All rights reserved. 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. 9 * 10 * This code is distributed in the hope that it will be useful, but WITHOUT 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 * version 2 for more details (a copy is included in the LICENSE file that 14 * accompanied this code). 15 * 16 * You should have received a copy of the GNU General Public License version 17 * 2 along with this work; if not, write to the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 19 * 20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 21 * or visit www.oracle.com if you need additional information or have any 22 * questions. 23 * 24 */ 25 26 #include "precompiled.hpp" 27 #include "asm/macroAssembler.inline.hpp" 28 #include "compiler/disassembler.hpp" 29 #include "gc/shared/cardTableModRefBS.hpp" 30 #include "gc/shared/collectedHeap.inline.hpp" 31 #include "interpreter/interpreter.hpp" 32 #include "memory/resourceArea.hpp" 33 #include "prims/methodHandles.hpp" 34 #include "runtime/biasedLocking.hpp" 35 #include "runtime/icache.hpp" 36 #include "runtime/interfaceSupport.hpp" 37 #include "runtime/objectMonitor.hpp" 38 #include "runtime/os.hpp" 39 #include "runtime/sharedRuntime.hpp" 40 #include "runtime/stubRoutines.hpp" 41 #include "utilities/macros.hpp" 42 #if INCLUDE_ALL_GCS 43 #include "gc/g1/g1CollectedHeap.inline.hpp" 44 #include "gc/g1/g1SATBCardTableModRefBS.hpp" 45 #include "gc/g1/heapRegion.hpp" 46 #endif // INCLUDE_ALL_GCS 47 48 #ifdef PRODUCT 49 #define BLOCK_COMMENT(str) // nothing 50 #else 51 #define BLOCK_COMMENT(str) block_comment(str) 52 #endif 53 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 54 55 #ifdef ASSERT 56 // On RISC, there's no benefit to verifying instruction boundaries. 57 bool AbstractAssembler::pd_check_instruction_mark() { return false; } 58 #endif 59 60 void MacroAssembler::ld_largeoffset_unchecked(Register d, int si31, Register a, int emit_filler_nop) { 61 assert(Assembler::is_simm(si31, 31) && si31 >= 0, "si31 out of range"); 62 if (Assembler::is_simm(si31, 16)) { 63 ld(d, si31, a); 64 if (emit_filler_nop) nop(); 65 } else { 66 const int hi = MacroAssembler::largeoffset_si16_si16_hi(si31); 67 const int lo = MacroAssembler::largeoffset_si16_si16_lo(si31); 68 addis(d, a, hi); 69 ld(d, lo, d); 70 } 71 } 72 73 void MacroAssembler::ld_largeoffset(Register d, int si31, Register a, int emit_filler_nop) { 74 assert_different_registers(d, a); 75 ld_largeoffset_unchecked(d, si31, a, emit_filler_nop); 76 } 77 78 void MacroAssembler::load_sized_value(Register dst, RegisterOrConstant offs, Register base, 79 size_t size_in_bytes, bool is_signed) { 80 switch (size_in_bytes) { 81 case 8: ld(dst, offs, base); break; 82 case 4: is_signed ? lwa(dst, offs, base) : lwz(dst, offs, base); break; 83 case 2: is_signed ? lha(dst, offs, base) : lhz(dst, offs, base); break; 84 case 1: lbz(dst, offs, base); if (is_signed) extsb(dst, dst); break; // lba doesn't exist :( 85 default: ShouldNotReachHere(); 86 } 87 } 88 89 void MacroAssembler::store_sized_value(Register dst, RegisterOrConstant offs, Register base, 90 size_t size_in_bytes) { 91 switch (size_in_bytes) { 92 case 8: std(dst, offs, base); break; 93 case 4: stw(dst, offs, base); break; 94 case 2: sth(dst, offs, base); break; 95 case 1: stb(dst, offs, base); break; 96 default: ShouldNotReachHere(); 97 } 98 } 99 100 void MacroAssembler::align(int modulus, int max, int rem) { 101 int padding = (rem + modulus - (offset() % modulus)) % modulus; 102 if (padding > max) return; 103 for (int c = (padding >> 2); c > 0; --c) { nop(); } 104 } 105 106 // Issue instructions that calculate given TOC from global TOC. 107 void MacroAssembler::calculate_address_from_global_toc(Register dst, address addr, bool hi16, bool lo16, 108 bool add_relocation, bool emit_dummy_addr) { 109 int offset = -1; 110 if (emit_dummy_addr) { 111 offset = -128; // dummy address 112 } else if (addr != (address)(intptr_t)-1) { 113 offset = MacroAssembler::offset_to_global_toc(addr); 114 } 115 116 if (hi16) { 117 addis(dst, R29, MacroAssembler::largeoffset_si16_si16_hi(offset)); 118 } 119 if (lo16) { 120 if (add_relocation) { 121 // Relocate at the addi to avoid confusion with a load from the method's TOC. 122 relocate(internal_word_Relocation::spec(addr)); 123 } 124 addi(dst, dst, MacroAssembler::largeoffset_si16_si16_lo(offset)); 125 } 126 } 127 128 int MacroAssembler::patch_calculate_address_from_global_toc_at(address a, address bound, address addr) { 129 const int offset = MacroAssembler::offset_to_global_toc(addr); 130 131 const address inst2_addr = a; 132 const int inst2 = *(int *)inst2_addr; 133 134 // The relocation points to the second instruction, the addi, 135 // and the addi reads and writes the same register dst. 136 const int dst = inv_rt_field(inst2); 137 assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst"); 138 139 // Now, find the preceding addis which writes to dst. 140 int inst1 = 0; 141 address inst1_addr = inst2_addr - BytesPerInstWord; 142 while (inst1_addr >= bound) { 143 inst1 = *(int *) inst1_addr; 144 if (is_addis(inst1) && inv_rt_field(inst1) == dst) { 145 // Stop, found the addis which writes dst. 146 break; 147 } 148 inst1_addr -= BytesPerInstWord; 149 } 150 151 assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC"); 152 set_imm((int *)inst1_addr, MacroAssembler::largeoffset_si16_si16_hi(offset)); 153 set_imm((int *)inst2_addr, MacroAssembler::largeoffset_si16_si16_lo(offset)); 154 return (int)((intptr_t)addr - (intptr_t)inst1_addr); 155 } 156 157 address MacroAssembler::get_address_of_calculate_address_from_global_toc_at(address a, address bound) { 158 const address inst2_addr = a; 159 const int inst2 = *(int *)inst2_addr; 160 161 // The relocation points to the second instruction, the addi, 162 // and the addi reads and writes the same register dst. 163 const int dst = inv_rt_field(inst2); 164 assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst"); 165 166 // Now, find the preceding addis which writes to dst. 167 int inst1 = 0; 168 address inst1_addr = inst2_addr - BytesPerInstWord; 169 while (inst1_addr >= bound) { 170 inst1 = *(int *) inst1_addr; 171 if (is_addis(inst1) && inv_rt_field(inst1) == dst) { 172 // stop, found the addis which writes dst 173 break; 174 } 175 inst1_addr -= BytesPerInstWord; 176 } 177 178 assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC"); 179 180 int offset = (get_imm(inst1_addr, 0) << 16) + get_imm(inst2_addr, 0); 181 // -1 is a special case 182 if (offset == -1) { 183 return (address)(intptr_t)-1; 184 } else { 185 return global_toc() + offset; 186 } 187 } 188 189 #ifdef _LP64 190 // Patch compressed oops or klass constants. 191 // Assembler sequence is 192 // 1) compressed oops: 193 // lis rx = const.hi 194 // ori rx = rx | const.lo 195 // 2) compressed klass: 196 // lis rx = const.hi 197 // clrldi rx = rx & 0xFFFFffff // clearMS32b, optional 198 // ori rx = rx | const.lo 199 // Clrldi will be passed by. 200 int MacroAssembler::patch_set_narrow_oop(address a, address bound, narrowOop data) { 201 assert(UseCompressedOops, "Should only patch compressed oops"); 202 203 const address inst2_addr = a; 204 const int inst2 = *(int *)inst2_addr; 205 206 // The relocation points to the second instruction, the ori, 207 // and the ori reads and writes the same register dst. 208 const int dst = inv_rta_field(inst2); 209 assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst"); 210 // Now, find the preceding addis which writes to dst. 211 int inst1 = 0; 212 address inst1_addr = inst2_addr - BytesPerInstWord; 213 bool inst1_found = false; 214 while (inst1_addr >= bound) { 215 inst1 = *(int *)inst1_addr; 216 if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break; } 217 inst1_addr -= BytesPerInstWord; 218 } 219 assert(inst1_found, "inst is not lis"); 220 221 int xc = (data >> 16) & 0xffff; 222 int xd = (data >> 0) & 0xffff; 223 224 set_imm((int *)inst1_addr, (short)(xc)); // see enc_load_con_narrow_hi/_lo 225 set_imm((int *)inst2_addr, (xd)); // unsigned int 226 return (int)((intptr_t)inst2_addr - (intptr_t)inst1_addr); 227 } 228 229 // Get compressed oop or klass constant. 230 narrowOop MacroAssembler::get_narrow_oop(address a, address bound) { 231 assert(UseCompressedOops, "Should only patch compressed oops"); 232 233 const address inst2_addr = a; 234 const int inst2 = *(int *)inst2_addr; 235 236 // The relocation points to the second instruction, the ori, 237 // and the ori reads and writes the same register dst. 238 const int dst = inv_rta_field(inst2); 239 assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst"); 240 // Now, find the preceding lis which writes to dst. 241 int inst1 = 0; 242 address inst1_addr = inst2_addr - BytesPerInstWord; 243 bool inst1_found = false; 244 245 while (inst1_addr >= bound) { 246 inst1 = *(int *) inst1_addr; 247 if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break;} 248 inst1_addr -= BytesPerInstWord; 249 } 250 assert(inst1_found, "inst is not lis"); 251 252 uint xl = ((unsigned int) (get_imm(inst2_addr, 0) & 0xffff)); 253 uint xh = (((get_imm(inst1_addr, 0)) & 0xffff) << 16); 254 255 return (int) (xl | xh); 256 } 257 #endif // _LP64 258 259 void MacroAssembler::load_const_from_method_toc(Register dst, AddressLiteral& a, Register toc) { 260 int toc_offset = 0; 261 // Use RelocationHolder::none for the constant pool entry, otherwise 262 // we will end up with a failing NativeCall::verify(x) where x is 263 // the address of the constant pool entry. 264 // FIXME: We should insert relocation information for oops at the constant 265 // pool entries instead of inserting it at the loads; patching of a constant 266 // pool entry should be less expensive. 267 address oop_address = address_constant((address)a.value(), RelocationHolder::none); 268 // Relocate at the pc of the load. 269 relocate(a.rspec()); 270 toc_offset = (int)(oop_address - code()->consts()->start()); 271 ld_largeoffset_unchecked(dst, toc_offset, toc, true); 272 } 273 274 bool MacroAssembler::is_load_const_from_method_toc_at(address a) { 275 const address inst1_addr = a; 276 const int inst1 = *(int *)inst1_addr; 277 278 // The relocation points to the ld or the addis. 279 return (is_ld(inst1)) || 280 (is_addis(inst1) && inv_ra_field(inst1) != 0); 281 } 282 283 int MacroAssembler::get_offset_of_load_const_from_method_toc_at(address a) { 284 assert(is_load_const_from_method_toc_at(a), "must be load_const_from_method_toc"); 285 286 const address inst1_addr = a; 287 const int inst1 = *(int *)inst1_addr; 288 289 if (is_ld(inst1)) { 290 return inv_d1_field(inst1); 291 } else if (is_addis(inst1)) { 292 const int dst = inv_rt_field(inst1); 293 294 // Now, find the succeeding ld which reads and writes to dst. 295 address inst2_addr = inst1_addr + BytesPerInstWord; 296 int inst2 = 0; 297 while (true) { 298 inst2 = *(int *) inst2_addr; 299 if (is_ld(inst2) && inv_ra_field(inst2) == dst && inv_rt_field(inst2) == dst) { 300 // Stop, found the ld which reads and writes dst. 301 break; 302 } 303 inst2_addr += BytesPerInstWord; 304 } 305 return (inv_d1_field(inst1) << 16) + inv_d1_field(inst2); 306 } 307 ShouldNotReachHere(); 308 return 0; 309 } 310 311 // Get the constant from a `load_const' sequence. 312 long MacroAssembler::get_const(address a) { 313 assert(is_load_const_at(a), "not a load of a constant"); 314 const int *p = (const int*) a; 315 unsigned long x = (((unsigned long) (get_imm(a,0) & 0xffff)) << 48); 316 if (is_ori(*(p+1))) { 317 x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 32); 318 x |= (((unsigned long) (get_imm(a,3) & 0xffff)) << 16); 319 x |= (((unsigned long) (get_imm(a,4) & 0xffff))); 320 } else if (is_lis(*(p+1))) { 321 x |= (((unsigned long) (get_imm(a,2) & 0xffff)) << 32); 322 x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 16); 323 x |= (((unsigned long) (get_imm(a,3) & 0xffff))); 324 } else { 325 ShouldNotReachHere(); 326 return (long) 0; 327 } 328 return (long) x; 329 } 330 331 // Patch the 64 bit constant of a `load_const' sequence. This is a low 332 // level procedure. It neither flushes the instruction cache nor is it 333 // mt safe. 334 void MacroAssembler::patch_const(address a, long x) { 335 assert(is_load_const_at(a), "not a load of a constant"); 336 int *p = (int*) a; 337 if (is_ori(*(p+1))) { 338 set_imm(0 + p, (x >> 48) & 0xffff); 339 set_imm(1 + p, (x >> 32) & 0xffff); 340 set_imm(3 + p, (x >> 16) & 0xffff); 341 set_imm(4 + p, x & 0xffff); 342 } else if (is_lis(*(p+1))) { 343 set_imm(0 + p, (x >> 48) & 0xffff); 344 set_imm(2 + p, (x >> 32) & 0xffff); 345 set_imm(1 + p, (x >> 16) & 0xffff); 346 set_imm(3 + p, x & 0xffff); 347 } else { 348 ShouldNotReachHere(); 349 } 350 } 351 352 AddressLiteral MacroAssembler::allocate_metadata_address(Metadata* obj) { 353 assert(oop_recorder() != NULL, "this assembler needs a Recorder"); 354 int index = oop_recorder()->allocate_metadata_index(obj); 355 RelocationHolder rspec = metadata_Relocation::spec(index); 356 return AddressLiteral((address)obj, rspec); 357 } 358 359 AddressLiteral MacroAssembler::constant_metadata_address(Metadata* obj) { 360 assert(oop_recorder() != NULL, "this assembler needs a Recorder"); 361 int index = oop_recorder()->find_index(obj); 362 RelocationHolder rspec = metadata_Relocation::spec(index); 363 return AddressLiteral((address)obj, rspec); 364 } 365 366 AddressLiteral MacroAssembler::allocate_oop_address(jobject obj) { 367 assert(oop_recorder() != NULL, "this assembler needs an OopRecorder"); 368 int oop_index = oop_recorder()->allocate_oop_index(obj); 369 return AddressLiteral(address(obj), oop_Relocation::spec(oop_index)); 370 } 371 372 AddressLiteral MacroAssembler::constant_oop_address(jobject obj) { 373 assert(oop_recorder() != NULL, "this assembler needs an OopRecorder"); 374 int oop_index = oop_recorder()->find_index(obj); 375 return AddressLiteral(address(obj), oop_Relocation::spec(oop_index)); 376 } 377 378 RegisterOrConstant MacroAssembler::delayed_value_impl(intptr_t* delayed_value_addr, 379 Register tmp, int offset) { 380 intptr_t value = *delayed_value_addr; 381 if (value != 0) { 382 return RegisterOrConstant(value + offset); 383 } 384 385 // Load indirectly to solve generation ordering problem. 386 // static address, no relocation 387 int simm16_offset = load_const_optimized(tmp, delayed_value_addr, noreg, true); 388 ld(tmp, simm16_offset, tmp); // must be aligned ((xa & 3) == 0) 389 390 if (offset != 0) { 391 addi(tmp, tmp, offset); 392 } 393 394 return RegisterOrConstant(tmp); 395 } 396 397 #ifndef PRODUCT 398 void MacroAssembler::pd_print_patched_instruction(address branch) { 399 Unimplemented(); // TODO: PPC port 400 } 401 #endif // ndef PRODUCT 402 403 // Conditional far branch for destinations encodable in 24+2 bits. 404 void MacroAssembler::bc_far(int boint, int biint, Label& dest, int optimize) { 405 406 // If requested by flag optimize, relocate the bc_far as a 407 // runtime_call and prepare for optimizing it when the code gets 408 // relocated. 409 if (optimize == bc_far_optimize_on_relocate) { 410 relocate(relocInfo::runtime_call_type); 411 } 412 413 // variant 2: 414 // 415 // b!cxx SKIP 416 // bxx DEST 417 // SKIP: 418 // 419 420 const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)), 421 opposite_bcond(inv_boint_bcond(boint))); 422 423 // We emit two branches. 424 // First, a conditional branch which jumps around the far branch. 425 const address not_taken_pc = pc() + 2 * BytesPerInstWord; 426 const address bc_pc = pc(); 427 bc(opposite_boint, biint, not_taken_pc); 428 429 const int bc_instr = *(int*)bc_pc; 430 assert(not_taken_pc == (address)inv_bd_field(bc_instr, (intptr_t)bc_pc), "postcondition"); 431 assert(opposite_boint == inv_bo_field(bc_instr), "postcondition"); 432 assert(boint == add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(bc_instr))), 433 opposite_bcond(inv_boint_bcond(inv_bo_field(bc_instr)))), 434 "postcondition"); 435 assert(biint == inv_bi_field(bc_instr), "postcondition"); 436 437 // Second, an unconditional far branch which jumps to dest. 438 // Note: target(dest) remembers the current pc (see CodeSection::target) 439 // and returns the current pc if the label is not bound yet; when 440 // the label gets bound, the unconditional far branch will be patched. 441 const address target_pc = target(dest); 442 const address b_pc = pc(); 443 b(target_pc); 444 445 assert(not_taken_pc == pc(), "postcondition"); 446 assert(dest.is_bound() || target_pc == b_pc, "postcondition"); 447 } 448 449 bool MacroAssembler::is_bc_far_at(address instruction_addr) { 450 return is_bc_far_variant1_at(instruction_addr) || 451 is_bc_far_variant2_at(instruction_addr) || 452 is_bc_far_variant3_at(instruction_addr); 453 } 454 455 address MacroAssembler::get_dest_of_bc_far_at(address instruction_addr) { 456 if (is_bc_far_variant1_at(instruction_addr)) { 457 const address instruction_1_addr = instruction_addr; 458 const int instruction_1 = *(int*)instruction_1_addr; 459 return (address)inv_bd_field(instruction_1, (intptr_t)instruction_1_addr); 460 } else if (is_bc_far_variant2_at(instruction_addr)) { 461 const address instruction_2_addr = instruction_addr + 4; 462 return bxx_destination(instruction_2_addr); 463 } else if (is_bc_far_variant3_at(instruction_addr)) { 464 return instruction_addr + 8; 465 } 466 // variant 4 ??? 467 ShouldNotReachHere(); 468 return NULL; 469 } 470 void MacroAssembler::set_dest_of_bc_far_at(address instruction_addr, address dest) { 471 472 if (is_bc_far_variant3_at(instruction_addr)) { 473 // variant 3, far cond branch to the next instruction, already patched to nops: 474 // 475 // nop 476 // endgroup 477 // SKIP/DEST: 478 // 479 return; 480 } 481 482 // first, extract boint and biint from the current branch 483 int boint = 0; 484 int biint = 0; 485 486 ResourceMark rm; 487 const int code_size = 2 * BytesPerInstWord; 488 CodeBuffer buf(instruction_addr, code_size); 489 MacroAssembler masm(&buf); 490 if (is_bc_far_variant2_at(instruction_addr) && dest == instruction_addr + 8) { 491 // Far branch to next instruction: Optimize it by patching nops (produce variant 3). 492 masm.nop(); 493 masm.endgroup(); 494 } else { 495 if (is_bc_far_variant1_at(instruction_addr)) { 496 // variant 1, the 1st instruction contains the destination address: 497 // 498 // bcxx DEST 499 // endgroup 500 // 501 const int instruction_1 = *(int*)(instruction_addr); 502 boint = inv_bo_field(instruction_1); 503 biint = inv_bi_field(instruction_1); 504 } else if (is_bc_far_variant2_at(instruction_addr)) { 505 // variant 2, the 2nd instruction contains the destination address: 506 // 507 // b!cxx SKIP 508 // bxx DEST 509 // SKIP: 510 // 511 const int instruction_1 = *(int*)(instruction_addr); 512 boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(instruction_1))), 513 opposite_bcond(inv_boint_bcond(inv_bo_field(instruction_1)))); 514 biint = inv_bi_field(instruction_1); 515 } else { 516 // variant 4??? 517 ShouldNotReachHere(); 518 } 519 520 // second, set the new branch destination and optimize the code 521 if (dest != instruction_addr + 4 && // the bc_far is still unbound! 522 masm.is_within_range_of_bcxx(dest, instruction_addr)) { 523 // variant 1: 524 // 525 // bcxx DEST 526 // endgroup 527 // 528 masm.bc(boint, biint, dest); 529 masm.endgroup(); 530 } else { 531 // variant 2: 532 // 533 // b!cxx SKIP 534 // bxx DEST 535 // SKIP: 536 // 537 const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)), 538 opposite_bcond(inv_boint_bcond(boint))); 539 const address not_taken_pc = masm.pc() + 2 * BytesPerInstWord; 540 masm.bc(opposite_boint, biint, not_taken_pc); 541 masm.b(dest); 542 } 543 } 544 ICache::ppc64_flush_icache_bytes(instruction_addr, code_size); 545 } 546 547 // Emit a NOT mt-safe patchable 64 bit absolute call/jump. 548 void MacroAssembler::bxx64_patchable(address dest, relocInfo::relocType rt, bool link) { 549 // get current pc 550 uint64_t start_pc = (uint64_t) pc(); 551 552 const address pc_of_bl = (address) (start_pc + (6*BytesPerInstWord)); // bl is last 553 const address pc_of_b = (address) (start_pc + (0*BytesPerInstWord)); // b is first 554 555 // relocate here 556 if (rt != relocInfo::none) { 557 relocate(rt); 558 } 559 560 if ( ReoptimizeCallSequences && 561 (( link && is_within_range_of_b(dest, pc_of_bl)) || 562 (!link && is_within_range_of_b(dest, pc_of_b)))) { 563 // variant 2: 564 // Emit an optimized, pc-relative call/jump. 565 566 if (link) { 567 // some padding 568 nop(); 569 nop(); 570 nop(); 571 nop(); 572 nop(); 573 nop(); 574 575 // do the call 576 assert(pc() == pc_of_bl, "just checking"); 577 bl(dest, relocInfo::none); 578 } else { 579 // do the jump 580 assert(pc() == pc_of_b, "just checking"); 581 b(dest, relocInfo::none); 582 583 // some padding 584 nop(); 585 nop(); 586 nop(); 587 nop(); 588 nop(); 589 nop(); 590 } 591 592 // Assert that we can identify the emitted call/jump. 593 assert(is_bxx64_patchable_variant2_at((address)start_pc, link), 594 "can't identify emitted call"); 595 } else { 596 // variant 1: 597 #if defined(ABI_ELFv2) 598 nop(); 599 calculate_address_from_global_toc(R12, dest, true, true, false); 600 mtctr(R12); 601 nop(); 602 nop(); 603 #else 604 mr(R0, R11); // spill R11 -> R0. 605 606 // Load the destination address into CTR, 607 // calculate destination relative to global toc. 608 calculate_address_from_global_toc(R11, dest, true, true, false); 609 610 mtctr(R11); 611 mr(R11, R0); // spill R11 <- R0. 612 nop(); 613 #endif 614 615 // do the call/jump 616 if (link) { 617 bctrl(); 618 } else{ 619 bctr(); 620 } 621 // Assert that we can identify the emitted call/jump. 622 assert(is_bxx64_patchable_variant1b_at((address)start_pc, link), 623 "can't identify emitted call"); 624 } 625 626 // Assert that we can identify the emitted call/jump. 627 assert(is_bxx64_patchable_at((address)start_pc, link), 628 "can't identify emitted call"); 629 assert(get_dest_of_bxx64_patchable_at((address)start_pc, link) == dest, 630 "wrong encoding of dest address"); 631 } 632 633 // Identify a bxx64_patchable instruction. 634 bool MacroAssembler::is_bxx64_patchable_at(address instruction_addr, bool link) { 635 return is_bxx64_patchable_variant1b_at(instruction_addr, link) 636 //|| is_bxx64_patchable_variant1_at(instruction_addr, link) 637 || is_bxx64_patchable_variant2_at(instruction_addr, link); 638 } 639 640 // Does the call64_patchable instruction use a pc-relative encoding of 641 // the call destination? 642 bool MacroAssembler::is_bxx64_patchable_pcrelative_at(address instruction_addr, bool link) { 643 // variant 2 is pc-relative 644 return is_bxx64_patchable_variant2_at(instruction_addr, link); 645 } 646 647 // Identify variant 1. 648 bool MacroAssembler::is_bxx64_patchable_variant1_at(address instruction_addr, bool link) { 649 unsigned int* instr = (unsigned int*) instruction_addr; 650 return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l] 651 && is_mtctr(instr[5]) // mtctr 652 && is_load_const_at(instruction_addr); 653 } 654 655 // Identify variant 1b: load destination relative to global toc. 656 bool MacroAssembler::is_bxx64_patchable_variant1b_at(address instruction_addr, bool link) { 657 unsigned int* instr = (unsigned int*) instruction_addr; 658 return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l] 659 && is_mtctr(instr[3]) // mtctr 660 && is_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord, instruction_addr); 661 } 662 663 // Identify variant 2. 664 bool MacroAssembler::is_bxx64_patchable_variant2_at(address instruction_addr, bool link) { 665 unsigned int* instr = (unsigned int*) instruction_addr; 666 if (link) { 667 return is_bl (instr[6]) // bl dest is last 668 && is_nop(instr[0]) // nop 669 && is_nop(instr[1]) // nop 670 && is_nop(instr[2]) // nop 671 && is_nop(instr[3]) // nop 672 && is_nop(instr[4]) // nop 673 && is_nop(instr[5]); // nop 674 } else { 675 return is_b (instr[0]) // b dest is first 676 && is_nop(instr[1]) // nop 677 && is_nop(instr[2]) // nop 678 && is_nop(instr[3]) // nop 679 && is_nop(instr[4]) // nop 680 && is_nop(instr[5]) // nop 681 && is_nop(instr[6]); // nop 682 } 683 } 684 685 // Set dest address of a bxx64_patchable instruction. 686 void MacroAssembler::set_dest_of_bxx64_patchable_at(address instruction_addr, address dest, bool link) { 687 ResourceMark rm; 688 int code_size = MacroAssembler::bxx64_patchable_size; 689 CodeBuffer buf(instruction_addr, code_size); 690 MacroAssembler masm(&buf); 691 masm.bxx64_patchable(dest, relocInfo::none, link); 692 ICache::ppc64_flush_icache_bytes(instruction_addr, code_size); 693 } 694 695 // Get dest address of a bxx64_patchable instruction. 696 address MacroAssembler::get_dest_of_bxx64_patchable_at(address instruction_addr, bool link) { 697 if (is_bxx64_patchable_variant1_at(instruction_addr, link)) { 698 return (address) (unsigned long) get_const(instruction_addr); 699 } else if (is_bxx64_patchable_variant2_at(instruction_addr, link)) { 700 unsigned int* instr = (unsigned int*) instruction_addr; 701 if (link) { 702 const int instr_idx = 6; // bl is last 703 int branchoffset = branch_destination(instr[instr_idx], 0); 704 return instruction_addr + branchoffset + instr_idx*BytesPerInstWord; 705 } else { 706 const int instr_idx = 0; // b is first 707 int branchoffset = branch_destination(instr[instr_idx], 0); 708 return instruction_addr + branchoffset + instr_idx*BytesPerInstWord; 709 } 710 // Load dest relative to global toc. 711 } else if (is_bxx64_patchable_variant1b_at(instruction_addr, link)) { 712 return get_address_of_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord, 713 instruction_addr); 714 } else { 715 ShouldNotReachHere(); 716 return NULL; 717 } 718 } 719 720 // Uses ordering which corresponds to ABI: 721 // _savegpr0_14: std r14,-144(r1) 722 // _savegpr0_15: std r15,-136(r1) 723 // _savegpr0_16: std r16,-128(r1) 724 void MacroAssembler::save_nonvolatile_gprs(Register dst, int offset) { 725 std(R14, offset, dst); offset += 8; 726 std(R15, offset, dst); offset += 8; 727 std(R16, offset, dst); offset += 8; 728 std(R17, offset, dst); offset += 8; 729 std(R18, offset, dst); offset += 8; 730 std(R19, offset, dst); offset += 8; 731 std(R20, offset, dst); offset += 8; 732 std(R21, offset, dst); offset += 8; 733 std(R22, offset, dst); offset += 8; 734 std(R23, offset, dst); offset += 8; 735 std(R24, offset, dst); offset += 8; 736 std(R25, offset, dst); offset += 8; 737 std(R26, offset, dst); offset += 8; 738 std(R27, offset, dst); offset += 8; 739 std(R28, offset, dst); offset += 8; 740 std(R29, offset, dst); offset += 8; 741 std(R30, offset, dst); offset += 8; 742 std(R31, offset, dst); offset += 8; 743 744 stfd(F14, offset, dst); offset += 8; 745 stfd(F15, offset, dst); offset += 8; 746 stfd(F16, offset, dst); offset += 8; 747 stfd(F17, offset, dst); offset += 8; 748 stfd(F18, offset, dst); offset += 8; 749 stfd(F19, offset, dst); offset += 8; 750 stfd(F20, offset, dst); offset += 8; 751 stfd(F21, offset, dst); offset += 8; 752 stfd(F22, offset, dst); offset += 8; 753 stfd(F23, offset, dst); offset += 8; 754 stfd(F24, offset, dst); offset += 8; 755 stfd(F25, offset, dst); offset += 8; 756 stfd(F26, offset, dst); offset += 8; 757 stfd(F27, offset, dst); offset += 8; 758 stfd(F28, offset, dst); offset += 8; 759 stfd(F29, offset, dst); offset += 8; 760 stfd(F30, offset, dst); offset += 8; 761 stfd(F31, offset, dst); 762 } 763 764 // Uses ordering which corresponds to ABI: 765 // _restgpr0_14: ld r14,-144(r1) 766 // _restgpr0_15: ld r15,-136(r1) 767 // _restgpr0_16: ld r16,-128(r1) 768 void MacroAssembler::restore_nonvolatile_gprs(Register src, int offset) { 769 ld(R14, offset, src); offset += 8; 770 ld(R15, offset, src); offset += 8; 771 ld(R16, offset, src); offset += 8; 772 ld(R17, offset, src); offset += 8; 773 ld(R18, offset, src); offset += 8; 774 ld(R19, offset, src); offset += 8; 775 ld(R20, offset, src); offset += 8; 776 ld(R21, offset, src); offset += 8; 777 ld(R22, offset, src); offset += 8; 778 ld(R23, offset, src); offset += 8; 779 ld(R24, offset, src); offset += 8; 780 ld(R25, offset, src); offset += 8; 781 ld(R26, offset, src); offset += 8; 782 ld(R27, offset, src); offset += 8; 783 ld(R28, offset, src); offset += 8; 784 ld(R29, offset, src); offset += 8; 785 ld(R30, offset, src); offset += 8; 786 ld(R31, offset, src); offset += 8; 787 788 // FP registers 789 lfd(F14, offset, src); offset += 8; 790 lfd(F15, offset, src); offset += 8; 791 lfd(F16, offset, src); offset += 8; 792 lfd(F17, offset, src); offset += 8; 793 lfd(F18, offset, src); offset += 8; 794 lfd(F19, offset, src); offset += 8; 795 lfd(F20, offset, src); offset += 8; 796 lfd(F21, offset, src); offset += 8; 797 lfd(F22, offset, src); offset += 8; 798 lfd(F23, offset, src); offset += 8; 799 lfd(F24, offset, src); offset += 8; 800 lfd(F25, offset, src); offset += 8; 801 lfd(F26, offset, src); offset += 8; 802 lfd(F27, offset, src); offset += 8; 803 lfd(F28, offset, src); offset += 8; 804 lfd(F29, offset, src); offset += 8; 805 lfd(F30, offset, src); offset += 8; 806 lfd(F31, offset, src); 807 } 808 809 // For verify_oops. 810 void MacroAssembler::save_volatile_gprs(Register dst, int offset) { 811 std(R2, offset, dst); offset += 8; 812 std(R3, offset, dst); offset += 8; 813 std(R4, offset, dst); offset += 8; 814 std(R5, offset, dst); offset += 8; 815 std(R6, offset, dst); offset += 8; 816 std(R7, offset, dst); offset += 8; 817 std(R8, offset, dst); offset += 8; 818 std(R9, offset, dst); offset += 8; 819 std(R10, offset, dst); offset += 8; 820 std(R11, offset, dst); offset += 8; 821 std(R12, offset, dst); 822 } 823 824 // For verify_oops. 825 void MacroAssembler::restore_volatile_gprs(Register src, int offset) { 826 ld(R2, offset, src); offset += 8; 827 ld(R3, offset, src); offset += 8; 828 ld(R4, offset, src); offset += 8; 829 ld(R5, offset, src); offset += 8; 830 ld(R6, offset, src); offset += 8; 831 ld(R7, offset, src); offset += 8; 832 ld(R8, offset, src); offset += 8; 833 ld(R9, offset, src); offset += 8; 834 ld(R10, offset, src); offset += 8; 835 ld(R11, offset, src); offset += 8; 836 ld(R12, offset, src); 837 } 838 839 void MacroAssembler::save_LR_CR(Register tmp) { 840 mfcr(tmp); 841 std(tmp, _abi(cr), R1_SP); 842 mflr(tmp); 843 std(tmp, _abi(lr), R1_SP); 844 // Tmp must contain lr on exit! (see return_addr and prolog in ppc64.ad) 845 } 846 847 void MacroAssembler::restore_LR_CR(Register tmp) { 848 assert(tmp != R1_SP, "must be distinct"); 849 ld(tmp, _abi(lr), R1_SP); 850 mtlr(tmp); 851 ld(tmp, _abi(cr), R1_SP); 852 mtcr(tmp); 853 } 854 855 address MacroAssembler::get_PC_trash_LR(Register result) { 856 Label L; 857 bl(L); 858 bind(L); 859 address lr_pc = pc(); 860 mflr(result); 861 return lr_pc; 862 } 863 864 void MacroAssembler::resize_frame(Register offset, Register tmp) { 865 #ifdef ASSERT 866 assert_different_registers(offset, tmp, R1_SP); 867 andi_(tmp, offset, frame::alignment_in_bytes-1); 868 asm_assert_eq("resize_frame: unaligned", 0x204); 869 #endif 870 871 // tmp <- *(SP) 872 ld(tmp, _abi(callers_sp), R1_SP); 873 // addr <- SP + offset; 874 // *(addr) <- tmp; 875 // SP <- addr 876 stdux(tmp, R1_SP, offset); 877 } 878 879 void MacroAssembler::resize_frame(int offset, Register tmp) { 880 assert(is_simm(offset, 16), "too big an offset"); 881 assert_different_registers(tmp, R1_SP); 882 assert((offset & (frame::alignment_in_bytes-1))==0, "resize_frame: unaligned"); 883 // tmp <- *(SP) 884 ld(tmp, _abi(callers_sp), R1_SP); 885 // addr <- SP + offset; 886 // *(addr) <- tmp; 887 // SP <- addr 888 stdu(tmp, offset, R1_SP); 889 } 890 891 void MacroAssembler::resize_frame_absolute(Register addr, Register tmp1, Register tmp2) { 892 // (addr == tmp1) || (addr == tmp2) is allowed here! 893 assert(tmp1 != tmp2, "must be distinct"); 894 895 // compute offset w.r.t. current stack pointer 896 // tmp_1 <- addr - SP (!) 897 subf(tmp1, R1_SP, addr); 898 899 // atomically update SP keeping back link. 900 resize_frame(tmp1/* offset */, tmp2/* tmp */); 901 } 902 903 void MacroAssembler::push_frame(Register bytes, Register tmp) { 904 #ifdef ASSERT 905 assert(bytes != R0, "r0 not allowed here"); 906 andi_(R0, bytes, frame::alignment_in_bytes-1); 907 asm_assert_eq("push_frame(Reg, Reg): unaligned", 0x203); 908 #endif 909 neg(tmp, bytes); 910 stdux(R1_SP, R1_SP, tmp); 911 } 912 913 // Push a frame of size `bytes'. 914 void MacroAssembler::push_frame(unsigned int bytes, Register tmp) { 915 long offset = align_addr(bytes, frame::alignment_in_bytes); 916 if (is_simm(-offset, 16)) { 917 stdu(R1_SP, -offset, R1_SP); 918 } else { 919 load_const(tmp, -offset); 920 stdux(R1_SP, R1_SP, tmp); 921 } 922 } 923 924 // Push a frame of size `bytes' plus abi_reg_args on top. 925 void MacroAssembler::push_frame_reg_args(unsigned int bytes, Register tmp) { 926 push_frame(bytes + frame::abi_reg_args_size, tmp); 927 } 928 929 // Setup up a new C frame with a spill area for non-volatile GPRs and 930 // additional space for local variables. 931 void MacroAssembler::push_frame_reg_args_nonvolatiles(unsigned int bytes, 932 Register tmp) { 933 push_frame(bytes + frame::abi_reg_args_size + frame::spill_nonvolatiles_size, tmp); 934 } 935 936 // Pop current C frame. 937 void MacroAssembler::pop_frame() { 938 ld(R1_SP, _abi(callers_sp), R1_SP); 939 } 940 941 #if defined(ABI_ELFv2) 942 address MacroAssembler::branch_to(Register r_function_entry, bool and_link) { 943 // TODO(asmundak): make sure the caller uses R12 as function descriptor 944 // most of the times. 945 if (R12 != r_function_entry) { 946 mr(R12, r_function_entry); 947 } 948 mtctr(R12); 949 // Do a call or a branch. 950 if (and_link) { 951 bctrl(); 952 } else { 953 bctr(); 954 } 955 _last_calls_return_pc = pc(); 956 957 return _last_calls_return_pc; 958 } 959 960 // Call a C function via a function descriptor and use full C 961 // calling conventions. Updates and returns _last_calls_return_pc. 962 address MacroAssembler::call_c(Register r_function_entry) { 963 return branch_to(r_function_entry, /*and_link=*/true); 964 } 965 966 // For tail calls: only branch, don't link, so callee returns to caller of this function. 967 address MacroAssembler::call_c_and_return_to_caller(Register r_function_entry) { 968 return branch_to(r_function_entry, /*and_link=*/false); 969 } 970 971 address MacroAssembler::call_c(address function_entry, relocInfo::relocType rt) { 972 load_const(R12, function_entry, R0); 973 return branch_to(R12, /*and_link=*/true); 974 } 975 976 #else 977 // Generic version of a call to C function via a function descriptor 978 // with variable support for C calling conventions (TOC, ENV, etc.). 979 // Updates and returns _last_calls_return_pc. 980 address MacroAssembler::branch_to(Register function_descriptor, bool and_link, bool save_toc_before_call, 981 bool restore_toc_after_call, bool load_toc_of_callee, bool load_env_of_callee) { 982 // we emit standard ptrgl glue code here 983 assert((function_descriptor != R0), "function_descriptor cannot be R0"); 984 985 // retrieve necessary entries from the function descriptor 986 ld(R0, in_bytes(FunctionDescriptor::entry_offset()), function_descriptor); 987 mtctr(R0); 988 989 if (load_toc_of_callee) { 990 ld(R2_TOC, in_bytes(FunctionDescriptor::toc_offset()), function_descriptor); 991 } 992 if (load_env_of_callee) { 993 ld(R11, in_bytes(FunctionDescriptor::env_offset()), function_descriptor); 994 } else if (load_toc_of_callee) { 995 li(R11, 0); 996 } 997 998 // do a call or a branch 999 if (and_link) { 1000 bctrl(); 1001 } else { 1002 bctr(); 1003 } 1004 _last_calls_return_pc = pc(); 1005 1006 return _last_calls_return_pc; 1007 } 1008 1009 // Call a C function via a function descriptor and use full C calling 1010 // conventions. 1011 // We don't use the TOC in generated code, so there is no need to save 1012 // and restore its value. 1013 address MacroAssembler::call_c(Register fd) { 1014 return branch_to(fd, /*and_link=*/true, 1015 /*save toc=*/false, 1016 /*restore toc=*/false, 1017 /*load toc=*/true, 1018 /*load env=*/true); 1019 } 1020 1021 address MacroAssembler::call_c_and_return_to_caller(Register fd) { 1022 return branch_to(fd, /*and_link=*/false, 1023 /*save toc=*/false, 1024 /*restore toc=*/false, 1025 /*load toc=*/true, 1026 /*load env=*/true); 1027 } 1028 1029 address MacroAssembler::call_c(const FunctionDescriptor* fd, relocInfo::relocType rt) { 1030 if (rt != relocInfo::none) { 1031 // this call needs to be relocatable 1032 if (!ReoptimizeCallSequences 1033 || (rt != relocInfo::runtime_call_type && rt != relocInfo::none) 1034 || fd == NULL // support code-size estimation 1035 || !fd->is_friend_function() 1036 || fd->entry() == NULL) { 1037 // it's not a friend function as defined by class FunctionDescriptor, 1038 // so do a full call-c here. 1039 load_const(R11, (address)fd, R0); 1040 1041 bool has_env = (fd != NULL && fd->env() != NULL); 1042 return branch_to(R11, /*and_link=*/true, 1043 /*save toc=*/false, 1044 /*restore toc=*/false, 1045 /*load toc=*/true, 1046 /*load env=*/has_env); 1047 } else { 1048 // It's a friend function. Load the entry point and don't care about 1049 // toc and env. Use an optimizable call instruction, but ensure the 1050 // same code-size as in the case of a non-friend function. 1051 nop(); 1052 nop(); 1053 nop(); 1054 bl64_patchable(fd->entry(), rt); 1055 _last_calls_return_pc = pc(); 1056 return _last_calls_return_pc; 1057 } 1058 } else { 1059 // This call does not need to be relocatable, do more aggressive 1060 // optimizations. 1061 if (!ReoptimizeCallSequences 1062 || !fd->is_friend_function()) { 1063 // It's not a friend function as defined by class FunctionDescriptor, 1064 // so do a full call-c here. 1065 load_const(R11, (address)fd, R0); 1066 return branch_to(R11, /*and_link=*/true, 1067 /*save toc=*/false, 1068 /*restore toc=*/false, 1069 /*load toc=*/true, 1070 /*load env=*/true); 1071 } else { 1072 // it's a friend function, load the entry point and don't care about 1073 // toc and env. 1074 address dest = fd->entry(); 1075 if (is_within_range_of_b(dest, pc())) { 1076 bl(dest); 1077 } else { 1078 bl64_patchable(dest, rt); 1079 } 1080 _last_calls_return_pc = pc(); 1081 return _last_calls_return_pc; 1082 } 1083 } 1084 } 1085 1086 // Call a C function. All constants needed reside in TOC. 1087 // 1088 // Read the address to call from the TOC. 1089 // Read env from TOC, if fd specifies an env. 1090 // Read new TOC from TOC. 1091 address MacroAssembler::call_c_using_toc(const FunctionDescriptor* fd, 1092 relocInfo::relocType rt, Register toc) { 1093 if (!ReoptimizeCallSequences 1094 || (rt != relocInfo::runtime_call_type && rt != relocInfo::none) 1095 || !fd->is_friend_function()) { 1096 // It's not a friend function as defined by class FunctionDescriptor, 1097 // so do a full call-c here. 1098 assert(fd->entry() != NULL, "function must be linked"); 1099 1100 AddressLiteral fd_entry(fd->entry()); 1101 load_const_from_method_toc(R11, fd_entry, toc); 1102 mtctr(R11); 1103 if (fd->env() == NULL) { 1104 li(R11, 0); 1105 nop(); 1106 } else { 1107 AddressLiteral fd_env(fd->env()); 1108 load_const_from_method_toc(R11, fd_env, toc); 1109 } 1110 AddressLiteral fd_toc(fd->toc()); 1111 load_toc_from_toc(R2_TOC, fd_toc, toc); 1112 // R2_TOC is killed. 1113 bctrl(); 1114 _last_calls_return_pc = pc(); 1115 } else { 1116 // It's a friend function, load the entry point and don't care about 1117 // toc and env. Use an optimizable call instruction, but ensure the 1118 // same code-size as in the case of a non-friend function. 1119 nop(); 1120 bl64_patchable(fd->entry(), rt); 1121 _last_calls_return_pc = pc(); 1122 } 1123 return _last_calls_return_pc; 1124 } 1125 #endif // ABI_ELFv2 1126 1127 void MacroAssembler::call_VM_base(Register oop_result, 1128 Register last_java_sp, 1129 address entry_point, 1130 bool check_exceptions) { 1131 BLOCK_COMMENT("call_VM {"); 1132 // Determine last_java_sp register. 1133 if (!last_java_sp->is_valid()) { 1134 last_java_sp = R1_SP; 1135 } 1136 set_top_ijava_frame_at_SP_as_last_Java_frame(last_java_sp, R11_scratch1); 1137 1138 // ARG1 must hold thread address. 1139 mr(R3_ARG1, R16_thread); 1140 #if defined(ABI_ELFv2) 1141 address return_pc = call_c(entry_point, relocInfo::none); 1142 #else 1143 address return_pc = call_c((FunctionDescriptor*)entry_point, relocInfo::none); 1144 #endif 1145 1146 reset_last_Java_frame(); 1147 1148 // Check for pending exceptions. 1149 if (check_exceptions) { 1150 // We don't check for exceptions here. 1151 ShouldNotReachHere(); 1152 } 1153 1154 // Get oop result if there is one and reset the value in the thread. 1155 if (oop_result->is_valid()) { 1156 get_vm_result(oop_result); 1157 } 1158 1159 _last_calls_return_pc = return_pc; 1160 BLOCK_COMMENT("} call_VM"); 1161 } 1162 1163 void MacroAssembler::call_VM_leaf_base(address entry_point) { 1164 BLOCK_COMMENT("call_VM_leaf {"); 1165 #if defined(ABI_ELFv2) 1166 call_c(entry_point, relocInfo::none); 1167 #else 1168 call_c(CAST_FROM_FN_PTR(FunctionDescriptor*, entry_point), relocInfo::none); 1169 #endif 1170 BLOCK_COMMENT("} call_VM_leaf"); 1171 } 1172 1173 void MacroAssembler::call_VM(Register oop_result, address entry_point, bool check_exceptions) { 1174 call_VM_base(oop_result, noreg, entry_point, check_exceptions); 1175 } 1176 1177 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, 1178 bool check_exceptions) { 1179 // R3_ARG1 is reserved for the thread. 1180 mr_if_needed(R4_ARG2, arg_1); 1181 call_VM(oop_result, entry_point, check_exceptions); 1182 } 1183 1184 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2, 1185 bool check_exceptions) { 1186 // R3_ARG1 is reserved for the thread 1187 mr_if_needed(R4_ARG2, arg_1); 1188 assert(arg_2 != R4_ARG2, "smashed argument"); 1189 mr_if_needed(R5_ARG3, arg_2); 1190 call_VM(oop_result, entry_point, check_exceptions); 1191 } 1192 1193 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2, Register arg_3, 1194 bool check_exceptions) { 1195 // R3_ARG1 is reserved for the thread 1196 mr_if_needed(R4_ARG2, arg_1); 1197 assert(arg_2 != R4_ARG2, "smashed argument"); 1198 mr_if_needed(R5_ARG3, arg_2); 1199 mr_if_needed(R6_ARG4, arg_3); 1200 call_VM(oop_result, entry_point, check_exceptions); 1201 } 1202 1203 void MacroAssembler::call_VM_leaf(address entry_point) { 1204 call_VM_leaf_base(entry_point); 1205 } 1206 1207 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1) { 1208 mr_if_needed(R3_ARG1, arg_1); 1209 call_VM_leaf(entry_point); 1210 } 1211 1212 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2) { 1213 mr_if_needed(R3_ARG1, arg_1); 1214 assert(arg_2 != R3_ARG1, "smashed argument"); 1215 mr_if_needed(R4_ARG2, arg_2); 1216 call_VM_leaf(entry_point); 1217 } 1218 1219 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2, Register arg_3) { 1220 mr_if_needed(R3_ARG1, arg_1); 1221 assert(arg_2 != R3_ARG1, "smashed argument"); 1222 mr_if_needed(R4_ARG2, arg_2); 1223 assert(arg_3 != R3_ARG1 && arg_3 != R4_ARG2, "smashed argument"); 1224 mr_if_needed(R5_ARG3, arg_3); 1225 call_VM_leaf(entry_point); 1226 } 1227 1228 // Check whether instruction is a read access to the polling page 1229 // which was emitted by load_from_polling_page(..). 1230 bool MacroAssembler::is_load_from_polling_page(int instruction, void* ucontext, 1231 address* polling_address_ptr) { 1232 if (!is_ld(instruction)) 1233 return false; // It's not a ld. Fail. 1234 1235 int rt = inv_rt_field(instruction); 1236 int ra = inv_ra_field(instruction); 1237 int ds = inv_ds_field(instruction); 1238 if (!(ds == 0 && ra != 0 && rt == 0)) { 1239 return false; // It's not a ld(r0, X, ra). Fail. 1240 } 1241 1242 if (!ucontext) { 1243 // Set polling address. 1244 if (polling_address_ptr != NULL) { 1245 *polling_address_ptr = NULL; 1246 } 1247 return true; // No ucontext given. Can't check value of ra. Assume true. 1248 } 1249 1250 #ifdef LINUX 1251 // Ucontext given. Check that register ra contains the address of 1252 // the safepoing polling page. 1253 ucontext_t* uc = (ucontext_t*) ucontext; 1254 // Set polling address. 1255 address addr = (address)uc->uc_mcontext.regs->gpr[ra] + (ssize_t)ds; 1256 if (polling_address_ptr != NULL) { 1257 *polling_address_ptr = addr; 1258 } 1259 return os::is_poll_address(addr); 1260 #else 1261 // Not on Linux, ucontext must be NULL. 1262 ShouldNotReachHere(); 1263 return false; 1264 #endif 1265 } 1266 1267 bool MacroAssembler::is_memory_serialization(int instruction, JavaThread* thread, void* ucontext) { 1268 #ifdef LINUX 1269 ucontext_t* uc = (ucontext_t*) ucontext; 1270 1271 if (is_stwx(instruction) || is_stwux(instruction)) { 1272 int ra = inv_ra_field(instruction); 1273 int rb = inv_rb_field(instruction); 1274 1275 // look up content of ra and rb in ucontext 1276 address ra_val=(address)uc->uc_mcontext.regs->gpr[ra]; 1277 long rb_val=(long)uc->uc_mcontext.regs->gpr[rb]; 1278 return os::is_memory_serialize_page(thread, ra_val+rb_val); 1279 } else if (is_stw(instruction) || is_stwu(instruction)) { 1280 int ra = inv_ra_field(instruction); 1281 int d1 = inv_d1_field(instruction); 1282 1283 // look up content of ra in ucontext 1284 address ra_val=(address)uc->uc_mcontext.regs->gpr[ra]; 1285 return os::is_memory_serialize_page(thread, ra_val+d1); 1286 } else { 1287 return false; 1288 } 1289 #else 1290 // workaround not needed on !LINUX :-) 1291 ShouldNotCallThis(); 1292 return false; 1293 #endif 1294 } 1295 1296 void MacroAssembler::bang_stack_with_offset(int offset) { 1297 // When increasing the stack, the old stack pointer will be written 1298 // to the new top of stack according to the PPC64 abi. 1299 // Therefore, stack banging is not necessary when increasing 1300 // the stack by <= os::vm_page_size() bytes. 1301 // When increasing the stack by a larger amount, this method is 1302 // called repeatedly to bang the intermediate pages. 1303 1304 // Stack grows down, caller passes positive offset. 1305 assert(offset > 0, "must bang with positive offset"); 1306 1307 long stdoffset = -offset; 1308 1309 if (is_simm(stdoffset, 16)) { 1310 // Signed 16 bit offset, a simple std is ok. 1311 if (UseLoadInstructionsForStackBangingPPC64) { 1312 ld(R0, (int)(signed short)stdoffset, R1_SP); 1313 } else { 1314 std(R0,(int)(signed short)stdoffset, R1_SP); 1315 } 1316 } else if (is_simm(stdoffset, 31)) { 1317 const int hi = MacroAssembler::largeoffset_si16_si16_hi(stdoffset); 1318 const int lo = MacroAssembler::largeoffset_si16_si16_lo(stdoffset); 1319 1320 Register tmp = R11; 1321 addis(tmp, R1_SP, hi); 1322 if (UseLoadInstructionsForStackBangingPPC64) { 1323 ld(R0, lo, tmp); 1324 } else { 1325 std(R0, lo, tmp); 1326 } 1327 } else { 1328 ShouldNotReachHere(); 1329 } 1330 } 1331 1332 // If instruction is a stack bang of the form 1333 // std R0, x(Ry), (see bang_stack_with_offset()) 1334 // stdu R1_SP, x(R1_SP), (see push_frame(), resize_frame()) 1335 // or stdux R1_SP, Rx, R1_SP (see push_frame(), resize_frame()) 1336 // return the banged address. Otherwise, return 0. 1337 address MacroAssembler::get_stack_bang_address(int instruction, void *ucontext) { 1338 #ifdef LINUX 1339 ucontext_t* uc = (ucontext_t*) ucontext; 1340 int rs = inv_rs_field(instruction); 1341 int ra = inv_ra_field(instruction); 1342 if ( (is_ld(instruction) && rs == 0 && UseLoadInstructionsForStackBangingPPC64) 1343 || (is_std(instruction) && rs == 0 && !UseLoadInstructionsForStackBangingPPC64) 1344 || (is_stdu(instruction) && rs == 1)) { 1345 int ds = inv_ds_field(instruction); 1346 // return banged address 1347 return ds+(address)uc->uc_mcontext.regs->gpr[ra]; 1348 } else if (is_stdux(instruction) && rs == 1) { 1349 int rb = inv_rb_field(instruction); 1350 address sp = (address)uc->uc_mcontext.regs->gpr[1]; 1351 long rb_val = (long)uc->uc_mcontext.regs->gpr[rb]; 1352 return ra != 1 || rb_val >= 0 ? NULL // not a stack bang 1353 : sp + rb_val; // banged address 1354 } 1355 return NULL; // not a stack bang 1356 #else 1357 // workaround not needed on !LINUX :-) 1358 ShouldNotCallThis(); 1359 return NULL; 1360 #endif 1361 } 1362 1363 // CmpxchgX sets condition register to cmpX(current, compare). 1364 void MacroAssembler::cmpxchgw(ConditionRegister flag, Register dest_current_value, 1365 Register compare_value, Register exchange_value, 1366 Register addr_base, int semantics, bool cmpxchgx_hint, 1367 Register int_flag_success, bool contention_hint) { 1368 Label retry; 1369 Label failed; 1370 Label done; 1371 1372 // Save one branch if result is returned via register and 1373 // result register is different from the other ones. 1374 bool use_result_reg = (int_flag_success != noreg); 1375 bool preset_result_reg = (int_flag_success != dest_current_value && int_flag_success != compare_value && 1376 int_flag_success != exchange_value && int_flag_success != addr_base); 1377 1378 // release/fence semantics 1379 if (semantics & MemBarRel) { 1380 release(); 1381 } 1382 1383 if (use_result_reg && preset_result_reg) { 1384 li(int_flag_success, 0); // preset (assume cas failed) 1385 } 1386 1387 // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM). 1388 if (contention_hint) { // Don't try to reserve if cmp fails. 1389 lwz(dest_current_value, 0, addr_base); 1390 cmpw(flag, dest_current_value, compare_value); 1391 bne(flag, failed); 1392 } 1393 1394 // atomic emulation loop 1395 bind(retry); 1396 1397 lwarx(dest_current_value, addr_base, cmpxchgx_hint); 1398 cmpw(flag, dest_current_value, compare_value); 1399 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1400 bne_predict_not_taken(flag, failed); 1401 } else { 1402 bne( flag, failed); 1403 } 1404 // branch to done => (flag == ne), (dest_current_value != compare_value) 1405 // fall through => (flag == eq), (dest_current_value == compare_value) 1406 1407 stwcx_(exchange_value, addr_base); 1408 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1409 bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0. 1410 } else { 1411 bne( CCR0, retry); // StXcx_ sets CCR0. 1412 } 1413 // fall through => (flag == eq), (dest_current_value == compare_value), (swapped) 1414 1415 // Result in register (must do this at the end because int_flag_success can be the 1416 // same register as one above). 1417 if (use_result_reg) { 1418 li(int_flag_success, 1); 1419 } 1420 1421 if (semantics & MemBarFenceAfter) { 1422 fence(); 1423 } else if (semantics & MemBarAcq) { 1424 isync(); 1425 } 1426 1427 if (use_result_reg && !preset_result_reg) { 1428 b(done); 1429 } 1430 1431 bind(failed); 1432 if (use_result_reg && !preset_result_reg) { 1433 li(int_flag_success, 0); 1434 } 1435 1436 bind(done); 1437 // (flag == ne) => (dest_current_value != compare_value), (!swapped) 1438 // (flag == eq) => (dest_current_value == compare_value), ( swapped) 1439 } 1440 1441 // Preforms atomic compare exchange: 1442 // if (compare_value == *addr_base) 1443 // *addr_base = exchange_value 1444 // int_flag_success = 1; 1445 // else 1446 // int_flag_success = 0; 1447 // 1448 // ConditionRegister flag = cmp(compare_value, *addr_base) 1449 // Register dest_current_value = *addr_base 1450 // Register compare_value Used to compare with value in memory 1451 // Register exchange_value Written to memory if compare_value == *addr_base 1452 // Register addr_base The memory location to compareXChange 1453 // Register int_flag_success Set to 1 if exchange_value was written to *addr_base 1454 // 1455 // To avoid the costly compare exchange the value is tested beforehand. 1456 // Several special cases exist to avoid that unnecessary information is generated. 1457 // 1458 void MacroAssembler::cmpxchgd(ConditionRegister flag, 1459 Register dest_current_value, RegisterOrConstant compare_value, Register exchange_value, 1460 Register addr_base, int semantics, bool cmpxchgx_hint, 1461 Register int_flag_success, Label* failed_ext, bool contention_hint) { 1462 Label retry; 1463 Label failed_int; 1464 Label& failed = (failed_ext != NULL) ? *failed_ext : failed_int; 1465 Label done; 1466 1467 // Save one branch if result is returned via register and result register is different from the other ones. 1468 bool use_result_reg = (int_flag_success!=noreg); 1469 bool preset_result_reg = (int_flag_success!=dest_current_value && int_flag_success!=compare_value.register_or_noreg() && 1470 int_flag_success!=exchange_value && int_flag_success!=addr_base); 1471 assert(int_flag_success == noreg || failed_ext == NULL, "cannot have both"); 1472 1473 // release/fence semantics 1474 if (semantics & MemBarRel) { 1475 release(); 1476 } 1477 1478 if (use_result_reg && preset_result_reg) { 1479 li(int_flag_success, 0); // preset (assume cas failed) 1480 } 1481 1482 // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM). 1483 if (contention_hint) { // Don't try to reserve if cmp fails. 1484 ld(dest_current_value, 0, addr_base); 1485 cmpd(flag, compare_value, dest_current_value); 1486 bne(flag, failed); 1487 } 1488 1489 // atomic emulation loop 1490 bind(retry); 1491 1492 ldarx(dest_current_value, addr_base, cmpxchgx_hint); 1493 cmpd(flag, compare_value, dest_current_value); 1494 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1495 bne_predict_not_taken(flag, failed); 1496 } else { 1497 bne( flag, failed); 1498 } 1499 1500 stdcx_(exchange_value, addr_base); 1501 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1502 bne_predict_not_taken(CCR0, retry); // stXcx_ sets CCR0 1503 } else { 1504 bne( CCR0, retry); // stXcx_ sets CCR0 1505 } 1506 1507 // result in register (must do this at the end because int_flag_success can be the same register as one above) 1508 if (use_result_reg) { 1509 li(int_flag_success, 1); 1510 } 1511 1512 // POWER6 doesn't need isync in CAS. 1513 // Always emit isync to be on the safe side. 1514 if (semantics & MemBarFenceAfter) { 1515 fence(); 1516 } else if (semantics & MemBarAcq) { 1517 isync(); 1518 } 1519 1520 if (use_result_reg && !preset_result_reg) { 1521 b(done); 1522 } 1523 1524 bind(failed_int); 1525 if (use_result_reg && !preset_result_reg) { 1526 li(int_flag_success, 0); 1527 } 1528 1529 bind(done); 1530 // (flag == ne) => (dest_current_value != compare_value), (!swapped) 1531 // (flag == eq) => (dest_current_value == compare_value), ( swapped) 1532 } 1533 1534 // Look up the method for a megamorphic invokeinterface call. 1535 // The target method is determined by <intf_klass, itable_index>. 1536 // The receiver klass is in recv_klass. 1537 // On success, the result will be in method_result, and execution falls through. 1538 // On failure, execution transfers to the given label. 1539 void MacroAssembler::lookup_interface_method(Register recv_klass, 1540 Register intf_klass, 1541 RegisterOrConstant itable_index, 1542 Register method_result, 1543 Register scan_temp, 1544 Register sethi_temp, 1545 Label& L_no_such_interface) { 1546 assert_different_registers(recv_klass, intf_klass, method_result, scan_temp); 1547 assert(itable_index.is_constant() || itable_index.as_register() == method_result, 1548 "caller must use same register for non-constant itable index as for method"); 1549 1550 // Compute start of first itableOffsetEntry (which is at the end of the vtable). 1551 int vtable_base = InstanceKlass::vtable_start_offset() * wordSize; 1552 int itentry_off = itableMethodEntry::method_offset_in_bytes(); 1553 int logMEsize = exact_log2(itableMethodEntry::size() * wordSize); 1554 int scan_step = itableOffsetEntry::size() * wordSize; 1555 int log_vte_size= exact_log2(vtableEntry::size() * wordSize); 1556 1557 lwz(scan_temp, InstanceKlass::vtable_length_offset() * wordSize, recv_klass); 1558 // %%% We should store the aligned, prescaled offset in the klassoop. 1559 // Then the next several instructions would fold away. 1560 1561 sldi(scan_temp, scan_temp, log_vte_size); 1562 addi(scan_temp, scan_temp, vtable_base); 1563 add(scan_temp, recv_klass, scan_temp); 1564 1565 // Adjust recv_klass by scaled itable_index, so we can free itable_index. 1566 if (itable_index.is_register()) { 1567 Register itable_offset = itable_index.as_register(); 1568 sldi(itable_offset, itable_offset, logMEsize); 1569 if (itentry_off) addi(itable_offset, itable_offset, itentry_off); 1570 add(recv_klass, itable_offset, recv_klass); 1571 } else { 1572 long itable_offset = (long)itable_index.as_constant(); 1573 load_const_optimized(sethi_temp, (itable_offset<<logMEsize)+itentry_off); // static address, no relocation 1574 add(recv_klass, sethi_temp, recv_klass); 1575 } 1576 1577 // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) { 1578 // if (scan->interface() == intf) { 1579 // result = (klass + scan->offset() + itable_index); 1580 // } 1581 // } 1582 Label search, found_method; 1583 1584 for (int peel = 1; peel >= 0; peel--) { 1585 // %%%% Could load both offset and interface in one ldx, if they were 1586 // in the opposite order. This would save a load. 1587 ld(method_result, itableOffsetEntry::interface_offset_in_bytes(), scan_temp); 1588 1589 // Check that this entry is non-null. A null entry means that 1590 // the receiver class doesn't implement the interface, and wasn't the 1591 // same as when the caller was compiled. 1592 cmpd(CCR0, method_result, intf_klass); 1593 1594 if (peel) { 1595 beq(CCR0, found_method); 1596 } else { 1597 bne(CCR0, search); 1598 // (invert the test to fall through to found_method...) 1599 } 1600 1601 if (!peel) break; 1602 1603 bind(search); 1604 1605 cmpdi(CCR0, method_result, 0); 1606 beq(CCR0, L_no_such_interface); 1607 addi(scan_temp, scan_temp, scan_step); 1608 } 1609 1610 bind(found_method); 1611 1612 // Got a hit. 1613 int ito_offset = itableOffsetEntry::offset_offset_in_bytes(); 1614 lwz(scan_temp, ito_offset, scan_temp); 1615 ldx(method_result, scan_temp, recv_klass); 1616 } 1617 1618 // virtual method calling 1619 void MacroAssembler::lookup_virtual_method(Register recv_klass, 1620 RegisterOrConstant vtable_index, 1621 Register method_result) { 1622 1623 assert_different_registers(recv_klass, method_result, vtable_index.register_or_noreg()); 1624 1625 const int base = InstanceKlass::vtable_start_offset() * wordSize; 1626 assert(vtableEntry::size() * wordSize == wordSize, "adjust the scaling in the code below"); 1627 1628 if (vtable_index.is_register()) { 1629 sldi(vtable_index.as_register(), vtable_index.as_register(), LogBytesPerWord); 1630 add(recv_klass, vtable_index.as_register(), recv_klass); 1631 } else { 1632 addi(recv_klass, recv_klass, vtable_index.as_constant() << LogBytesPerWord); 1633 } 1634 ld(R19_method, base + vtableEntry::method_offset_in_bytes(), recv_klass); 1635 } 1636 1637 /////////////////////////////////////////// subtype checking //////////////////////////////////////////// 1638 1639 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass, 1640 Register super_klass, 1641 Register temp1_reg, 1642 Register temp2_reg, 1643 Label& L_success, 1644 Label& L_failure) { 1645 1646 const Register check_cache_offset = temp1_reg; 1647 const Register cached_super = temp2_reg; 1648 1649 assert_different_registers(sub_klass, super_klass, check_cache_offset, cached_super); 1650 1651 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 1652 int sc_offset = in_bytes(Klass::secondary_super_cache_offset()); 1653 1654 // If the pointers are equal, we are done (e.g., String[] elements). 1655 // This self-check enables sharing of secondary supertype arrays among 1656 // non-primary types such as array-of-interface. Otherwise, each such 1657 // type would need its own customized SSA. 1658 // We move this check to the front of the fast path because many 1659 // type checks are in fact trivially successful in this manner, 1660 // so we get a nicely predicted branch right at the start of the check. 1661 cmpd(CCR0, sub_klass, super_klass); 1662 beq(CCR0, L_success); 1663 1664 // Check the supertype display: 1665 lwz(check_cache_offset, sco_offset, super_klass); 1666 // The loaded value is the offset from KlassOopDesc. 1667 1668 ldx(cached_super, check_cache_offset, sub_klass); 1669 cmpd(CCR0, cached_super, super_klass); 1670 beq(CCR0, L_success); 1671 1672 // This check has worked decisively for primary supers. 1673 // Secondary supers are sought in the super_cache ('super_cache_addr'). 1674 // (Secondary supers are interfaces and very deeply nested subtypes.) 1675 // This works in the same check above because of a tricky aliasing 1676 // between the super_cache and the primary super display elements. 1677 // (The 'super_check_addr' can address either, as the case requires.) 1678 // Note that the cache is updated below if it does not help us find 1679 // what we need immediately. 1680 // So if it was a primary super, we can just fail immediately. 1681 // Otherwise, it's the slow path for us (no success at this point). 1682 1683 cmpwi(CCR0, check_cache_offset, sc_offset); 1684 bne(CCR0, L_failure); 1685 // bind(slow_path); // fallthru 1686 } 1687 1688 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass, 1689 Register super_klass, 1690 Register temp1_reg, 1691 Register temp2_reg, 1692 Label* L_success, 1693 Register result_reg) { 1694 const Register array_ptr = temp1_reg; // current value from cache array 1695 const Register temp = temp2_reg; 1696 1697 assert_different_registers(sub_klass, super_klass, array_ptr, temp); 1698 1699 int source_offset = in_bytes(Klass::secondary_supers_offset()); 1700 int target_offset = in_bytes(Klass::secondary_super_cache_offset()); 1701 1702 int length_offset = Array<Klass*>::length_offset_in_bytes(); 1703 int base_offset = Array<Klass*>::base_offset_in_bytes(); 1704 1705 Label hit, loop, failure, fallthru; 1706 1707 ld(array_ptr, source_offset, sub_klass); 1708 1709 //assert(4 == arrayOopDesc::length_length_in_bytes(), "precondition violated."); 1710 lwz(temp, length_offset, array_ptr); 1711 cmpwi(CCR0, temp, 0); 1712 beq(CCR0, result_reg!=noreg ? failure : fallthru); // length 0 1713 1714 mtctr(temp); // load ctr 1715 1716 bind(loop); 1717 // Oops in table are NO MORE compressed. 1718 ld(temp, base_offset, array_ptr); 1719 cmpd(CCR0, temp, super_klass); 1720 beq(CCR0, hit); 1721 addi(array_ptr, array_ptr, BytesPerWord); 1722 bdnz(loop); 1723 1724 bind(failure); 1725 if (result_reg!=noreg) li(result_reg, 1); // load non-zero result (indicates a miss) 1726 b(fallthru); 1727 1728 bind(hit); 1729 std(super_klass, target_offset, sub_klass); // save result to cache 1730 if (result_reg != noreg) li(result_reg, 0); // load zero result (indicates a hit) 1731 if (L_success != NULL) b(*L_success); 1732 1733 bind(fallthru); 1734 } 1735 1736 // Try fast path, then go to slow one if not successful 1737 void MacroAssembler::check_klass_subtype(Register sub_klass, 1738 Register super_klass, 1739 Register temp1_reg, 1740 Register temp2_reg, 1741 Label& L_success) { 1742 Label L_failure; 1743 check_klass_subtype_fast_path(sub_klass, super_klass, temp1_reg, temp2_reg, L_success, L_failure); 1744 check_klass_subtype_slow_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success); 1745 bind(L_failure); // Fallthru if not successful. 1746 } 1747 1748 void MacroAssembler::check_method_handle_type(Register mtype_reg, Register mh_reg, 1749 Register temp_reg, 1750 Label& wrong_method_type) { 1751 assert_different_registers(mtype_reg, mh_reg, temp_reg); 1752 // Compare method type against that of the receiver. 1753 load_heap_oop_not_null(temp_reg, delayed_value(java_lang_invoke_MethodHandle::type_offset_in_bytes, temp_reg), mh_reg); 1754 cmpd(CCR0, temp_reg, mtype_reg); 1755 bne(CCR0, wrong_method_type); 1756 } 1757 1758 RegisterOrConstant MacroAssembler::argument_offset(RegisterOrConstant arg_slot, 1759 Register temp_reg, 1760 int extra_slot_offset) { 1761 // cf. TemplateTable::prepare_invoke(), if (load_receiver). 1762 int stackElementSize = Interpreter::stackElementSize; 1763 int offset = extra_slot_offset * stackElementSize; 1764 if (arg_slot.is_constant()) { 1765 offset += arg_slot.as_constant() * stackElementSize; 1766 return offset; 1767 } else { 1768 assert(temp_reg != noreg, "must specify"); 1769 sldi(temp_reg, arg_slot.as_register(), exact_log2(stackElementSize)); 1770 if (offset != 0) 1771 addi(temp_reg, temp_reg, offset); 1772 return temp_reg; 1773 } 1774 } 1775 1776 void MacroAssembler::biased_locking_enter(ConditionRegister cr_reg, Register obj_reg, 1777 Register mark_reg, Register temp_reg, 1778 Register temp2_reg, Label& done, Label* slow_case) { 1779 assert(UseBiasedLocking, "why call this otherwise?"); 1780 1781 #ifdef ASSERT 1782 assert_different_registers(obj_reg, mark_reg, temp_reg, temp2_reg); 1783 #endif 1784 1785 Label cas_label; 1786 1787 // Branch to done if fast path fails and no slow_case provided. 1788 Label *slow_case_int = (slow_case != NULL) ? slow_case : &done; 1789 1790 // Biased locking 1791 // See whether the lock is currently biased toward our thread and 1792 // whether the epoch is still valid 1793 // Note that the runtime guarantees sufficient alignment of JavaThread 1794 // pointers to allow age to be placed into low bits 1795 assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits, 1796 "biased locking makes assumptions about bit layout"); 1797 1798 if (PrintBiasedLockingStatistics) { 1799 load_const(temp_reg, (address) BiasedLocking::total_entry_count_addr(), temp2_reg); 1800 lwz(temp2_reg, 0, temp_reg); 1801 addi(temp2_reg, temp2_reg, 1); 1802 stw(temp2_reg, 0, temp_reg); 1803 } 1804 1805 andi(temp_reg, mark_reg, markOopDesc::biased_lock_mask_in_place); 1806 cmpwi(cr_reg, temp_reg, markOopDesc::biased_lock_pattern); 1807 bne(cr_reg, cas_label); 1808 1809 load_klass(temp_reg, obj_reg); 1810 1811 load_const_optimized(temp2_reg, ~((int) markOopDesc::age_mask_in_place)); 1812 ld(temp_reg, in_bytes(Klass::prototype_header_offset()), temp_reg); 1813 orr(temp_reg, R16_thread, temp_reg); 1814 xorr(temp_reg, mark_reg, temp_reg); 1815 andr(temp_reg, temp_reg, temp2_reg); 1816 cmpdi(cr_reg, temp_reg, 0); 1817 if (PrintBiasedLockingStatistics) { 1818 Label l; 1819 bne(cr_reg, l); 1820 load_const(mark_reg, (address) BiasedLocking::biased_lock_entry_count_addr()); 1821 lwz(temp2_reg, 0, mark_reg); 1822 addi(temp2_reg, temp2_reg, 1); 1823 stw(temp2_reg, 0, mark_reg); 1824 // restore mark_reg 1825 ld(mark_reg, oopDesc::mark_offset_in_bytes(), obj_reg); 1826 bind(l); 1827 } 1828 beq(cr_reg, done); 1829 1830 Label try_revoke_bias; 1831 Label try_rebias; 1832 1833 // At this point we know that the header has the bias pattern and 1834 // that we are not the bias owner in the current epoch. We need to 1835 // figure out more details about the state of the header in order to 1836 // know what operations can be legally performed on the object's 1837 // header. 1838 1839 // If the low three bits in the xor result aren't clear, that means 1840 // the prototype header is no longer biased and we have to revoke 1841 // the bias on this object. 1842 andi(temp2_reg, temp_reg, markOopDesc::biased_lock_mask_in_place); 1843 cmpwi(cr_reg, temp2_reg, 0); 1844 bne(cr_reg, try_revoke_bias); 1845 1846 // Biasing is still enabled for this data type. See whether the 1847 // epoch of the current bias is still valid, meaning that the epoch 1848 // bits of the mark word are equal to the epoch bits of the 1849 // prototype header. (Note that the prototype header's epoch bits 1850 // only change at a safepoint.) If not, attempt to rebias the object 1851 // toward the current thread. Note that we must be absolutely sure 1852 // that the current epoch is invalid in order to do this because 1853 // otherwise the manipulations it performs on the mark word are 1854 // illegal. 1855 1856 int shift_amount = 64 - markOopDesc::epoch_shift; 1857 // rotate epoch bits to right (little) end and set other bits to 0 1858 // [ big part | epoch | little part ] -> [ 0..0 | epoch ] 1859 rldicl_(temp2_reg, temp_reg, shift_amount, 64 - markOopDesc::epoch_bits); 1860 // branch if epoch bits are != 0, i.e. they differ, because the epoch has been incremented 1861 bne(CCR0, try_rebias); 1862 1863 // The epoch of the current bias is still valid but we know nothing 1864 // about the owner; it might be set or it might be clear. Try to 1865 // acquire the bias of the object using an atomic operation. If this 1866 // fails we will go in to the runtime to revoke the object's bias. 1867 // Note that we first construct the presumed unbiased header so we 1868 // don't accidentally blow away another thread's valid bias. 1869 andi(mark_reg, mark_reg, (markOopDesc::biased_lock_mask_in_place | 1870 markOopDesc::age_mask_in_place | 1871 markOopDesc::epoch_mask_in_place)); 1872 orr(temp_reg, R16_thread, mark_reg); 1873 1874 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 1875 1876 // CmpxchgX sets cr_reg to cmpX(temp2_reg, mark_reg). 1877 cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg, 1878 /*compare_value=*/mark_reg, /*exchange_value=*/temp_reg, 1879 /*where=*/obj_reg, 1880 MacroAssembler::MemBarAcq, 1881 MacroAssembler::cmpxchgx_hint_acquire_lock(), 1882 noreg, slow_case_int); // bail out if failed 1883 1884 // If the biasing toward our thread failed, this means that 1885 // another thread succeeded in biasing it toward itself and we 1886 // need to revoke that bias. The revocation will occur in the 1887 // interpreter runtime in the slow case. 1888 if (PrintBiasedLockingStatistics) { 1889 load_const(temp_reg, (address) BiasedLocking::anonymously_biased_lock_entry_count_addr(), temp2_reg); 1890 lwz(temp2_reg, 0, temp_reg); 1891 addi(temp2_reg, temp2_reg, 1); 1892 stw(temp2_reg, 0, temp_reg); 1893 } 1894 b(done); 1895 1896 bind(try_rebias); 1897 // At this point we know the epoch has expired, meaning that the 1898 // current "bias owner", if any, is actually invalid. Under these 1899 // circumstances _only_, we are allowed to use the current header's 1900 // value as the comparison value when doing the cas to acquire the 1901 // bias in the current epoch. In other words, we allow transfer of 1902 // the bias from one thread to another directly in this situation. 1903 andi(temp_reg, mark_reg, markOopDesc::age_mask_in_place); 1904 orr(temp_reg, R16_thread, temp_reg); 1905 load_klass(temp2_reg, obj_reg); 1906 ld(temp2_reg, in_bytes(Klass::prototype_header_offset()), temp2_reg); 1907 orr(temp_reg, temp_reg, temp2_reg); 1908 1909 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 1910 1911 // CmpxchgX sets cr_reg to cmpX(temp2_reg, mark_reg). 1912 cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg, 1913 /*compare_value=*/mark_reg, /*exchange_value=*/temp_reg, 1914 /*where=*/obj_reg, 1915 MacroAssembler::MemBarAcq, 1916 MacroAssembler::cmpxchgx_hint_acquire_lock(), 1917 noreg, slow_case_int); // bail out if failed 1918 1919 // If the biasing toward our thread failed, this means that 1920 // another thread succeeded in biasing it toward itself and we 1921 // need to revoke that bias. The revocation will occur in the 1922 // interpreter runtime in the slow case. 1923 if (PrintBiasedLockingStatistics) { 1924 load_const(temp_reg, (address) BiasedLocking::rebiased_lock_entry_count_addr(), temp2_reg); 1925 lwz(temp2_reg, 0, temp_reg); 1926 addi(temp2_reg, temp2_reg, 1); 1927 stw(temp2_reg, 0, temp_reg); 1928 } 1929 b(done); 1930 1931 bind(try_revoke_bias); 1932 // The prototype mark in the klass doesn't have the bias bit set any 1933 // more, indicating that objects of this data type are not supposed 1934 // to be biased any more. We are going to try to reset the mark of 1935 // this object to the prototype value and fall through to the 1936 // CAS-based locking scheme. Note that if our CAS fails, it means 1937 // that another thread raced us for the privilege of revoking the 1938 // bias of this particular object, so it's okay to continue in the 1939 // normal locking code. 1940 load_klass(temp_reg, obj_reg); 1941 ld(temp_reg, in_bytes(Klass::prototype_header_offset()), temp_reg); 1942 andi(temp2_reg, mark_reg, markOopDesc::age_mask_in_place); 1943 orr(temp_reg, temp_reg, temp2_reg); 1944 1945 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 1946 1947 // CmpxchgX sets cr_reg to cmpX(temp2_reg, mark_reg). 1948 cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg, 1949 /*compare_value=*/mark_reg, /*exchange_value=*/temp_reg, 1950 /*where=*/obj_reg, 1951 MacroAssembler::MemBarAcq, 1952 MacroAssembler::cmpxchgx_hint_acquire_lock()); 1953 1954 // reload markOop in mark_reg before continuing with lightweight locking 1955 ld(mark_reg, oopDesc::mark_offset_in_bytes(), obj_reg); 1956 1957 // Fall through to the normal CAS-based lock, because no matter what 1958 // the result of the above CAS, some thread must have succeeded in 1959 // removing the bias bit from the object's header. 1960 if (PrintBiasedLockingStatistics) { 1961 Label l; 1962 bne(cr_reg, l); 1963 load_const(temp_reg, (address) BiasedLocking::revoked_lock_entry_count_addr(), temp2_reg); 1964 lwz(temp2_reg, 0, temp_reg); 1965 addi(temp2_reg, temp2_reg, 1); 1966 stw(temp2_reg, 0, temp_reg); 1967 bind(l); 1968 } 1969 1970 bind(cas_label); 1971 } 1972 1973 void MacroAssembler::biased_locking_exit (ConditionRegister cr_reg, Register mark_addr, Register temp_reg, Label& done) { 1974 // Check for biased locking unlock case, which is a no-op 1975 // Note: we do not have to check the thread ID for two reasons. 1976 // First, the interpreter checks for IllegalMonitorStateException at 1977 // a higher level. Second, if the bias was revoked while we held the 1978 // lock, the object could not be rebiased toward another thread, so 1979 // the bias bit would be clear. 1980 1981 ld(temp_reg, 0, mark_addr); 1982 andi(temp_reg, temp_reg, markOopDesc::biased_lock_mask_in_place); 1983 1984 cmpwi(cr_reg, temp_reg, markOopDesc::biased_lock_pattern); 1985 beq(cr_reg, done); 1986 } 1987 1988 // TM on PPC64. 1989 void MacroAssembler::atomic_inc_ptr(Register addr, Register result, int simm16) { 1990 Label retry; 1991 bind(retry); 1992 ldarx(result, addr, /*hint*/ false); 1993 addi(result, result, simm16); 1994 stdcx_(result, addr); 1995 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1996 bne_predict_not_taken(CCR0, retry); // stXcx_ sets CCR0 1997 } else { 1998 bne( CCR0, retry); // stXcx_ sets CCR0 1999 } 2000 } 2001 2002 void MacroAssembler::atomic_ori_int(Register addr, Register result, int uimm16) { 2003 Label retry; 2004 bind(retry); 2005 lwarx(result, addr, /*hint*/ false); 2006 ori(result, result, uimm16); 2007 stwcx_(result, addr); 2008 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 2009 bne_predict_not_taken(CCR0, retry); // stXcx_ sets CCR0 2010 } else { 2011 bne( CCR0, retry); // stXcx_ sets CCR0 2012 } 2013 } 2014 2015 #if INCLUDE_RTM_OPT 2016 2017 // Update rtm_counters based on abort status 2018 // input: abort_status 2019 // rtm_counters (RTMLockingCounters*) 2020 void MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters_Reg) { 2021 // Mapping to keep PreciseRTMLockingStatistics similar to x86. 2022 // x86 ppc (! means inverted, ? means not the same) 2023 // 0 31 Set if abort caused by XABORT instruction. 2024 // 1 ! 7 If set, the transaction may succeed on a retry. This bit is always clear if bit 0 is set. 2025 // 2 13 Set if another logical processor conflicted with a memory address that was part of the transaction that aborted. 2026 // 3 10 Set if an internal buffer overflowed. 2027 // 4 ?12 Set if a debug breakpoint was hit. 2028 // 5 ?32 Set if an abort occurred during execution of a nested transaction. 2029 const int tm_failure_bit[] = {Assembler::tm_tabort, // Note: Seems like signal handler sets this, too. 2030 Assembler::tm_failure_persistent, // inverted: transient 2031 Assembler::tm_trans_cf, 2032 Assembler::tm_footprint_of, 2033 Assembler::tm_non_trans_cf, 2034 Assembler::tm_suspended}; 2035 const bool tm_failure_inv[] = {false, true, false, false, false, false}; 2036 assert(sizeof(tm_failure_bit)/sizeof(int) == RTMLockingCounters::ABORT_STATUS_LIMIT, "adapt mapping!"); 2037 2038 const Register addr_Reg = R0; 2039 // Keep track of offset to where rtm_counters_Reg had pointed to. 2040 int counters_offs = RTMLockingCounters::abort_count_offset(); 2041 addi(addr_Reg, rtm_counters_Reg, counters_offs); 2042 const Register temp_Reg = rtm_counters_Reg; 2043 2044 //atomic_inc_ptr(addr_Reg, temp_Reg); We don't increment atomically 2045 ldx(temp_Reg, addr_Reg); 2046 addi(temp_Reg, temp_Reg, 1); 2047 stdx(temp_Reg, addr_Reg); 2048 2049 if (PrintPreciseRTMLockingStatistics) { 2050 int counters_offs_delta = RTMLockingCounters::abortX_count_offset() - counters_offs; 2051 2052 //mftexasr(abort_status); done by caller 2053 for (int i = 0; i < RTMLockingCounters::ABORT_STATUS_LIMIT; i++) { 2054 counters_offs += counters_offs_delta; 2055 li(temp_Reg, counters_offs_delta); // can't use addi with R0 2056 add(addr_Reg, addr_Reg, temp_Reg); // point to next counter 2057 counters_offs_delta = sizeof(uintx); 2058 2059 Label check_abort; 2060 rldicr_(temp_Reg, abort_status, tm_failure_bit[i], 0); 2061 if (tm_failure_inv[i]) { 2062 bne(CCR0, check_abort); 2063 } else { 2064 beq(CCR0, check_abort); 2065 } 2066 //atomic_inc_ptr(addr_Reg, temp_Reg); We don't increment atomically 2067 ldx(temp_Reg, addr_Reg); 2068 addi(temp_Reg, temp_Reg, 1); 2069 stdx(temp_Reg, addr_Reg); 2070 bind(check_abort); 2071 } 2072 } 2073 li(temp_Reg, -counters_offs); // can't use addi with R0 2074 add(rtm_counters_Reg, addr_Reg, temp_Reg); // restore 2075 } 2076 2077 // Branch if (random & (count-1) != 0), count is 2^n 2078 // tmp and CR0 are killed 2079 void MacroAssembler::branch_on_random_using_tb(Register tmp, int count, Label& brLabel) { 2080 mftb(tmp); 2081 andi_(tmp, tmp, count-1); 2082 bne(CCR0, brLabel); 2083 } 2084 2085 // Perform abort ratio calculation, set no_rtm bit if high ratio. 2086 // input: rtm_counters_Reg (RTMLockingCounters* address) - KILLED 2087 void MacroAssembler::rtm_abort_ratio_calculation(Register rtm_counters_Reg, 2088 RTMLockingCounters* rtm_counters, 2089 Metadata* method_data) { 2090 Label L_done, L_check_always_rtm1, L_check_always_rtm2; 2091 2092 if (RTMLockingCalculationDelay > 0) { 2093 // Delay calculation. 2094 ld(rtm_counters_Reg, (RegisterOrConstant)(intptr_t)RTMLockingCounters::rtm_calculation_flag_addr()); 2095 cmpdi(CCR0, rtm_counters_Reg, 0); 2096 beq(CCR0, L_done); 2097 load_const_optimized(rtm_counters_Reg, (address)rtm_counters, R0); // reload 2098 } 2099 // Abort ratio calculation only if abort_count > RTMAbortThreshold. 2100 // Aborted transactions = abort_count * 100 2101 // All transactions = total_count * RTMTotalCountIncrRate 2102 // Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio) 2103 ld(R0, RTMLockingCounters::abort_count_offset(), rtm_counters_Reg); 2104 cmpdi(CCR0, R0, RTMAbortThreshold); 2105 blt(CCR0, L_check_always_rtm2); 2106 mulli(R0, R0, 100); 2107 2108 const Register tmpReg = rtm_counters_Reg; 2109 ld(tmpReg, RTMLockingCounters::total_count_offset(), rtm_counters_Reg); 2110 mulli(tmpReg, tmpReg, RTMTotalCountIncrRate); 2111 mulli(tmpReg, tmpReg, RTMAbortRatio); 2112 cmpd(CCR0, R0, tmpReg); 2113 blt(CCR0, L_check_always_rtm1); // jump to reload 2114 if (method_data != NULL) { 2115 // Set rtm_state to "no rtm" in MDO. 2116 // Not using a metadata relocation. Method and Class Loader are kept alive anyway. 2117 // (See nmethod::metadata_do and CodeBuffer::finalize_oop_references.) 2118 load_const(R0, (address)method_data + MethodData::rtm_state_offset_in_bytes(), tmpReg); 2119 atomic_ori_int(R0, tmpReg, NoRTM); 2120 } 2121 b(L_done); 2122 2123 bind(L_check_always_rtm1); 2124 load_const_optimized(rtm_counters_Reg, (address)rtm_counters, R0); // reload 2125 bind(L_check_always_rtm2); 2126 ld(tmpReg, RTMLockingCounters::total_count_offset(), rtm_counters_Reg); 2127 cmpdi(CCR0, tmpReg, RTMLockingThreshold / RTMTotalCountIncrRate); 2128 blt(CCR0, L_done); 2129 if (method_data != NULL) { 2130 // Set rtm_state to "always rtm" in MDO. 2131 // Not using a metadata relocation. See above. 2132 load_const(R0, (address)method_data + MethodData::rtm_state_offset_in_bytes(), tmpReg); 2133 atomic_ori_int(R0, tmpReg, UseRTM); 2134 } 2135 bind(L_done); 2136 } 2137 2138 // Update counters and perform abort ratio calculation. 2139 // input: abort_status_Reg 2140 void MacroAssembler::rtm_profiling(Register abort_status_Reg, Register temp_Reg, 2141 RTMLockingCounters* rtm_counters, 2142 Metadata* method_data, 2143 bool profile_rtm) { 2144 2145 assert(rtm_counters != NULL, "should not be NULL when profiling RTM"); 2146 // Update rtm counters based on state at abort. 2147 // Reads abort_status_Reg, updates flags. 2148 assert_different_registers(abort_status_Reg, temp_Reg); 2149 load_const_optimized(temp_Reg, (address)rtm_counters, R0); 2150 rtm_counters_update(abort_status_Reg, temp_Reg); 2151 if (profile_rtm) { 2152 assert(rtm_counters != NULL, "should not be NULL when profiling RTM"); 2153 rtm_abort_ratio_calculation(temp_Reg, rtm_counters, method_data); 2154 } 2155 } 2156 2157 // Retry on abort if abort's status indicates non-persistent failure. 2158 // inputs: retry_count_Reg 2159 // : abort_status_Reg 2160 // output: retry_count_Reg decremented by 1 2161 void MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg, 2162 Label& retryLabel, Label* checkRetry) { 2163 Label doneRetry; 2164 rldicr_(R0, abort_status_Reg, tm_failure_persistent, 0); 2165 bne(CCR0, doneRetry); 2166 if (checkRetry) { bind(*checkRetry); } 2167 addic_(retry_count_Reg, retry_count_Reg, -1); 2168 blt(CCR0, doneRetry); 2169 smt_yield(); // Can't use wait(). No permission (SIGILL). 2170 b(retryLabel); 2171 bind(doneRetry); 2172 } 2173 2174 // Spin and retry if lock is busy. 2175 // inputs: box_Reg (monitor address) 2176 // : retry_count_Reg 2177 // output: retry_count_Reg decremented by 1 2178 // CTR is killed 2179 void MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register owner_addr_Reg, Label& retryLabel) { 2180 Label SpinLoop, doneRetry; 2181 addic_(retry_count_Reg, retry_count_Reg, -1); 2182 blt(CCR0, doneRetry); 2183 li(R0, RTMSpinLoopCount); 2184 mtctr(R0); 2185 2186 bind(SpinLoop); 2187 smt_yield(); // Can't use waitrsv(). No permission (SIGILL). 2188 bdz(retryLabel); 2189 ld(R0, 0, owner_addr_Reg); 2190 cmpdi(CCR0, R0, 0); 2191 bne(CCR0, SpinLoop); 2192 b(retryLabel); 2193 2194 bind(doneRetry); 2195 } 2196 2197 // Use RTM for normal stack locks. 2198 // Input: objReg (object to lock) 2199 void MacroAssembler::rtm_stack_locking(ConditionRegister flag, 2200 Register obj, Register mark_word, Register tmp, 2201 Register retry_on_abort_count_Reg, 2202 RTMLockingCounters* stack_rtm_counters, 2203 Metadata* method_data, bool profile_rtm, 2204 Label& DONE_LABEL, Label& IsInflated) { 2205 assert(UseRTMForStackLocks, "why call this otherwise?"); 2206 assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking"); 2207 Label L_rtm_retry, L_decrement_retry, L_on_abort; 2208 2209 if (RTMRetryCount > 0) { 2210 load_const_optimized(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort 2211 bind(L_rtm_retry); 2212 } 2213 andi_(R0, mark_word, markOopDesc::monitor_value); // inflated vs stack-locked|neutral|biased 2214 bne(CCR0, IsInflated); 2215 2216 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 2217 Label L_noincrement; 2218 if (RTMTotalCountIncrRate > 1) { 2219 branch_on_random_using_tb(tmp, (int)RTMTotalCountIncrRate, L_noincrement); 2220 } 2221 assert(stack_rtm_counters != NULL, "should not be NULL when profiling RTM"); 2222 load_const_optimized(tmp, (address)stack_rtm_counters->total_count_addr(), R0); 2223 //atomic_inc_ptr(tmp, /*temp, will be reloaded*/mark_word); We don't increment atomically 2224 ldx(mark_word, tmp); 2225 addi(mark_word, mark_word, 1); 2226 stdx(mark_word, tmp); 2227 bind(L_noincrement); 2228 } 2229 tbegin_(); 2230 beq(CCR0, L_on_abort); 2231 ld(mark_word, oopDesc::mark_offset_in_bytes(), obj); // Reload in transaction, conflicts need to be tracked. 2232 andi(R0, mark_word, markOopDesc::biased_lock_mask_in_place); // look at 3 lock bits 2233 cmpwi(flag, R0, markOopDesc::unlocked_value); // bits = 001 unlocked 2234 beq(flag, DONE_LABEL); // all done if unlocked 2235 2236 if (UseRTMXendForLockBusy) { 2237 tend_(); 2238 b(L_decrement_retry); 2239 } else { 2240 tabort_(); 2241 } 2242 bind(L_on_abort); 2243 const Register abort_status_Reg = tmp; 2244 mftexasr(abort_status_Reg); 2245 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 2246 rtm_profiling(abort_status_Reg, /*temp*/mark_word, stack_rtm_counters, method_data, profile_rtm); 2247 } 2248 ld(mark_word, oopDesc::mark_offset_in_bytes(), obj); // reload 2249 if (RTMRetryCount > 0) { 2250 // Retry on lock abort if abort status is not permanent. 2251 rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry, &L_decrement_retry); 2252 } else { 2253 bind(L_decrement_retry); 2254 } 2255 } 2256 2257 // Use RTM for inflating locks 2258 // inputs: obj (object to lock) 2259 // mark_word (current header - KILLED) 2260 // boxReg (on-stack box address (displaced header location) - KILLED) 2261 void MacroAssembler::rtm_inflated_locking(ConditionRegister flag, 2262 Register obj, Register mark_word, Register boxReg, 2263 Register retry_on_busy_count_Reg, Register retry_on_abort_count_Reg, 2264 RTMLockingCounters* rtm_counters, 2265 Metadata* method_data, bool profile_rtm, 2266 Label& DONE_LABEL) { 2267 assert(UseRTMLocking, "why call this otherwise?"); 2268 Label L_rtm_retry, L_decrement_retry, L_on_abort; 2269 // Clean monitor_value bit to get valid pointer. 2270 int owner_offset = ObjectMonitor::owner_offset_in_bytes() - markOopDesc::monitor_value; 2271 2272 // Store non-null, using boxReg instead of (intptr_t)markOopDesc::unused_mark(). 2273 std(boxReg, BasicLock::displaced_header_offset_in_bytes(), boxReg); 2274 const Register tmpReg = boxReg; 2275 const Register owner_addr_Reg = mark_word; 2276 addi(owner_addr_Reg, mark_word, owner_offset); 2277 2278 if (RTMRetryCount > 0) { 2279 load_const_optimized(retry_on_busy_count_Reg, RTMRetryCount); // Retry on lock busy. 2280 load_const_optimized(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort. 2281 bind(L_rtm_retry); 2282 } 2283 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 2284 Label L_noincrement; 2285 if (RTMTotalCountIncrRate > 1) { 2286 branch_on_random_using_tb(R0, (int)RTMTotalCountIncrRate, L_noincrement); 2287 } 2288 assert(rtm_counters != NULL, "should not be NULL when profiling RTM"); 2289 load_const(R0, (address)rtm_counters->total_count_addr(), tmpReg); 2290 //atomic_inc_ptr(R0, tmpReg); We don't increment atomically 2291 ldx(tmpReg, R0); 2292 addi(tmpReg, tmpReg, 1); 2293 stdx(tmpReg, R0); 2294 bind(L_noincrement); 2295 } 2296 tbegin_(); 2297 beq(CCR0, L_on_abort); 2298 // We don't reload mark word. Will only be reset at safepoint. 2299 ld(R0, 0, owner_addr_Reg); // Load in transaction, conflicts need to be tracked. 2300 cmpdi(flag, R0, 0); 2301 beq(flag, DONE_LABEL); 2302 2303 if (UseRTMXendForLockBusy) { 2304 tend_(); 2305 b(L_decrement_retry); 2306 } else { 2307 tabort_(); 2308 } 2309 bind(L_on_abort); 2310 const Register abort_status_Reg = tmpReg; 2311 mftexasr(abort_status_Reg); 2312 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 2313 rtm_profiling(abort_status_Reg, /*temp*/ owner_addr_Reg, rtm_counters, method_data, profile_rtm); 2314 // Restore owner_addr_Reg 2315 ld(mark_word, oopDesc::mark_offset_in_bytes(), obj); 2316 #ifdef ASSERT 2317 andi_(R0, mark_word, markOopDesc::monitor_value); 2318 asm_assert_ne("must be inflated", 0xa754); // Deflating only allowed at safepoint. 2319 #endif 2320 addi(owner_addr_Reg, mark_word, owner_offset); 2321 } 2322 if (RTMRetryCount > 0) { 2323 // Retry on lock abort if abort status is not permanent. 2324 rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry); 2325 } 2326 2327 // Appears unlocked - try to swing _owner from null to non-null. 2328 cmpxchgd(flag, /*current val*/ R0, (intptr_t)0, /*new val*/ R16_thread, owner_addr_Reg, 2329 MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq, 2330 MacroAssembler::cmpxchgx_hint_acquire_lock(), noreg, &L_decrement_retry, true); 2331 2332 if (RTMRetryCount > 0) { 2333 // success done else retry 2334 b(DONE_LABEL); 2335 bind(L_decrement_retry); 2336 // Spin and retry if lock is busy. 2337 rtm_retry_lock_on_busy(retry_on_busy_count_Reg, owner_addr_Reg, L_rtm_retry); 2338 } else { 2339 bind(L_decrement_retry); 2340 } 2341 } 2342 2343 #endif // INCLUDE_RTM_OPT 2344 2345 // "The box" is the space on the stack where we copy the object mark. 2346 void MacroAssembler::compiler_fast_lock_object(ConditionRegister flag, Register oop, Register box, 2347 Register temp, Register displaced_header, Register current_header, 2348 bool try_bias, 2349 RTMLockingCounters* rtm_counters, 2350 RTMLockingCounters* stack_rtm_counters, 2351 Metadata* method_data, 2352 bool use_rtm, bool profile_rtm) { 2353 assert_different_registers(oop, box, temp, displaced_header, current_header); 2354 assert(flag != CCR0, "bad condition register"); 2355 Label cont; 2356 Label object_has_monitor; 2357 Label cas_failed; 2358 2359 // Load markOop from object into displaced_header. 2360 ld(displaced_header, oopDesc::mark_offset_in_bytes(), oop); 2361 2362 2363 // Always do locking in runtime. 2364 if (EmitSync & 0x01) { 2365 cmpdi(flag, oop, 0); // Oop can't be 0 here => always false. 2366 return; 2367 } 2368 2369 if (try_bias) { 2370 biased_locking_enter(flag, oop, displaced_header, temp, current_header, cont); 2371 } 2372 2373 #if INCLUDE_RTM_OPT 2374 if (UseRTMForStackLocks && use_rtm) { 2375 rtm_stack_locking(flag, oop, displaced_header, temp, /*temp*/ current_header, 2376 stack_rtm_counters, method_data, profile_rtm, 2377 cont, object_has_monitor); 2378 } 2379 #endif // INCLUDE_RTM_OPT 2380 2381 // Handle existing monitor. 2382 if ((EmitSync & 0x02) == 0) { 2383 // The object has an existing monitor iff (mark & monitor_value) != 0. 2384 andi_(temp, displaced_header, markOopDesc::monitor_value); 2385 bne(CCR0, object_has_monitor); 2386 } 2387 2388 // Set displaced_header to be (markOop of object | UNLOCK_VALUE). 2389 ori(displaced_header, displaced_header, markOopDesc::unlocked_value); 2390 2391 // Load Compare Value application register. 2392 2393 // Initialize the box. (Must happen before we update the object mark!) 2394 std(displaced_header, BasicLock::displaced_header_offset_in_bytes(), box); 2395 2396 // Must fence, otherwise, preceding store(s) may float below cmpxchg. 2397 // Compare object markOop with mark and if equal exchange scratch1 with object markOop. 2398 // CmpxchgX sets cr_reg to cmpX(current, displaced). 2399 membar(Assembler::StoreStore); 2400 cmpxchgd(/*flag=*/flag, 2401 /*current_value=*/current_header, 2402 /*compare_value=*/displaced_header, 2403 /*exchange_value=*/box, 2404 /*where=*/oop, 2405 MacroAssembler::MemBarAcq, 2406 MacroAssembler::cmpxchgx_hint_acquire_lock(), 2407 noreg, 2408 &cas_failed); 2409 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 2410 2411 // If the compare-and-exchange succeeded, then we found an unlocked 2412 // object and we have now locked it. 2413 b(cont); 2414 2415 bind(cas_failed); 2416 // We did not see an unlocked object so try the fast recursive case. 2417 2418 // Check if the owner is self by comparing the value in the markOop of object 2419 // (current_header) with the stack pointer. 2420 sub(current_header, current_header, R1_SP); 2421 load_const_optimized(temp, (address) (~(os::vm_page_size()-1) | 2422 markOopDesc::lock_mask_in_place)); 2423 2424 and_(R0/*==0?*/, current_header, temp); 2425 // If condition is true we are cont and hence we can store 0 as the 2426 // displaced header in the box, which indicates that it is a recursive lock. 2427 mcrf(flag,CCR0); 2428 std(R0/*==0, perhaps*/, BasicLock::displaced_header_offset_in_bytes(), box); 2429 2430 // Handle existing monitor. 2431 if ((EmitSync & 0x02) == 0) { 2432 b(cont); 2433 2434 bind(object_has_monitor); 2435 // The object's monitor m is unlocked iff m->owner == NULL, 2436 // otherwise m->owner may contain a thread or a stack address. 2437 2438 #if INCLUDE_RTM_OPT 2439 // Use the same RTM locking code in 32- and 64-bit VM. 2440 if (use_rtm) { 2441 rtm_inflated_locking(flag, oop, displaced_header, box, temp, /*temp*/ current_header, 2442 rtm_counters, method_data, profile_rtm, cont); 2443 } else { 2444 #endif // INCLUDE_RTM_OPT 2445 2446 // Try to CAS m->owner from NULL to current thread. 2447 addi(temp, displaced_header, ObjectMonitor::owner_offset_in_bytes()-markOopDesc::monitor_value); 2448 li(displaced_header, 0); 2449 // CmpxchgX sets flag to cmpX(current, displaced). 2450 cmpxchgd(/*flag=*/flag, 2451 /*current_value=*/current_header, 2452 /*compare_value=*/(intptr_t)0, 2453 /*exchange_value=*/R16_thread, 2454 /*where=*/temp, 2455 MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq, 2456 MacroAssembler::cmpxchgx_hint_acquire_lock()); 2457 2458 // Store a non-null value into the box. 2459 std(box, BasicLock::displaced_header_offset_in_bytes(), box); 2460 2461 # ifdef ASSERT 2462 bne(flag, cont); 2463 // We have acquired the monitor, check some invariants. 2464 addi(/*monitor=*/temp, temp, -ObjectMonitor::owner_offset_in_bytes()); 2465 // Invariant 1: _recursions should be 0. 2466 //assert(ObjectMonitor::recursions_size_in_bytes() == 8, "unexpected size"); 2467 asm_assert_mem8_is_zero(ObjectMonitor::recursions_offset_in_bytes(), temp, 2468 "monitor->_recursions should be 0", -1); 2469 // Invariant 2: OwnerIsThread shouldn't be 0. 2470 //assert(ObjectMonitor::OwnerIsThread_size_in_bytes() == 4, "unexpected size"); 2471 //asm_assert_mem4_isnot_zero(ObjectMonitor::OwnerIsThread_offset_in_bytes(), temp, 2472 // "monitor->OwnerIsThread shouldn't be 0", -1); 2473 # endif 2474 2475 #if INCLUDE_RTM_OPT 2476 } // use_rtm() 2477 #endif 2478 } 2479 2480 bind(cont); 2481 // flag == EQ indicates success 2482 // flag == NE indicates failure 2483 } 2484 2485 void MacroAssembler::compiler_fast_unlock_object(ConditionRegister flag, Register oop, Register box, 2486 Register temp, Register displaced_header, Register current_header, 2487 bool try_bias, bool use_rtm) { 2488 assert_different_registers(oop, box, temp, displaced_header, current_header); 2489 assert(flag != CCR0, "bad condition register"); 2490 Label cont; 2491 Label object_has_monitor; 2492 2493 // Always do locking in runtime. 2494 if (EmitSync & 0x01) { 2495 cmpdi(flag, oop, 0); // Oop can't be 0 here => always false. 2496 return; 2497 } 2498 2499 if (try_bias) { 2500 biased_locking_exit(flag, oop, current_header, cont); 2501 } 2502 2503 #if INCLUDE_RTM_OPT 2504 if (UseRTMForStackLocks && use_rtm) { 2505 assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking"); 2506 Label L_regular_unlock; 2507 ld(current_header, oopDesc::mark_offset_in_bytes(), oop); // fetch markword 2508 andi(R0, current_header, markOopDesc::biased_lock_mask_in_place); // look at 3 lock bits 2509 cmpwi(flag, R0, markOopDesc::unlocked_value); // bits = 001 unlocked 2510 bne(flag, L_regular_unlock); // else RegularLock 2511 tend_(); // otherwise end... 2512 b(cont); // ... and we're done 2513 bind(L_regular_unlock); 2514 } 2515 #endif 2516 2517 // Find the lock address and load the displaced header from the stack. 2518 ld(displaced_header, BasicLock::displaced_header_offset_in_bytes(), box); 2519 2520 // If the displaced header is 0, we have a recursive unlock. 2521 cmpdi(flag, displaced_header, 0); 2522 beq(flag, cont); 2523 2524 // Handle existing monitor. 2525 if ((EmitSync & 0x02) == 0) { 2526 // The object has an existing monitor iff (mark & monitor_value) != 0. 2527 RTM_OPT_ONLY( if (!(UseRTMForStackLocks && use_rtm)) ) // skip load if already done 2528 ld(current_header, oopDesc::mark_offset_in_bytes(), oop); 2529 andi_(R0, current_header, markOopDesc::monitor_value); 2530 bne(CCR0, object_has_monitor); 2531 } 2532 2533 // Check if it is still a light weight lock, this is is true if we see 2534 // the stack address of the basicLock in the markOop of the object. 2535 // Cmpxchg sets flag to cmpd(current_header, box). 2536 cmpxchgd(/*flag=*/flag, 2537 /*current_value=*/current_header, 2538 /*compare_value=*/box, 2539 /*exchange_value=*/displaced_header, 2540 /*where=*/oop, 2541 MacroAssembler::MemBarRel, 2542 MacroAssembler::cmpxchgx_hint_release_lock(), 2543 noreg, 2544 &cont); 2545 2546 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 2547 2548 // Handle existing monitor. 2549 if ((EmitSync & 0x02) == 0) { 2550 b(cont); 2551 2552 bind(object_has_monitor); 2553 addi(current_header, current_header, -markOopDesc::monitor_value); // monitor 2554 ld(temp, ObjectMonitor::owner_offset_in_bytes(), current_header); 2555 2556 // It's inflated. 2557 #if INCLUDE_RTM_OPT 2558 if (use_rtm) { 2559 Label L_regular_inflated_unlock; 2560 // Clean monitor_value bit to get valid pointer 2561 cmpdi(flag, temp, 0); 2562 bne(flag, L_regular_inflated_unlock); 2563 tend_(); 2564 b(cont); 2565 bind(L_regular_inflated_unlock); 2566 } 2567 #endif 2568 2569 ld(displaced_header, ObjectMonitor::recursions_offset_in_bytes(), current_header); 2570 xorr(temp, R16_thread, temp); // Will be 0 if we are the owner. 2571 orr(temp, temp, displaced_header); // Will be 0 if there are 0 recursions. 2572 cmpdi(flag, temp, 0); 2573 bne(flag, cont); 2574 2575 ld(temp, ObjectMonitor::EntryList_offset_in_bytes(), current_header); 2576 ld(displaced_header, ObjectMonitor::cxq_offset_in_bytes(), current_header); 2577 orr(temp, temp, displaced_header); // Will be 0 if both are 0. 2578 cmpdi(flag, temp, 0); 2579 bne(flag, cont); 2580 release(); 2581 std(temp, ObjectMonitor::owner_offset_in_bytes(), current_header); 2582 } 2583 2584 bind(cont); 2585 // flag == EQ indicates success 2586 // flag == NE indicates failure 2587 } 2588 2589 // Write serialization page so VM thread can do a pseudo remote membar. 2590 // We use the current thread pointer to calculate a thread specific 2591 // offset to write to within the page. This minimizes bus traffic 2592 // due to cache line collision. 2593 void MacroAssembler::serialize_memory(Register thread, Register tmp1, Register tmp2) { 2594 srdi(tmp2, thread, os::get_serialize_page_shift_count()); 2595 2596 int mask = os::vm_page_size() - sizeof(int); 2597 if (Assembler::is_simm(mask, 16)) { 2598 andi(tmp2, tmp2, mask); 2599 } else { 2600 lis(tmp1, (int)((signed short) (mask >> 16))); 2601 ori(tmp1, tmp1, mask & 0x0000ffff); 2602 andr(tmp2, tmp2, tmp1); 2603 } 2604 2605 load_const(tmp1, (long) os::get_memory_serialize_page()); 2606 release(); 2607 stwx(R0, tmp1, tmp2); 2608 } 2609 2610 2611 // GC barrier helper macros 2612 2613 // Write the card table byte if needed. 2614 void MacroAssembler::card_write_barrier_post(Register Rstore_addr, Register Rnew_val, Register Rtmp) { 2615 CardTableModRefBS* bs = 2616 barrier_set_cast<CardTableModRefBS>(Universe::heap()->barrier_set()); 2617 assert(bs->kind() == BarrierSet::CardTableForRS || 2618 bs->kind() == BarrierSet::CardTableExtension, "wrong barrier"); 2619 #ifdef ASSERT 2620 cmpdi(CCR0, Rnew_val, 0); 2621 asm_assert_ne("null oop not allowed", 0x321); 2622 #endif 2623 card_table_write(bs->byte_map_base, Rtmp, Rstore_addr); 2624 } 2625 2626 // Write the card table byte. 2627 void MacroAssembler::card_table_write(jbyte* byte_map_base, Register Rtmp, Register Robj) { 2628 assert_different_registers(Robj, Rtmp, R0); 2629 load_const_optimized(Rtmp, (address)byte_map_base, R0); 2630 srdi(Robj, Robj, CardTableModRefBS::card_shift); 2631 li(R0, 0); // dirty 2632 if (UseConcMarkSweepGC) membar(Assembler::StoreStore); 2633 stbx(R0, Rtmp, Robj); 2634 } 2635 2636 #if INCLUDE_ALL_GCS 2637 // General G1 pre-barrier generator. 2638 // Goal: record the previous value if it is not null. 2639 void MacroAssembler::g1_write_barrier_pre(Register Robj, RegisterOrConstant offset, Register Rpre_val, 2640 Register Rtmp1, Register Rtmp2, bool needs_frame) { 2641 Label runtime, filtered; 2642 2643 // Is marking active? 2644 if (in_bytes(PtrQueue::byte_width_of_active()) == 4) { 2645 lwz(Rtmp1, in_bytes(JavaThread::satb_mark_queue_offset() + PtrQueue::byte_offset_of_active()), R16_thread); 2646 } else { 2647 guarantee(in_bytes(PtrQueue::byte_width_of_active()) == 1, "Assumption"); 2648 lbz(Rtmp1, in_bytes(JavaThread::satb_mark_queue_offset() + PtrQueue::byte_offset_of_active()), R16_thread); 2649 } 2650 cmpdi(CCR0, Rtmp1, 0); 2651 beq(CCR0, filtered); 2652 2653 // Do we need to load the previous value? 2654 if (Robj != noreg) { 2655 // Load the previous value... 2656 if (UseCompressedOops) { 2657 lwz(Rpre_val, offset, Robj); 2658 } else { 2659 ld(Rpre_val, offset, Robj); 2660 } 2661 // Previous value has been loaded into Rpre_val. 2662 } 2663 assert(Rpre_val != noreg, "must have a real register"); 2664 2665 // Is the previous value null? 2666 cmpdi(CCR0, Rpre_val, 0); 2667 beq(CCR0, filtered); 2668 2669 if (Robj != noreg && UseCompressedOops) { 2670 decode_heap_oop_not_null(Rpre_val); 2671 } 2672 2673 // OK, it's not filtered, so we'll need to call enqueue. In the normal 2674 // case, pre_val will be a scratch G-reg, but there are some cases in 2675 // which it's an O-reg. In the first case, do a normal call. In the 2676 // latter, do a save here and call the frameless version. 2677 2678 // Can we store original value in the thread's buffer? 2679 // Is index == 0? 2680 // (The index field is typed as size_t.) 2681 const Register Rbuffer = Rtmp1, Rindex = Rtmp2; 2682 2683 ld(Rindex, in_bytes(JavaThread::satb_mark_queue_offset() + PtrQueue::byte_offset_of_index()), R16_thread); 2684 cmpdi(CCR0, Rindex, 0); 2685 beq(CCR0, runtime); // If index == 0, goto runtime. 2686 ld(Rbuffer, in_bytes(JavaThread::satb_mark_queue_offset() + PtrQueue::byte_offset_of_buf()), R16_thread); 2687 2688 addi(Rindex, Rindex, -wordSize); // Decrement index. 2689 std(Rindex, in_bytes(JavaThread::satb_mark_queue_offset() + PtrQueue::byte_offset_of_index()), R16_thread); 2690 2691 // Record the previous value. 2692 stdx(Rpre_val, Rbuffer, Rindex); 2693 b(filtered); 2694 2695 bind(runtime); 2696 2697 // VM call need frame to access(write) O register. 2698 if (needs_frame) { 2699 save_LR_CR(Rtmp1); 2700 push_frame_reg_args(0, Rtmp2); 2701 } 2702 2703 if (Rpre_val->is_volatile() && Robj == noreg) mr(R31, Rpre_val); // Save pre_val across C call if it was preloaded. 2704 call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_pre), Rpre_val, R16_thread); 2705 if (Rpre_val->is_volatile() && Robj == noreg) mr(Rpre_val, R31); // restore 2706 2707 if (needs_frame) { 2708 pop_frame(); 2709 restore_LR_CR(Rtmp1); 2710 } 2711 2712 bind(filtered); 2713 } 2714 2715 // General G1 post-barrier generator 2716 // Store cross-region card. 2717 void MacroAssembler::g1_write_barrier_post(Register Rstore_addr, Register Rnew_val, Register Rtmp1, Register Rtmp2, Register Rtmp3, Label *filtered_ext) { 2718 Label runtime, filtered_int; 2719 Label& filtered = (filtered_ext != NULL) ? *filtered_ext : filtered_int; 2720 assert_different_registers(Rstore_addr, Rnew_val, Rtmp1, Rtmp2); 2721 2722 G1SATBCardTableLoggingModRefBS* bs = 2723 barrier_set_cast<G1SATBCardTableLoggingModRefBS>(Universe::heap()->barrier_set()); 2724 2725 // Does store cross heap regions? 2726 if (G1RSBarrierRegionFilter) { 2727 xorr(Rtmp1, Rstore_addr, Rnew_val); 2728 srdi_(Rtmp1, Rtmp1, HeapRegion::LogOfHRGrainBytes); 2729 beq(CCR0, filtered); 2730 } 2731 2732 // Crosses regions, storing NULL? 2733 #ifdef ASSERT 2734 cmpdi(CCR0, Rnew_val, 0); 2735 asm_assert_ne("null oop not allowed (G1)", 0x322); // Checked by caller on PPC64, so following branch is obsolete: 2736 //beq(CCR0, filtered); 2737 #endif 2738 2739 // Storing region crossing non-NULL, is card already dirty? 2740 assert(sizeof(*bs->byte_map_base) == sizeof(jbyte), "adjust this code"); 2741 const Register Rcard_addr = Rtmp1; 2742 Register Rbase = Rtmp2; 2743 load_const_optimized(Rbase, (address)bs->byte_map_base, /*temp*/ Rtmp3); 2744 2745 srdi(Rcard_addr, Rstore_addr, CardTableModRefBS::card_shift); 2746 2747 // Get the address of the card. 2748 lbzx(/*card value*/ Rtmp3, Rbase, Rcard_addr); 2749 cmpwi(CCR0, Rtmp3, (int)G1SATBCardTableModRefBS::g1_young_card_val()); 2750 beq(CCR0, filtered); 2751 2752 membar(Assembler::StoreLoad); 2753 lbzx(/*card value*/ Rtmp3, Rbase, Rcard_addr); // Reload after membar. 2754 cmpwi(CCR0, Rtmp3 /* card value */, CardTableModRefBS::dirty_card_val()); 2755 beq(CCR0, filtered); 2756 2757 // Storing a region crossing, non-NULL oop, card is clean. 2758 // Dirty card and log. 2759 li(Rtmp3, CardTableModRefBS::dirty_card_val()); 2760 //release(); // G1: oops are allowed to get visible after dirty marking. 2761 stbx(Rtmp3, Rbase, Rcard_addr); 2762 2763 add(Rcard_addr, Rbase, Rcard_addr); // This is the address which needs to get enqueued. 2764 Rbase = noreg; // end of lifetime 2765 2766 const Register Rqueue_index = Rtmp2, 2767 Rqueue_buf = Rtmp3; 2768 ld(Rqueue_index, in_bytes(JavaThread::dirty_card_queue_offset() + PtrQueue::byte_offset_of_index()), R16_thread); 2769 cmpdi(CCR0, Rqueue_index, 0); 2770 beq(CCR0, runtime); // index == 0 then jump to runtime 2771 ld(Rqueue_buf, in_bytes(JavaThread::dirty_card_queue_offset() + PtrQueue::byte_offset_of_buf()), R16_thread); 2772 2773 addi(Rqueue_index, Rqueue_index, -wordSize); // decrement index 2774 std(Rqueue_index, in_bytes(JavaThread::dirty_card_queue_offset() + PtrQueue::byte_offset_of_index()), R16_thread); 2775 2776 stdx(Rcard_addr, Rqueue_buf, Rqueue_index); // store card 2777 b(filtered); 2778 2779 bind(runtime); 2780 2781 // Save the live input values. 2782 call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_post), Rcard_addr, R16_thread); 2783 2784 bind(filtered_int); 2785 } 2786 #endif // INCLUDE_ALL_GCS 2787 2788 // Values for last_Java_pc, and last_Java_sp must comply to the rules 2789 // in frame_ppc.hpp. 2790 void MacroAssembler::set_last_Java_frame(Register last_Java_sp, Register last_Java_pc) { 2791 // Always set last_Java_pc and flags first because once last_Java_sp 2792 // is visible has_last_Java_frame is true and users will look at the 2793 // rest of the fields. (Note: flags should always be zero before we 2794 // get here so doesn't need to be set.) 2795 2796 // Verify that last_Java_pc was zeroed on return to Java 2797 asm_assert_mem8_is_zero(in_bytes(JavaThread::last_Java_pc_offset()), R16_thread, 2798 "last_Java_pc not zeroed before leaving Java", 0x200); 2799 2800 // When returning from calling out from Java mode the frame anchor's 2801 // last_Java_pc will always be set to NULL. It is set here so that 2802 // if we are doing a call to native (not VM) that we capture the 2803 // known pc and don't have to rely on the native call having a 2804 // standard frame linkage where we can find the pc. 2805 if (last_Java_pc != noreg) 2806 std(last_Java_pc, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread); 2807 2808 // Set last_Java_sp last. 2809 std(last_Java_sp, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread); 2810 } 2811 2812 void MacroAssembler::reset_last_Java_frame(void) { 2813 asm_assert_mem8_isnot_zero(in_bytes(JavaThread::last_Java_sp_offset()), 2814 R16_thread, "SP was not set, still zero", 0x202); 2815 2816 BLOCK_COMMENT("reset_last_Java_frame {"); 2817 li(R0, 0); 2818 2819 // _last_Java_sp = 0 2820 std(R0, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread); 2821 2822 // _last_Java_pc = 0 2823 std(R0, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread); 2824 BLOCK_COMMENT("} reset_last_Java_frame"); 2825 } 2826 2827 void MacroAssembler::set_top_ijava_frame_at_SP_as_last_Java_frame(Register sp, Register tmp1) { 2828 assert_different_registers(sp, tmp1); 2829 2830 // sp points to a TOP_IJAVA_FRAME, retrieve frame's PC via 2831 // TOP_IJAVA_FRAME_ABI. 2832 // FIXME: assert that we really have a TOP_IJAVA_FRAME here! 2833 #ifdef CC_INTERP 2834 ld(tmp1/*pc*/, _top_ijava_frame_abi(frame_manager_lr), sp); 2835 #else 2836 address entry = pc(); 2837 load_const_optimized(tmp1, entry); 2838 #endif 2839 2840 set_last_Java_frame(/*sp=*/sp, /*pc=*/tmp1); 2841 } 2842 2843 void MacroAssembler::get_vm_result(Register oop_result) { 2844 // Read: 2845 // R16_thread 2846 // R16_thread->in_bytes(JavaThread::vm_result_offset()) 2847 // 2848 // Updated: 2849 // oop_result 2850 // R16_thread->in_bytes(JavaThread::vm_result_offset()) 2851 2852 verify_thread(); 2853 2854 ld(oop_result, in_bytes(JavaThread::vm_result_offset()), R16_thread); 2855 li(R0, 0); 2856 std(R0, in_bytes(JavaThread::vm_result_offset()), R16_thread); 2857 2858 verify_oop(oop_result); 2859 } 2860 2861 void MacroAssembler::get_vm_result_2(Register metadata_result) { 2862 // Read: 2863 // R16_thread 2864 // R16_thread->in_bytes(JavaThread::vm_result_2_offset()) 2865 // 2866 // Updated: 2867 // metadata_result 2868 // R16_thread->in_bytes(JavaThread::vm_result_2_offset()) 2869 2870 ld(metadata_result, in_bytes(JavaThread::vm_result_2_offset()), R16_thread); 2871 li(R0, 0); 2872 std(R0, in_bytes(JavaThread::vm_result_2_offset()), R16_thread); 2873 } 2874 2875 Register MacroAssembler::encode_klass_not_null(Register dst, Register src) { 2876 Register current = (src != noreg) ? src : dst; // Klass is in dst if no src provided. 2877 if (Universe::narrow_klass_base() != 0) { 2878 // Use dst as temp if it is free. 2879 sub_const_optimized(dst, current, Universe::narrow_klass_base(), R0); 2880 current = dst; 2881 } 2882 if (Universe::narrow_klass_shift() != 0) { 2883 srdi(dst, current, Universe::narrow_klass_shift()); 2884 current = dst; 2885 } 2886 return current; 2887 } 2888 2889 void MacroAssembler::store_klass(Register dst_oop, Register klass, Register ck) { 2890 if (UseCompressedClassPointers) { 2891 Register compressedKlass = encode_klass_not_null(ck, klass); 2892 stw(compressedKlass, oopDesc::klass_offset_in_bytes(), dst_oop); 2893 } else { 2894 std(klass, oopDesc::klass_offset_in_bytes(), dst_oop); 2895 } 2896 } 2897 2898 void MacroAssembler::store_klass_gap(Register dst_oop, Register val) { 2899 if (UseCompressedClassPointers) { 2900 if (val == noreg) { 2901 val = R0; 2902 li(val, 0); 2903 } 2904 stw(val, oopDesc::klass_gap_offset_in_bytes(), dst_oop); // klass gap if compressed 2905 } 2906 } 2907 2908 int MacroAssembler::instr_size_for_decode_klass_not_null() { 2909 if (!UseCompressedClassPointers) return 0; 2910 int num_instrs = 1; // shift or move 2911 if (Universe::narrow_klass_base() != 0) num_instrs = 7; // shift + load const + add 2912 return num_instrs * BytesPerInstWord; 2913 } 2914 2915 void MacroAssembler::decode_klass_not_null(Register dst, Register src) { 2916 assert(dst != R0, "Dst reg may not be R0, as R0 is used here."); 2917 if (src == noreg) src = dst; 2918 Register shifted_src = src; 2919 if (Universe::narrow_klass_shift() != 0 || 2920 Universe::narrow_klass_base() == 0 && src != dst) { // Move required. 2921 shifted_src = dst; 2922 sldi(shifted_src, src, Universe::narrow_klass_shift()); 2923 } 2924 if (Universe::narrow_klass_base() != 0) { 2925 add_const_optimized(dst, shifted_src, Universe::narrow_klass_base(), R0); 2926 } 2927 } 2928 2929 void MacroAssembler::load_klass(Register dst, Register src) { 2930 if (UseCompressedClassPointers) { 2931 lwz(dst, oopDesc::klass_offset_in_bytes(), src); 2932 // Attention: no null check here! 2933 decode_klass_not_null(dst, dst); 2934 } else { 2935 ld(dst, oopDesc::klass_offset_in_bytes(), src); 2936 } 2937 } 2938 2939 void MacroAssembler::load_klass_with_trap_null_check(Register dst, Register src) { 2940 if (!os::zero_page_read_protected()) { 2941 if (TrapBasedNullChecks) { 2942 trap_null_check(src); 2943 } 2944 } 2945 load_klass(dst, src); 2946 } 2947 2948 void MacroAssembler::reinit_heapbase(Register d, Register tmp) { 2949 if (Universe::heap() != NULL) { 2950 load_const_optimized(R30, Universe::narrow_ptrs_base(), tmp); 2951 } else { 2952 // Heap not yet allocated. Load indirectly. 2953 int simm16_offset = load_const_optimized(R30, Universe::narrow_ptrs_base_addr(), tmp, true); 2954 ld(R30, simm16_offset, R30); 2955 } 2956 } 2957 2958 // Clear Array 2959 // Kills both input registers. tmp == R0 is allowed. 2960 void MacroAssembler::clear_memory_doubleword(Register base_ptr, Register cnt_dwords, Register tmp) { 2961 // Procedure for large arrays (uses data cache block zero instruction). 2962 Label startloop, fast, fastloop, small_rest, restloop, done; 2963 const int cl_size = VM_Version::get_cache_line_size(), 2964 cl_dwords = cl_size>>3, 2965 cl_dw_addr_bits = exact_log2(cl_dwords), 2966 dcbz_min = 1; // Min count of dcbz executions, needs to be >0. 2967 2968 //2: 2969 cmpdi(CCR1, cnt_dwords, ((dcbz_min+1)<<cl_dw_addr_bits)-1); // Big enough? (ensure >=dcbz_min lines included). 2970 blt(CCR1, small_rest); // Too small. 2971 rldicl_(tmp, base_ptr, 64-3, 64-cl_dw_addr_bits); // Extract dword offset within first cache line. 2972 beq(CCR0, fast); // Already 128byte aligned. 2973 2974 subfic(tmp, tmp, cl_dwords); 2975 mtctr(tmp); // Set ctr to hit 128byte boundary (0<ctr<cl_dwords). 2976 subf(cnt_dwords, tmp, cnt_dwords); // rest. 2977 li(tmp, 0); 2978 //10: 2979 bind(startloop); // Clear at the beginning to reach 128byte boundary. 2980 std(tmp, 0, base_ptr); // Clear 8byte aligned block. 2981 addi(base_ptr, base_ptr, 8); 2982 bdnz(startloop); 2983 //13: 2984 bind(fast); // Clear 128byte blocks. 2985 srdi(tmp, cnt_dwords, cl_dw_addr_bits); // Loop count for 128byte loop (>0). 2986 andi(cnt_dwords, cnt_dwords, cl_dwords-1); // Rest in dwords. 2987 mtctr(tmp); // Load counter. 2988 //16: 2989 bind(fastloop); 2990 dcbz(base_ptr); // Clear 128byte aligned block. 2991 addi(base_ptr, base_ptr, cl_size); 2992 bdnz(fastloop); 2993 if (InsertEndGroupPPC64) { endgroup(); } else { nop(); } 2994 //20: 2995 bind(small_rest); 2996 cmpdi(CCR0, cnt_dwords, 0); // size 0? 2997 beq(CCR0, done); // rest == 0 2998 li(tmp, 0); 2999 mtctr(cnt_dwords); // Load counter. 3000 //24: 3001 bind(restloop); // Clear rest. 3002 std(tmp, 0, base_ptr); // Clear 8byte aligned block. 3003 addi(base_ptr, base_ptr, 8); 3004 bdnz(restloop); 3005 //27: 3006 bind(done); 3007 } 3008 3009 /////////////////////////////////////////// String intrinsics //////////////////////////////////////////// 3010 3011 // Search for a single jchar in an jchar[]. 3012 // 3013 // Assumes that result differs from all other registers. 3014 // 3015 // Haystack, needle are the addresses of jchar-arrays. 3016 // NeedleChar is needle[0] if it is known at compile time. 3017 // Haycnt is the length of the haystack. We assume haycnt >=1. 3018 // 3019 // Preserves haystack, haycnt, kills all other registers. 3020 // 3021 // If needle == R0, we search for the constant needleChar. 3022 void MacroAssembler::string_indexof_1(Register result, Register haystack, Register haycnt, 3023 Register needle, jchar needleChar, 3024 Register tmp1, Register tmp2) { 3025 3026 assert_different_registers(result, haystack, haycnt, needle, tmp1, tmp2); 3027 3028 Label L_InnerLoop, L_FinalCheck, L_Found1, L_Found2, L_Found3, L_NotFound, L_End; 3029 Register needle0 = needle, // Contains needle[0]. 3030 addr = tmp1, 3031 ch1 = tmp2, 3032 ch2 = R0; 3033 3034 //2 (variable) or 3 (const): 3035 if (needle != R0) lhz(needle0, 0, needle); // Preload needle character, needle has len==1. 3036 dcbtct(haystack, 0x00); // Indicate R/O access to haystack. 3037 3038 srwi_(tmp2, haycnt, 1); // Shift right by exact_log2(UNROLL_FACTOR). 3039 mr(addr, haystack); 3040 beq(CCR0, L_FinalCheck); 3041 mtctr(tmp2); // Move to count register. 3042 //8: 3043 bind(L_InnerLoop); // Main work horse (2x unrolled search loop). 3044 lhz(ch1, 0, addr); // Load characters from haystack. 3045 lhz(ch2, 2, addr); 3046 (needle != R0) ? cmpw(CCR0, ch1, needle0) : cmplwi(CCR0, ch1, needleChar); 3047 (needle != R0) ? cmpw(CCR1, ch2, needle0) : cmplwi(CCR1, ch2, needleChar); 3048 beq(CCR0, L_Found1); // Did we find the needle? 3049 beq(CCR1, L_Found2); 3050 addi(addr, addr, 4); 3051 bdnz(L_InnerLoop); 3052 //16: 3053 bind(L_FinalCheck); 3054 andi_(R0, haycnt, 1); 3055 beq(CCR0, L_NotFound); 3056 lhz(ch1, 0, addr); // One position left at which we have to compare. 3057 (needle != R0) ? cmpw(CCR1, ch1, needle0) : cmplwi(CCR1, ch1, needleChar); 3058 beq(CCR1, L_Found3); 3059 //21: 3060 bind(L_NotFound); 3061 li(result, -1); // Not found. 3062 b(L_End); 3063 3064 bind(L_Found2); 3065 addi(addr, addr, 2); 3066 //24: 3067 bind(L_Found1); 3068 bind(L_Found3); // Return index ... 3069 subf(addr, haystack, addr); // relative to haystack, 3070 srdi(result, addr, 1); // in characters. 3071 bind(L_End); 3072 } 3073 3074 3075 // Implementation of IndexOf for jchar arrays. 3076 // 3077 // The length of haystack and needle are not constant, i.e. passed in a register. 3078 // 3079 // Preserves registers haystack, needle. 3080 // Kills registers haycnt, needlecnt. 3081 // Assumes that result differs from all other registers. 3082 // Haystack, needle are the addresses of jchar-arrays. 3083 // Haycnt, needlecnt are the lengths of them, respectively. 3084 // 3085 // Needlecntval must be zero or 15-bit unsigned immediate and > 1. 3086 void MacroAssembler::string_indexof(Register result, Register haystack, Register haycnt, 3087 Register needle, ciTypeArray* needle_values, Register needlecnt, int needlecntval, 3088 Register tmp1, Register tmp2, Register tmp3, Register tmp4) { 3089 3090 // Ensure 0<needlecnt<=haycnt in ideal graph as prerequisite! 3091 Label L_TooShort, L_Found, L_NotFound, L_End; 3092 Register last_addr = haycnt, // Kill haycnt at the beginning. 3093 addr = tmp1, 3094 n_start = tmp2, 3095 ch1 = tmp3, 3096 ch2 = R0; 3097 3098 // ************************************************************************************************** 3099 // Prepare for main loop: optimized for needle count >=2, bail out otherwise. 3100 // ************************************************************************************************** 3101 3102 //1 (variable) or 3 (const): 3103 dcbtct(needle, 0x00); // Indicate R/O access to str1. 3104 dcbtct(haystack, 0x00); // Indicate R/O access to str2. 3105 3106 // Compute last haystack addr to use if no match gets found. 3107 if (needlecntval == 0) { // variable needlecnt 3108 //3: 3109 subf(ch1, needlecnt, haycnt); // Last character index to compare is haycnt-needlecnt. 3110 addi(addr, haystack, -2); // Accesses use pre-increment. 3111 cmpwi(CCR6, needlecnt, 2); 3112 blt(CCR6, L_TooShort); // Variable needlecnt: handle short needle separately. 3113 slwi(ch1, ch1, 1); // Scale to number of bytes. 3114 lwz(n_start, 0, needle); // Load first 2 characters of needle. 3115 add(last_addr, haystack, ch1); // Point to last address to compare (haystack+2*(haycnt-needlecnt)). 3116 addi(needlecnt, needlecnt, -2); // Rest of needle. 3117 } else { // constant needlecnt 3118 guarantee(needlecntval != 1, "IndexOf with single-character needle must be handled separately"); 3119 assert((needlecntval & 0x7fff) == needlecntval, "wrong immediate"); 3120 //5: 3121 addi(ch1, haycnt, -needlecntval); // Last character index to compare is haycnt-needlecnt. 3122 lwz(n_start, 0, needle); // Load first 2 characters of needle. 3123 addi(addr, haystack, -2); // Accesses use pre-increment. 3124 slwi(ch1, ch1, 1); // Scale to number of bytes. 3125 add(last_addr, haystack, ch1); // Point to last address to compare (haystack+2*(haycnt-needlecnt)). 3126 li(needlecnt, needlecntval-2); // Rest of needle. 3127 } 3128 3129 // Main Loop (now we have at least 3 characters). 3130 //11: 3131 Label L_OuterLoop, L_InnerLoop, L_FinalCheck, L_Comp1, L_Comp2, L_Comp3; 3132 bind(L_OuterLoop); // Search for 1st 2 characters. 3133 Register addr_diff = tmp4; 3134 subf(addr_diff, addr, last_addr); // Difference between already checked address and last address to check. 3135 addi(addr, addr, 2); // This is the new address we want to use for comparing. 3136 srdi_(ch2, addr_diff, 2); 3137 beq(CCR0, L_FinalCheck); // 2 characters left? 3138 mtctr(ch2); // addr_diff/4 3139 //16: 3140 bind(L_InnerLoop); // Main work horse (2x unrolled search loop) 3141 lwz(ch1, 0, addr); // Load 2 characters of haystack (ignore alignment). 3142 lwz(ch2, 2, addr); 3143 cmpw(CCR0, ch1, n_start); // Compare 2 characters (1 would be sufficient but try to reduce branches to CompLoop). 3144 cmpw(CCR1, ch2, n_start); 3145 beq(CCR0, L_Comp1); // Did we find the needle start? 3146 beq(CCR1, L_Comp2); 3147 addi(addr, addr, 4); 3148 bdnz(L_InnerLoop); 3149 //24: 3150 bind(L_FinalCheck); 3151 rldicl_(addr_diff, addr_diff, 64-1, 63); // Remaining characters not covered by InnerLoop: (addr_diff>>1)&1. 3152 beq(CCR0, L_NotFound); 3153 lwz(ch1, 0, addr); // One position left at which we have to compare. 3154 cmpw(CCR1, ch1, n_start); 3155 beq(CCR1, L_Comp3); 3156 //29: 3157 bind(L_NotFound); 3158 li(result, -1); // not found 3159 b(L_End); 3160 3161 3162 // ************************************************************************************************** 3163 // Special Case: unfortunately, the variable needle case can be called with needlecnt<2 3164 // ************************************************************************************************** 3165 //31: 3166 if ((needlecntval>>1) !=1 ) { // Const needlecnt is 2 or 3? Reduce code size. 3167 int nopcnt = 5; 3168 if (needlecntval !=0 ) ++nopcnt; // Balance alignment (other case: see below). 3169 if (needlecntval == 0) { // We have to handle these cases separately. 3170 Label L_OneCharLoop; 3171 bind(L_TooShort); 3172 mtctr(haycnt); 3173 lhz(n_start, 0, needle); // First character of needle 3174 bind(L_OneCharLoop); 3175 lhzu(ch1, 2, addr); 3176 cmpw(CCR1, ch1, n_start); 3177 beq(CCR1, L_Found); // Did we find the one character needle? 3178 bdnz(L_OneCharLoop); 3179 li(result, -1); // Not found. 3180 b(L_End); 3181 } // 8 instructions, so no impact on alignment. 3182 for (int x = 0; x < nopcnt; ++x) nop(); 3183 } 3184 3185 // ************************************************************************************************** 3186 // Regular Case Part II: compare rest of needle (first 2 characters have been compared already) 3187 // ************************************************************************************************** 3188 3189 // Compare the rest 3190 //36 if needlecntval==0, else 37: 3191 bind(L_Comp2); 3192 addi(addr, addr, 2); // First comparison has failed, 2nd one hit. 3193 bind(L_Comp1); // Addr points to possible needle start. 3194 bind(L_Comp3); // Could have created a copy and use a different return address but saving code size here. 3195 if (needlecntval != 2) { // Const needlecnt==2? 3196 if (needlecntval != 3) { 3197 if (needlecntval == 0) beq(CCR6, L_Found); // Variable needlecnt==2? 3198 Register ind_reg = tmp4; 3199 li(ind_reg, 2*2); // First 2 characters are already compared, use index 2. 3200 mtctr(needlecnt); // Decremented by 2, still > 0. 3201 //40: 3202 Label L_CompLoop; 3203 bind(L_CompLoop); 3204 lhzx(ch2, needle, ind_reg); 3205 lhzx(ch1, addr, ind_reg); 3206 cmpw(CCR1, ch1, ch2); 3207 bne(CCR1, L_OuterLoop); 3208 addi(ind_reg, ind_reg, 2); 3209 bdnz(L_CompLoop); 3210 } else { // No loop required if there's only one needle character left. 3211 lhz(ch2, 2*2, needle); 3212 lhz(ch1, 2*2, addr); 3213 cmpw(CCR1, ch1, ch2); 3214 bne(CCR1, L_OuterLoop); 3215 } 3216 } 3217 // Return index ... 3218 //46: 3219 bind(L_Found); 3220 subf(addr, haystack, addr); // relative to haystack, ... 3221 srdi(result, addr, 1); // in characters. 3222 //48: 3223 bind(L_End); 3224 } 3225 3226 // Implementation of Compare for jchar arrays. 3227 // 3228 // Kills the registers str1, str2, cnt1, cnt2. 3229 // Kills cr0, ctr. 3230 // Assumes that result differes from the input registers. 3231 void MacroAssembler::string_compare(Register str1_reg, Register str2_reg, Register cnt1_reg, Register cnt2_reg, 3232 Register result_reg, Register tmp_reg) { 3233 assert_different_registers(result_reg, str1_reg, str2_reg, cnt1_reg, cnt2_reg, tmp_reg); 3234 3235 Label Ldone, Lslow_case, Lslow_loop, Lfast_loop; 3236 Register cnt_diff = R0, 3237 limit_reg = cnt1_reg, 3238 chr1_reg = result_reg, 3239 chr2_reg = cnt2_reg, 3240 addr_diff = str2_reg; 3241 3242 // Offset 0 should be 32 byte aligned. 3243 //-4: 3244 dcbtct(str1_reg, 0x00); // Indicate R/O access to str1. 3245 dcbtct(str2_reg, 0x00); // Indicate R/O access to str2. 3246 //-2: 3247 // Compute min(cnt1, cnt2) and check if 0 (bail out if we don't need to compare characters). 3248 subf(result_reg, cnt2_reg, cnt1_reg); // difference between cnt1/2 3249 subf_(addr_diff, str1_reg, str2_reg); // alias? 3250 beq(CCR0, Ldone); // return cnt difference if both ones are identical 3251 srawi(limit_reg, result_reg, 31); // generate signmask (cnt1/2 must be non-negative so cnt_diff can't overflow) 3252 mr(cnt_diff, result_reg); 3253 andr(limit_reg, result_reg, limit_reg); // difference or zero (negative): cnt1<cnt2 ? cnt1-cnt2 : 0 3254 add_(limit_reg, cnt2_reg, limit_reg); // min(cnt1, cnt2)==0? 3255 beq(CCR0, Ldone); // return cnt difference if one has 0 length 3256 3257 lhz(chr1_reg, 0, str1_reg); // optional: early out if first characters mismatch 3258 lhzx(chr2_reg, str1_reg, addr_diff); // optional: early out if first characters mismatch 3259 addi(tmp_reg, limit_reg, -1); // min(cnt1, cnt2)-1 3260 subf_(result_reg, chr2_reg, chr1_reg); // optional: early out if first characters mismatch 3261 bne(CCR0, Ldone); // optional: early out if first characters mismatch 3262 3263 // Set loop counter by scaling down tmp_reg 3264 srawi_(chr2_reg, tmp_reg, exact_log2(4)); // (min(cnt1, cnt2)-1)/4 3265 ble(CCR0, Lslow_case); // need >4 characters for fast loop 3266 andi(limit_reg, tmp_reg, 4-1); // remaining characters 3267 3268 // Adapt str1_reg str2_reg for the first loop iteration 3269 mtctr(chr2_reg); // (min(cnt1, cnt2)-1)/4 3270 addi(limit_reg, limit_reg, 4+1); // compare last 5-8 characters in slow_case if mismatch found in fast_loop 3271 //16: 3272 // Compare the rest of the characters 3273 bind(Lfast_loop); 3274 ld(chr1_reg, 0, str1_reg); 3275 ldx(chr2_reg, str1_reg, addr_diff); 3276 cmpd(CCR0, chr2_reg, chr1_reg); 3277 bne(CCR0, Lslow_case); // return chr1_reg 3278 addi(str1_reg, str1_reg, 4*2); 3279 bdnz(Lfast_loop); 3280 addi(limit_reg, limit_reg, -4); // no mismatch found in fast_loop, only 1-4 characters missing 3281 //23: 3282 bind(Lslow_case); 3283 mtctr(limit_reg); 3284 //24: 3285 bind(Lslow_loop); 3286 lhz(chr1_reg, 0, str1_reg); 3287 lhzx(chr2_reg, str1_reg, addr_diff); 3288 subf_(result_reg, chr2_reg, chr1_reg); 3289 bne(CCR0, Ldone); // return chr1_reg 3290 addi(str1_reg, str1_reg, 1*2); 3291 bdnz(Lslow_loop); 3292 //30: 3293 // If strings are equal up to min length, return the length difference. 3294 mr(result_reg, cnt_diff); 3295 nop(); // alignment 3296 //32: 3297 // Otherwise, return the difference between the first mismatched chars. 3298 bind(Ldone); 3299 } 3300 3301 3302 // Compare char[] arrays. 3303 // 3304 // str1_reg USE only 3305 // str2_reg USE only 3306 // cnt_reg USE_DEF, due to tmp reg shortage 3307 // result_reg DEF only, might compromise USE only registers 3308 void MacroAssembler::char_arrays_equals(Register str1_reg, Register str2_reg, Register cnt_reg, Register result_reg, 3309 Register tmp1_reg, Register tmp2_reg, Register tmp3_reg, Register tmp4_reg, 3310 Register tmp5_reg) { 3311 3312 // Str1 may be the same register as str2 which can occur e.g. after scalar replacement. 3313 assert_different_registers(result_reg, str1_reg, cnt_reg, tmp1_reg, tmp2_reg, tmp3_reg, tmp4_reg, tmp5_reg); 3314 assert_different_registers(result_reg, str2_reg, cnt_reg, tmp1_reg, tmp2_reg, tmp3_reg, tmp4_reg, tmp5_reg); 3315 3316 // Offset 0 should be 32 byte aligned. 3317 Label Linit_cbc, Lcbc, Lloop, Ldone_true, Ldone_false; 3318 Register index_reg = tmp5_reg; 3319 Register cbc_iter = tmp4_reg; 3320 3321 //-1: 3322 dcbtct(str1_reg, 0x00); // Indicate R/O access to str1. 3323 dcbtct(str2_reg, 0x00); // Indicate R/O access to str2. 3324 //1: 3325 andi(cbc_iter, cnt_reg, 4-1); // Remaining iterations after 4 java characters per iteration loop. 3326 li(index_reg, 0); // init 3327 li(result_reg, 0); // assume false 3328 srwi_(tmp2_reg, cnt_reg, exact_log2(4)); // Div: 4 java characters per iteration (main loop). 3329 3330 cmpwi(CCR1, cbc_iter, 0); // CCR1 = (cbc_iter==0) 3331 beq(CCR0, Linit_cbc); // too short 3332 mtctr(tmp2_reg); 3333 //8: 3334 bind(Lloop); 3335 ldx(tmp1_reg, str1_reg, index_reg); 3336 ldx(tmp2_reg, str2_reg, index_reg); 3337 cmpd(CCR0, tmp1_reg, tmp2_reg); 3338 bne(CCR0, Ldone_false); // Unequal char pair found -> done. 3339 addi(index_reg, index_reg, 4*sizeof(jchar)); 3340 bdnz(Lloop); 3341 //14: 3342 bind(Linit_cbc); 3343 beq(CCR1, Ldone_true); 3344 mtctr(cbc_iter); 3345 //16: 3346 bind(Lcbc); 3347 lhzx(tmp1_reg, str1_reg, index_reg); 3348 lhzx(tmp2_reg, str2_reg, index_reg); 3349 cmpw(CCR0, tmp1_reg, tmp2_reg); 3350 bne(CCR0, Ldone_false); // Unequal char pair found -> done. 3351 addi(index_reg, index_reg, 1*sizeof(jchar)); 3352 bdnz(Lcbc); 3353 nop(); 3354 bind(Ldone_true); 3355 li(result_reg, 1); 3356 //24: 3357 bind(Ldone_false); 3358 } 3359 3360 3361 void MacroAssembler::char_arrays_equalsImm(Register str1_reg, Register str2_reg, int cntval, Register result_reg, 3362 Register tmp1_reg, Register tmp2_reg) { 3363 // Str1 may be the same register as str2 which can occur e.g. after scalar replacement. 3364 assert_different_registers(result_reg, str1_reg, tmp1_reg, tmp2_reg); 3365 assert_different_registers(result_reg, str2_reg, tmp1_reg, tmp2_reg); 3366 assert(sizeof(jchar) == 2, "must be"); 3367 assert(cntval >= 0 && ((cntval & 0x7fff) == cntval), "wrong immediate"); 3368 3369 Label Ldone_false; 3370 3371 if (cntval < 16) { // short case 3372 if (cntval != 0) li(result_reg, 0); // assume false 3373 3374 const int num_bytes = cntval*sizeof(jchar); 3375 int index = 0; 3376 for (int next_index; (next_index = index + 8) <= num_bytes; index = next_index) { 3377 ld(tmp1_reg, index, str1_reg); 3378 ld(tmp2_reg, index, str2_reg); 3379 cmpd(CCR0, tmp1_reg, tmp2_reg); 3380 bne(CCR0, Ldone_false); 3381 } 3382 if (cntval & 2) { 3383 lwz(tmp1_reg, index, str1_reg); 3384 lwz(tmp2_reg, index, str2_reg); 3385 cmpw(CCR0, tmp1_reg, tmp2_reg); 3386 bne(CCR0, Ldone_false); 3387 index += 4; 3388 } 3389 if (cntval & 1) { 3390 lhz(tmp1_reg, index, str1_reg); 3391 lhz(tmp2_reg, index, str2_reg); 3392 cmpw(CCR0, tmp1_reg, tmp2_reg); 3393 bne(CCR0, Ldone_false); 3394 } 3395 // fallthrough: true 3396 } else { 3397 Label Lloop; 3398 Register index_reg = tmp1_reg; 3399 const int loopcnt = cntval/4; 3400 assert(loopcnt > 0, "must be"); 3401 // Offset 0 should be 32 byte aligned. 3402 //2: 3403 dcbtct(str1_reg, 0x00); // Indicate R/O access to str1. 3404 dcbtct(str2_reg, 0x00); // Indicate R/O access to str2. 3405 li(tmp2_reg, loopcnt); 3406 li(index_reg, 0); // init 3407 li(result_reg, 0); // assume false 3408 mtctr(tmp2_reg); 3409 //8: 3410 bind(Lloop); 3411 ldx(R0, str1_reg, index_reg); 3412 ldx(tmp2_reg, str2_reg, index_reg); 3413 cmpd(CCR0, R0, tmp2_reg); 3414 bne(CCR0, Ldone_false); // Unequal char pair found -> done. 3415 addi(index_reg, index_reg, 4*sizeof(jchar)); 3416 bdnz(Lloop); 3417 //14: 3418 if (cntval & 2) { 3419 lwzx(R0, str1_reg, index_reg); 3420 lwzx(tmp2_reg, str2_reg, index_reg); 3421 cmpw(CCR0, R0, tmp2_reg); 3422 bne(CCR0, Ldone_false); 3423 if (cntval & 1) addi(index_reg, index_reg, 2*sizeof(jchar)); 3424 } 3425 if (cntval & 1) { 3426 lhzx(R0, str1_reg, index_reg); 3427 lhzx(tmp2_reg, str2_reg, index_reg); 3428 cmpw(CCR0, R0, tmp2_reg); 3429 bne(CCR0, Ldone_false); 3430 } 3431 // fallthru: true 3432 } 3433 li(result_reg, 1); 3434 bind(Ldone_false); 3435 } 3436 3437 // Helpers for Intrinsic Emitters 3438 // 3439 // Revert the byte order of a 32bit value in a register 3440 // src: 0x44556677 3441 // dst: 0x77665544 3442 // Three steps to obtain the result: 3443 // 1) Rotate src (as doubleword) left 5 bytes. That puts the leftmost byte of the src word 3444 // into the rightmost byte position. Afterwards, everything left of the rightmost byte is cleared. 3445 // This value initializes dst. 3446 // 2) Rotate src (as word) left 3 bytes. That puts the rightmost byte of the src word into the leftmost 3447 // byte position. Furthermore, byte 5 is rotated into byte 6 position where it is supposed to go. 3448 // This value is mask inserted into dst with a [0..23] mask of 1s. 3449 // 3) Rotate src (as word) left 1 byte. That puts byte 6 into byte 5 position. 3450 // This value is mask inserted into dst with a [8..15] mask of 1s. 3451 void MacroAssembler::load_reverse_32(Register dst, Register src) { 3452 assert_different_registers(dst, src); 3453 3454 rldicl(dst, src, (4+1)*8, 56); // Rotate byte 4 into position 7 (rightmost), clear all to the left. 3455 rlwimi(dst, src, 3*8, 0, 23); // Insert byte 5 into position 6, 7 into 4, leave pos 7 alone. 3456 rlwimi(dst, src, 1*8, 8, 15); // Insert byte 6 into position 5, leave the rest alone. 3457 } 3458 3459 // Calculate the column addresses of the crc32 lookup table into distinct registers. 3460 // This loop-invariant calculation is moved out of the loop body, reducing the loop 3461 // body size from 20 to 16 instructions. 3462 // Returns the offset that was used to calculate the address of column tc3. 3463 // Due to register shortage, setting tc3 may overwrite table. With the return offset 3464 // at hand, the original table address can be easily reconstructed. 3465 int MacroAssembler::crc32_table_columns(Register table, Register tc0, Register tc1, Register tc2, Register tc3) { 3466 3467 #ifdef VM_LITTLE_ENDIAN 3468 // This is what we implement (the DOLIT4 part): 3469 // ========================================================================= */ 3470 // #define DOLIT4 c ^= *buf4++; \ 3471 // c = crc_table[3][c & 0xff] ^ crc_table[2][(c >> 8) & 0xff] ^ \ 3472 // crc_table[1][(c >> 16) & 0xff] ^ crc_table[0][c >> 24] 3473 // #define DOLIT32 DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4 3474 // ========================================================================= */ 3475 const int ix0 = 3*(4*CRC32_COLUMN_SIZE); 3476 const int ix1 = 2*(4*CRC32_COLUMN_SIZE); 3477 const int ix2 = 1*(4*CRC32_COLUMN_SIZE); 3478 const int ix3 = 0*(4*CRC32_COLUMN_SIZE); 3479 #else 3480 // This is what we implement (the DOBIG4 part): 3481 // ========================================================================= 3482 // #define DOBIG4 c ^= *++buf4; \ 3483 // c = crc_table[4][c & 0xff] ^ crc_table[5][(c >> 8) & 0xff] ^ \ 3484 // crc_table[6][(c >> 16) & 0xff] ^ crc_table[7][c >> 24] 3485 // #define DOBIG32 DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4 3486 // ========================================================================= 3487 const int ix0 = 4*(4*CRC32_COLUMN_SIZE); 3488 const int ix1 = 5*(4*CRC32_COLUMN_SIZE); 3489 const int ix2 = 6*(4*CRC32_COLUMN_SIZE); 3490 const int ix3 = 7*(4*CRC32_COLUMN_SIZE); 3491 #endif 3492 assert_different_registers(table, tc0, tc1, tc2); 3493 assert(table == tc3, "must be!"); 3494 3495 if (ix0 != 0) addi(tc0, table, ix0); 3496 if (ix1 != 0) addi(tc1, table, ix1); 3497 if (ix2 != 0) addi(tc2, table, ix2); 3498 if (ix3 != 0) addi(tc3, table, ix3); 3499 3500 return ix3; 3501 } 3502 3503 /** 3504 * uint32_t crc; 3505 * timesXtoThe32[crc & 0xFF] ^ (crc >> 8); 3506 */ 3507 void MacroAssembler::fold_byte_crc32(Register crc, Register val, Register table, Register tmp) { 3508 assert_different_registers(crc, table, tmp); 3509 assert_different_registers(val, table); 3510 3511 if (crc == val) { // Must rotate first to use the unmodified value. 3512 rlwinm(tmp, val, 2, 24-2, 31-2); // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest. 3513 // As we use a word (4-byte) instruction, we have to adapt the mask bit positions. 3514 srwi(crc, crc, 8); // Unsigned shift, clear leftmost 8 bits. 3515 } else { 3516 srwi(crc, crc, 8); // Unsigned shift, clear leftmost 8 bits. 3517 rlwinm(tmp, val, 2, 24-2, 31-2); // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest. 3518 } 3519 lwzx(tmp, table, tmp); 3520 xorr(crc, crc, tmp); 3521 } 3522 3523 /** 3524 * uint32_t crc; 3525 * timesXtoThe32[crc & 0xFF] ^ (crc >> 8); 3526 */ 3527 void MacroAssembler::fold_8bit_crc32(Register crc, Register table, Register tmp) { 3528 fold_byte_crc32(crc, crc, table, tmp); 3529 } 3530 3531 /** 3532 * Emits code to update CRC-32 with a byte value according to constants in table. 3533 * 3534 * @param [in,out]crc Register containing the crc. 3535 * @param [in]val Register containing the byte to fold into the CRC. 3536 * @param [in]table Register containing the table of crc constants. 3537 * 3538 * uint32_t crc; 3539 * val = crc_table[(val ^ crc) & 0xFF]; 3540 * crc = val ^ (crc >> 8); 3541 */ 3542 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) { 3543 BLOCK_COMMENT("update_byte_crc32:"); 3544 xorr(val, val, crc); 3545 fold_byte_crc32(crc, val, table, val); 3546 } 3547 3548 /** 3549 * @param crc register containing existing CRC (32-bit) 3550 * @param buf register pointing to input byte buffer (byte*) 3551 * @param len register containing number of bytes 3552 * @param table register pointing to CRC table 3553 */ 3554 void MacroAssembler::update_byteLoop_crc32(Register crc, Register buf, Register len, Register table, 3555 Register data, bool loopAlignment, bool invertCRC) { 3556 assert_different_registers(crc, buf, len, table, data); 3557 3558 Label L_mainLoop, L_done; 3559 const int mainLoop_stepping = 1; 3560 const int mainLoop_alignment = loopAlignment ? 32 : 4; // (InputForNewCode > 4 ? InputForNewCode : 32) : 4; 3561 3562 // Process all bytes in a single-byte loop. 3563 cmpdi(CCR0, len, 0); // Anything to do? 3564 mtctr(len); 3565 beq(CCR0, L_done); 3566 3567 if (invertCRC) { 3568 nand(crc, crc, crc); // ~c 3569 } 3570 3571 align(mainLoop_alignment); 3572 BIND(L_mainLoop); 3573 lbz(data, 0, buf); // Byte from buffer, zero-extended. 3574 addi(buf, buf, mainLoop_stepping); // Advance buffer position. 3575 update_byte_crc32(crc, data, table); 3576 bdnz(L_mainLoop); // Iterate. 3577 3578 if (invertCRC) { 3579 nand(crc, crc, crc); // ~c 3580 } 3581 3582 bind(L_done); 3583 } 3584 3585 /** 3586 * Emits code to update CRC-32 with a 4-byte value according to constants in table 3587 * Implementation according to jdk/src/share/native/java/util/zip/zlib-1.2.8/crc32.c 3588 */ 3589 // A not on the lookup table address(es): 3590 // The lookup table consists of two sets of four columns each. 3591 // The columns {0..3} are used for little-endian machines. 3592 // The columns {4..7} are used for big-endian machines. 3593 // To save the effort of adding the column offset to the table address each time 3594 // a table element is looked up, it is possible to pass the pre-calculated 3595 // column addresses. 3596 // Uses R9..R12 as work register. Must be saved/restored by caller, if necessary. 3597 void MacroAssembler::update_1word_crc32(Register crc, Register buf, Register table, int bufDisp, int bufInc, 3598 Register t0, Register t1, Register t2, Register t3, 3599 Register tc0, Register tc1, Register tc2, Register tc3) { 3600 assert_different_registers(crc, t3); 3601 3602 // XOR crc with next four bytes of buffer. 3603 lwz(t3, bufDisp, buf); 3604 if (bufInc != 0) { 3605 addi(buf, buf, bufInc); 3606 } 3607 xorr(t3, t3, crc); 3608 3609 // Chop crc into 4 single-byte pieces, shifted left 2 bits, to form the table indices. 3610 rlwinm(t0, t3, 2, 24-2, 31-2); // ((t1 >> 0) & 0xff) << 2 3611 rlwinm(t1, t3, 32+(2- 8), 24-2, 31-2); // ((t1 >> 8) & 0xff) << 2 3612 rlwinm(t2, t3, 32+(2-16), 24-2, 31-2); // ((t1 >> 16) & 0xff) << 2 3613 rlwinm(t3, t3, 32+(2-24), 24-2, 31-2); // ((t1 >> 24) & 0xff) << 2 3614 3615 // Use the pre-calculated column addresses. 3616 // Load pre-calculated table values. 3617 lwzx(t0, tc0, t0); 3618 lwzx(t1, tc1, t1); 3619 lwzx(t2, tc2, t2); 3620 lwzx(t3, tc3, t3); 3621 3622 // Calculate new crc from table values. 3623 xorr(t0, t0, t1); 3624 xorr(t2, t2, t3); 3625 xorr(crc, t0, t2); // Now crc contains the final checksum value. 3626 } 3627 3628 /** 3629 * @param crc register containing existing CRC (32-bit) 3630 * @param buf register pointing to input byte buffer (byte*) 3631 * @param len register containing number of bytes 3632 * @param table register pointing to CRC table 3633 * 3634 * Uses R9..R12 as work register. Must be saved/restored by caller! 3635 */ 3636 void MacroAssembler::kernel_crc32_2word(Register crc, Register buf, Register len, Register table, 3637 Register t0, Register t1, Register t2, Register t3, 3638 Register tc0, Register tc1, Register tc2, Register tc3) { 3639 assert_different_registers(crc, buf, len, table); 3640 3641 Label L_mainLoop, L_tail; 3642 Register tmp = t0; 3643 Register data = t0; 3644 Register tmp2 = t1; 3645 const int mainLoop_stepping = 8; 3646 const int tailLoop_stepping = 1; 3647 const int log_stepping = exact_log2(mainLoop_stepping); 3648 const int mainLoop_alignment = 32; // InputForNewCode > 4 ? InputForNewCode : 32; 3649 const int complexThreshold = 2*mainLoop_stepping; 3650 3651 // Don't test for len <= 0 here. This pathological case should not occur anyway. 3652 // Optimizing for it by adding a test and a branch seems to be a waste of CPU cycles. 3653 // The situation itself is detected and handled correctly by the conditional branches 3654 // following aghi(len, -stepping) and aghi(len, +stepping). 3655 assert(tailLoop_stepping == 1, "check tailLoop_stepping!"); 3656 3657 BLOCK_COMMENT("kernel_crc32_2word {"); 3658 3659 nand(crc, crc, crc); // ~c 3660 3661 // Check for short (<mainLoop_stepping) buffer. 3662 cmpdi(CCR0, len, complexThreshold); 3663 blt(CCR0, L_tail); 3664 3665 // Pre-mainLoop alignment did show a slight (1%) positive effect on performance. 3666 // We leave the code in for reference. Maybe we need alignment when we exploit vector instructions. 3667 { 3668 // Align buf addr to mainLoop_stepping boundary. 3669 neg(tmp2, buf); // Calculate # preLoop iterations for alignment. 3670 rldicl(tmp2, tmp2, 0, 64-log_stepping); // Rotate tmp2 0 bits, insert into tmp2, anding with mask with 1s from 62..63. 3671 3672 if (complexThreshold > mainLoop_stepping) { 3673 sub(len, len, tmp2); // Remaining bytes for main loop (>=mainLoop_stepping is guaranteed). 3674 } else { 3675 sub(tmp, len, tmp2); // Remaining bytes for main loop. 3676 cmpdi(CCR0, tmp, mainLoop_stepping); 3677 blt(CCR0, L_tail); // For less than one mainloop_stepping left, do only tail processing 3678 mr(len, tmp); // remaining bytes for main loop (>=mainLoop_stepping is guaranteed). 3679 } 3680 update_byteLoop_crc32(crc, buf, tmp2, table, data, false, false); 3681 } 3682 3683 srdi(tmp2, len, log_stepping); // #iterations for mainLoop 3684 andi(len, len, mainLoop_stepping-1); // remaining bytes for tailLoop 3685 mtctr(tmp2); 3686 3687 #ifdef VM_LITTLE_ENDIAN 3688 Register crc_rv = crc; 3689 #else 3690 Register crc_rv = tmp; // Load_reverse needs separate registers to work on. 3691 // Occupies tmp, but frees up crc. 3692 load_reverse_32(crc_rv, crc); // Revert byte order because we are dealing with big-endian data. 3693 tmp = crc; 3694 #endif 3695 3696 int reconstructTableOffset = crc32_table_columns(table, tc0, tc1, tc2, tc3); 3697 3698 align(mainLoop_alignment); // Octoword-aligned loop address. Shows 2% improvement. 3699 BIND(L_mainLoop); 3700 update_1word_crc32(crc_rv, buf, table, 0, 0, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3); 3701 update_1word_crc32(crc_rv, buf, table, 4, mainLoop_stepping, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3); 3702 bdnz(L_mainLoop); 3703 3704 #ifndef VM_LITTLE_ENDIAN 3705 load_reverse_32(crc, crc_rv); // Revert byte order because we are dealing with big-endian data. 3706 tmp = crc_rv; // Tmp uses it's original register again. 3707 #endif 3708 3709 // Restore original table address for tailLoop. 3710 if (reconstructTableOffset != 0) { 3711 addi(table, table, -reconstructTableOffset); 3712 } 3713 3714 // Process last few (<complexThreshold) bytes of buffer. 3715 BIND(L_tail); 3716 update_byteLoop_crc32(crc, buf, len, table, data, false, false); 3717 3718 nand(crc, crc, crc); // ~c 3719 BLOCK_COMMENT("} kernel_crc32_2word"); 3720 } 3721 3722 /** 3723 * @param crc register containing existing CRC (32-bit) 3724 * @param buf register pointing to input byte buffer (byte*) 3725 * @param len register containing number of bytes 3726 * @param table register pointing to CRC table 3727 * 3728 * uses R9..R12 as work register. Must be saved/restored by caller! 3729 */ 3730 void MacroAssembler::kernel_crc32_1word(Register crc, Register buf, Register len, Register table, 3731 Register t0, Register t1, Register t2, Register t3, 3732 Register tc0, Register tc1, Register tc2, Register tc3) { 3733 assert_different_registers(crc, buf, len, table); 3734 3735 Label L_mainLoop, L_tail; 3736 Register tmp = t0; 3737 Register data = t0; 3738 Register tmp2 = t1; 3739 const int mainLoop_stepping = 4; 3740 const int tailLoop_stepping = 1; 3741 const int log_stepping = exact_log2(mainLoop_stepping); 3742 const int mainLoop_alignment = 32; // InputForNewCode > 4 ? InputForNewCode : 32; 3743 const int complexThreshold = 2*mainLoop_stepping; 3744 3745 // Don't test for len <= 0 here. This pathological case should not occur anyway. 3746 // Optimizing for it by adding a test and a branch seems to be a waste of CPU cycles. 3747 // The situation itself is detected and handled correctly by the conditional branches 3748 // following aghi(len, -stepping) and aghi(len, +stepping). 3749 assert(tailLoop_stepping == 1, "check tailLoop_stepping!"); 3750 3751 BLOCK_COMMENT("kernel_crc32_1word {"); 3752 3753 nand(crc, crc, crc); // ~c 3754 3755 // Check for short (<mainLoop_stepping) buffer. 3756 cmpdi(CCR0, len, complexThreshold); 3757 blt(CCR0, L_tail); 3758 3759 // Pre-mainLoop alignment did show a slight (1%) positive effect on performance. 3760 // We leave the code in for reference. Maybe we need alignment when we exploit vector instructions. 3761 { 3762 // Align buf addr to mainLoop_stepping boundary. 3763 neg(tmp2, buf); // Calculate # preLoop iterations for alignment. 3764 rldicl(tmp2, tmp2, 0, 64-log_stepping); // Rotate tmp2 0 bits, insert into tmp2, anding with mask with 1s from 62..63. 3765 3766 if (complexThreshold > mainLoop_stepping) { 3767 sub(len, len, tmp2); // Remaining bytes for main loop (>=mainLoop_stepping is guaranteed). 3768 } else { 3769 sub(tmp, len, tmp2); // Remaining bytes for main loop. 3770 cmpdi(CCR0, tmp, mainLoop_stepping); 3771 blt(CCR0, L_tail); // For less than one mainloop_stepping left, do only tail processing 3772 mr(len, tmp); // remaining bytes for main loop (>=mainLoop_stepping is guaranteed). 3773 } 3774 update_byteLoop_crc32(crc, buf, tmp2, table, data, false, false); 3775 } 3776 3777 srdi(tmp2, len, log_stepping); // #iterations for mainLoop 3778 andi(len, len, mainLoop_stepping-1); // remaining bytes for tailLoop 3779 mtctr(tmp2); 3780 3781 #ifdef VM_LITTLE_ENDIAN 3782 Register crc_rv = crc; 3783 #else 3784 Register crc_rv = tmp; // Load_reverse needs separate registers to work on. 3785 // Occupies tmp, but frees up crc. 3786 load_reverse_32(crc_rv, crc); // evert byte order because we are dealing with big-endian data. 3787 tmp = crc; 3788 #endif 3789 3790 int reconstructTableOffset = crc32_table_columns(table, tc0, tc1, tc2, tc3); 3791 3792 align(mainLoop_alignment); // Octoword-aligned loop address. Shows 2% improvement. 3793 BIND(L_mainLoop); 3794 update_1word_crc32(crc_rv, buf, table, 0, mainLoop_stepping, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3); 3795 bdnz(L_mainLoop); 3796 3797 #ifndef VM_LITTLE_ENDIAN 3798 load_reverse_32(crc, crc_rv); // Revert byte order because we are dealing with big-endian data. 3799 tmp = crc_rv; // Tmp uses it's original register again. 3800 #endif 3801 3802 // Restore original table address for tailLoop. 3803 if (reconstructTableOffset != 0) { 3804 addi(table, table, -reconstructTableOffset); 3805 } 3806 3807 // Process last few (<complexThreshold) bytes of buffer. 3808 BIND(L_tail); 3809 update_byteLoop_crc32(crc, buf, len, table, data, false, false); 3810 3811 nand(crc, crc, crc); // ~c 3812 BLOCK_COMMENT("} kernel_crc32_1word"); 3813 } 3814 3815 /** 3816 * @param crc register containing existing CRC (32-bit) 3817 * @param buf register pointing to input byte buffer (byte*) 3818 * @param len register containing number of bytes 3819 * @param table register pointing to CRC table 3820 * 3821 * Uses R7_ARG5, R8_ARG6 as work registers. 3822 */ 3823 void MacroAssembler::kernel_crc32_1byte(Register crc, Register buf, Register len, Register table, 3824 Register t0, Register t1, Register t2, Register t3) { 3825 assert_different_registers(crc, buf, len, table); 3826 3827 Register data = t0; // Holds the current byte to be folded into crc. 3828 3829 BLOCK_COMMENT("kernel_crc32_1byte {"); 3830 3831 // Process all bytes in a single-byte loop. 3832 update_byteLoop_crc32(crc, buf, len, table, data, true, true); 3833 3834 BLOCK_COMMENT("} kernel_crc32_1byte"); 3835 } 3836 3837 void MacroAssembler::kernel_crc32_singleByte(Register crc, Register buf, Register len, Register table, Register tmp) { 3838 assert_different_registers(crc, buf, /* len, not used!! */ table, tmp); 3839 3840 BLOCK_COMMENT("kernel_crc32_singleByte:"); 3841 nand(crc, crc, crc); // ~c 3842 3843 lbz(tmp, 0, buf); // Byte from buffer, zero-extended. 3844 update_byte_crc32(crc, tmp, table); 3845 3846 nand(crc, crc, crc); // ~c 3847 } 3848 3849 // dest_lo += src1 + src2 3850 // dest_hi += carry1 + carry2 3851 void MacroAssembler::add2_with_carry(Register dest_hi, 3852 Register dest_lo, 3853 Register src1, Register src2) { 3854 li(R0, 0); 3855 addc(dest_lo, dest_lo, src1); 3856 adde(dest_hi, dest_hi, R0); 3857 addc(dest_lo, dest_lo, src2); 3858 adde(dest_hi, dest_hi, R0); 3859 } 3860 3861 // Multiply 64 bit by 64 bit first loop. 3862 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, 3863 Register x_xstart, 3864 Register y, Register y_idx, 3865 Register z, 3866 Register carry, 3867 Register product_high, Register product, 3868 Register idx, Register kdx, 3869 Register tmp) { 3870 // jlong carry, x[], y[], z[]; 3871 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx--, kdx--) { 3872 // huge_128 product = y[idx] * x[xstart] + carry; 3873 // z[kdx] = (jlong)product; 3874 // carry = (jlong)(product >>> 64); 3875 // } 3876 // z[xstart] = carry; 3877 3878 Label L_first_loop, L_first_loop_exit; 3879 Label L_one_x, L_one_y, L_multiply; 3880 3881 addic_(xstart, xstart, -1); 3882 blt(CCR0, L_one_x); // Special case: length of x is 1. 3883 3884 // Load next two integers of x. 3885 sldi(tmp, xstart, LogBytesPerInt); 3886 ldx(x_xstart, x, tmp); 3887 #ifdef VM_LITTLE_ENDIAN 3888 rldicl(x_xstart, x_xstart, 32, 0); 3889 #endif 3890 3891 align(32, 16); 3892 bind(L_first_loop); 3893 3894 cmpdi(CCR0, idx, 1); 3895 blt(CCR0, L_first_loop_exit); 3896 addi(idx, idx, -2); 3897 beq(CCR0, L_one_y); 3898 3899 // Load next two integers of y. 3900 sldi(tmp, idx, LogBytesPerInt); 3901 ldx(y_idx, y, tmp); 3902 #ifdef VM_LITTLE_ENDIAN 3903 rldicl(y_idx, y_idx, 32, 0); 3904 #endif 3905 3906 3907 bind(L_multiply); 3908 multiply64(product_high, product, x_xstart, y_idx); 3909 3910 li(tmp, 0); 3911 addc(product, product, carry); // Add carry to result. 3912 adde(product_high, product_high, tmp); // Add carry of the last addition. 3913 addi(kdx, kdx, -2); 3914 3915 // Store result. 3916 #ifdef VM_LITTLE_ENDIAN 3917 rldicl(product, product, 32, 0); 3918 #endif 3919 sldi(tmp, kdx, LogBytesPerInt); 3920 stdx(product, z, tmp); 3921 mr_if_needed(carry, product_high); 3922 b(L_first_loop); 3923 3924 3925 bind(L_one_y); // Load one 32 bit portion of y as (0,value). 3926 3927 lwz(y_idx, 0, y); 3928 b(L_multiply); 3929 3930 3931 bind(L_one_x); // Load one 32 bit portion of x as (0,value). 3932 3933 lwz(x_xstart, 0, x); 3934 b(L_first_loop); 3935 3936 bind(L_first_loop_exit); 3937 } 3938 3939 // Multiply 64 bit by 64 bit and add 128 bit. 3940 void MacroAssembler::multiply_add_128_x_128(Register x_xstart, Register y, 3941 Register z, Register yz_idx, 3942 Register idx, Register carry, 3943 Register product_high, Register product, 3944 Register tmp, int offset) { 3945 3946 // huge_128 product = (y[idx] * x_xstart) + z[kdx] + carry; 3947 // z[kdx] = (jlong)product; 3948 3949 sldi(tmp, idx, LogBytesPerInt); 3950 if (offset) { 3951 addi(tmp, tmp, offset); 3952 } 3953 ldx(yz_idx, y, tmp); 3954 #ifdef VM_LITTLE_ENDIAN 3955 rldicl(yz_idx, yz_idx, 32, 0); 3956 #endif 3957 3958 multiply64(product_high, product, x_xstart, yz_idx); 3959 ldx(yz_idx, z, tmp); 3960 #ifdef VM_LITTLE_ENDIAN 3961 rldicl(yz_idx, yz_idx, 32, 0); 3962 #endif 3963 3964 add2_with_carry(product_high, product, carry, yz_idx); 3965 3966 sldi(tmp, idx, LogBytesPerInt); 3967 if (offset) { 3968 addi(tmp, tmp, offset); 3969 } 3970 #ifdef VM_LITTLE_ENDIAN 3971 rldicl(product, product, 32, 0); 3972 #endif 3973 stdx(product, z, tmp); 3974 } 3975 3976 // Multiply 128 bit by 128 bit. Unrolled inner loop. 3977 void MacroAssembler::multiply_128_x_128_loop(Register x_xstart, 3978 Register y, Register z, 3979 Register yz_idx, Register idx, Register carry, 3980 Register product_high, Register product, 3981 Register carry2, Register tmp) { 3982 3983 // jlong carry, x[], y[], z[]; 3984 // int kdx = ystart+1; 3985 // for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop 3986 // huge_128 product = (y[idx+1] * x_xstart) + z[kdx+idx+1] + carry; 3987 // z[kdx+idx+1] = (jlong)product; 3988 // jlong carry2 = (jlong)(product >>> 64); 3989 // product = (y[idx] * x_xstart) + z[kdx+idx] + carry2; 3990 // z[kdx+idx] = (jlong)product; 3991 // carry = (jlong)(product >>> 64); 3992 // } 3993 // idx += 2; 3994 // if (idx > 0) { 3995 // product = (y[idx] * x_xstart) + z[kdx+idx] + carry; 3996 // z[kdx+idx] = (jlong)product; 3997 // carry = (jlong)(product >>> 64); 3998 // } 3999 4000 Label L_third_loop, L_third_loop_exit, L_post_third_loop_done; 4001 const Register jdx = R0; 4002 4003 // Scale the index. 4004 srdi_(jdx, idx, 2); 4005 beq(CCR0, L_third_loop_exit); 4006 mtctr(jdx); 4007 4008 align(32, 16); 4009 bind(L_third_loop); 4010 4011 addi(idx, idx, -4); 4012 4013 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 8); 4014 mr_if_needed(carry2, product_high); 4015 4016 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry2, product_high, product, tmp, 0); 4017 mr_if_needed(carry, product_high); 4018 bdnz(L_third_loop); 4019 4020 bind(L_third_loop_exit); // Handle any left-over operand parts. 4021 4022 andi_(idx, idx, 0x3); 4023 beq(CCR0, L_post_third_loop_done); 4024 4025 Label L_check_1; 4026 4027 addic_(idx, idx, -2); 4028 blt(CCR0, L_check_1); 4029 4030 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 0); 4031 mr_if_needed(carry, product_high); 4032 4033 bind(L_check_1); 4034 4035 addi(idx, idx, 0x2); 4036 andi_(idx, idx, 0x1) ; 4037 addic_(idx, idx, -1); 4038 blt(CCR0, L_post_third_loop_done); 4039 4040 sldi(tmp, idx, LogBytesPerInt); 4041 lwzx(yz_idx, y, tmp); 4042 multiply64(product_high, product, x_xstart, yz_idx); 4043 lwzx(yz_idx, z, tmp); 4044 4045 add2_with_carry(product_high, product, yz_idx, carry); 4046 4047 sldi(tmp, idx, LogBytesPerInt); 4048 stwx(product, z, tmp); 4049 srdi(product, product, 32); 4050 4051 sldi(product_high, product_high, 32); 4052 orr(product, product, product_high); 4053 mr_if_needed(carry, product); 4054 4055 bind(L_post_third_loop_done); 4056 } // multiply_128_x_128_loop 4057 4058 void MacroAssembler::multiply_to_len(Register x, Register xlen, 4059 Register y, Register ylen, 4060 Register z, Register zlen, 4061 Register tmp1, Register tmp2, 4062 Register tmp3, Register tmp4, 4063 Register tmp5, Register tmp6, 4064 Register tmp7, Register tmp8, 4065 Register tmp9, Register tmp10, 4066 Register tmp11, Register tmp12, 4067 Register tmp13) { 4068 4069 ShortBranchVerifier sbv(this); 4070 4071 assert_different_registers(x, xlen, y, ylen, z, zlen, 4072 tmp1, tmp2, tmp3, tmp4, tmp5, tmp6); 4073 assert_different_registers(x, xlen, y, ylen, z, zlen, 4074 tmp1, tmp2, tmp3, tmp4, tmp5, tmp7); 4075 assert_different_registers(x, xlen, y, ylen, z, zlen, 4076 tmp1, tmp2, tmp3, tmp4, tmp5, tmp8); 4077 4078 const Register idx = tmp1; 4079 const Register kdx = tmp2; 4080 const Register xstart = tmp3; 4081 4082 const Register y_idx = tmp4; 4083 const Register carry = tmp5; 4084 const Register product = tmp6; 4085 const Register product_high = tmp7; 4086 const Register x_xstart = tmp8; 4087 const Register tmp = tmp9; 4088 4089 // First Loop. 4090 // 4091 // final static long LONG_MASK = 0xffffffffL; 4092 // int xstart = xlen - 1; 4093 // int ystart = ylen - 1; 4094 // long carry = 0; 4095 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) { 4096 // long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry; 4097 // z[kdx] = (int)product; 4098 // carry = product >>> 32; 4099 // } 4100 // z[xstart] = (int)carry; 4101 4102 mr_if_needed(idx, ylen); // idx = ylen 4103 mr_if_needed(kdx, zlen); // kdx = xlen + ylen 4104 li(carry, 0); // carry = 0 4105 4106 Label L_done; 4107 4108 addic_(xstart, xlen, -1); 4109 blt(CCR0, L_done); 4110 4111 multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, 4112 carry, product_high, product, idx, kdx, tmp); 4113 4114 Label L_second_loop; 4115 4116 cmpdi(CCR0, kdx, 0); 4117 beq(CCR0, L_second_loop); 4118 4119 Label L_carry; 4120 4121 addic_(kdx, kdx, -1); 4122 beq(CCR0, L_carry); 4123 4124 // Store lower 32 bits of carry. 4125 sldi(tmp, kdx, LogBytesPerInt); 4126 stwx(carry, z, tmp); 4127 srdi(carry, carry, 32); 4128 addi(kdx, kdx, -1); 4129 4130 4131 bind(L_carry); 4132 4133 // Store upper 32 bits of carry. 4134 sldi(tmp, kdx, LogBytesPerInt); 4135 stwx(carry, z, tmp); 4136 4137 // Second and third (nested) loops. 4138 // 4139 // for (int i = xstart-1; i >= 0; i--) { // Second loop 4140 // carry = 0; 4141 // for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop 4142 // long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) + 4143 // (z[k] & LONG_MASK) + carry; 4144 // z[k] = (int)product; 4145 // carry = product >>> 32; 4146 // } 4147 // z[i] = (int)carry; 4148 // } 4149 // 4150 // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = rdx 4151 4152 bind(L_second_loop); 4153 4154 li(carry, 0); // carry = 0; 4155 4156 addic_(xstart, xstart, -1); // i = xstart-1; 4157 blt(CCR0, L_done); 4158 4159 Register zsave = tmp10; 4160 4161 mr(zsave, z); 4162 4163 4164 Label L_last_x; 4165 4166 sldi(tmp, xstart, LogBytesPerInt); 4167 add(z, z, tmp); // z = z + k - j 4168 addi(z, z, 4); 4169 addic_(xstart, xstart, -1); // i = xstart-1; 4170 blt(CCR0, L_last_x); 4171 4172 sldi(tmp, xstart, LogBytesPerInt); 4173 ldx(x_xstart, x, tmp); 4174 #ifdef VM_LITTLE_ENDIAN 4175 rldicl(x_xstart, x_xstart, 32, 0); 4176 #endif 4177 4178 4179 Label L_third_loop_prologue; 4180 4181 bind(L_third_loop_prologue); 4182 4183 Register xsave = tmp11; 4184 Register xlensave = tmp12; 4185 Register ylensave = tmp13; 4186 4187 mr(xsave, x); 4188 mr(xlensave, xstart); 4189 mr(ylensave, ylen); 4190 4191 4192 multiply_128_x_128_loop(x_xstart, y, z, y_idx, ylen, 4193 carry, product_high, product, x, tmp); 4194 4195 mr(z, zsave); 4196 mr(x, xsave); 4197 mr(xlen, xlensave); // This is the decrement of the loop counter! 4198 mr(ylen, ylensave); 4199 4200 addi(tmp3, xlen, 1); 4201 sldi(tmp, tmp3, LogBytesPerInt); 4202 stwx(carry, z, tmp); 4203 addic_(tmp3, tmp3, -1); 4204 blt(CCR0, L_done); 4205 4206 srdi(carry, carry, 32); 4207 sldi(tmp, tmp3, LogBytesPerInt); 4208 stwx(carry, z, tmp); 4209 b(L_second_loop); 4210 4211 // Next infrequent code is moved outside loops. 4212 bind(L_last_x); 4213 4214 lwz(x_xstart, 0, x); 4215 b(L_third_loop_prologue); 4216 4217 bind(L_done); 4218 } // multiply_to_len 4219 4220 void MacroAssembler::asm_assert(bool check_equal, const char *msg, int id) { 4221 #ifdef ASSERT 4222 Label ok; 4223 if (check_equal) { 4224 beq(CCR0, ok); 4225 } else { 4226 bne(CCR0, ok); 4227 } 4228 stop(msg, id); 4229 bind(ok); 4230 #endif 4231 } 4232 4233 void MacroAssembler::asm_assert_mems_zero(bool check_equal, int size, int mem_offset, 4234 Register mem_base, const char* msg, int id) { 4235 #ifdef ASSERT 4236 switch (size) { 4237 case 4: 4238 lwz(R0, mem_offset, mem_base); 4239 cmpwi(CCR0, R0, 0); 4240 break; 4241 case 8: 4242 ld(R0, mem_offset, mem_base); 4243 cmpdi(CCR0, R0, 0); 4244 break; 4245 default: 4246 ShouldNotReachHere(); 4247 } 4248 asm_assert(check_equal, msg, id); 4249 #endif // ASSERT 4250 } 4251 4252 void MacroAssembler::verify_thread() { 4253 if (VerifyThread) { 4254 unimplemented("'VerifyThread' currently not implemented on PPC"); 4255 } 4256 } 4257 4258 // READ: oop. KILL: R0. Volatile floats perhaps. 4259 void MacroAssembler::verify_oop(Register oop, const char* msg) { 4260 if (!VerifyOops) { 4261 return; 4262 } 4263 4264 address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address(); 4265 const Register tmp = R11; // Will be preserved. 4266 const int nbytes_save = 11*8; // Volatile gprs except R0. 4267 save_volatile_gprs(R1_SP, -nbytes_save); // except R0 4268 4269 if (oop == tmp) mr(R4_ARG2, oop); 4270 save_LR_CR(tmp); // save in old frame 4271 push_frame_reg_args(nbytes_save, tmp); 4272 // load FunctionDescriptor** / entry_address * 4273 load_const_optimized(tmp, fd, R0); 4274 // load FunctionDescriptor* / entry_address 4275 ld(tmp, 0, tmp); 4276 if (oop != tmp) mr_if_needed(R4_ARG2, oop); 4277 load_const_optimized(R3_ARG1, (address)msg, R0); 4278 // Call destination for its side effect. 4279 call_c(tmp); 4280 4281 pop_frame(); 4282 restore_LR_CR(tmp); 4283 restore_volatile_gprs(R1_SP, -nbytes_save); // except R0 4284 } 4285 4286 const char* stop_types[] = { 4287 "stop", 4288 "untested", 4289 "unimplemented", 4290 "shouldnotreachhere" 4291 }; 4292 4293 static void stop_on_request(int tp, const char* msg) { 4294 tty->print("PPC assembly code requires stop: (%s) %s\n", stop_types[tp%/*stop_end*/4], msg); 4295 guarantee(false, "PPC assembly code requires stop: %s", msg); 4296 } 4297 4298 // Call a C-function that prints output. 4299 void MacroAssembler::stop(int type, const char* msg, int id) { 4300 #ifndef PRODUCT 4301 block_comment(err_msg("stop: %s %s {", stop_types[type%stop_end], msg)); 4302 #else 4303 block_comment("stop {"); 4304 #endif 4305 4306 // setup arguments 4307 load_const_optimized(R3_ARG1, type); 4308 load_const_optimized(R4_ARG2, (void *)msg, /*tmp=*/R0); 4309 call_VM_leaf(CAST_FROM_FN_PTR(address, stop_on_request), R3_ARG1, R4_ARG2); 4310 illtrap(); 4311 emit_int32(id); 4312 block_comment("} stop;"); 4313 } 4314 4315 #ifndef PRODUCT 4316 // Write pattern 0x0101010101010101 in memory region [low-before, high+after]. 4317 // Val, addr are temp registers. 4318 // If low == addr, addr is killed. 4319 // High is preserved. 4320 void MacroAssembler::zap_from_to(Register low, int before, Register high, int after, Register val, Register addr) { 4321 if (!ZapMemory) return; 4322 4323 assert_different_registers(low, val); 4324 4325 BLOCK_COMMENT("zap memory region {"); 4326 load_const_optimized(val, 0x0101010101010101); 4327 int size = before + after; 4328 if (low == high && size < 5 && size > 0) { 4329 int offset = -before*BytesPerWord; 4330 for (int i = 0; i < size; ++i) { 4331 std(val, offset, low); 4332 offset += (1*BytesPerWord); 4333 } 4334 } else { 4335 addi(addr, low, -before*BytesPerWord); 4336 assert_different_registers(high, val); 4337 if (after) addi(high, high, after * BytesPerWord); 4338 Label loop; 4339 bind(loop); 4340 std(val, 0, addr); 4341 addi(addr, addr, 8); 4342 cmpd(CCR6, addr, high); 4343 ble(CCR6, loop); 4344 if (after) addi(high, high, -after * BytesPerWord); // Correct back to old value. 4345 } 4346 BLOCK_COMMENT("} zap memory region"); 4347 } 4348 4349 #endif // !PRODUCT 4350 4351 SkipIfEqualZero::SkipIfEqualZero(MacroAssembler* masm, Register temp, const bool* flag_addr) : _masm(masm), _label() { 4352 int simm16_offset = masm->load_const_optimized(temp, (address)flag_addr, R0, true); 4353 assert(sizeof(bool) == 1, "PowerPC ABI"); 4354 masm->lbz(temp, simm16_offset, temp); 4355 masm->cmpwi(CCR0, temp, 0); 4356 masm->beq(CCR0, _label); 4357 } 4358 4359 SkipIfEqualZero::~SkipIfEqualZero() { 4360 _masm->bind(_label); 4361 }