1 /* 2 * Copyright (c) 1997, 2017, Oracle and/or its affiliates. All rights reserved. 3 * Copyright (c) 2012, 2017, SAP SE. All rights reserved. 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. 9 * 10 * This code is distributed in the hope that it will be useful, but WITHOUT 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 * version 2 for more details (a copy is included in the LICENSE file that 14 * accompanied this code). 15 * 16 * You should have received a copy of the GNU General Public License version 17 * 2 along with this work; if not, write to the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 19 * 20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 21 * or visit www.oracle.com if you need additional information or have any 22 * questions. 23 * 24 */ 25 26 #include "precompiled.hpp" 27 #include "asm/macroAssembler.inline.hpp" 28 #include "compiler/disassembler.hpp" 29 #include "gc/shared/cardTableModRefBS.hpp" 30 #include "gc/shared/collectedHeap.inline.hpp" 31 #include "interpreter/interpreter.hpp" 32 #include "memory/resourceArea.hpp" 33 #include "nativeInst_ppc.hpp" 34 #include "prims/methodHandles.hpp" 35 #include "runtime/biasedLocking.hpp" 36 #include "runtime/icache.hpp" 37 #include "runtime/interfaceSupport.hpp" 38 #include "runtime/objectMonitor.hpp" 39 #include "runtime/os.hpp" 40 #include "runtime/safepoint.hpp" 41 #include "runtime/safepointMechanism.hpp" 42 #include "runtime/sharedRuntime.hpp" 43 #include "runtime/stubRoutines.hpp" 44 #include "utilities/macros.hpp" 45 #if INCLUDE_ALL_GCS 46 #include "gc/g1/g1CollectedHeap.inline.hpp" 47 #include "gc/g1/g1SATBCardTableModRefBS.hpp" 48 #include "gc/g1/heapRegion.hpp" 49 #endif // INCLUDE_ALL_GCS 50 #ifdef COMPILER2 51 #include "opto/intrinsicnode.hpp" 52 #endif 53 54 #ifdef PRODUCT 55 #define BLOCK_COMMENT(str) // nothing 56 #else 57 #define BLOCK_COMMENT(str) block_comment(str) 58 #endif 59 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 60 61 #ifdef ASSERT 62 // On RISC, there's no benefit to verifying instruction boundaries. 63 bool AbstractAssembler::pd_check_instruction_mark() { return false; } 64 #endif 65 66 void MacroAssembler::ld_largeoffset_unchecked(Register d, int si31, Register a, int emit_filler_nop) { 67 assert(Assembler::is_simm(si31, 31) && si31 >= 0, "si31 out of range"); 68 if (Assembler::is_simm(si31, 16)) { 69 ld(d, si31, a); 70 if (emit_filler_nop) nop(); 71 } else { 72 const int hi = MacroAssembler::largeoffset_si16_si16_hi(si31); 73 const int lo = MacroAssembler::largeoffset_si16_si16_lo(si31); 74 addis(d, a, hi); 75 ld(d, lo, d); 76 } 77 } 78 79 void MacroAssembler::ld_largeoffset(Register d, int si31, Register a, int emit_filler_nop) { 80 assert_different_registers(d, a); 81 ld_largeoffset_unchecked(d, si31, a, emit_filler_nop); 82 } 83 84 void MacroAssembler::load_sized_value(Register dst, RegisterOrConstant offs, Register base, 85 size_t size_in_bytes, bool is_signed) { 86 switch (size_in_bytes) { 87 case 8: ld(dst, offs, base); break; 88 case 4: is_signed ? lwa(dst, offs, base) : lwz(dst, offs, base); break; 89 case 2: is_signed ? lha(dst, offs, base) : lhz(dst, offs, base); break; 90 case 1: lbz(dst, offs, base); if (is_signed) extsb(dst, dst); break; // lba doesn't exist :( 91 default: ShouldNotReachHere(); 92 } 93 } 94 95 void MacroAssembler::store_sized_value(Register dst, RegisterOrConstant offs, Register base, 96 size_t size_in_bytes) { 97 switch (size_in_bytes) { 98 case 8: std(dst, offs, base); break; 99 case 4: stw(dst, offs, base); break; 100 case 2: sth(dst, offs, base); break; 101 case 1: stb(dst, offs, base); break; 102 default: ShouldNotReachHere(); 103 } 104 } 105 106 void MacroAssembler::align(int modulus, int max, int rem) { 107 int padding = (rem + modulus - (offset() % modulus)) % modulus; 108 if (padding > max) return; 109 for (int c = (padding >> 2); c > 0; --c) { nop(); } 110 } 111 112 // Issue instructions that calculate given TOC from global TOC. 113 void MacroAssembler::calculate_address_from_global_toc(Register dst, address addr, bool hi16, bool lo16, 114 bool add_relocation, bool emit_dummy_addr) { 115 int offset = -1; 116 if (emit_dummy_addr) { 117 offset = -128; // dummy address 118 } else if (addr != (address)(intptr_t)-1) { 119 offset = MacroAssembler::offset_to_global_toc(addr); 120 } 121 122 if (hi16) { 123 addis(dst, R29_TOC, MacroAssembler::largeoffset_si16_si16_hi(offset)); 124 } 125 if (lo16) { 126 if (add_relocation) { 127 // Relocate at the addi to avoid confusion with a load from the method's TOC. 128 relocate(internal_word_Relocation::spec(addr)); 129 } 130 addi(dst, dst, MacroAssembler::largeoffset_si16_si16_lo(offset)); 131 } 132 } 133 134 address MacroAssembler::patch_calculate_address_from_global_toc_at(address a, address bound, address addr) { 135 const int offset = MacroAssembler::offset_to_global_toc(addr); 136 137 const address inst2_addr = a; 138 const int inst2 = *(int *)inst2_addr; 139 140 // The relocation points to the second instruction, the addi, 141 // and the addi reads and writes the same register dst. 142 const int dst = inv_rt_field(inst2); 143 assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst"); 144 145 // Now, find the preceding addis which writes to dst. 146 int inst1 = 0; 147 address inst1_addr = inst2_addr - BytesPerInstWord; 148 while (inst1_addr >= bound) { 149 inst1 = *(int *) inst1_addr; 150 if (is_addis(inst1) && inv_rt_field(inst1) == dst) { 151 // Stop, found the addis which writes dst. 152 break; 153 } 154 inst1_addr -= BytesPerInstWord; 155 } 156 157 assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC"); 158 set_imm((int *)inst1_addr, MacroAssembler::largeoffset_si16_si16_hi(offset)); 159 set_imm((int *)inst2_addr, MacroAssembler::largeoffset_si16_si16_lo(offset)); 160 return inst1_addr; 161 } 162 163 address MacroAssembler::get_address_of_calculate_address_from_global_toc_at(address a, address bound) { 164 const address inst2_addr = a; 165 const int inst2 = *(int *)inst2_addr; 166 167 // The relocation points to the second instruction, the addi, 168 // and the addi reads and writes the same register dst. 169 const int dst = inv_rt_field(inst2); 170 assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst"); 171 172 // Now, find the preceding addis which writes to dst. 173 int inst1 = 0; 174 address inst1_addr = inst2_addr - BytesPerInstWord; 175 while (inst1_addr >= bound) { 176 inst1 = *(int *) inst1_addr; 177 if (is_addis(inst1) && inv_rt_field(inst1) == dst) { 178 // stop, found the addis which writes dst 179 break; 180 } 181 inst1_addr -= BytesPerInstWord; 182 } 183 184 assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC"); 185 186 int offset = (get_imm(inst1_addr, 0) << 16) + get_imm(inst2_addr, 0); 187 // -1 is a special case 188 if (offset == -1) { 189 return (address)(intptr_t)-1; 190 } else { 191 return global_toc() + offset; 192 } 193 } 194 195 #ifdef _LP64 196 // Patch compressed oops or klass constants. 197 // Assembler sequence is 198 // 1) compressed oops: 199 // lis rx = const.hi 200 // ori rx = rx | const.lo 201 // 2) compressed klass: 202 // lis rx = const.hi 203 // clrldi rx = rx & 0xFFFFffff // clearMS32b, optional 204 // ori rx = rx | const.lo 205 // Clrldi will be passed by. 206 address MacroAssembler::patch_set_narrow_oop(address a, address bound, narrowOop data) { 207 assert(UseCompressedOops, "Should only patch compressed oops"); 208 209 const address inst2_addr = a; 210 const int inst2 = *(int *)inst2_addr; 211 212 // The relocation points to the second instruction, the ori, 213 // and the ori reads and writes the same register dst. 214 const int dst = inv_rta_field(inst2); 215 assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst"); 216 // Now, find the preceding addis which writes to dst. 217 int inst1 = 0; 218 address inst1_addr = inst2_addr - BytesPerInstWord; 219 bool inst1_found = false; 220 while (inst1_addr >= bound) { 221 inst1 = *(int *)inst1_addr; 222 if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break; } 223 inst1_addr -= BytesPerInstWord; 224 } 225 assert(inst1_found, "inst is not lis"); 226 227 int xc = (data >> 16) & 0xffff; 228 int xd = (data >> 0) & 0xffff; 229 230 set_imm((int *)inst1_addr, (short)(xc)); // see enc_load_con_narrow_hi/_lo 231 set_imm((int *)inst2_addr, (xd)); // unsigned int 232 return inst1_addr; 233 } 234 235 // Get compressed oop or klass constant. 236 narrowOop MacroAssembler::get_narrow_oop(address a, address bound) { 237 assert(UseCompressedOops, "Should only patch compressed oops"); 238 239 const address inst2_addr = a; 240 const int inst2 = *(int *)inst2_addr; 241 242 // The relocation points to the second instruction, the ori, 243 // and the ori reads and writes the same register dst. 244 const int dst = inv_rta_field(inst2); 245 assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst"); 246 // Now, find the preceding lis which writes to dst. 247 int inst1 = 0; 248 address inst1_addr = inst2_addr - BytesPerInstWord; 249 bool inst1_found = false; 250 251 while (inst1_addr >= bound) { 252 inst1 = *(int *) inst1_addr; 253 if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break;} 254 inst1_addr -= BytesPerInstWord; 255 } 256 assert(inst1_found, "inst is not lis"); 257 258 uint xl = ((unsigned int) (get_imm(inst2_addr, 0) & 0xffff)); 259 uint xh = (((get_imm(inst1_addr, 0)) & 0xffff) << 16); 260 261 return (int) (xl | xh); 262 } 263 #endif // _LP64 264 265 // Returns true if successful. 266 bool MacroAssembler::load_const_from_method_toc(Register dst, AddressLiteral& a, 267 Register toc, bool fixed_size) { 268 int toc_offset = 0; 269 // Use RelocationHolder::none for the constant pool entry, otherwise 270 // we will end up with a failing NativeCall::verify(x) where x is 271 // the address of the constant pool entry. 272 // FIXME: We should insert relocation information for oops at the constant 273 // pool entries instead of inserting it at the loads; patching of a constant 274 // pool entry should be less expensive. 275 address const_address = address_constant((address)a.value(), RelocationHolder::none); 276 if (const_address == NULL) { return false; } // allocation failure 277 // Relocate at the pc of the load. 278 relocate(a.rspec()); 279 toc_offset = (int)(const_address - code()->consts()->start()); 280 ld_largeoffset_unchecked(dst, toc_offset, toc, fixed_size); 281 return true; 282 } 283 284 bool MacroAssembler::is_load_const_from_method_toc_at(address a) { 285 const address inst1_addr = a; 286 const int inst1 = *(int *)inst1_addr; 287 288 // The relocation points to the ld or the addis. 289 return (is_ld(inst1)) || 290 (is_addis(inst1) && inv_ra_field(inst1) != 0); 291 } 292 293 int MacroAssembler::get_offset_of_load_const_from_method_toc_at(address a) { 294 assert(is_load_const_from_method_toc_at(a), "must be load_const_from_method_toc"); 295 296 const address inst1_addr = a; 297 const int inst1 = *(int *)inst1_addr; 298 299 if (is_ld(inst1)) { 300 return inv_d1_field(inst1); 301 } else if (is_addis(inst1)) { 302 const int dst = inv_rt_field(inst1); 303 304 // Now, find the succeeding ld which reads and writes to dst. 305 address inst2_addr = inst1_addr + BytesPerInstWord; 306 int inst2 = 0; 307 while (true) { 308 inst2 = *(int *) inst2_addr; 309 if (is_ld(inst2) && inv_ra_field(inst2) == dst && inv_rt_field(inst2) == dst) { 310 // Stop, found the ld which reads and writes dst. 311 break; 312 } 313 inst2_addr += BytesPerInstWord; 314 } 315 return (inv_d1_field(inst1) << 16) + inv_d1_field(inst2); 316 } 317 ShouldNotReachHere(); 318 return 0; 319 } 320 321 // Get the constant from a `load_const' sequence. 322 long MacroAssembler::get_const(address a) { 323 assert(is_load_const_at(a), "not a load of a constant"); 324 const int *p = (const int*) a; 325 unsigned long x = (((unsigned long) (get_imm(a,0) & 0xffff)) << 48); 326 if (is_ori(*(p+1))) { 327 x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 32); 328 x |= (((unsigned long) (get_imm(a,3) & 0xffff)) << 16); 329 x |= (((unsigned long) (get_imm(a,4) & 0xffff))); 330 } else if (is_lis(*(p+1))) { 331 x |= (((unsigned long) (get_imm(a,2) & 0xffff)) << 32); 332 x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 16); 333 x |= (((unsigned long) (get_imm(a,3) & 0xffff))); 334 } else { 335 ShouldNotReachHere(); 336 return (long) 0; 337 } 338 return (long) x; 339 } 340 341 // Patch the 64 bit constant of a `load_const' sequence. This is a low 342 // level procedure. It neither flushes the instruction cache nor is it 343 // mt safe. 344 void MacroAssembler::patch_const(address a, long x) { 345 assert(is_load_const_at(a), "not a load of a constant"); 346 int *p = (int*) a; 347 if (is_ori(*(p+1))) { 348 set_imm(0 + p, (x >> 48) & 0xffff); 349 set_imm(1 + p, (x >> 32) & 0xffff); 350 set_imm(3 + p, (x >> 16) & 0xffff); 351 set_imm(4 + p, x & 0xffff); 352 } else if (is_lis(*(p+1))) { 353 set_imm(0 + p, (x >> 48) & 0xffff); 354 set_imm(2 + p, (x >> 32) & 0xffff); 355 set_imm(1 + p, (x >> 16) & 0xffff); 356 set_imm(3 + p, x & 0xffff); 357 } else { 358 ShouldNotReachHere(); 359 } 360 } 361 362 AddressLiteral MacroAssembler::allocate_metadata_address(Metadata* obj) { 363 assert(oop_recorder() != NULL, "this assembler needs a Recorder"); 364 int index = oop_recorder()->allocate_metadata_index(obj); 365 RelocationHolder rspec = metadata_Relocation::spec(index); 366 return AddressLiteral((address)obj, rspec); 367 } 368 369 AddressLiteral MacroAssembler::constant_metadata_address(Metadata* obj) { 370 assert(oop_recorder() != NULL, "this assembler needs a Recorder"); 371 int index = oop_recorder()->find_index(obj); 372 RelocationHolder rspec = metadata_Relocation::spec(index); 373 return AddressLiteral((address)obj, rspec); 374 } 375 376 AddressLiteral MacroAssembler::allocate_oop_address(jobject obj) { 377 assert(oop_recorder() != NULL, "this assembler needs an OopRecorder"); 378 int oop_index = oop_recorder()->allocate_oop_index(obj); 379 return AddressLiteral(address(obj), oop_Relocation::spec(oop_index)); 380 } 381 382 AddressLiteral MacroAssembler::constant_oop_address(jobject obj) { 383 assert(oop_recorder() != NULL, "this assembler needs an OopRecorder"); 384 int oop_index = oop_recorder()->find_index(obj); 385 return AddressLiteral(address(obj), oop_Relocation::spec(oop_index)); 386 } 387 388 RegisterOrConstant MacroAssembler::delayed_value_impl(intptr_t* delayed_value_addr, 389 Register tmp, int offset) { 390 intptr_t value = *delayed_value_addr; 391 if (value != 0) { 392 return RegisterOrConstant(value + offset); 393 } 394 395 // Load indirectly to solve generation ordering problem. 396 // static address, no relocation 397 int simm16_offset = load_const_optimized(tmp, delayed_value_addr, noreg, true); 398 ld(tmp, simm16_offset, tmp); // must be aligned ((xa & 3) == 0) 399 400 if (offset != 0) { 401 addi(tmp, tmp, offset); 402 } 403 404 return RegisterOrConstant(tmp); 405 } 406 407 #ifndef PRODUCT 408 void MacroAssembler::pd_print_patched_instruction(address branch) { 409 Unimplemented(); // TODO: PPC port 410 } 411 #endif // ndef PRODUCT 412 413 // Conditional far branch for destinations encodable in 24+2 bits. 414 void MacroAssembler::bc_far(int boint, int biint, Label& dest, int optimize) { 415 416 // If requested by flag optimize, relocate the bc_far as a 417 // runtime_call and prepare for optimizing it when the code gets 418 // relocated. 419 if (optimize == bc_far_optimize_on_relocate) { 420 relocate(relocInfo::runtime_call_type); 421 } 422 423 // variant 2: 424 // 425 // b!cxx SKIP 426 // bxx DEST 427 // SKIP: 428 // 429 430 const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)), 431 opposite_bcond(inv_boint_bcond(boint))); 432 433 // We emit two branches. 434 // First, a conditional branch which jumps around the far branch. 435 const address not_taken_pc = pc() + 2 * BytesPerInstWord; 436 const address bc_pc = pc(); 437 bc(opposite_boint, biint, not_taken_pc); 438 439 const int bc_instr = *(int*)bc_pc; 440 assert(not_taken_pc == (address)inv_bd_field(bc_instr, (intptr_t)bc_pc), "postcondition"); 441 assert(opposite_boint == inv_bo_field(bc_instr), "postcondition"); 442 assert(boint == add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(bc_instr))), 443 opposite_bcond(inv_boint_bcond(inv_bo_field(bc_instr)))), 444 "postcondition"); 445 assert(biint == inv_bi_field(bc_instr), "postcondition"); 446 447 // Second, an unconditional far branch which jumps to dest. 448 // Note: target(dest) remembers the current pc (see CodeSection::target) 449 // and returns the current pc if the label is not bound yet; when 450 // the label gets bound, the unconditional far branch will be patched. 451 const address target_pc = target(dest); 452 const address b_pc = pc(); 453 b(target_pc); 454 455 assert(not_taken_pc == pc(), "postcondition"); 456 assert(dest.is_bound() || target_pc == b_pc, "postcondition"); 457 } 458 459 // 1 or 2 instructions 460 void MacroAssembler::bc_far_optimized(int boint, int biint, Label& dest) { 461 if (dest.is_bound() && is_within_range_of_bcxx(target(dest), pc())) { 462 bc(boint, biint, dest); 463 } else { 464 bc_far(boint, biint, dest, MacroAssembler::bc_far_optimize_on_relocate); 465 } 466 } 467 468 bool MacroAssembler::is_bc_far_at(address instruction_addr) { 469 return is_bc_far_variant1_at(instruction_addr) || 470 is_bc_far_variant2_at(instruction_addr) || 471 is_bc_far_variant3_at(instruction_addr); 472 } 473 474 address MacroAssembler::get_dest_of_bc_far_at(address instruction_addr) { 475 if (is_bc_far_variant1_at(instruction_addr)) { 476 const address instruction_1_addr = instruction_addr; 477 const int instruction_1 = *(int*)instruction_1_addr; 478 return (address)inv_bd_field(instruction_1, (intptr_t)instruction_1_addr); 479 } else if (is_bc_far_variant2_at(instruction_addr)) { 480 const address instruction_2_addr = instruction_addr + 4; 481 return bxx_destination(instruction_2_addr); 482 } else if (is_bc_far_variant3_at(instruction_addr)) { 483 return instruction_addr + 8; 484 } 485 // variant 4 ??? 486 ShouldNotReachHere(); 487 return NULL; 488 } 489 void MacroAssembler::set_dest_of_bc_far_at(address instruction_addr, address dest) { 490 491 if (is_bc_far_variant3_at(instruction_addr)) { 492 // variant 3, far cond branch to the next instruction, already patched to nops: 493 // 494 // nop 495 // endgroup 496 // SKIP/DEST: 497 // 498 return; 499 } 500 501 // first, extract boint and biint from the current branch 502 int boint = 0; 503 int biint = 0; 504 505 ResourceMark rm; 506 const int code_size = 2 * BytesPerInstWord; 507 CodeBuffer buf(instruction_addr, code_size); 508 MacroAssembler masm(&buf); 509 if (is_bc_far_variant2_at(instruction_addr) && dest == instruction_addr + 8) { 510 // Far branch to next instruction: Optimize it by patching nops (produce variant 3). 511 masm.nop(); 512 masm.endgroup(); 513 } else { 514 if (is_bc_far_variant1_at(instruction_addr)) { 515 // variant 1, the 1st instruction contains the destination address: 516 // 517 // bcxx DEST 518 // nop 519 // 520 const int instruction_1 = *(int*)(instruction_addr); 521 boint = inv_bo_field(instruction_1); 522 biint = inv_bi_field(instruction_1); 523 } else if (is_bc_far_variant2_at(instruction_addr)) { 524 // variant 2, the 2nd instruction contains the destination address: 525 // 526 // b!cxx SKIP 527 // bxx DEST 528 // SKIP: 529 // 530 const int instruction_1 = *(int*)(instruction_addr); 531 boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(instruction_1))), 532 opposite_bcond(inv_boint_bcond(inv_bo_field(instruction_1)))); 533 biint = inv_bi_field(instruction_1); 534 } else { 535 // variant 4??? 536 ShouldNotReachHere(); 537 } 538 539 // second, set the new branch destination and optimize the code 540 if (dest != instruction_addr + 4 && // the bc_far is still unbound! 541 masm.is_within_range_of_bcxx(dest, instruction_addr)) { 542 // variant 1: 543 // 544 // bcxx DEST 545 // nop 546 // 547 masm.bc(boint, biint, dest); 548 masm.nop(); 549 } else { 550 // variant 2: 551 // 552 // b!cxx SKIP 553 // bxx DEST 554 // SKIP: 555 // 556 const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)), 557 opposite_bcond(inv_boint_bcond(boint))); 558 const address not_taken_pc = masm.pc() + 2 * BytesPerInstWord; 559 masm.bc(opposite_boint, biint, not_taken_pc); 560 masm.b(dest); 561 } 562 } 563 ICache::ppc64_flush_icache_bytes(instruction_addr, code_size); 564 } 565 566 // Emit a NOT mt-safe patchable 64 bit absolute call/jump. 567 void MacroAssembler::bxx64_patchable(address dest, relocInfo::relocType rt, bool link) { 568 // get current pc 569 uint64_t start_pc = (uint64_t) pc(); 570 571 const address pc_of_bl = (address) (start_pc + (6*BytesPerInstWord)); // bl is last 572 const address pc_of_b = (address) (start_pc + (0*BytesPerInstWord)); // b is first 573 574 // relocate here 575 if (rt != relocInfo::none) { 576 relocate(rt); 577 } 578 579 if ( ReoptimizeCallSequences && 580 (( link && is_within_range_of_b(dest, pc_of_bl)) || 581 (!link && is_within_range_of_b(dest, pc_of_b)))) { 582 // variant 2: 583 // Emit an optimized, pc-relative call/jump. 584 585 if (link) { 586 // some padding 587 nop(); 588 nop(); 589 nop(); 590 nop(); 591 nop(); 592 nop(); 593 594 // do the call 595 assert(pc() == pc_of_bl, "just checking"); 596 bl(dest, relocInfo::none); 597 } else { 598 // do the jump 599 assert(pc() == pc_of_b, "just checking"); 600 b(dest, relocInfo::none); 601 602 // some padding 603 nop(); 604 nop(); 605 nop(); 606 nop(); 607 nop(); 608 nop(); 609 } 610 611 // Assert that we can identify the emitted call/jump. 612 assert(is_bxx64_patchable_variant2_at((address)start_pc, link), 613 "can't identify emitted call"); 614 } else { 615 // variant 1: 616 mr(R0, R11); // spill R11 -> R0. 617 618 // Load the destination address into CTR, 619 // calculate destination relative to global toc. 620 calculate_address_from_global_toc(R11, dest, true, true, false); 621 622 mtctr(R11); 623 mr(R11, R0); // spill R11 <- R0. 624 nop(); 625 626 // do the call/jump 627 if (link) { 628 bctrl(); 629 } else{ 630 bctr(); 631 } 632 // Assert that we can identify the emitted call/jump. 633 assert(is_bxx64_patchable_variant1b_at((address)start_pc, link), 634 "can't identify emitted call"); 635 } 636 637 // Assert that we can identify the emitted call/jump. 638 assert(is_bxx64_patchable_at((address)start_pc, link), 639 "can't identify emitted call"); 640 assert(get_dest_of_bxx64_patchable_at((address)start_pc, link) == dest, 641 "wrong encoding of dest address"); 642 } 643 644 // Identify a bxx64_patchable instruction. 645 bool MacroAssembler::is_bxx64_patchable_at(address instruction_addr, bool link) { 646 return is_bxx64_patchable_variant1b_at(instruction_addr, link) 647 //|| is_bxx64_patchable_variant1_at(instruction_addr, link) 648 || is_bxx64_patchable_variant2_at(instruction_addr, link); 649 } 650 651 // Does the call64_patchable instruction use a pc-relative encoding of 652 // the call destination? 653 bool MacroAssembler::is_bxx64_patchable_pcrelative_at(address instruction_addr, bool link) { 654 // variant 2 is pc-relative 655 return is_bxx64_patchable_variant2_at(instruction_addr, link); 656 } 657 658 // Identify variant 1. 659 bool MacroAssembler::is_bxx64_patchable_variant1_at(address instruction_addr, bool link) { 660 unsigned int* instr = (unsigned int*) instruction_addr; 661 return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l] 662 && is_mtctr(instr[5]) // mtctr 663 && is_load_const_at(instruction_addr); 664 } 665 666 // Identify variant 1b: load destination relative to global toc. 667 bool MacroAssembler::is_bxx64_patchable_variant1b_at(address instruction_addr, bool link) { 668 unsigned int* instr = (unsigned int*) instruction_addr; 669 return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l] 670 && is_mtctr(instr[3]) // mtctr 671 && is_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord, instruction_addr); 672 } 673 674 // Identify variant 2. 675 bool MacroAssembler::is_bxx64_patchable_variant2_at(address instruction_addr, bool link) { 676 unsigned int* instr = (unsigned int*) instruction_addr; 677 if (link) { 678 return is_bl (instr[6]) // bl dest is last 679 && is_nop(instr[0]) // nop 680 && is_nop(instr[1]) // nop 681 && is_nop(instr[2]) // nop 682 && is_nop(instr[3]) // nop 683 && is_nop(instr[4]) // nop 684 && is_nop(instr[5]); // nop 685 } else { 686 return is_b (instr[0]) // b dest is first 687 && is_nop(instr[1]) // nop 688 && is_nop(instr[2]) // nop 689 && is_nop(instr[3]) // nop 690 && is_nop(instr[4]) // nop 691 && is_nop(instr[5]) // nop 692 && is_nop(instr[6]); // nop 693 } 694 } 695 696 // Set dest address of a bxx64_patchable instruction. 697 void MacroAssembler::set_dest_of_bxx64_patchable_at(address instruction_addr, address dest, bool link) { 698 ResourceMark rm; 699 int code_size = MacroAssembler::bxx64_patchable_size; 700 CodeBuffer buf(instruction_addr, code_size); 701 MacroAssembler masm(&buf); 702 masm.bxx64_patchable(dest, relocInfo::none, link); 703 ICache::ppc64_flush_icache_bytes(instruction_addr, code_size); 704 } 705 706 // Get dest address of a bxx64_patchable instruction. 707 address MacroAssembler::get_dest_of_bxx64_patchable_at(address instruction_addr, bool link) { 708 if (is_bxx64_patchable_variant1_at(instruction_addr, link)) { 709 return (address) (unsigned long) get_const(instruction_addr); 710 } else if (is_bxx64_patchable_variant2_at(instruction_addr, link)) { 711 unsigned int* instr = (unsigned int*) instruction_addr; 712 if (link) { 713 const int instr_idx = 6; // bl is last 714 int branchoffset = branch_destination(instr[instr_idx], 0); 715 return instruction_addr + branchoffset + instr_idx*BytesPerInstWord; 716 } else { 717 const int instr_idx = 0; // b is first 718 int branchoffset = branch_destination(instr[instr_idx], 0); 719 return instruction_addr + branchoffset + instr_idx*BytesPerInstWord; 720 } 721 // Load dest relative to global toc. 722 } else if (is_bxx64_patchable_variant1b_at(instruction_addr, link)) { 723 return get_address_of_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord, 724 instruction_addr); 725 } else { 726 ShouldNotReachHere(); 727 return NULL; 728 } 729 } 730 731 // Uses ordering which corresponds to ABI: 732 // _savegpr0_14: std r14,-144(r1) 733 // _savegpr0_15: std r15,-136(r1) 734 // _savegpr0_16: std r16,-128(r1) 735 void MacroAssembler::save_nonvolatile_gprs(Register dst, int offset) { 736 std(R14, offset, dst); offset += 8; 737 std(R15, offset, dst); offset += 8; 738 std(R16, offset, dst); offset += 8; 739 std(R17, offset, dst); offset += 8; 740 std(R18, offset, dst); offset += 8; 741 std(R19, offset, dst); offset += 8; 742 std(R20, offset, dst); offset += 8; 743 std(R21, offset, dst); offset += 8; 744 std(R22, offset, dst); offset += 8; 745 std(R23, offset, dst); offset += 8; 746 std(R24, offset, dst); offset += 8; 747 std(R25, offset, dst); offset += 8; 748 std(R26, offset, dst); offset += 8; 749 std(R27, offset, dst); offset += 8; 750 std(R28, offset, dst); offset += 8; 751 std(R29, offset, dst); offset += 8; 752 std(R30, offset, dst); offset += 8; 753 std(R31, offset, dst); offset += 8; 754 755 stfd(F14, offset, dst); offset += 8; 756 stfd(F15, offset, dst); offset += 8; 757 stfd(F16, offset, dst); offset += 8; 758 stfd(F17, offset, dst); offset += 8; 759 stfd(F18, offset, dst); offset += 8; 760 stfd(F19, offset, dst); offset += 8; 761 stfd(F20, offset, dst); offset += 8; 762 stfd(F21, offset, dst); offset += 8; 763 stfd(F22, offset, dst); offset += 8; 764 stfd(F23, offset, dst); offset += 8; 765 stfd(F24, offset, dst); offset += 8; 766 stfd(F25, offset, dst); offset += 8; 767 stfd(F26, offset, dst); offset += 8; 768 stfd(F27, offset, dst); offset += 8; 769 stfd(F28, offset, dst); offset += 8; 770 stfd(F29, offset, dst); offset += 8; 771 stfd(F30, offset, dst); offset += 8; 772 stfd(F31, offset, dst); 773 } 774 775 // Uses ordering which corresponds to ABI: 776 // _restgpr0_14: ld r14,-144(r1) 777 // _restgpr0_15: ld r15,-136(r1) 778 // _restgpr0_16: ld r16,-128(r1) 779 void MacroAssembler::restore_nonvolatile_gprs(Register src, int offset) { 780 ld(R14, offset, src); offset += 8; 781 ld(R15, offset, src); offset += 8; 782 ld(R16, offset, src); offset += 8; 783 ld(R17, offset, src); offset += 8; 784 ld(R18, offset, src); offset += 8; 785 ld(R19, offset, src); offset += 8; 786 ld(R20, offset, src); offset += 8; 787 ld(R21, offset, src); offset += 8; 788 ld(R22, offset, src); offset += 8; 789 ld(R23, offset, src); offset += 8; 790 ld(R24, offset, src); offset += 8; 791 ld(R25, offset, src); offset += 8; 792 ld(R26, offset, src); offset += 8; 793 ld(R27, offset, src); offset += 8; 794 ld(R28, offset, src); offset += 8; 795 ld(R29, offset, src); offset += 8; 796 ld(R30, offset, src); offset += 8; 797 ld(R31, offset, src); offset += 8; 798 799 // FP registers 800 lfd(F14, offset, src); offset += 8; 801 lfd(F15, offset, src); offset += 8; 802 lfd(F16, offset, src); offset += 8; 803 lfd(F17, offset, src); offset += 8; 804 lfd(F18, offset, src); offset += 8; 805 lfd(F19, offset, src); offset += 8; 806 lfd(F20, offset, src); offset += 8; 807 lfd(F21, offset, src); offset += 8; 808 lfd(F22, offset, src); offset += 8; 809 lfd(F23, offset, src); offset += 8; 810 lfd(F24, offset, src); offset += 8; 811 lfd(F25, offset, src); offset += 8; 812 lfd(F26, offset, src); offset += 8; 813 lfd(F27, offset, src); offset += 8; 814 lfd(F28, offset, src); offset += 8; 815 lfd(F29, offset, src); offset += 8; 816 lfd(F30, offset, src); offset += 8; 817 lfd(F31, offset, src); 818 } 819 820 // For verify_oops. 821 void MacroAssembler::save_volatile_gprs(Register dst, int offset) { 822 std(R2, offset, dst); offset += 8; 823 std(R3, offset, dst); offset += 8; 824 std(R4, offset, dst); offset += 8; 825 std(R5, offset, dst); offset += 8; 826 std(R6, offset, dst); offset += 8; 827 std(R7, offset, dst); offset += 8; 828 std(R8, offset, dst); offset += 8; 829 std(R9, offset, dst); offset += 8; 830 std(R10, offset, dst); offset += 8; 831 std(R11, offset, dst); offset += 8; 832 std(R12, offset, dst); offset += 8; 833 834 stfd(F0, offset, dst); offset += 8; 835 stfd(F1, offset, dst); offset += 8; 836 stfd(F2, offset, dst); offset += 8; 837 stfd(F3, offset, dst); offset += 8; 838 stfd(F4, offset, dst); offset += 8; 839 stfd(F5, offset, dst); offset += 8; 840 stfd(F6, offset, dst); offset += 8; 841 stfd(F7, offset, dst); offset += 8; 842 stfd(F8, offset, dst); offset += 8; 843 stfd(F9, offset, dst); offset += 8; 844 stfd(F10, offset, dst); offset += 8; 845 stfd(F11, offset, dst); offset += 8; 846 stfd(F12, offset, dst); offset += 8; 847 stfd(F13, offset, dst); 848 } 849 850 // For verify_oops. 851 void MacroAssembler::restore_volatile_gprs(Register src, int offset) { 852 ld(R2, offset, src); offset += 8; 853 ld(R3, offset, src); offset += 8; 854 ld(R4, offset, src); offset += 8; 855 ld(R5, offset, src); offset += 8; 856 ld(R6, offset, src); offset += 8; 857 ld(R7, offset, src); offset += 8; 858 ld(R8, offset, src); offset += 8; 859 ld(R9, offset, src); offset += 8; 860 ld(R10, offset, src); offset += 8; 861 ld(R11, offset, src); offset += 8; 862 ld(R12, offset, src); offset += 8; 863 864 lfd(F0, offset, src); offset += 8; 865 lfd(F1, offset, src); offset += 8; 866 lfd(F2, offset, src); offset += 8; 867 lfd(F3, offset, src); offset += 8; 868 lfd(F4, offset, src); offset += 8; 869 lfd(F5, offset, src); offset += 8; 870 lfd(F6, offset, src); offset += 8; 871 lfd(F7, offset, src); offset += 8; 872 lfd(F8, offset, src); offset += 8; 873 lfd(F9, offset, src); offset += 8; 874 lfd(F10, offset, src); offset += 8; 875 lfd(F11, offset, src); offset += 8; 876 lfd(F12, offset, src); offset += 8; 877 lfd(F13, offset, src); 878 } 879 880 void MacroAssembler::save_LR_CR(Register tmp) { 881 mfcr(tmp); 882 std(tmp, _abi(cr), R1_SP); 883 mflr(tmp); 884 std(tmp, _abi(lr), R1_SP); 885 // Tmp must contain lr on exit! (see return_addr and prolog in ppc64.ad) 886 } 887 888 void MacroAssembler::restore_LR_CR(Register tmp) { 889 assert(tmp != R1_SP, "must be distinct"); 890 ld(tmp, _abi(lr), R1_SP); 891 mtlr(tmp); 892 ld(tmp, _abi(cr), R1_SP); 893 mtcr(tmp); 894 } 895 896 address MacroAssembler::get_PC_trash_LR(Register result) { 897 Label L; 898 bl(L); 899 bind(L); 900 address lr_pc = pc(); 901 mflr(result); 902 return lr_pc; 903 } 904 905 void MacroAssembler::resize_frame(Register offset, Register tmp) { 906 #ifdef ASSERT 907 assert_different_registers(offset, tmp, R1_SP); 908 andi_(tmp, offset, frame::alignment_in_bytes-1); 909 asm_assert_eq("resize_frame: unaligned", 0x204); 910 #endif 911 912 // tmp <- *(SP) 913 ld(tmp, _abi(callers_sp), R1_SP); 914 // addr <- SP + offset; 915 // *(addr) <- tmp; 916 // SP <- addr 917 stdux(tmp, R1_SP, offset); 918 } 919 920 void MacroAssembler::resize_frame(int offset, Register tmp) { 921 assert(is_simm(offset, 16), "too big an offset"); 922 assert_different_registers(tmp, R1_SP); 923 assert((offset & (frame::alignment_in_bytes-1))==0, "resize_frame: unaligned"); 924 // tmp <- *(SP) 925 ld(tmp, _abi(callers_sp), R1_SP); 926 // addr <- SP + offset; 927 // *(addr) <- tmp; 928 // SP <- addr 929 stdu(tmp, offset, R1_SP); 930 } 931 932 void MacroAssembler::resize_frame_absolute(Register addr, Register tmp1, Register tmp2) { 933 // (addr == tmp1) || (addr == tmp2) is allowed here! 934 assert(tmp1 != tmp2, "must be distinct"); 935 936 // compute offset w.r.t. current stack pointer 937 // tmp_1 <- addr - SP (!) 938 subf(tmp1, R1_SP, addr); 939 940 // atomically update SP keeping back link. 941 resize_frame(tmp1/* offset */, tmp2/* tmp */); 942 } 943 944 void MacroAssembler::push_frame(Register bytes, Register tmp) { 945 #ifdef ASSERT 946 assert(bytes != R0, "r0 not allowed here"); 947 andi_(R0, bytes, frame::alignment_in_bytes-1); 948 asm_assert_eq("push_frame(Reg, Reg): unaligned", 0x203); 949 #endif 950 neg(tmp, bytes); 951 stdux(R1_SP, R1_SP, tmp); 952 } 953 954 // Push a frame of size `bytes'. 955 void MacroAssembler::push_frame(unsigned int bytes, Register tmp) { 956 long offset = align_addr(bytes, frame::alignment_in_bytes); 957 if (is_simm(-offset, 16)) { 958 stdu(R1_SP, -offset, R1_SP); 959 } else { 960 load_const_optimized(tmp, -offset); 961 stdux(R1_SP, R1_SP, tmp); 962 } 963 } 964 965 // Push a frame of size `bytes' plus abi_reg_args on top. 966 void MacroAssembler::push_frame_reg_args(unsigned int bytes, Register tmp) { 967 push_frame(bytes + frame::abi_reg_args_size, tmp); 968 } 969 970 // Setup up a new C frame with a spill area for non-volatile GPRs and 971 // additional space for local variables. 972 void MacroAssembler::push_frame_reg_args_nonvolatiles(unsigned int bytes, 973 Register tmp) { 974 push_frame(bytes + frame::abi_reg_args_size + frame::spill_nonvolatiles_size, tmp); 975 } 976 977 // Pop current C frame. 978 void MacroAssembler::pop_frame() { 979 ld(R1_SP, _abi(callers_sp), R1_SP); 980 } 981 982 #if defined(ABI_ELFv2) 983 address MacroAssembler::branch_to(Register r_function_entry, bool and_link) { 984 // TODO(asmundak): make sure the caller uses R12 as function descriptor 985 // most of the times. 986 if (R12 != r_function_entry) { 987 mr(R12, r_function_entry); 988 } 989 mtctr(R12); 990 // Do a call or a branch. 991 if (and_link) { 992 bctrl(); 993 } else { 994 bctr(); 995 } 996 _last_calls_return_pc = pc(); 997 998 return _last_calls_return_pc; 999 } 1000 1001 // Call a C function via a function descriptor and use full C 1002 // calling conventions. Updates and returns _last_calls_return_pc. 1003 address MacroAssembler::call_c(Register r_function_entry) { 1004 return branch_to(r_function_entry, /*and_link=*/true); 1005 } 1006 1007 // For tail calls: only branch, don't link, so callee returns to caller of this function. 1008 address MacroAssembler::call_c_and_return_to_caller(Register r_function_entry) { 1009 return branch_to(r_function_entry, /*and_link=*/false); 1010 } 1011 1012 address MacroAssembler::call_c(address function_entry, relocInfo::relocType rt) { 1013 load_const(R12, function_entry, R0); 1014 return branch_to(R12, /*and_link=*/true); 1015 } 1016 1017 #else 1018 // Generic version of a call to C function via a function descriptor 1019 // with variable support for C calling conventions (TOC, ENV, etc.). 1020 // Updates and returns _last_calls_return_pc. 1021 address MacroAssembler::branch_to(Register function_descriptor, bool and_link, bool save_toc_before_call, 1022 bool restore_toc_after_call, bool load_toc_of_callee, bool load_env_of_callee) { 1023 // we emit standard ptrgl glue code here 1024 assert((function_descriptor != R0), "function_descriptor cannot be R0"); 1025 1026 // retrieve necessary entries from the function descriptor 1027 ld(R0, in_bytes(FunctionDescriptor::entry_offset()), function_descriptor); 1028 mtctr(R0); 1029 1030 if (load_toc_of_callee) { 1031 ld(R2_TOC, in_bytes(FunctionDescriptor::toc_offset()), function_descriptor); 1032 } 1033 if (load_env_of_callee) { 1034 ld(R11, in_bytes(FunctionDescriptor::env_offset()), function_descriptor); 1035 } else if (load_toc_of_callee) { 1036 li(R11, 0); 1037 } 1038 1039 // do a call or a branch 1040 if (and_link) { 1041 bctrl(); 1042 } else { 1043 bctr(); 1044 } 1045 _last_calls_return_pc = pc(); 1046 1047 return _last_calls_return_pc; 1048 } 1049 1050 // Call a C function via a function descriptor and use full C calling 1051 // conventions. 1052 // We don't use the TOC in generated code, so there is no need to save 1053 // and restore its value. 1054 address MacroAssembler::call_c(Register fd) { 1055 return branch_to(fd, /*and_link=*/true, 1056 /*save toc=*/false, 1057 /*restore toc=*/false, 1058 /*load toc=*/true, 1059 /*load env=*/true); 1060 } 1061 1062 address MacroAssembler::call_c_and_return_to_caller(Register fd) { 1063 return branch_to(fd, /*and_link=*/false, 1064 /*save toc=*/false, 1065 /*restore toc=*/false, 1066 /*load toc=*/true, 1067 /*load env=*/true); 1068 } 1069 1070 address MacroAssembler::call_c(const FunctionDescriptor* fd, relocInfo::relocType rt) { 1071 if (rt != relocInfo::none) { 1072 // this call needs to be relocatable 1073 if (!ReoptimizeCallSequences 1074 || (rt != relocInfo::runtime_call_type && rt != relocInfo::none) 1075 || fd == NULL // support code-size estimation 1076 || !fd->is_friend_function() 1077 || fd->entry() == NULL) { 1078 // it's not a friend function as defined by class FunctionDescriptor, 1079 // so do a full call-c here. 1080 load_const(R11, (address)fd, R0); 1081 1082 bool has_env = (fd != NULL && fd->env() != NULL); 1083 return branch_to(R11, /*and_link=*/true, 1084 /*save toc=*/false, 1085 /*restore toc=*/false, 1086 /*load toc=*/true, 1087 /*load env=*/has_env); 1088 } else { 1089 // It's a friend function. Load the entry point and don't care about 1090 // toc and env. Use an optimizable call instruction, but ensure the 1091 // same code-size as in the case of a non-friend function. 1092 nop(); 1093 nop(); 1094 nop(); 1095 bl64_patchable(fd->entry(), rt); 1096 _last_calls_return_pc = pc(); 1097 return _last_calls_return_pc; 1098 } 1099 } else { 1100 // This call does not need to be relocatable, do more aggressive 1101 // optimizations. 1102 if (!ReoptimizeCallSequences 1103 || !fd->is_friend_function()) { 1104 // It's not a friend function as defined by class FunctionDescriptor, 1105 // so do a full call-c here. 1106 load_const(R11, (address)fd, R0); 1107 return branch_to(R11, /*and_link=*/true, 1108 /*save toc=*/false, 1109 /*restore toc=*/false, 1110 /*load toc=*/true, 1111 /*load env=*/true); 1112 } else { 1113 // it's a friend function, load the entry point and don't care about 1114 // toc and env. 1115 address dest = fd->entry(); 1116 if (is_within_range_of_b(dest, pc())) { 1117 bl(dest); 1118 } else { 1119 bl64_patchable(dest, rt); 1120 } 1121 _last_calls_return_pc = pc(); 1122 return _last_calls_return_pc; 1123 } 1124 } 1125 } 1126 1127 // Call a C function. All constants needed reside in TOC. 1128 // 1129 // Read the address to call from the TOC. 1130 // Read env from TOC, if fd specifies an env. 1131 // Read new TOC from TOC. 1132 address MacroAssembler::call_c_using_toc(const FunctionDescriptor* fd, 1133 relocInfo::relocType rt, Register toc) { 1134 if (!ReoptimizeCallSequences 1135 || (rt != relocInfo::runtime_call_type && rt != relocInfo::none) 1136 || !fd->is_friend_function()) { 1137 // It's not a friend function as defined by class FunctionDescriptor, 1138 // so do a full call-c here. 1139 assert(fd->entry() != NULL, "function must be linked"); 1140 1141 AddressLiteral fd_entry(fd->entry()); 1142 bool success = load_const_from_method_toc(R11, fd_entry, toc, /*fixed_size*/ true); 1143 mtctr(R11); 1144 if (fd->env() == NULL) { 1145 li(R11, 0); 1146 nop(); 1147 } else { 1148 AddressLiteral fd_env(fd->env()); 1149 success = success && load_const_from_method_toc(R11, fd_env, toc, /*fixed_size*/ true); 1150 } 1151 AddressLiteral fd_toc(fd->toc()); 1152 // Set R2_TOC (load from toc) 1153 success = success && load_const_from_method_toc(R2_TOC, fd_toc, toc, /*fixed_size*/ true); 1154 bctrl(); 1155 _last_calls_return_pc = pc(); 1156 if (!success) { return NULL; } 1157 } else { 1158 // It's a friend function, load the entry point and don't care about 1159 // toc and env. Use an optimizable call instruction, but ensure the 1160 // same code-size as in the case of a non-friend function. 1161 nop(); 1162 bl64_patchable(fd->entry(), rt); 1163 _last_calls_return_pc = pc(); 1164 } 1165 return _last_calls_return_pc; 1166 } 1167 #endif // ABI_ELFv2 1168 1169 void MacroAssembler::call_VM_base(Register oop_result, 1170 Register last_java_sp, 1171 address entry_point, 1172 bool check_exceptions) { 1173 BLOCK_COMMENT("call_VM {"); 1174 // Determine last_java_sp register. 1175 if (!last_java_sp->is_valid()) { 1176 last_java_sp = R1_SP; 1177 } 1178 set_top_ijava_frame_at_SP_as_last_Java_frame(last_java_sp, R11_scratch1); 1179 1180 // ARG1 must hold thread address. 1181 mr(R3_ARG1, R16_thread); 1182 #if defined(ABI_ELFv2) 1183 address return_pc = call_c(entry_point, relocInfo::none); 1184 #else 1185 address return_pc = call_c((FunctionDescriptor*)entry_point, relocInfo::none); 1186 #endif 1187 1188 reset_last_Java_frame(); 1189 1190 // Check for pending exceptions. 1191 if (check_exceptions) { 1192 // We don't check for exceptions here. 1193 ShouldNotReachHere(); 1194 } 1195 1196 // Get oop result if there is one and reset the value in the thread. 1197 if (oop_result->is_valid()) { 1198 get_vm_result(oop_result); 1199 } 1200 1201 _last_calls_return_pc = return_pc; 1202 BLOCK_COMMENT("} call_VM"); 1203 } 1204 1205 void MacroAssembler::call_VM_leaf_base(address entry_point) { 1206 BLOCK_COMMENT("call_VM_leaf {"); 1207 #if defined(ABI_ELFv2) 1208 call_c(entry_point, relocInfo::none); 1209 #else 1210 call_c(CAST_FROM_FN_PTR(FunctionDescriptor*, entry_point), relocInfo::none); 1211 #endif 1212 BLOCK_COMMENT("} call_VM_leaf"); 1213 } 1214 1215 void MacroAssembler::call_VM(Register oop_result, address entry_point, bool check_exceptions) { 1216 call_VM_base(oop_result, noreg, entry_point, check_exceptions); 1217 } 1218 1219 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, 1220 bool check_exceptions) { 1221 // R3_ARG1 is reserved for the thread. 1222 mr_if_needed(R4_ARG2, arg_1); 1223 call_VM(oop_result, entry_point, check_exceptions); 1224 } 1225 1226 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2, 1227 bool check_exceptions) { 1228 // R3_ARG1 is reserved for the thread 1229 mr_if_needed(R4_ARG2, arg_1); 1230 assert(arg_2 != R4_ARG2, "smashed argument"); 1231 mr_if_needed(R5_ARG3, arg_2); 1232 call_VM(oop_result, entry_point, check_exceptions); 1233 } 1234 1235 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2, Register arg_3, 1236 bool check_exceptions) { 1237 // R3_ARG1 is reserved for the thread 1238 mr_if_needed(R4_ARG2, arg_1); 1239 assert(arg_2 != R4_ARG2, "smashed argument"); 1240 mr_if_needed(R5_ARG3, arg_2); 1241 mr_if_needed(R6_ARG4, arg_3); 1242 call_VM(oop_result, entry_point, check_exceptions); 1243 } 1244 1245 void MacroAssembler::call_VM_leaf(address entry_point) { 1246 call_VM_leaf_base(entry_point); 1247 } 1248 1249 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1) { 1250 mr_if_needed(R3_ARG1, arg_1); 1251 call_VM_leaf(entry_point); 1252 } 1253 1254 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2) { 1255 mr_if_needed(R3_ARG1, arg_1); 1256 assert(arg_2 != R3_ARG1, "smashed argument"); 1257 mr_if_needed(R4_ARG2, arg_2); 1258 call_VM_leaf(entry_point); 1259 } 1260 1261 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2, Register arg_3) { 1262 mr_if_needed(R3_ARG1, arg_1); 1263 assert(arg_2 != R3_ARG1, "smashed argument"); 1264 mr_if_needed(R4_ARG2, arg_2); 1265 assert(arg_3 != R3_ARG1 && arg_3 != R4_ARG2, "smashed argument"); 1266 mr_if_needed(R5_ARG3, arg_3); 1267 call_VM_leaf(entry_point); 1268 } 1269 1270 // Check whether instruction is a read access to the polling page 1271 // which was emitted by load_from_polling_page(..). 1272 bool MacroAssembler::is_load_from_polling_page(int instruction, void* ucontext, 1273 address* polling_address_ptr) { 1274 if (!is_ld(instruction)) 1275 return false; // It's not a ld. Fail. 1276 1277 int rt = inv_rt_field(instruction); 1278 int ra = inv_ra_field(instruction); 1279 int ds = inv_ds_field(instruction); 1280 if (!(ds == 0 && ra != 0 && rt == 0)) { 1281 return false; // It's not a ld(r0, X, ra). Fail. 1282 } 1283 1284 if (!ucontext) { 1285 // Set polling address. 1286 if (polling_address_ptr != NULL) { 1287 *polling_address_ptr = NULL; 1288 } 1289 return true; // No ucontext given. Can't check value of ra. Assume true. 1290 } 1291 1292 #ifdef LINUX 1293 // Ucontext given. Check that register ra contains the address of 1294 // the safepoing polling page. 1295 ucontext_t* uc = (ucontext_t*) ucontext; 1296 // Set polling address. 1297 address addr = (address)uc->uc_mcontext.regs->gpr[ra] + (ssize_t)ds; 1298 if (polling_address_ptr != NULL) { 1299 *polling_address_ptr = addr; 1300 } 1301 return os::is_poll_address(addr); 1302 #else 1303 // Not on Linux, ucontext must be NULL. 1304 ShouldNotReachHere(); 1305 return false; 1306 #endif 1307 } 1308 1309 bool MacroAssembler::is_memory_serialization(int instruction, JavaThread* thread, void* ucontext) { 1310 #ifdef LINUX 1311 ucontext_t* uc = (ucontext_t*) ucontext; 1312 1313 if (is_stwx(instruction) || is_stwux(instruction)) { 1314 int ra = inv_ra_field(instruction); 1315 int rb = inv_rb_field(instruction); 1316 1317 // look up content of ra and rb in ucontext 1318 address ra_val=(address)uc->uc_mcontext.regs->gpr[ra]; 1319 long rb_val=(long)uc->uc_mcontext.regs->gpr[rb]; 1320 return os::is_memory_serialize_page(thread, ra_val+rb_val); 1321 } else if (is_stw(instruction) || is_stwu(instruction)) { 1322 int ra = inv_ra_field(instruction); 1323 int d1 = inv_d1_field(instruction); 1324 1325 // look up content of ra in ucontext 1326 address ra_val=(address)uc->uc_mcontext.regs->gpr[ra]; 1327 return os::is_memory_serialize_page(thread, ra_val+d1); 1328 } else { 1329 return false; 1330 } 1331 #else 1332 // workaround not needed on !LINUX :-) 1333 ShouldNotCallThis(); 1334 return false; 1335 #endif 1336 } 1337 1338 void MacroAssembler::bang_stack_with_offset(int offset) { 1339 // When increasing the stack, the old stack pointer will be written 1340 // to the new top of stack according to the PPC64 abi. 1341 // Therefore, stack banging is not necessary when increasing 1342 // the stack by <= os::vm_page_size() bytes. 1343 // When increasing the stack by a larger amount, this method is 1344 // called repeatedly to bang the intermediate pages. 1345 1346 // Stack grows down, caller passes positive offset. 1347 assert(offset > 0, "must bang with positive offset"); 1348 1349 long stdoffset = -offset; 1350 1351 if (is_simm(stdoffset, 16)) { 1352 // Signed 16 bit offset, a simple std is ok. 1353 if (UseLoadInstructionsForStackBangingPPC64) { 1354 ld(R0, (int)(signed short)stdoffset, R1_SP); 1355 } else { 1356 std(R0,(int)(signed short)stdoffset, R1_SP); 1357 } 1358 } else if (is_simm(stdoffset, 31)) { 1359 const int hi = MacroAssembler::largeoffset_si16_si16_hi(stdoffset); 1360 const int lo = MacroAssembler::largeoffset_si16_si16_lo(stdoffset); 1361 1362 Register tmp = R11; 1363 addis(tmp, R1_SP, hi); 1364 if (UseLoadInstructionsForStackBangingPPC64) { 1365 ld(R0, lo, tmp); 1366 } else { 1367 std(R0, lo, tmp); 1368 } 1369 } else { 1370 ShouldNotReachHere(); 1371 } 1372 } 1373 1374 // If instruction is a stack bang of the form 1375 // std R0, x(Ry), (see bang_stack_with_offset()) 1376 // stdu R1_SP, x(R1_SP), (see push_frame(), resize_frame()) 1377 // or stdux R1_SP, Rx, R1_SP (see push_frame(), resize_frame()) 1378 // return the banged address. Otherwise, return 0. 1379 address MacroAssembler::get_stack_bang_address(int instruction, void *ucontext) { 1380 #ifdef LINUX 1381 ucontext_t* uc = (ucontext_t*) ucontext; 1382 int rs = inv_rs_field(instruction); 1383 int ra = inv_ra_field(instruction); 1384 if ( (is_ld(instruction) && rs == 0 && UseLoadInstructionsForStackBangingPPC64) 1385 || (is_std(instruction) && rs == 0 && !UseLoadInstructionsForStackBangingPPC64) 1386 || (is_stdu(instruction) && rs == 1)) { 1387 int ds = inv_ds_field(instruction); 1388 // return banged address 1389 return ds+(address)uc->uc_mcontext.regs->gpr[ra]; 1390 } else if (is_stdux(instruction) && rs == 1) { 1391 int rb = inv_rb_field(instruction); 1392 address sp = (address)uc->uc_mcontext.regs->gpr[1]; 1393 long rb_val = (long)uc->uc_mcontext.regs->gpr[rb]; 1394 return ra != 1 || rb_val >= 0 ? NULL // not a stack bang 1395 : sp + rb_val; // banged address 1396 } 1397 return NULL; // not a stack bang 1398 #else 1399 // workaround not needed on !LINUX :-) 1400 ShouldNotCallThis(); 1401 return NULL; 1402 #endif 1403 } 1404 1405 void MacroAssembler::reserved_stack_check(Register return_pc) { 1406 // Test if reserved zone needs to be enabled. 1407 Label no_reserved_zone_enabling; 1408 1409 ld_ptr(R0, JavaThread::reserved_stack_activation_offset(), R16_thread); 1410 cmpld(CCR0, R1_SP, R0); 1411 blt_predict_taken(CCR0, no_reserved_zone_enabling); 1412 1413 // Enable reserved zone again, throw stack overflow exception. 1414 push_frame_reg_args(0, R0); 1415 call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone), R16_thread); 1416 pop_frame(); 1417 mtlr(return_pc); 1418 load_const_optimized(R0, StubRoutines::throw_delayed_StackOverflowError_entry()); 1419 mtctr(R0); 1420 bctr(); 1421 1422 should_not_reach_here(); 1423 1424 bind(no_reserved_zone_enabling); 1425 } 1426 1427 void MacroAssembler::getandsetd(Register dest_current_value, Register exchange_value, Register addr_base, 1428 bool cmpxchgx_hint) { 1429 Label retry; 1430 bind(retry); 1431 ldarx(dest_current_value, addr_base, cmpxchgx_hint); 1432 stdcx_(exchange_value, addr_base); 1433 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1434 bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0. 1435 } else { 1436 bne( CCR0, retry); // StXcx_ sets CCR0. 1437 } 1438 } 1439 1440 void MacroAssembler::getandaddd(Register dest_current_value, Register inc_value, Register addr_base, 1441 Register tmp, bool cmpxchgx_hint) { 1442 Label retry; 1443 bind(retry); 1444 ldarx(dest_current_value, addr_base, cmpxchgx_hint); 1445 add(tmp, dest_current_value, inc_value); 1446 stdcx_(tmp, addr_base); 1447 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1448 bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0. 1449 } else { 1450 bne( CCR0, retry); // StXcx_ sets CCR0. 1451 } 1452 } 1453 1454 // Word/sub-word atomic helper functions 1455 1456 // Temps and addr_base are killed if size < 4 and processor does not support respective instructions. 1457 // Only signed types are supported with size < 4. 1458 // Atomic add always kills tmp1. 1459 void MacroAssembler::atomic_get_and_modify_generic(Register dest_current_value, Register exchange_value, 1460 Register addr_base, Register tmp1, Register tmp2, Register tmp3, 1461 bool cmpxchgx_hint, bool is_add, int size) { 1462 // Sub-word instructions are available since Power 8. 1463 // For older processors, instruction_type != size holds, and we 1464 // emulate the sub-word instructions by constructing a 4-byte value 1465 // that leaves the other bytes unchanged. 1466 const int instruction_type = VM_Version::has_lqarx() ? size : 4; 1467 1468 Label retry; 1469 Register shift_amount = noreg, 1470 val32 = dest_current_value, 1471 modval = is_add ? tmp1 : exchange_value; 1472 1473 if (instruction_type != size) { 1474 assert_different_registers(tmp1, tmp2, tmp3, dest_current_value, exchange_value, addr_base); 1475 modval = tmp1; 1476 shift_amount = tmp2; 1477 val32 = tmp3; 1478 // Need some preperation: Compute shift amount, align address. Note: shorts must be 2 byte aligned. 1479 #ifdef VM_LITTLE_ENDIAN 1480 rldic(shift_amount, addr_base, 3, 64-5); // (dest & 3) * 8; 1481 clrrdi(addr_base, addr_base, 2); 1482 #else 1483 xori(shift_amount, addr_base, (size == 1) ? 3 : 2); 1484 clrrdi(addr_base, addr_base, 2); 1485 rldic(shift_amount, shift_amount, 3, 64-5); // byte: ((3-dest) & 3) * 8; short: ((1-dest/2) & 1) * 16; 1486 #endif 1487 } 1488 1489 // atomic emulation loop 1490 bind(retry); 1491 1492 switch (instruction_type) { 1493 case 4: lwarx(val32, addr_base, cmpxchgx_hint); break; 1494 case 2: lharx(val32, addr_base, cmpxchgx_hint); break; 1495 case 1: lbarx(val32, addr_base, cmpxchgx_hint); break; 1496 default: ShouldNotReachHere(); 1497 } 1498 1499 if (instruction_type != size) { 1500 srw(dest_current_value, val32, shift_amount); 1501 } 1502 1503 if (is_add) { add(modval, dest_current_value, exchange_value); } 1504 1505 if (instruction_type != size) { 1506 // Transform exchange value such that the replacement can be done by one xor instruction. 1507 xorr(modval, dest_current_value, is_add ? modval : exchange_value); 1508 clrldi(modval, modval, (size == 1) ? 56 : 48); 1509 slw(modval, modval, shift_amount); 1510 xorr(modval, val32, modval); 1511 } 1512 1513 switch (instruction_type) { 1514 case 4: stwcx_(modval, addr_base); break; 1515 case 2: sthcx_(modval, addr_base); break; 1516 case 1: stbcx_(modval, addr_base); break; 1517 default: ShouldNotReachHere(); 1518 } 1519 1520 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1521 bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0. 1522 } else { 1523 bne( CCR0, retry); // StXcx_ sets CCR0. 1524 } 1525 1526 // l?arx zero-extends, but Java wants byte/short values sign-extended. 1527 if (size == 1) { 1528 extsb(dest_current_value, dest_current_value); 1529 } else if (size == 2) { 1530 extsh(dest_current_value, dest_current_value); 1531 }; 1532 } 1533 1534 // Temps, addr_base and exchange_value are killed if size < 4 and processor does not support respective instructions. 1535 // Only signed types are supported with size < 4. 1536 void MacroAssembler::cmpxchg_loop_body(ConditionRegister flag, Register dest_current_value, 1537 Register compare_value, Register exchange_value, 1538 Register addr_base, Register tmp1, Register tmp2, 1539 Label &retry, Label &failed, bool cmpxchgx_hint, int size) { 1540 // Sub-word instructions are available since Power 8. 1541 // For older processors, instruction_type != size holds, and we 1542 // emulate the sub-word instructions by constructing a 4-byte value 1543 // that leaves the other bytes unchanged. 1544 const int instruction_type = VM_Version::has_lqarx() ? size : 4; 1545 1546 Register shift_amount = noreg, 1547 val32 = dest_current_value, 1548 modval = exchange_value; 1549 1550 if (instruction_type != size) { 1551 assert_different_registers(tmp1, tmp2, dest_current_value, compare_value, exchange_value, addr_base); 1552 shift_amount = tmp1; 1553 val32 = tmp2; 1554 modval = tmp2; 1555 // Need some preperation: Compute shift amount, align address. Note: shorts must be 2 byte aligned. 1556 #ifdef VM_LITTLE_ENDIAN 1557 rldic(shift_amount, addr_base, 3, 64-5); // (dest & 3) * 8; 1558 clrrdi(addr_base, addr_base, 2); 1559 #else 1560 xori(shift_amount, addr_base, (size == 1) ? 3 : 2); 1561 clrrdi(addr_base, addr_base, 2); 1562 rldic(shift_amount, shift_amount, 3, 64-5); // byte: ((3-dest) & 3) * 8; short: ((1-dest/2) & 1) * 16; 1563 #endif 1564 // Transform exchange value such that the replacement can be done by one xor instruction. 1565 xorr(exchange_value, compare_value, exchange_value); 1566 clrldi(exchange_value, exchange_value, (size == 1) ? 56 : 48); 1567 slw(exchange_value, exchange_value, shift_amount); 1568 } 1569 1570 // atomic emulation loop 1571 bind(retry); 1572 1573 switch (instruction_type) { 1574 case 4: lwarx(val32, addr_base, cmpxchgx_hint); break; 1575 case 2: lharx(val32, addr_base, cmpxchgx_hint); break; 1576 case 1: lbarx(val32, addr_base, cmpxchgx_hint); break; 1577 default: ShouldNotReachHere(); 1578 } 1579 1580 if (instruction_type != size) { 1581 srw(dest_current_value, val32, shift_amount); 1582 } 1583 if (size == 1) { 1584 extsb(dest_current_value, dest_current_value); 1585 } else if (size == 2) { 1586 extsh(dest_current_value, dest_current_value); 1587 }; 1588 1589 cmpw(flag, dest_current_value, compare_value); 1590 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1591 bne_predict_not_taken(flag, failed); 1592 } else { 1593 bne( flag, failed); 1594 } 1595 // branch to done => (flag == ne), (dest_current_value != compare_value) 1596 // fall through => (flag == eq), (dest_current_value == compare_value) 1597 1598 if (instruction_type != size) { 1599 xorr(modval, val32, exchange_value); 1600 } 1601 1602 switch (instruction_type) { 1603 case 4: stwcx_(modval, addr_base); break; 1604 case 2: sthcx_(modval, addr_base); break; 1605 case 1: stbcx_(modval, addr_base); break; 1606 default: ShouldNotReachHere(); 1607 } 1608 } 1609 1610 // CmpxchgX sets condition register to cmpX(current, compare). 1611 void MacroAssembler::cmpxchg_generic(ConditionRegister flag, Register dest_current_value, 1612 Register compare_value, Register exchange_value, 1613 Register addr_base, Register tmp1, Register tmp2, 1614 int semantics, bool cmpxchgx_hint, 1615 Register int_flag_success, bool contention_hint, bool weak, int size) { 1616 Label retry; 1617 Label failed; 1618 Label done; 1619 1620 // Save one branch if result is returned via register and 1621 // result register is different from the other ones. 1622 bool use_result_reg = (int_flag_success != noreg); 1623 bool preset_result_reg = (int_flag_success != dest_current_value && int_flag_success != compare_value && 1624 int_flag_success != exchange_value && int_flag_success != addr_base && 1625 int_flag_success != tmp1 && int_flag_success != tmp2); 1626 assert(!weak || flag == CCR0, "weak only supported with CCR0"); 1627 assert(size == 1 || size == 2 || size == 4, "unsupported"); 1628 1629 if (use_result_reg && preset_result_reg) { 1630 li(int_flag_success, 0); // preset (assume cas failed) 1631 } 1632 1633 // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM). 1634 if (contention_hint) { // Don't try to reserve if cmp fails. 1635 switch (size) { 1636 case 1: lbz(dest_current_value, 0, addr_base); extsb(dest_current_value, dest_current_value); break; 1637 case 2: lha(dest_current_value, 0, addr_base); break; 1638 case 4: lwz(dest_current_value, 0, addr_base); break; 1639 default: ShouldNotReachHere(); 1640 } 1641 cmpw(flag, dest_current_value, compare_value); 1642 bne(flag, failed); 1643 } 1644 1645 // release/fence semantics 1646 if (semantics & MemBarRel) { 1647 release(); 1648 } 1649 1650 cmpxchg_loop_body(flag, dest_current_value, compare_value, exchange_value, addr_base, tmp1, tmp2, 1651 retry, failed, cmpxchgx_hint, size); 1652 if (!weak || use_result_reg) { 1653 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1654 bne_predict_not_taken(CCR0, weak ? failed : retry); // StXcx_ sets CCR0. 1655 } else { 1656 bne( CCR0, weak ? failed : retry); // StXcx_ sets CCR0. 1657 } 1658 } 1659 // fall through => (flag == eq), (dest_current_value == compare_value), (swapped) 1660 1661 // Result in register (must do this at the end because int_flag_success can be the 1662 // same register as one above). 1663 if (use_result_reg) { 1664 li(int_flag_success, 1); 1665 } 1666 1667 if (semantics & MemBarFenceAfter) { 1668 fence(); 1669 } else if (semantics & MemBarAcq) { 1670 isync(); 1671 } 1672 1673 if (use_result_reg && !preset_result_reg) { 1674 b(done); 1675 } 1676 1677 bind(failed); 1678 if (use_result_reg && !preset_result_reg) { 1679 li(int_flag_success, 0); 1680 } 1681 1682 bind(done); 1683 // (flag == ne) => (dest_current_value != compare_value), (!swapped) 1684 // (flag == eq) => (dest_current_value == compare_value), ( swapped) 1685 } 1686 1687 // Preforms atomic compare exchange: 1688 // if (compare_value == *addr_base) 1689 // *addr_base = exchange_value 1690 // int_flag_success = 1; 1691 // else 1692 // int_flag_success = 0; 1693 // 1694 // ConditionRegister flag = cmp(compare_value, *addr_base) 1695 // Register dest_current_value = *addr_base 1696 // Register compare_value Used to compare with value in memory 1697 // Register exchange_value Written to memory if compare_value == *addr_base 1698 // Register addr_base The memory location to compareXChange 1699 // Register int_flag_success Set to 1 if exchange_value was written to *addr_base 1700 // 1701 // To avoid the costly compare exchange the value is tested beforehand. 1702 // Several special cases exist to avoid that unnecessary information is generated. 1703 // 1704 void MacroAssembler::cmpxchgd(ConditionRegister flag, 1705 Register dest_current_value, RegisterOrConstant compare_value, Register exchange_value, 1706 Register addr_base, int semantics, bool cmpxchgx_hint, 1707 Register int_flag_success, Label* failed_ext, bool contention_hint, bool weak) { 1708 Label retry; 1709 Label failed_int; 1710 Label& failed = (failed_ext != NULL) ? *failed_ext : failed_int; 1711 Label done; 1712 1713 // Save one branch if result is returned via register and result register is different from the other ones. 1714 bool use_result_reg = (int_flag_success!=noreg); 1715 bool preset_result_reg = (int_flag_success!=dest_current_value && int_flag_success!=compare_value.register_or_noreg() && 1716 int_flag_success!=exchange_value && int_flag_success!=addr_base); 1717 assert(!weak || flag == CCR0, "weak only supported with CCR0"); 1718 assert(int_flag_success == noreg || failed_ext == NULL, "cannot have both"); 1719 1720 if (use_result_reg && preset_result_reg) { 1721 li(int_flag_success, 0); // preset (assume cas failed) 1722 } 1723 1724 // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM). 1725 if (contention_hint) { // Don't try to reserve if cmp fails. 1726 ld(dest_current_value, 0, addr_base); 1727 cmpd(flag, compare_value, dest_current_value); 1728 bne(flag, failed); 1729 } 1730 1731 // release/fence semantics 1732 if (semantics & MemBarRel) { 1733 release(); 1734 } 1735 1736 // atomic emulation loop 1737 bind(retry); 1738 1739 ldarx(dest_current_value, addr_base, cmpxchgx_hint); 1740 cmpd(flag, compare_value, dest_current_value); 1741 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1742 bne_predict_not_taken(flag, failed); 1743 } else { 1744 bne( flag, failed); 1745 } 1746 1747 stdcx_(exchange_value, addr_base); 1748 if (!weak || use_result_reg || failed_ext) { 1749 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1750 bne_predict_not_taken(CCR0, weak ? failed : retry); // stXcx_ sets CCR0 1751 } else { 1752 bne( CCR0, weak ? failed : retry); // stXcx_ sets CCR0 1753 } 1754 } 1755 1756 // result in register (must do this at the end because int_flag_success can be the same register as one above) 1757 if (use_result_reg) { 1758 li(int_flag_success, 1); 1759 } 1760 1761 if (semantics & MemBarFenceAfter) { 1762 fence(); 1763 } else if (semantics & MemBarAcq) { 1764 isync(); 1765 } 1766 1767 if (use_result_reg && !preset_result_reg) { 1768 b(done); 1769 } 1770 1771 bind(failed_int); 1772 if (use_result_reg && !preset_result_reg) { 1773 li(int_flag_success, 0); 1774 } 1775 1776 bind(done); 1777 // (flag == ne) => (dest_current_value != compare_value), (!swapped) 1778 // (flag == eq) => (dest_current_value == compare_value), ( swapped) 1779 } 1780 1781 // Look up the method for a megamorphic invokeinterface call. 1782 // The target method is determined by <intf_klass, itable_index>. 1783 // The receiver klass is in recv_klass. 1784 // On success, the result will be in method_result, and execution falls through. 1785 // On failure, execution transfers to the given label. 1786 void MacroAssembler::lookup_interface_method(Register recv_klass, 1787 Register intf_klass, 1788 RegisterOrConstant itable_index, 1789 Register method_result, 1790 Register scan_temp, 1791 Register sethi_temp, 1792 Label& L_no_such_interface) { 1793 assert_different_registers(recv_klass, intf_klass, method_result, scan_temp); 1794 assert(itable_index.is_constant() || itable_index.as_register() == method_result, 1795 "caller must use same register for non-constant itable index as for method"); 1796 1797 // Compute start of first itableOffsetEntry (which is at the end of the vtable). 1798 int vtable_base = in_bytes(Klass::vtable_start_offset()); 1799 int itentry_off = itableMethodEntry::method_offset_in_bytes(); 1800 int logMEsize = exact_log2(itableMethodEntry::size() * wordSize); 1801 int scan_step = itableOffsetEntry::size() * wordSize; 1802 int log_vte_size= exact_log2(vtableEntry::size_in_bytes()); 1803 1804 lwz(scan_temp, in_bytes(Klass::vtable_length_offset()), recv_klass); 1805 // %%% We should store the aligned, prescaled offset in the klassoop. 1806 // Then the next several instructions would fold away. 1807 1808 sldi(scan_temp, scan_temp, log_vte_size); 1809 addi(scan_temp, scan_temp, vtable_base); 1810 add(scan_temp, recv_klass, scan_temp); 1811 1812 // Adjust recv_klass by scaled itable_index, so we can free itable_index. 1813 if (itable_index.is_register()) { 1814 Register itable_offset = itable_index.as_register(); 1815 sldi(itable_offset, itable_offset, logMEsize); 1816 if (itentry_off) addi(itable_offset, itable_offset, itentry_off); 1817 add(recv_klass, itable_offset, recv_klass); 1818 } else { 1819 long itable_offset = (long)itable_index.as_constant(); 1820 load_const_optimized(sethi_temp, (itable_offset<<logMEsize)+itentry_off); // static address, no relocation 1821 add(recv_klass, sethi_temp, recv_klass); 1822 } 1823 1824 // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) { 1825 // if (scan->interface() == intf) { 1826 // result = (klass + scan->offset() + itable_index); 1827 // } 1828 // } 1829 Label search, found_method; 1830 1831 for (int peel = 1; peel >= 0; peel--) { 1832 // %%%% Could load both offset and interface in one ldx, if they were 1833 // in the opposite order. This would save a load. 1834 ld(method_result, itableOffsetEntry::interface_offset_in_bytes(), scan_temp); 1835 1836 // Check that this entry is non-null. A null entry means that 1837 // the receiver class doesn't implement the interface, and wasn't the 1838 // same as when the caller was compiled. 1839 cmpd(CCR0, method_result, intf_klass); 1840 1841 if (peel) { 1842 beq(CCR0, found_method); 1843 } else { 1844 bne(CCR0, search); 1845 // (invert the test to fall through to found_method...) 1846 } 1847 1848 if (!peel) break; 1849 1850 bind(search); 1851 1852 cmpdi(CCR0, method_result, 0); 1853 beq(CCR0, L_no_such_interface); 1854 addi(scan_temp, scan_temp, scan_step); 1855 } 1856 1857 bind(found_method); 1858 1859 // Got a hit. 1860 int ito_offset = itableOffsetEntry::offset_offset_in_bytes(); 1861 lwz(scan_temp, ito_offset, scan_temp); 1862 ldx(method_result, scan_temp, recv_klass); 1863 } 1864 1865 // virtual method calling 1866 void MacroAssembler::lookup_virtual_method(Register recv_klass, 1867 RegisterOrConstant vtable_index, 1868 Register method_result) { 1869 1870 assert_different_registers(recv_klass, method_result, vtable_index.register_or_noreg()); 1871 1872 const int base = in_bytes(Klass::vtable_start_offset()); 1873 assert(vtableEntry::size() * wordSize == wordSize, "adjust the scaling in the code below"); 1874 1875 if (vtable_index.is_register()) { 1876 sldi(vtable_index.as_register(), vtable_index.as_register(), LogBytesPerWord); 1877 add(recv_klass, vtable_index.as_register(), recv_klass); 1878 } else { 1879 addi(recv_klass, recv_klass, vtable_index.as_constant() << LogBytesPerWord); 1880 } 1881 ld(R19_method, base + vtableEntry::method_offset_in_bytes(), recv_klass); 1882 } 1883 1884 /////////////////////////////////////////// subtype checking //////////////////////////////////////////// 1885 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass, 1886 Register super_klass, 1887 Register temp1_reg, 1888 Register temp2_reg, 1889 Label* L_success, 1890 Label* L_failure, 1891 Label* L_slow_path, 1892 RegisterOrConstant super_check_offset) { 1893 1894 const Register check_cache_offset = temp1_reg; 1895 const Register cached_super = temp2_reg; 1896 1897 assert_different_registers(sub_klass, super_klass, check_cache_offset, cached_super); 1898 1899 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 1900 int sc_offset = in_bytes(Klass::secondary_super_cache_offset()); 1901 1902 bool must_load_sco = (super_check_offset.constant_or_zero() == -1); 1903 bool need_slow_path = (must_load_sco || super_check_offset.constant_or_zero() == sco_offset); 1904 1905 Label L_fallthrough; 1906 int label_nulls = 0; 1907 if (L_success == NULL) { L_success = &L_fallthrough; label_nulls++; } 1908 if (L_failure == NULL) { L_failure = &L_fallthrough; label_nulls++; } 1909 if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; } 1910 assert(label_nulls <= 1 || 1911 (L_slow_path == &L_fallthrough && label_nulls <= 2 && !need_slow_path), 1912 "at most one NULL in the batch, usually"); 1913 1914 // If the pointers are equal, we are done (e.g., String[] elements). 1915 // This self-check enables sharing of secondary supertype arrays among 1916 // non-primary types such as array-of-interface. Otherwise, each such 1917 // type would need its own customized SSA. 1918 // We move this check to the front of the fast path because many 1919 // type checks are in fact trivially successful in this manner, 1920 // so we get a nicely predicted branch right at the start of the check. 1921 cmpd(CCR0, sub_klass, super_klass); 1922 beq(CCR0, *L_success); 1923 1924 // Check the supertype display: 1925 if (must_load_sco) { 1926 // The super check offset is always positive... 1927 lwz(check_cache_offset, sco_offset, super_klass); 1928 super_check_offset = RegisterOrConstant(check_cache_offset); 1929 // super_check_offset is register. 1930 assert_different_registers(sub_klass, super_klass, cached_super, super_check_offset.as_register()); 1931 } 1932 // The loaded value is the offset from KlassOopDesc. 1933 1934 ld(cached_super, super_check_offset, sub_klass); 1935 cmpd(CCR0, cached_super, super_klass); 1936 1937 // This check has worked decisively for primary supers. 1938 // Secondary supers are sought in the super_cache ('super_cache_addr'). 1939 // (Secondary supers are interfaces and very deeply nested subtypes.) 1940 // This works in the same check above because of a tricky aliasing 1941 // between the super_cache and the primary super display elements. 1942 // (The 'super_check_addr' can address either, as the case requires.) 1943 // Note that the cache is updated below if it does not help us find 1944 // what we need immediately. 1945 // So if it was a primary super, we can just fail immediately. 1946 // Otherwise, it's the slow path for us (no success at this point). 1947 1948 #define FINAL_JUMP(label) if (&(label) != &L_fallthrough) { b(label); } 1949 1950 if (super_check_offset.is_register()) { 1951 beq(CCR0, *L_success); 1952 cmpwi(CCR0, super_check_offset.as_register(), sc_offset); 1953 if (L_failure == &L_fallthrough) { 1954 beq(CCR0, *L_slow_path); 1955 } else { 1956 bne(CCR0, *L_failure); 1957 FINAL_JUMP(*L_slow_path); 1958 } 1959 } else { 1960 if (super_check_offset.as_constant() == sc_offset) { 1961 // Need a slow path; fast failure is impossible. 1962 if (L_slow_path == &L_fallthrough) { 1963 beq(CCR0, *L_success); 1964 } else { 1965 bne(CCR0, *L_slow_path); 1966 FINAL_JUMP(*L_success); 1967 } 1968 } else { 1969 // No slow path; it's a fast decision. 1970 if (L_failure == &L_fallthrough) { 1971 beq(CCR0, *L_success); 1972 } else { 1973 bne(CCR0, *L_failure); 1974 FINAL_JUMP(*L_success); 1975 } 1976 } 1977 } 1978 1979 bind(L_fallthrough); 1980 #undef FINAL_JUMP 1981 } 1982 1983 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass, 1984 Register super_klass, 1985 Register temp1_reg, 1986 Register temp2_reg, 1987 Label* L_success, 1988 Register result_reg) { 1989 const Register array_ptr = temp1_reg; // current value from cache array 1990 const Register temp = temp2_reg; 1991 1992 assert_different_registers(sub_klass, super_klass, array_ptr, temp); 1993 1994 int source_offset = in_bytes(Klass::secondary_supers_offset()); 1995 int target_offset = in_bytes(Klass::secondary_super_cache_offset()); 1996 1997 int length_offset = Array<Klass*>::length_offset_in_bytes(); 1998 int base_offset = Array<Klass*>::base_offset_in_bytes(); 1999 2000 Label hit, loop, failure, fallthru; 2001 2002 ld(array_ptr, source_offset, sub_klass); 2003 2004 // TODO: PPC port: assert(4 == arrayOopDesc::length_length_in_bytes(), "precondition violated."); 2005 lwz(temp, length_offset, array_ptr); 2006 cmpwi(CCR0, temp, 0); 2007 beq(CCR0, result_reg!=noreg ? failure : fallthru); // length 0 2008 2009 mtctr(temp); // load ctr 2010 2011 bind(loop); 2012 // Oops in table are NO MORE compressed. 2013 ld(temp, base_offset, array_ptr); 2014 cmpd(CCR0, temp, super_klass); 2015 beq(CCR0, hit); 2016 addi(array_ptr, array_ptr, BytesPerWord); 2017 bdnz(loop); 2018 2019 bind(failure); 2020 if (result_reg!=noreg) li(result_reg, 1); // load non-zero result (indicates a miss) 2021 b(fallthru); 2022 2023 bind(hit); 2024 std(super_klass, target_offset, sub_klass); // save result to cache 2025 if (result_reg != noreg) { li(result_reg, 0); } // load zero result (indicates a hit) 2026 if (L_success != NULL) { b(*L_success); } 2027 else if (result_reg == noreg) { blr(); } // return with CR0.eq if neither label nor result reg provided 2028 2029 bind(fallthru); 2030 } 2031 2032 // Try fast path, then go to slow one if not successful 2033 void MacroAssembler::check_klass_subtype(Register sub_klass, 2034 Register super_klass, 2035 Register temp1_reg, 2036 Register temp2_reg, 2037 Label& L_success) { 2038 Label L_failure; 2039 check_klass_subtype_fast_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success, &L_failure); 2040 check_klass_subtype_slow_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success); 2041 bind(L_failure); // Fallthru if not successful. 2042 } 2043 2044 void MacroAssembler::check_method_handle_type(Register mtype_reg, Register mh_reg, 2045 Register temp_reg, 2046 Label& wrong_method_type) { 2047 assert_different_registers(mtype_reg, mh_reg, temp_reg); 2048 // Compare method type against that of the receiver. 2049 load_heap_oop_not_null(temp_reg, delayed_value(java_lang_invoke_MethodHandle::type_offset_in_bytes, temp_reg), mh_reg); 2050 cmpd(CCR0, temp_reg, mtype_reg); 2051 bne(CCR0, wrong_method_type); 2052 } 2053 2054 RegisterOrConstant MacroAssembler::argument_offset(RegisterOrConstant arg_slot, 2055 Register temp_reg, 2056 int extra_slot_offset) { 2057 // cf. TemplateTable::prepare_invoke(), if (load_receiver). 2058 int stackElementSize = Interpreter::stackElementSize; 2059 int offset = extra_slot_offset * stackElementSize; 2060 if (arg_slot.is_constant()) { 2061 offset += arg_slot.as_constant() * stackElementSize; 2062 return offset; 2063 } else { 2064 assert(temp_reg != noreg, "must specify"); 2065 sldi(temp_reg, arg_slot.as_register(), exact_log2(stackElementSize)); 2066 if (offset != 0) 2067 addi(temp_reg, temp_reg, offset); 2068 return temp_reg; 2069 } 2070 } 2071 2072 // Supports temp2_reg = R0. 2073 void MacroAssembler::biased_locking_enter(ConditionRegister cr_reg, Register obj_reg, 2074 Register mark_reg, Register temp_reg, 2075 Register temp2_reg, Label& done, Label* slow_case) { 2076 assert(UseBiasedLocking, "why call this otherwise?"); 2077 2078 #ifdef ASSERT 2079 assert_different_registers(obj_reg, mark_reg, temp_reg, temp2_reg); 2080 #endif 2081 2082 Label cas_label; 2083 2084 // Branch to done if fast path fails and no slow_case provided. 2085 Label *slow_case_int = (slow_case != NULL) ? slow_case : &done; 2086 2087 // Biased locking 2088 // See whether the lock is currently biased toward our thread and 2089 // whether the epoch is still valid 2090 // Note that the runtime guarantees sufficient alignment of JavaThread 2091 // pointers to allow age to be placed into low bits 2092 assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits, 2093 "biased locking makes assumptions about bit layout"); 2094 2095 if (PrintBiasedLockingStatistics) { 2096 load_const(temp2_reg, (address) BiasedLocking::total_entry_count_addr(), temp_reg); 2097 lwzx(temp_reg, temp2_reg); 2098 addi(temp_reg, temp_reg, 1); 2099 stwx(temp_reg, temp2_reg); 2100 } 2101 2102 andi(temp_reg, mark_reg, markOopDesc::biased_lock_mask_in_place); 2103 cmpwi(cr_reg, temp_reg, markOopDesc::biased_lock_pattern); 2104 bne(cr_reg, cas_label); 2105 2106 load_klass(temp_reg, obj_reg); 2107 2108 load_const_optimized(temp2_reg, ~((int) markOopDesc::age_mask_in_place)); 2109 ld(temp_reg, in_bytes(Klass::prototype_header_offset()), temp_reg); 2110 orr(temp_reg, R16_thread, temp_reg); 2111 xorr(temp_reg, mark_reg, temp_reg); 2112 andr(temp_reg, temp_reg, temp2_reg); 2113 cmpdi(cr_reg, temp_reg, 0); 2114 if (PrintBiasedLockingStatistics) { 2115 Label l; 2116 bne(cr_reg, l); 2117 load_const(temp2_reg, (address) BiasedLocking::biased_lock_entry_count_addr()); 2118 lwzx(mark_reg, temp2_reg); 2119 addi(mark_reg, mark_reg, 1); 2120 stwx(mark_reg, temp2_reg); 2121 // restore mark_reg 2122 ld(mark_reg, oopDesc::mark_offset_in_bytes(), obj_reg); 2123 bind(l); 2124 } 2125 beq(cr_reg, done); 2126 2127 Label try_revoke_bias; 2128 Label try_rebias; 2129 2130 // At this point we know that the header has the bias pattern and 2131 // that we are not the bias owner in the current epoch. We need to 2132 // figure out more details about the state of the header in order to 2133 // know what operations can be legally performed on the object's 2134 // header. 2135 2136 // If the low three bits in the xor result aren't clear, that means 2137 // the prototype header is no longer biased and we have to revoke 2138 // the bias on this object. 2139 andi(temp2_reg, temp_reg, markOopDesc::biased_lock_mask_in_place); 2140 cmpwi(cr_reg, temp2_reg, 0); 2141 bne(cr_reg, try_revoke_bias); 2142 2143 // Biasing is still enabled for this data type. See whether the 2144 // epoch of the current bias is still valid, meaning that the epoch 2145 // bits of the mark word are equal to the epoch bits of the 2146 // prototype header. (Note that the prototype header's epoch bits 2147 // only change at a safepoint.) If not, attempt to rebias the object 2148 // toward the current thread. Note that we must be absolutely sure 2149 // that the current epoch is invalid in order to do this because 2150 // otherwise the manipulations it performs on the mark word are 2151 // illegal. 2152 2153 int shift_amount = 64 - markOopDesc::epoch_shift; 2154 // rotate epoch bits to right (little) end and set other bits to 0 2155 // [ big part | epoch | little part ] -> [ 0..0 | epoch ] 2156 rldicl_(temp2_reg, temp_reg, shift_amount, 64 - markOopDesc::epoch_bits); 2157 // branch if epoch bits are != 0, i.e. they differ, because the epoch has been incremented 2158 bne(CCR0, try_rebias); 2159 2160 // The epoch of the current bias is still valid but we know nothing 2161 // about the owner; it might be set or it might be clear. Try to 2162 // acquire the bias of the object using an atomic operation. If this 2163 // fails we will go in to the runtime to revoke the object's bias. 2164 // Note that we first construct the presumed unbiased header so we 2165 // don't accidentally blow away another thread's valid bias. 2166 andi(mark_reg, mark_reg, (markOopDesc::biased_lock_mask_in_place | 2167 markOopDesc::age_mask_in_place | 2168 markOopDesc::epoch_mask_in_place)); 2169 orr(temp_reg, R16_thread, mark_reg); 2170 2171 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 2172 2173 // CmpxchgX sets cr_reg to cmpX(temp2_reg, mark_reg). 2174 cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg, 2175 /*compare_value=*/mark_reg, /*exchange_value=*/temp_reg, 2176 /*where=*/obj_reg, 2177 MacroAssembler::MemBarAcq, 2178 MacroAssembler::cmpxchgx_hint_acquire_lock(), 2179 noreg, slow_case_int); // bail out if failed 2180 2181 // If the biasing toward our thread failed, this means that 2182 // another thread succeeded in biasing it toward itself and we 2183 // need to revoke that bias. The revocation will occur in the 2184 // interpreter runtime in the slow case. 2185 if (PrintBiasedLockingStatistics) { 2186 load_const(temp2_reg, (address) BiasedLocking::anonymously_biased_lock_entry_count_addr(), temp_reg); 2187 lwzx(temp_reg, temp2_reg); 2188 addi(temp_reg, temp_reg, 1); 2189 stwx(temp_reg, temp2_reg); 2190 } 2191 b(done); 2192 2193 bind(try_rebias); 2194 // At this point we know the epoch has expired, meaning that the 2195 // current "bias owner", if any, is actually invalid. Under these 2196 // circumstances _only_, we are allowed to use the current header's 2197 // value as the comparison value when doing the cas to acquire the 2198 // bias in the current epoch. In other words, we allow transfer of 2199 // the bias from one thread to another directly in this situation. 2200 load_klass(temp_reg, obj_reg); 2201 andi(temp2_reg, mark_reg, markOopDesc::age_mask_in_place); 2202 orr(temp2_reg, R16_thread, temp2_reg); 2203 ld(temp_reg, in_bytes(Klass::prototype_header_offset()), temp_reg); 2204 orr(temp_reg, temp2_reg, temp_reg); 2205 2206 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 2207 2208 cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg, 2209 /*compare_value=*/mark_reg, /*exchange_value=*/temp_reg, 2210 /*where=*/obj_reg, 2211 MacroAssembler::MemBarAcq, 2212 MacroAssembler::cmpxchgx_hint_acquire_lock(), 2213 noreg, slow_case_int); // bail out if failed 2214 2215 // If the biasing toward our thread failed, this means that 2216 // another thread succeeded in biasing it toward itself and we 2217 // need to revoke that bias. The revocation will occur in the 2218 // interpreter runtime in the slow case. 2219 if (PrintBiasedLockingStatistics) { 2220 load_const(temp2_reg, (address) BiasedLocking::rebiased_lock_entry_count_addr(), temp_reg); 2221 lwzx(temp_reg, temp2_reg); 2222 addi(temp_reg, temp_reg, 1); 2223 stwx(temp_reg, temp2_reg); 2224 } 2225 b(done); 2226 2227 bind(try_revoke_bias); 2228 // The prototype mark in the klass doesn't have the bias bit set any 2229 // more, indicating that objects of this data type are not supposed 2230 // to be biased any more. We are going to try to reset the mark of 2231 // this object to the prototype value and fall through to the 2232 // CAS-based locking scheme. Note that if our CAS fails, it means 2233 // that another thread raced us for the privilege of revoking the 2234 // bias of this particular object, so it's okay to continue in the 2235 // normal locking code. 2236 load_klass(temp_reg, obj_reg); 2237 ld(temp_reg, in_bytes(Klass::prototype_header_offset()), temp_reg); 2238 andi(temp2_reg, mark_reg, markOopDesc::age_mask_in_place); 2239 orr(temp_reg, temp_reg, temp2_reg); 2240 2241 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 2242 2243 // CmpxchgX sets cr_reg to cmpX(temp2_reg, mark_reg). 2244 cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg, 2245 /*compare_value=*/mark_reg, /*exchange_value=*/temp_reg, 2246 /*where=*/obj_reg, 2247 MacroAssembler::MemBarAcq, 2248 MacroAssembler::cmpxchgx_hint_acquire_lock()); 2249 2250 // reload markOop in mark_reg before continuing with lightweight locking 2251 ld(mark_reg, oopDesc::mark_offset_in_bytes(), obj_reg); 2252 2253 // Fall through to the normal CAS-based lock, because no matter what 2254 // the result of the above CAS, some thread must have succeeded in 2255 // removing the bias bit from the object's header. 2256 if (PrintBiasedLockingStatistics) { 2257 Label l; 2258 bne(cr_reg, l); 2259 load_const(temp2_reg, (address) BiasedLocking::revoked_lock_entry_count_addr(), temp_reg); 2260 lwzx(temp_reg, temp2_reg); 2261 addi(temp_reg, temp_reg, 1); 2262 stwx(temp_reg, temp2_reg); 2263 bind(l); 2264 } 2265 2266 bind(cas_label); 2267 } 2268 2269 void MacroAssembler::biased_locking_exit (ConditionRegister cr_reg, Register mark_addr, Register temp_reg, Label& done) { 2270 // Check for biased locking unlock case, which is a no-op 2271 // Note: we do not have to check the thread ID for two reasons. 2272 // First, the interpreter checks for IllegalMonitorStateException at 2273 // a higher level. Second, if the bias was revoked while we held the 2274 // lock, the object could not be rebiased toward another thread, so 2275 // the bias bit would be clear. 2276 2277 ld(temp_reg, 0, mark_addr); 2278 andi(temp_reg, temp_reg, markOopDesc::biased_lock_mask_in_place); 2279 2280 cmpwi(cr_reg, temp_reg, markOopDesc::biased_lock_pattern); 2281 beq(cr_reg, done); 2282 } 2283 2284 // allocation (for C1) 2285 void MacroAssembler::eden_allocate( 2286 Register obj, // result: pointer to object after successful allocation 2287 Register var_size_in_bytes, // object size in bytes if unknown at compile time; invalid otherwise 2288 int con_size_in_bytes, // object size in bytes if known at compile time 2289 Register t1, // temp register 2290 Register t2, // temp register 2291 Label& slow_case // continuation point if fast allocation fails 2292 ) { 2293 b(slow_case); 2294 } 2295 2296 void MacroAssembler::tlab_allocate( 2297 Register obj, // result: pointer to object after successful allocation 2298 Register var_size_in_bytes, // object size in bytes if unknown at compile time; invalid otherwise 2299 int con_size_in_bytes, // object size in bytes if known at compile time 2300 Register t1, // temp register 2301 Label& slow_case // continuation point if fast allocation fails 2302 ) { 2303 // make sure arguments make sense 2304 assert_different_registers(obj, var_size_in_bytes, t1); 2305 assert(0 <= con_size_in_bytes && is_simm13(con_size_in_bytes), "illegal object size"); 2306 assert((con_size_in_bytes & MinObjAlignmentInBytesMask) == 0, "object size is not multiple of alignment"); 2307 2308 const Register new_top = t1; 2309 //verify_tlab(); not implemented 2310 2311 ld(obj, in_bytes(JavaThread::tlab_top_offset()), R16_thread); 2312 ld(R0, in_bytes(JavaThread::tlab_end_offset()), R16_thread); 2313 if (var_size_in_bytes == noreg) { 2314 addi(new_top, obj, con_size_in_bytes); 2315 } else { 2316 add(new_top, obj, var_size_in_bytes); 2317 } 2318 cmpld(CCR0, new_top, R0); 2319 bc_far_optimized(Assembler::bcondCRbiIs1, bi0(CCR0, Assembler::greater), slow_case); 2320 2321 #ifdef ASSERT 2322 // make sure new free pointer is properly aligned 2323 { 2324 Label L; 2325 andi_(R0, new_top, MinObjAlignmentInBytesMask); 2326 beq(CCR0, L); 2327 stop("updated TLAB free is not properly aligned", 0x934); 2328 bind(L); 2329 } 2330 #endif // ASSERT 2331 2332 // update the tlab top pointer 2333 std(new_top, in_bytes(JavaThread::tlab_top_offset()), R16_thread); 2334 //verify_tlab(); not implemented 2335 } 2336 void MacroAssembler::tlab_refill(Label& retry_tlab, Label& try_eden, Label& slow_case) { 2337 unimplemented("tlab_refill"); 2338 } 2339 void MacroAssembler::incr_allocated_bytes(RegisterOrConstant size_in_bytes, Register t1, Register t2) { 2340 unimplemented("incr_allocated_bytes"); 2341 } 2342 2343 address MacroAssembler::emit_trampoline_stub(int destination_toc_offset, 2344 int insts_call_instruction_offset, Register Rtoc) { 2345 // Start the stub. 2346 address stub = start_a_stub(64); 2347 if (stub == NULL) { return NULL; } // CodeCache full: bail out 2348 2349 // Create a trampoline stub relocation which relates this trampoline stub 2350 // with the call instruction at insts_call_instruction_offset in the 2351 // instructions code-section. 2352 relocate(trampoline_stub_Relocation::spec(code()->insts()->start() + insts_call_instruction_offset)); 2353 const int stub_start_offset = offset(); 2354 2355 // For java_to_interp stubs we use R11_scratch1 as scratch register 2356 // and in call trampoline stubs we use R12_scratch2. This way we 2357 // can distinguish them (see is_NativeCallTrampolineStub_at()). 2358 Register reg_scratch = R12_scratch2; 2359 2360 // Now, create the trampoline stub's code: 2361 // - load the TOC 2362 // - load the call target from the constant pool 2363 // - call 2364 if (Rtoc == noreg) { 2365 calculate_address_from_global_toc(reg_scratch, method_toc()); 2366 Rtoc = reg_scratch; 2367 } 2368 2369 ld_largeoffset_unchecked(reg_scratch, destination_toc_offset, Rtoc, false); 2370 mtctr(reg_scratch); 2371 bctr(); 2372 2373 const address stub_start_addr = addr_at(stub_start_offset); 2374 2375 // Assert that the encoded destination_toc_offset can be identified and that it is correct. 2376 assert(destination_toc_offset == NativeCallTrampolineStub_at(stub_start_addr)->destination_toc_offset(), 2377 "encoded offset into the constant pool must match"); 2378 // Trampoline_stub_size should be good. 2379 assert((uint)(offset() - stub_start_offset) <= trampoline_stub_size, "should be good size"); 2380 assert(is_NativeCallTrampolineStub_at(stub_start_addr), "doesn't look like a trampoline"); 2381 2382 // End the stub. 2383 end_a_stub(); 2384 return stub; 2385 } 2386 2387 // TM on PPC64. 2388 void MacroAssembler::atomic_inc_ptr(Register addr, Register result, int simm16) { 2389 Label retry; 2390 bind(retry); 2391 ldarx(result, addr, /*hint*/ false); 2392 addi(result, result, simm16); 2393 stdcx_(result, addr); 2394 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 2395 bne_predict_not_taken(CCR0, retry); // stXcx_ sets CCR0 2396 } else { 2397 bne( CCR0, retry); // stXcx_ sets CCR0 2398 } 2399 } 2400 2401 void MacroAssembler::atomic_ori_int(Register addr, Register result, int uimm16) { 2402 Label retry; 2403 bind(retry); 2404 lwarx(result, addr, /*hint*/ false); 2405 ori(result, result, uimm16); 2406 stwcx_(result, addr); 2407 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 2408 bne_predict_not_taken(CCR0, retry); // stXcx_ sets CCR0 2409 } else { 2410 bne( CCR0, retry); // stXcx_ sets CCR0 2411 } 2412 } 2413 2414 #if INCLUDE_RTM_OPT 2415 2416 // Update rtm_counters based on abort status 2417 // input: abort_status 2418 // rtm_counters (RTMLockingCounters*) 2419 void MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters_Reg) { 2420 // Mapping to keep PreciseRTMLockingStatistics similar to x86. 2421 // x86 ppc (! means inverted, ? means not the same) 2422 // 0 31 Set if abort caused by XABORT instruction. 2423 // 1 ! 7 If set, the transaction may succeed on a retry. This bit is always clear if bit 0 is set. 2424 // 2 13 Set if another logical processor conflicted with a memory address that was part of the transaction that aborted. 2425 // 3 10 Set if an internal buffer overflowed. 2426 // 4 ?12 Set if a debug breakpoint was hit. 2427 // 5 ?32 Set if an abort occurred during execution of a nested transaction. 2428 const int tm_failure_bit[] = {Assembler::tm_tabort, // Note: Seems like signal handler sets this, too. 2429 Assembler::tm_failure_persistent, // inverted: transient 2430 Assembler::tm_trans_cf, 2431 Assembler::tm_footprint_of, 2432 Assembler::tm_non_trans_cf, 2433 Assembler::tm_suspended}; 2434 const bool tm_failure_inv[] = {false, true, false, false, false, false}; 2435 assert(sizeof(tm_failure_bit)/sizeof(int) == RTMLockingCounters::ABORT_STATUS_LIMIT, "adapt mapping!"); 2436 2437 const Register addr_Reg = R0; 2438 // Keep track of offset to where rtm_counters_Reg had pointed to. 2439 int counters_offs = RTMLockingCounters::abort_count_offset(); 2440 addi(addr_Reg, rtm_counters_Reg, counters_offs); 2441 const Register temp_Reg = rtm_counters_Reg; 2442 2443 //atomic_inc_ptr(addr_Reg, temp_Reg); We don't increment atomically 2444 ldx(temp_Reg, addr_Reg); 2445 addi(temp_Reg, temp_Reg, 1); 2446 stdx(temp_Reg, addr_Reg); 2447 2448 if (PrintPreciseRTMLockingStatistics) { 2449 int counters_offs_delta = RTMLockingCounters::abortX_count_offset() - counters_offs; 2450 2451 //mftexasr(abort_status); done by caller 2452 for (int i = 0; i < RTMLockingCounters::ABORT_STATUS_LIMIT; i++) { 2453 counters_offs += counters_offs_delta; 2454 li(temp_Reg, counters_offs_delta); // can't use addi with R0 2455 add(addr_Reg, addr_Reg, temp_Reg); // point to next counter 2456 counters_offs_delta = sizeof(uintx); 2457 2458 Label check_abort; 2459 rldicr_(temp_Reg, abort_status, tm_failure_bit[i], 0); 2460 if (tm_failure_inv[i]) { 2461 bne(CCR0, check_abort); 2462 } else { 2463 beq(CCR0, check_abort); 2464 } 2465 //atomic_inc_ptr(addr_Reg, temp_Reg); We don't increment atomically 2466 ldx(temp_Reg, addr_Reg); 2467 addi(temp_Reg, temp_Reg, 1); 2468 stdx(temp_Reg, addr_Reg); 2469 bind(check_abort); 2470 } 2471 } 2472 li(temp_Reg, -counters_offs); // can't use addi with R0 2473 add(rtm_counters_Reg, addr_Reg, temp_Reg); // restore 2474 } 2475 2476 // Branch if (random & (count-1) != 0), count is 2^n 2477 // tmp and CR0 are killed 2478 void MacroAssembler::branch_on_random_using_tb(Register tmp, int count, Label& brLabel) { 2479 mftb(tmp); 2480 andi_(tmp, tmp, count-1); 2481 bne(CCR0, brLabel); 2482 } 2483 2484 // Perform abort ratio calculation, set no_rtm bit if high ratio. 2485 // input: rtm_counters_Reg (RTMLockingCounters* address) - KILLED 2486 void MacroAssembler::rtm_abort_ratio_calculation(Register rtm_counters_Reg, 2487 RTMLockingCounters* rtm_counters, 2488 Metadata* method_data) { 2489 Label L_done, L_check_always_rtm1, L_check_always_rtm2; 2490 2491 if (RTMLockingCalculationDelay > 0) { 2492 // Delay calculation. 2493 ld(rtm_counters_Reg, (RegisterOrConstant)(intptr_t)RTMLockingCounters::rtm_calculation_flag_addr()); 2494 cmpdi(CCR0, rtm_counters_Reg, 0); 2495 beq(CCR0, L_done); 2496 load_const_optimized(rtm_counters_Reg, (address)rtm_counters, R0); // reload 2497 } 2498 // Abort ratio calculation only if abort_count > RTMAbortThreshold. 2499 // Aborted transactions = abort_count * 100 2500 // All transactions = total_count * RTMTotalCountIncrRate 2501 // Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio) 2502 ld(R0, RTMLockingCounters::abort_count_offset(), rtm_counters_Reg); 2503 if (is_simm(RTMAbortThreshold, 16)) { // cmpdi can handle 16bit immediate only. 2504 cmpdi(CCR0, R0, RTMAbortThreshold); 2505 blt(CCR0, L_check_always_rtm2); // reload of rtm_counters_Reg not necessary 2506 } else { 2507 load_const_optimized(rtm_counters_Reg, RTMAbortThreshold); 2508 cmpd(CCR0, R0, rtm_counters_Reg); 2509 blt(CCR0, L_check_always_rtm1); // reload of rtm_counters_Reg required 2510 } 2511 mulli(R0, R0, 100); 2512 2513 const Register tmpReg = rtm_counters_Reg; 2514 ld(tmpReg, RTMLockingCounters::total_count_offset(), rtm_counters_Reg); 2515 mulli(tmpReg, tmpReg, RTMTotalCountIncrRate); // allowable range: int16 2516 mulli(tmpReg, tmpReg, RTMAbortRatio); // allowable range: int16 2517 cmpd(CCR0, R0, tmpReg); 2518 blt(CCR0, L_check_always_rtm1); // jump to reload 2519 if (method_data != NULL) { 2520 // Set rtm_state to "no rtm" in MDO. 2521 // Not using a metadata relocation. Method and Class Loader are kept alive anyway. 2522 // (See nmethod::metadata_do and CodeBuffer::finalize_oop_references.) 2523 load_const(R0, (address)method_data + MethodData::rtm_state_offset_in_bytes(), tmpReg); 2524 atomic_ori_int(R0, tmpReg, NoRTM); 2525 } 2526 b(L_done); 2527 2528 bind(L_check_always_rtm1); 2529 load_const_optimized(rtm_counters_Reg, (address)rtm_counters, R0); // reload 2530 bind(L_check_always_rtm2); 2531 ld(tmpReg, RTMLockingCounters::total_count_offset(), rtm_counters_Reg); 2532 int64_t thresholdValue = RTMLockingThreshold / RTMTotalCountIncrRate; 2533 if (is_simm(thresholdValue, 16)) { // cmpdi can handle 16bit immediate only. 2534 cmpdi(CCR0, tmpReg, thresholdValue); 2535 } else { 2536 load_const_optimized(R0, thresholdValue); 2537 cmpd(CCR0, tmpReg, R0); 2538 } 2539 blt(CCR0, L_done); 2540 if (method_data != NULL) { 2541 // Set rtm_state to "always rtm" in MDO. 2542 // Not using a metadata relocation. See above. 2543 load_const(R0, (address)method_data + MethodData::rtm_state_offset_in_bytes(), tmpReg); 2544 atomic_ori_int(R0, tmpReg, UseRTM); 2545 } 2546 bind(L_done); 2547 } 2548 2549 // Update counters and perform abort ratio calculation. 2550 // input: abort_status_Reg 2551 void MacroAssembler::rtm_profiling(Register abort_status_Reg, Register temp_Reg, 2552 RTMLockingCounters* rtm_counters, 2553 Metadata* method_data, 2554 bool profile_rtm) { 2555 2556 assert(rtm_counters != NULL, "should not be NULL when profiling RTM"); 2557 // Update rtm counters based on state at abort. 2558 // Reads abort_status_Reg, updates flags. 2559 assert_different_registers(abort_status_Reg, temp_Reg); 2560 load_const_optimized(temp_Reg, (address)rtm_counters, R0); 2561 rtm_counters_update(abort_status_Reg, temp_Reg); 2562 if (profile_rtm) { 2563 assert(rtm_counters != NULL, "should not be NULL when profiling RTM"); 2564 rtm_abort_ratio_calculation(temp_Reg, rtm_counters, method_data); 2565 } 2566 } 2567 2568 // Retry on abort if abort's status indicates non-persistent failure. 2569 // inputs: retry_count_Reg 2570 // : abort_status_Reg 2571 // output: retry_count_Reg decremented by 1 2572 void MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg, 2573 Label& retryLabel, Label* checkRetry) { 2574 Label doneRetry; 2575 rldicr_(R0, abort_status_Reg, tm_failure_persistent, 0); 2576 bne(CCR0, doneRetry); 2577 if (checkRetry) { bind(*checkRetry); } 2578 addic_(retry_count_Reg, retry_count_Reg, -1); 2579 blt(CCR0, doneRetry); 2580 smt_yield(); // Can't use wait(). No permission (SIGILL). 2581 b(retryLabel); 2582 bind(doneRetry); 2583 } 2584 2585 // Spin and retry if lock is busy. 2586 // inputs: owner_addr_Reg (monitor address) 2587 // : retry_count_Reg 2588 // output: retry_count_Reg decremented by 1 2589 // CTR is killed 2590 void MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register owner_addr_Reg, Label& retryLabel) { 2591 Label SpinLoop, doneRetry; 2592 addic_(retry_count_Reg, retry_count_Reg, -1); 2593 blt(CCR0, doneRetry); 2594 2595 if (RTMSpinLoopCount > 1) { 2596 li(R0, RTMSpinLoopCount); 2597 mtctr(R0); 2598 } 2599 2600 bind(SpinLoop); 2601 smt_yield(); // Can't use waitrsv(). No permission (SIGILL). 2602 2603 if (RTMSpinLoopCount > 1) { 2604 bdz(retryLabel); 2605 ld(R0, 0, owner_addr_Reg); 2606 cmpdi(CCR0, R0, 0); 2607 bne(CCR0, SpinLoop); 2608 } 2609 2610 b(retryLabel); 2611 2612 bind(doneRetry); 2613 } 2614 2615 // Use RTM for normal stack locks. 2616 // Input: objReg (object to lock) 2617 void MacroAssembler::rtm_stack_locking(ConditionRegister flag, 2618 Register obj, Register mark_word, Register tmp, 2619 Register retry_on_abort_count_Reg, 2620 RTMLockingCounters* stack_rtm_counters, 2621 Metadata* method_data, bool profile_rtm, 2622 Label& DONE_LABEL, Label& IsInflated) { 2623 assert(UseRTMForStackLocks, "why call this otherwise?"); 2624 assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking"); 2625 Label L_rtm_retry, L_decrement_retry, L_on_abort; 2626 2627 if (RTMRetryCount > 0) { 2628 load_const_optimized(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort 2629 bind(L_rtm_retry); 2630 } 2631 andi_(R0, mark_word, markOopDesc::monitor_value); // inflated vs stack-locked|neutral|biased 2632 bne(CCR0, IsInflated); 2633 2634 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 2635 Label L_noincrement; 2636 if (RTMTotalCountIncrRate > 1) { 2637 branch_on_random_using_tb(tmp, RTMTotalCountIncrRate, L_noincrement); 2638 } 2639 assert(stack_rtm_counters != NULL, "should not be NULL when profiling RTM"); 2640 load_const_optimized(tmp, (address)stack_rtm_counters->total_count_addr(), R0); 2641 //atomic_inc_ptr(tmp, /*temp, will be reloaded*/mark_word); We don't increment atomically 2642 ldx(mark_word, tmp); 2643 addi(mark_word, mark_word, 1); 2644 stdx(mark_word, tmp); 2645 bind(L_noincrement); 2646 } 2647 tbegin_(); 2648 beq(CCR0, L_on_abort); 2649 ld(mark_word, oopDesc::mark_offset_in_bytes(), obj); // Reload in transaction, conflicts need to be tracked. 2650 andi(R0, mark_word, markOopDesc::biased_lock_mask_in_place); // look at 3 lock bits 2651 cmpwi(flag, R0, markOopDesc::unlocked_value); // bits = 001 unlocked 2652 beq(flag, DONE_LABEL); // all done if unlocked 2653 2654 if (UseRTMXendForLockBusy) { 2655 tend_(); 2656 b(L_decrement_retry); 2657 } else { 2658 tabort_(); 2659 } 2660 bind(L_on_abort); 2661 const Register abort_status_Reg = tmp; 2662 mftexasr(abort_status_Reg); 2663 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 2664 rtm_profiling(abort_status_Reg, /*temp*/mark_word, stack_rtm_counters, method_data, profile_rtm); 2665 } 2666 ld(mark_word, oopDesc::mark_offset_in_bytes(), obj); // reload 2667 if (RTMRetryCount > 0) { 2668 // Retry on lock abort if abort status is not permanent. 2669 rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry, &L_decrement_retry); 2670 } else { 2671 bind(L_decrement_retry); 2672 } 2673 } 2674 2675 // Use RTM for inflating locks 2676 // inputs: obj (object to lock) 2677 // mark_word (current header - KILLED) 2678 // boxReg (on-stack box address (displaced header location) - KILLED) 2679 void MacroAssembler::rtm_inflated_locking(ConditionRegister flag, 2680 Register obj, Register mark_word, Register boxReg, 2681 Register retry_on_busy_count_Reg, Register retry_on_abort_count_Reg, 2682 RTMLockingCounters* rtm_counters, 2683 Metadata* method_data, bool profile_rtm, 2684 Label& DONE_LABEL) { 2685 assert(UseRTMLocking, "why call this otherwise?"); 2686 Label L_rtm_retry, L_decrement_retry, L_on_abort; 2687 // Clean monitor_value bit to get valid pointer. 2688 int owner_offset = ObjectMonitor::owner_offset_in_bytes() - markOopDesc::monitor_value; 2689 2690 // Store non-null, using boxReg instead of (intptr_t)markOopDesc::unused_mark(). 2691 std(boxReg, BasicLock::displaced_header_offset_in_bytes(), boxReg); 2692 const Register tmpReg = boxReg; 2693 const Register owner_addr_Reg = mark_word; 2694 addi(owner_addr_Reg, mark_word, owner_offset); 2695 2696 if (RTMRetryCount > 0) { 2697 load_const_optimized(retry_on_busy_count_Reg, RTMRetryCount); // Retry on lock busy. 2698 load_const_optimized(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort. 2699 bind(L_rtm_retry); 2700 } 2701 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 2702 Label L_noincrement; 2703 if (RTMTotalCountIncrRate > 1) { 2704 branch_on_random_using_tb(R0, RTMTotalCountIncrRate, L_noincrement); 2705 } 2706 assert(rtm_counters != NULL, "should not be NULL when profiling RTM"); 2707 load_const(R0, (address)rtm_counters->total_count_addr(), tmpReg); 2708 //atomic_inc_ptr(R0, tmpReg); We don't increment atomically 2709 ldx(tmpReg, R0); 2710 addi(tmpReg, tmpReg, 1); 2711 stdx(tmpReg, R0); 2712 bind(L_noincrement); 2713 } 2714 tbegin_(); 2715 beq(CCR0, L_on_abort); 2716 // We don't reload mark word. Will only be reset at safepoint. 2717 ld(R0, 0, owner_addr_Reg); // Load in transaction, conflicts need to be tracked. 2718 cmpdi(flag, R0, 0); 2719 beq(flag, DONE_LABEL); 2720 2721 if (UseRTMXendForLockBusy) { 2722 tend_(); 2723 b(L_decrement_retry); 2724 } else { 2725 tabort_(); 2726 } 2727 bind(L_on_abort); 2728 const Register abort_status_Reg = tmpReg; 2729 mftexasr(abort_status_Reg); 2730 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 2731 rtm_profiling(abort_status_Reg, /*temp*/ owner_addr_Reg, rtm_counters, method_data, profile_rtm); 2732 // Restore owner_addr_Reg 2733 ld(mark_word, oopDesc::mark_offset_in_bytes(), obj); 2734 #ifdef ASSERT 2735 andi_(R0, mark_word, markOopDesc::monitor_value); 2736 asm_assert_ne("must be inflated", 0xa754); // Deflating only allowed at safepoint. 2737 #endif 2738 addi(owner_addr_Reg, mark_word, owner_offset); 2739 } 2740 if (RTMRetryCount > 0) { 2741 // Retry on lock abort if abort status is not permanent. 2742 rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry); 2743 } 2744 2745 // Appears unlocked - try to swing _owner from null to non-null. 2746 cmpxchgd(flag, /*current val*/ R0, (intptr_t)0, /*new val*/ R16_thread, owner_addr_Reg, 2747 MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq, 2748 MacroAssembler::cmpxchgx_hint_acquire_lock(), noreg, &L_decrement_retry, true); 2749 2750 if (RTMRetryCount > 0) { 2751 // success done else retry 2752 b(DONE_LABEL); 2753 bind(L_decrement_retry); 2754 // Spin and retry if lock is busy. 2755 rtm_retry_lock_on_busy(retry_on_busy_count_Reg, owner_addr_Reg, L_rtm_retry); 2756 } else { 2757 bind(L_decrement_retry); 2758 } 2759 } 2760 2761 #endif // INCLUDE_RTM_OPT 2762 2763 // "The box" is the space on the stack where we copy the object mark. 2764 void MacroAssembler::compiler_fast_lock_object(ConditionRegister flag, Register oop, Register box, 2765 Register temp, Register displaced_header, Register current_header, 2766 bool try_bias, 2767 RTMLockingCounters* rtm_counters, 2768 RTMLockingCounters* stack_rtm_counters, 2769 Metadata* method_data, 2770 bool use_rtm, bool profile_rtm) { 2771 assert_different_registers(oop, box, temp, displaced_header, current_header); 2772 assert(flag != CCR0, "bad condition register"); 2773 Label cont; 2774 Label object_has_monitor; 2775 Label cas_failed; 2776 2777 // Load markOop from object into displaced_header. 2778 ld(displaced_header, oopDesc::mark_offset_in_bytes(), oop); 2779 2780 2781 // Always do locking in runtime. 2782 if (EmitSync & 0x01) { 2783 cmpdi(flag, oop, 0); // Oop can't be 0 here => always false. 2784 return; 2785 } 2786 2787 if (try_bias) { 2788 biased_locking_enter(flag, oop, displaced_header, temp, current_header, cont); 2789 } 2790 2791 #if INCLUDE_RTM_OPT 2792 if (UseRTMForStackLocks && use_rtm) { 2793 rtm_stack_locking(flag, oop, displaced_header, temp, /*temp*/ current_header, 2794 stack_rtm_counters, method_data, profile_rtm, 2795 cont, object_has_monitor); 2796 } 2797 #endif // INCLUDE_RTM_OPT 2798 2799 // Handle existing monitor. 2800 if ((EmitSync & 0x02) == 0) { 2801 // The object has an existing monitor iff (mark & monitor_value) != 0. 2802 andi_(temp, displaced_header, markOopDesc::monitor_value); 2803 bne(CCR0, object_has_monitor); 2804 } 2805 2806 // Set displaced_header to be (markOop of object | UNLOCK_VALUE). 2807 ori(displaced_header, displaced_header, markOopDesc::unlocked_value); 2808 2809 // Load Compare Value application register. 2810 2811 // Initialize the box. (Must happen before we update the object mark!) 2812 std(displaced_header, BasicLock::displaced_header_offset_in_bytes(), box); 2813 2814 // Must fence, otherwise, preceding store(s) may float below cmpxchg. 2815 // Compare object markOop with mark and if equal exchange scratch1 with object markOop. 2816 cmpxchgd(/*flag=*/flag, 2817 /*current_value=*/current_header, 2818 /*compare_value=*/displaced_header, 2819 /*exchange_value=*/box, 2820 /*where=*/oop, 2821 MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq, 2822 MacroAssembler::cmpxchgx_hint_acquire_lock(), 2823 noreg, 2824 &cas_failed, 2825 /*check without membar and ldarx first*/true); 2826 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 2827 2828 // If the compare-and-exchange succeeded, then we found an unlocked 2829 // object and we have now locked it. 2830 b(cont); 2831 2832 bind(cas_failed); 2833 // We did not see an unlocked object so try the fast recursive case. 2834 2835 // Check if the owner is self by comparing the value in the markOop of object 2836 // (current_header) with the stack pointer. 2837 sub(current_header, current_header, R1_SP); 2838 load_const_optimized(temp, ~(os::vm_page_size()-1) | markOopDesc::lock_mask_in_place); 2839 2840 and_(R0/*==0?*/, current_header, temp); 2841 // If condition is true we are cont and hence we can store 0 as the 2842 // displaced header in the box, which indicates that it is a recursive lock. 2843 mcrf(flag,CCR0); 2844 std(R0/*==0, perhaps*/, BasicLock::displaced_header_offset_in_bytes(), box); 2845 2846 // Handle existing monitor. 2847 if ((EmitSync & 0x02) == 0) { 2848 b(cont); 2849 2850 bind(object_has_monitor); 2851 // The object's monitor m is unlocked iff m->owner == NULL, 2852 // otherwise m->owner may contain a thread or a stack address. 2853 2854 #if INCLUDE_RTM_OPT 2855 // Use the same RTM locking code in 32- and 64-bit VM. 2856 if (use_rtm) { 2857 rtm_inflated_locking(flag, oop, displaced_header, box, temp, /*temp*/ current_header, 2858 rtm_counters, method_data, profile_rtm, cont); 2859 } else { 2860 #endif // INCLUDE_RTM_OPT 2861 2862 // Try to CAS m->owner from NULL to current thread. 2863 addi(temp, displaced_header, ObjectMonitor::owner_offset_in_bytes()-markOopDesc::monitor_value); 2864 cmpxchgd(/*flag=*/flag, 2865 /*current_value=*/current_header, 2866 /*compare_value=*/(intptr_t)0, 2867 /*exchange_value=*/R16_thread, 2868 /*where=*/temp, 2869 MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq, 2870 MacroAssembler::cmpxchgx_hint_acquire_lock()); 2871 2872 // Store a non-null value into the box. 2873 std(box, BasicLock::displaced_header_offset_in_bytes(), box); 2874 2875 # ifdef ASSERT 2876 bne(flag, cont); 2877 // We have acquired the monitor, check some invariants. 2878 addi(/*monitor=*/temp, temp, -ObjectMonitor::owner_offset_in_bytes()); 2879 // Invariant 1: _recursions should be 0. 2880 //assert(ObjectMonitor::recursions_size_in_bytes() == 8, "unexpected size"); 2881 asm_assert_mem8_is_zero(ObjectMonitor::recursions_offset_in_bytes(), temp, 2882 "monitor->_recursions should be 0", -1); 2883 // Invariant 2: OwnerIsThread shouldn't be 0. 2884 //assert(ObjectMonitor::OwnerIsThread_size_in_bytes() == 4, "unexpected size"); 2885 //asm_assert_mem4_isnot_zero(ObjectMonitor::OwnerIsThread_offset_in_bytes(), temp, 2886 // "monitor->OwnerIsThread shouldn't be 0", -1); 2887 # endif 2888 2889 #if INCLUDE_RTM_OPT 2890 } // use_rtm() 2891 #endif 2892 } 2893 2894 bind(cont); 2895 // flag == EQ indicates success 2896 // flag == NE indicates failure 2897 } 2898 2899 void MacroAssembler::compiler_fast_unlock_object(ConditionRegister flag, Register oop, Register box, 2900 Register temp, Register displaced_header, Register current_header, 2901 bool try_bias, bool use_rtm) { 2902 assert_different_registers(oop, box, temp, displaced_header, current_header); 2903 assert(flag != CCR0, "bad condition register"); 2904 Label cont; 2905 Label object_has_monitor; 2906 2907 // Always do locking in runtime. 2908 if (EmitSync & 0x01) { 2909 cmpdi(flag, oop, 0); // Oop can't be 0 here => always false. 2910 return; 2911 } 2912 2913 if (try_bias) { 2914 biased_locking_exit(flag, oop, current_header, cont); 2915 } 2916 2917 #if INCLUDE_RTM_OPT 2918 if (UseRTMForStackLocks && use_rtm) { 2919 assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking"); 2920 Label L_regular_unlock; 2921 ld(current_header, oopDesc::mark_offset_in_bytes(), oop); // fetch markword 2922 andi(R0, current_header, markOopDesc::biased_lock_mask_in_place); // look at 3 lock bits 2923 cmpwi(flag, R0, markOopDesc::unlocked_value); // bits = 001 unlocked 2924 bne(flag, L_regular_unlock); // else RegularLock 2925 tend_(); // otherwise end... 2926 b(cont); // ... and we're done 2927 bind(L_regular_unlock); 2928 } 2929 #endif 2930 2931 // Find the lock address and load the displaced header from the stack. 2932 ld(displaced_header, BasicLock::displaced_header_offset_in_bytes(), box); 2933 2934 // If the displaced header is 0, we have a recursive unlock. 2935 cmpdi(flag, displaced_header, 0); 2936 beq(flag, cont); 2937 2938 // Handle existing monitor. 2939 if ((EmitSync & 0x02) == 0) { 2940 // The object has an existing monitor iff (mark & monitor_value) != 0. 2941 RTM_OPT_ONLY( if (!(UseRTMForStackLocks && use_rtm)) ) // skip load if already done 2942 ld(current_header, oopDesc::mark_offset_in_bytes(), oop); 2943 andi_(R0, current_header, markOopDesc::monitor_value); 2944 bne(CCR0, object_has_monitor); 2945 } 2946 2947 // Check if it is still a light weight lock, this is is true if we see 2948 // the stack address of the basicLock in the markOop of the object. 2949 // Cmpxchg sets flag to cmpd(current_header, box). 2950 cmpxchgd(/*flag=*/flag, 2951 /*current_value=*/current_header, 2952 /*compare_value=*/box, 2953 /*exchange_value=*/displaced_header, 2954 /*where=*/oop, 2955 MacroAssembler::MemBarRel, 2956 MacroAssembler::cmpxchgx_hint_release_lock(), 2957 noreg, 2958 &cont); 2959 2960 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 2961 2962 // Handle existing monitor. 2963 if ((EmitSync & 0x02) == 0) { 2964 b(cont); 2965 2966 bind(object_has_monitor); 2967 addi(current_header, current_header, -markOopDesc::monitor_value); // monitor 2968 ld(temp, ObjectMonitor::owner_offset_in_bytes(), current_header); 2969 2970 // It's inflated. 2971 #if INCLUDE_RTM_OPT 2972 if (use_rtm) { 2973 Label L_regular_inflated_unlock; 2974 // Clean monitor_value bit to get valid pointer 2975 cmpdi(flag, temp, 0); 2976 bne(flag, L_regular_inflated_unlock); 2977 tend_(); 2978 b(cont); 2979 bind(L_regular_inflated_unlock); 2980 } 2981 #endif 2982 2983 ld(displaced_header, ObjectMonitor::recursions_offset_in_bytes(), current_header); 2984 xorr(temp, R16_thread, temp); // Will be 0 if we are the owner. 2985 orr(temp, temp, displaced_header); // Will be 0 if there are 0 recursions. 2986 cmpdi(flag, temp, 0); 2987 bne(flag, cont); 2988 2989 ld(temp, ObjectMonitor::EntryList_offset_in_bytes(), current_header); 2990 ld(displaced_header, ObjectMonitor::cxq_offset_in_bytes(), current_header); 2991 orr(temp, temp, displaced_header); // Will be 0 if both are 0. 2992 cmpdi(flag, temp, 0); 2993 bne(flag, cont); 2994 release(); 2995 std(temp, ObjectMonitor::owner_offset_in_bytes(), current_header); 2996 } 2997 2998 bind(cont); 2999 // flag == EQ indicates success 3000 // flag == NE indicates failure 3001 } 3002 3003 // Write serialization page so VM thread can do a pseudo remote membar. 3004 // We use the current thread pointer to calculate a thread specific 3005 // offset to write to within the page. This minimizes bus traffic 3006 // due to cache line collision. 3007 void MacroAssembler::serialize_memory(Register thread, Register tmp1, Register tmp2) { 3008 srdi(tmp2, thread, os::get_serialize_page_shift_count()); 3009 3010 int mask = os::vm_page_size() - sizeof(int); 3011 if (Assembler::is_simm(mask, 16)) { 3012 andi(tmp2, tmp2, mask); 3013 } else { 3014 lis(tmp1, (int)((signed short) (mask >> 16))); 3015 ori(tmp1, tmp1, mask & 0x0000ffff); 3016 andr(tmp2, tmp2, tmp1); 3017 } 3018 3019 load_const(tmp1, (long) os::get_memory_serialize_page()); 3020 release(); 3021 stwx(R0, tmp1, tmp2); 3022 } 3023 3024 void MacroAssembler::safepoint_poll(Label& slow_path, Register temp_reg) { 3025 if (SafepointMechanism::uses_thread_local_poll()) { 3026 ld(temp_reg, in_bytes(Thread::polling_page_offset()), R16_thread); 3027 // Armed page has poll_bit set. 3028 andi_(temp_reg, temp_reg, SafepointMechanism::poll_bit()); 3029 } else { 3030 lwz(temp_reg, (RegisterOrConstant)(intptr_t)SafepointSynchronize::address_of_state()); 3031 cmpwi(CCR0, temp_reg, SafepointSynchronize::_not_synchronized); 3032 } 3033 bne(CCR0, slow_path); 3034 } 3035 3036 3037 // GC barrier helper macros 3038 3039 // Write the card table byte if needed. 3040 void MacroAssembler::card_write_barrier_post(Register Rstore_addr, Register Rnew_val, Register Rtmp) { 3041 CardTableModRefBS* bs = 3042 barrier_set_cast<CardTableModRefBS>(Universe::heap()->barrier_set()); 3043 assert(bs->kind() == BarrierSet::CardTableForRS || 3044 bs->kind() == BarrierSet::CardTableExtension, "wrong barrier"); 3045 #ifdef ASSERT 3046 cmpdi(CCR0, Rnew_val, 0); 3047 asm_assert_ne("null oop not allowed", 0x321); 3048 #endif 3049 card_table_write(bs->byte_map_base, Rtmp, Rstore_addr); 3050 } 3051 3052 // Write the card table byte. 3053 void MacroAssembler::card_table_write(jbyte* byte_map_base, Register Rtmp, Register Robj) { 3054 assert_different_registers(Robj, Rtmp, R0); 3055 load_const_optimized(Rtmp, (address)byte_map_base, R0); 3056 srdi(Robj, Robj, CardTableModRefBS::card_shift); 3057 li(R0, 0); // dirty 3058 if (UseConcMarkSweepGC) membar(Assembler::StoreStore); 3059 stbx(R0, Rtmp, Robj); 3060 } 3061 3062 // Kills R31 if value is a volatile register. 3063 void MacroAssembler::resolve_jobject(Register value, Register tmp1, Register tmp2, bool needs_frame) { 3064 Label done; 3065 cmpdi(CCR0, value, 0); 3066 beq(CCR0, done); // Use NULL as-is. 3067 3068 clrrdi(tmp1, value, JNIHandles::weak_tag_size); 3069 #if INCLUDE_ALL_GCS 3070 if (UseG1GC) { andi_(tmp2, value, JNIHandles::weak_tag_mask); } 3071 #endif 3072 ld(value, 0, tmp1); // Resolve (untagged) jobject. 3073 3074 #if INCLUDE_ALL_GCS 3075 if (UseG1GC) { 3076 Label not_weak; 3077 beq(CCR0, not_weak); // Test for jweak tag. 3078 verify_oop(value); 3079 g1_write_barrier_pre(noreg, // obj 3080 noreg, // offset 3081 value, // pre_val 3082 tmp1, tmp2, needs_frame); 3083 bind(not_weak); 3084 } 3085 #endif // INCLUDE_ALL_GCS 3086 verify_oop(value); 3087 bind(done); 3088 } 3089 3090 #if INCLUDE_ALL_GCS 3091 // General G1 pre-barrier generator. 3092 // Goal: record the previous value if it is not null. 3093 void MacroAssembler::g1_write_barrier_pre(Register Robj, RegisterOrConstant offset, Register Rpre_val, 3094 Register Rtmp1, Register Rtmp2, bool needs_frame) { 3095 Label runtime, filtered; 3096 3097 // Is marking active? 3098 if (in_bytes(SATBMarkQueue::byte_width_of_active()) == 4) { 3099 lwz(Rtmp1, in_bytes(JavaThread::satb_mark_queue_offset() + SATBMarkQueue::byte_offset_of_active()), R16_thread); 3100 } else { 3101 guarantee(in_bytes(SATBMarkQueue::byte_width_of_active()) == 1, "Assumption"); 3102 lbz(Rtmp1, in_bytes(JavaThread::satb_mark_queue_offset() + SATBMarkQueue::byte_offset_of_active()), R16_thread); 3103 } 3104 cmpdi(CCR0, Rtmp1, 0); 3105 beq(CCR0, filtered); 3106 3107 // Do we need to load the previous value? 3108 if (Robj != noreg) { 3109 // Load the previous value... 3110 if (UseCompressedOops) { 3111 lwz(Rpre_val, offset, Robj); 3112 } else { 3113 ld(Rpre_val, offset, Robj); 3114 } 3115 // Previous value has been loaded into Rpre_val. 3116 } 3117 assert(Rpre_val != noreg, "must have a real register"); 3118 3119 // Is the previous value null? 3120 cmpdi(CCR0, Rpre_val, 0); 3121 beq(CCR0, filtered); 3122 3123 if (Robj != noreg && UseCompressedOops) { 3124 decode_heap_oop_not_null(Rpre_val); 3125 } 3126 3127 // OK, it's not filtered, so we'll need to call enqueue. In the normal 3128 // case, pre_val will be a scratch G-reg, but there are some cases in 3129 // which it's an O-reg. In the first case, do a normal call. In the 3130 // latter, do a save here and call the frameless version. 3131 3132 // Can we store original value in the thread's buffer? 3133 // Is index == 0? 3134 // (The index field is typed as size_t.) 3135 const Register Rbuffer = Rtmp1, Rindex = Rtmp2; 3136 3137 ld(Rindex, in_bytes(JavaThread::satb_mark_queue_offset() + SATBMarkQueue::byte_offset_of_index()), R16_thread); 3138 cmpdi(CCR0, Rindex, 0); 3139 beq(CCR0, runtime); // If index == 0, goto runtime. 3140 ld(Rbuffer, in_bytes(JavaThread::satb_mark_queue_offset() + SATBMarkQueue::byte_offset_of_buf()), R16_thread); 3141 3142 addi(Rindex, Rindex, -wordSize); // Decrement index. 3143 std(Rindex, in_bytes(JavaThread::satb_mark_queue_offset() + SATBMarkQueue::byte_offset_of_index()), R16_thread); 3144 3145 // Record the previous value. 3146 stdx(Rpre_val, Rbuffer, Rindex); 3147 b(filtered); 3148 3149 bind(runtime); 3150 3151 // May need to preserve LR. Also needed if current frame is not compatible with C calling convention. 3152 if (needs_frame) { 3153 save_LR_CR(Rtmp1); 3154 push_frame_reg_args(0, Rtmp2); 3155 } 3156 3157 if (Rpre_val->is_volatile() && Robj == noreg) mr(R31, Rpre_val); // Save pre_val across C call if it was preloaded. 3158 call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_pre), Rpre_val, R16_thread); 3159 if (Rpre_val->is_volatile() && Robj == noreg) mr(Rpre_val, R31); // restore 3160 3161 if (needs_frame) { 3162 pop_frame(); 3163 restore_LR_CR(Rtmp1); 3164 } 3165 3166 bind(filtered); 3167 } 3168 3169 // General G1 post-barrier generator 3170 // Store cross-region card. 3171 void MacroAssembler::g1_write_barrier_post(Register Rstore_addr, Register Rnew_val, Register Rtmp1, Register Rtmp2, Register Rtmp3, Label *filtered_ext) { 3172 Label runtime, filtered_int; 3173 Label& filtered = (filtered_ext != NULL) ? *filtered_ext : filtered_int; 3174 assert_different_registers(Rstore_addr, Rnew_val, Rtmp1, Rtmp2); 3175 3176 G1SATBCardTableLoggingModRefBS* bs = 3177 barrier_set_cast<G1SATBCardTableLoggingModRefBS>(Universe::heap()->barrier_set()); 3178 3179 // Does store cross heap regions? 3180 if (G1RSBarrierRegionFilter) { 3181 xorr(Rtmp1, Rstore_addr, Rnew_val); 3182 srdi_(Rtmp1, Rtmp1, HeapRegion::LogOfHRGrainBytes); 3183 beq(CCR0, filtered); 3184 } 3185 3186 // Crosses regions, storing NULL? 3187 #ifdef ASSERT 3188 cmpdi(CCR0, Rnew_val, 0); 3189 asm_assert_ne("null oop not allowed (G1)", 0x322); // Checked by caller on PPC64, so following branch is obsolete: 3190 //beq(CCR0, filtered); 3191 #endif 3192 3193 // Storing region crossing non-NULL, is card already dirty? 3194 assert(sizeof(*bs->byte_map_base) == sizeof(jbyte), "adjust this code"); 3195 const Register Rcard_addr = Rtmp1; 3196 Register Rbase = Rtmp2; 3197 load_const_optimized(Rbase, (address)bs->byte_map_base, /*temp*/ Rtmp3); 3198 3199 srdi(Rcard_addr, Rstore_addr, CardTableModRefBS::card_shift); 3200 3201 // Get the address of the card. 3202 lbzx(/*card value*/ Rtmp3, Rbase, Rcard_addr); 3203 cmpwi(CCR0, Rtmp3, (int)G1SATBCardTableModRefBS::g1_young_card_val()); 3204 beq(CCR0, filtered); 3205 3206 membar(Assembler::StoreLoad); 3207 lbzx(/*card value*/ Rtmp3, Rbase, Rcard_addr); // Reload after membar. 3208 cmpwi(CCR0, Rtmp3 /* card value */, CardTableModRefBS::dirty_card_val()); 3209 beq(CCR0, filtered); 3210 3211 // Storing a region crossing, non-NULL oop, card is clean. 3212 // Dirty card and log. 3213 li(Rtmp3, CardTableModRefBS::dirty_card_val()); 3214 //release(); // G1: oops are allowed to get visible after dirty marking. 3215 stbx(Rtmp3, Rbase, Rcard_addr); 3216 3217 add(Rcard_addr, Rbase, Rcard_addr); // This is the address which needs to get enqueued. 3218 Rbase = noreg; // end of lifetime 3219 3220 const Register Rqueue_index = Rtmp2, 3221 Rqueue_buf = Rtmp3; 3222 ld(Rqueue_index, in_bytes(JavaThread::dirty_card_queue_offset() + DirtyCardQueue::byte_offset_of_index()), R16_thread); 3223 cmpdi(CCR0, Rqueue_index, 0); 3224 beq(CCR0, runtime); // index == 0 then jump to runtime 3225 ld(Rqueue_buf, in_bytes(JavaThread::dirty_card_queue_offset() + DirtyCardQueue::byte_offset_of_buf()), R16_thread); 3226 3227 addi(Rqueue_index, Rqueue_index, -wordSize); // decrement index 3228 std(Rqueue_index, in_bytes(JavaThread::dirty_card_queue_offset() + DirtyCardQueue::byte_offset_of_index()), R16_thread); 3229 3230 stdx(Rcard_addr, Rqueue_buf, Rqueue_index); // store card 3231 b(filtered); 3232 3233 bind(runtime); 3234 3235 // Save the live input values. 3236 call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_post), Rcard_addr, R16_thread); 3237 3238 bind(filtered_int); 3239 } 3240 #endif // INCLUDE_ALL_GCS 3241 3242 // Values for last_Java_pc, and last_Java_sp must comply to the rules 3243 // in frame_ppc.hpp. 3244 void MacroAssembler::set_last_Java_frame(Register last_Java_sp, Register last_Java_pc) { 3245 // Always set last_Java_pc and flags first because once last_Java_sp 3246 // is visible has_last_Java_frame is true and users will look at the 3247 // rest of the fields. (Note: flags should always be zero before we 3248 // get here so doesn't need to be set.) 3249 3250 // Verify that last_Java_pc was zeroed on return to Java 3251 asm_assert_mem8_is_zero(in_bytes(JavaThread::last_Java_pc_offset()), R16_thread, 3252 "last_Java_pc not zeroed before leaving Java", 0x200); 3253 3254 // When returning from calling out from Java mode the frame anchor's 3255 // last_Java_pc will always be set to NULL. It is set here so that 3256 // if we are doing a call to native (not VM) that we capture the 3257 // known pc and don't have to rely on the native call having a 3258 // standard frame linkage where we can find the pc. 3259 if (last_Java_pc != noreg) 3260 std(last_Java_pc, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread); 3261 3262 // Set last_Java_sp last. 3263 std(last_Java_sp, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread); 3264 } 3265 3266 void MacroAssembler::reset_last_Java_frame(void) { 3267 asm_assert_mem8_isnot_zero(in_bytes(JavaThread::last_Java_sp_offset()), 3268 R16_thread, "SP was not set, still zero", 0x202); 3269 3270 BLOCK_COMMENT("reset_last_Java_frame {"); 3271 li(R0, 0); 3272 3273 // _last_Java_sp = 0 3274 std(R0, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread); 3275 3276 // _last_Java_pc = 0 3277 std(R0, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread); 3278 BLOCK_COMMENT("} reset_last_Java_frame"); 3279 } 3280 3281 void MacroAssembler::set_top_ijava_frame_at_SP_as_last_Java_frame(Register sp, Register tmp1) { 3282 assert_different_registers(sp, tmp1); 3283 3284 // sp points to a TOP_IJAVA_FRAME, retrieve frame's PC via 3285 // TOP_IJAVA_FRAME_ABI. 3286 // FIXME: assert that we really have a TOP_IJAVA_FRAME here! 3287 address entry = pc(); 3288 load_const_optimized(tmp1, entry); 3289 3290 set_last_Java_frame(/*sp=*/sp, /*pc=*/tmp1); 3291 } 3292 3293 void MacroAssembler::get_vm_result(Register oop_result) { 3294 // Read: 3295 // R16_thread 3296 // R16_thread->in_bytes(JavaThread::vm_result_offset()) 3297 // 3298 // Updated: 3299 // oop_result 3300 // R16_thread->in_bytes(JavaThread::vm_result_offset()) 3301 3302 verify_thread(); 3303 3304 ld(oop_result, in_bytes(JavaThread::vm_result_offset()), R16_thread); 3305 li(R0, 0); 3306 std(R0, in_bytes(JavaThread::vm_result_offset()), R16_thread); 3307 3308 verify_oop(oop_result); 3309 } 3310 3311 void MacroAssembler::get_vm_result_2(Register metadata_result) { 3312 // Read: 3313 // R16_thread 3314 // R16_thread->in_bytes(JavaThread::vm_result_2_offset()) 3315 // 3316 // Updated: 3317 // metadata_result 3318 // R16_thread->in_bytes(JavaThread::vm_result_2_offset()) 3319 3320 ld(metadata_result, in_bytes(JavaThread::vm_result_2_offset()), R16_thread); 3321 li(R0, 0); 3322 std(R0, in_bytes(JavaThread::vm_result_2_offset()), R16_thread); 3323 } 3324 3325 Register MacroAssembler::encode_klass_not_null(Register dst, Register src) { 3326 Register current = (src != noreg) ? src : dst; // Klass is in dst if no src provided. 3327 if (Universe::narrow_klass_base() != 0) { 3328 // Use dst as temp if it is free. 3329 sub_const_optimized(dst, current, Universe::narrow_klass_base(), R0); 3330 current = dst; 3331 } 3332 if (Universe::narrow_klass_shift() != 0) { 3333 srdi(dst, current, Universe::narrow_klass_shift()); 3334 current = dst; 3335 } 3336 return current; 3337 } 3338 3339 void MacroAssembler::store_klass(Register dst_oop, Register klass, Register ck) { 3340 if (UseCompressedClassPointers) { 3341 Register compressedKlass = encode_klass_not_null(ck, klass); 3342 stw(compressedKlass, oopDesc::klass_offset_in_bytes(), dst_oop); 3343 } else { 3344 std(klass, oopDesc::klass_offset_in_bytes(), dst_oop); 3345 } 3346 } 3347 3348 void MacroAssembler::store_klass_gap(Register dst_oop, Register val) { 3349 if (UseCompressedClassPointers) { 3350 if (val == noreg) { 3351 val = R0; 3352 li(val, 0); 3353 } 3354 stw(val, oopDesc::klass_gap_offset_in_bytes(), dst_oop); // klass gap if compressed 3355 } 3356 } 3357 3358 int MacroAssembler::instr_size_for_decode_klass_not_null() { 3359 if (!UseCompressedClassPointers) return 0; 3360 int num_instrs = 1; // shift or move 3361 if (Universe::narrow_klass_base() != 0) num_instrs = 7; // shift + load const + add 3362 return num_instrs * BytesPerInstWord; 3363 } 3364 3365 void MacroAssembler::decode_klass_not_null(Register dst, Register src) { 3366 assert(dst != R0, "Dst reg may not be R0, as R0 is used here."); 3367 if (src == noreg) src = dst; 3368 Register shifted_src = src; 3369 if (Universe::narrow_klass_shift() != 0 || 3370 Universe::narrow_klass_base() == 0 && src != dst) { // Move required. 3371 shifted_src = dst; 3372 sldi(shifted_src, src, Universe::narrow_klass_shift()); 3373 } 3374 if (Universe::narrow_klass_base() != 0) { 3375 add_const_optimized(dst, shifted_src, Universe::narrow_klass_base(), R0); 3376 } 3377 } 3378 3379 void MacroAssembler::load_klass(Register dst, Register src) { 3380 if (UseCompressedClassPointers) { 3381 lwz(dst, oopDesc::klass_offset_in_bytes(), src); 3382 // Attention: no null check here! 3383 decode_klass_not_null(dst, dst); 3384 } else { 3385 ld(dst, oopDesc::klass_offset_in_bytes(), src); 3386 } 3387 } 3388 3389 // ((OopHandle)result).resolve(); 3390 void MacroAssembler::resolve_oop_handle(Register result) { 3391 // OopHandle::resolve is an indirection. 3392 ld(result, 0, result); 3393 } 3394 3395 void MacroAssembler::load_mirror_from_const_method(Register mirror, Register const_method) { 3396 ld(mirror, in_bytes(ConstMethod::constants_offset()), const_method); 3397 ld(mirror, ConstantPool::pool_holder_offset_in_bytes(), mirror); 3398 ld(mirror, in_bytes(Klass::java_mirror_offset()), mirror); 3399 resolve_oop_handle(mirror); 3400 } 3401 3402 // Clear Array 3403 // For very short arrays. tmp == R0 is allowed. 3404 void MacroAssembler::clear_memory_unrolled(Register base_ptr, int cnt_dwords, Register tmp, int offset) { 3405 if (cnt_dwords > 0) { li(tmp, 0); } 3406 for (int i = 0; i < cnt_dwords; ++i) { std(tmp, offset + i * 8, base_ptr); } 3407 } 3408 3409 // Version for constant short array length. Kills base_ptr. tmp == R0 is allowed. 3410 void MacroAssembler::clear_memory_constlen(Register base_ptr, int cnt_dwords, Register tmp) { 3411 if (cnt_dwords < 8) { 3412 clear_memory_unrolled(base_ptr, cnt_dwords, tmp); 3413 return; 3414 } 3415 3416 Label loop; 3417 const long loopcnt = cnt_dwords >> 1, 3418 remainder = cnt_dwords & 1; 3419 3420 li(tmp, loopcnt); 3421 mtctr(tmp); 3422 li(tmp, 0); 3423 bind(loop); 3424 std(tmp, 0, base_ptr); 3425 std(tmp, 8, base_ptr); 3426 addi(base_ptr, base_ptr, 16); 3427 bdnz(loop); 3428 if (remainder) { std(tmp, 0, base_ptr); } 3429 } 3430 3431 // Kills both input registers. tmp == R0 is allowed. 3432 void MacroAssembler::clear_memory_doubleword(Register base_ptr, Register cnt_dwords, Register tmp, long const_cnt) { 3433 // Procedure for large arrays (uses data cache block zero instruction). 3434 Label startloop, fast, fastloop, small_rest, restloop, done; 3435 const int cl_size = VM_Version::L1_data_cache_line_size(), 3436 cl_dwords = cl_size >> 3, 3437 cl_dw_addr_bits = exact_log2(cl_dwords), 3438 dcbz_min = 1, // Min count of dcbz executions, needs to be >0. 3439 min_cnt = ((dcbz_min + 1) << cl_dw_addr_bits) - 1; 3440 3441 if (const_cnt >= 0) { 3442 // Constant case. 3443 if (const_cnt < min_cnt) { 3444 clear_memory_constlen(base_ptr, const_cnt, tmp); 3445 return; 3446 } 3447 load_const_optimized(cnt_dwords, const_cnt, tmp); 3448 } else { 3449 // cnt_dwords already loaded in register. Need to check size. 3450 cmpdi(CCR1, cnt_dwords, min_cnt); // Big enough? (ensure >= dcbz_min lines included). 3451 blt(CCR1, small_rest); 3452 } 3453 rldicl_(tmp, base_ptr, 64-3, 64-cl_dw_addr_bits); // Extract dword offset within first cache line. 3454 beq(CCR0, fast); // Already 128byte aligned. 3455 3456 subfic(tmp, tmp, cl_dwords); 3457 mtctr(tmp); // Set ctr to hit 128byte boundary (0<ctr<cl_dwords). 3458 subf(cnt_dwords, tmp, cnt_dwords); // rest. 3459 li(tmp, 0); 3460 3461 bind(startloop); // Clear at the beginning to reach 128byte boundary. 3462 std(tmp, 0, base_ptr); // Clear 8byte aligned block. 3463 addi(base_ptr, base_ptr, 8); 3464 bdnz(startloop); 3465 3466 bind(fast); // Clear 128byte blocks. 3467 srdi(tmp, cnt_dwords, cl_dw_addr_bits); // Loop count for 128byte loop (>0). 3468 andi(cnt_dwords, cnt_dwords, cl_dwords-1); // Rest in dwords. 3469 mtctr(tmp); // Load counter. 3470 3471 bind(fastloop); 3472 dcbz(base_ptr); // Clear 128byte aligned block. 3473 addi(base_ptr, base_ptr, cl_size); 3474 bdnz(fastloop); 3475 3476 bind(small_rest); 3477 cmpdi(CCR0, cnt_dwords, 0); // size 0? 3478 beq(CCR0, done); // rest == 0 3479 li(tmp, 0); 3480 mtctr(cnt_dwords); // Load counter. 3481 3482 bind(restloop); // Clear rest. 3483 std(tmp, 0, base_ptr); // Clear 8byte aligned block. 3484 addi(base_ptr, base_ptr, 8); 3485 bdnz(restloop); 3486 3487 bind(done); 3488 } 3489 3490 /////////////////////////////////////////// String intrinsics //////////////////////////////////////////// 3491 3492 #ifdef COMPILER2 3493 // Intrinsics for CompactStrings 3494 3495 // Compress char[] to byte[] by compressing 16 bytes at once. 3496 void MacroAssembler::string_compress_16(Register src, Register dst, Register cnt, 3497 Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5, 3498 Label& Lfailure) { 3499 3500 const Register tmp0 = R0; 3501 assert_different_registers(src, dst, cnt, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5); 3502 Label Lloop, Lslow; 3503 3504 // Check if cnt >= 8 (= 16 bytes) 3505 lis(tmp1, 0xFF); // tmp1 = 0x00FF00FF00FF00FF 3506 srwi_(tmp2, cnt, 3); 3507 beq(CCR0, Lslow); 3508 ori(tmp1, tmp1, 0xFF); 3509 rldimi(tmp1, tmp1, 32, 0); 3510 mtctr(tmp2); 3511 3512 // 2x unrolled loop 3513 bind(Lloop); 3514 ld(tmp2, 0, src); // _0_1_2_3 (Big Endian) 3515 ld(tmp4, 8, src); // _4_5_6_7 3516 3517 orr(tmp0, tmp2, tmp4); 3518 rldicl(tmp3, tmp2, 6*8, 64-24); // _____1_2 3519 rldimi(tmp2, tmp2, 2*8, 2*8); // _0_2_3_3 3520 rldicl(tmp5, tmp4, 6*8, 64-24); // _____5_6 3521 rldimi(tmp4, tmp4, 2*8, 2*8); // _4_6_7_7 3522 3523 andc_(tmp0, tmp0, tmp1); 3524 bne(CCR0, Lfailure); // Not latin1. 3525 addi(src, src, 16); 3526 3527 rlwimi(tmp3, tmp2, 0*8, 24, 31);// _____1_3 3528 srdi(tmp2, tmp2, 3*8); // ____0_2_ 3529 rlwimi(tmp5, tmp4, 0*8, 24, 31);// _____5_7 3530 srdi(tmp4, tmp4, 3*8); // ____4_6_ 3531 3532 orr(tmp2, tmp2, tmp3); // ____0123 3533 orr(tmp4, tmp4, tmp5); // ____4567 3534 3535 stw(tmp2, 0, dst); 3536 stw(tmp4, 4, dst); 3537 addi(dst, dst, 8); 3538 bdnz(Lloop); 3539 3540 bind(Lslow); // Fallback to slow version 3541 } 3542 3543 // Compress char[] to byte[]. cnt must be positive int. 3544 void MacroAssembler::string_compress(Register src, Register dst, Register cnt, Register tmp, Label& Lfailure) { 3545 Label Lloop; 3546 mtctr(cnt); 3547 3548 bind(Lloop); 3549 lhz(tmp, 0, src); 3550 cmplwi(CCR0, tmp, 0xff); 3551 bgt(CCR0, Lfailure); // Not latin1. 3552 addi(src, src, 2); 3553 stb(tmp, 0, dst); 3554 addi(dst, dst, 1); 3555 bdnz(Lloop); 3556 } 3557 3558 // Inflate byte[] to char[] by inflating 16 bytes at once. 3559 void MacroAssembler::string_inflate_16(Register src, Register dst, Register cnt, 3560 Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5) { 3561 const Register tmp0 = R0; 3562 assert_different_registers(src, dst, cnt, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5); 3563 Label Lloop, Lslow; 3564 3565 // Check if cnt >= 8 3566 srwi_(tmp2, cnt, 3); 3567 beq(CCR0, Lslow); 3568 lis(tmp1, 0xFF); // tmp1 = 0x00FF00FF 3569 ori(tmp1, tmp1, 0xFF); 3570 mtctr(tmp2); 3571 3572 // 2x unrolled loop 3573 bind(Lloop); 3574 lwz(tmp2, 0, src); // ____0123 (Big Endian) 3575 lwz(tmp4, 4, src); // ____4567 3576 addi(src, src, 8); 3577 3578 rldicl(tmp3, tmp2, 7*8, 64-8); // _______2 3579 rlwimi(tmp2, tmp2, 3*8, 16, 23);// ____0113 3580 rldicl(tmp5, tmp4, 7*8, 64-8); // _______6 3581 rlwimi(tmp4, tmp4, 3*8, 16, 23);// ____4557 3582 3583 andc(tmp0, tmp2, tmp1); // ____0_1_ 3584 rlwimi(tmp2, tmp3, 2*8, 0, 23); // _____2_3 3585 andc(tmp3, tmp4, tmp1); // ____4_5_ 3586 rlwimi(tmp4, tmp5, 2*8, 0, 23); // _____6_7 3587 3588 rldimi(tmp2, tmp0, 3*8, 0*8); // _0_1_2_3 3589 rldimi(tmp4, tmp3, 3*8, 0*8); // _4_5_6_7 3590 3591 std(tmp2, 0, dst); 3592 std(tmp4, 8, dst); 3593 addi(dst, dst, 16); 3594 bdnz(Lloop); 3595 3596 bind(Lslow); // Fallback to slow version 3597 } 3598 3599 // Inflate byte[] to char[]. cnt must be positive int. 3600 void MacroAssembler::string_inflate(Register src, Register dst, Register cnt, Register tmp) { 3601 Label Lloop; 3602 mtctr(cnt); 3603 3604 bind(Lloop); 3605 lbz(tmp, 0, src); 3606 addi(src, src, 1); 3607 sth(tmp, 0, dst); 3608 addi(dst, dst, 2); 3609 bdnz(Lloop); 3610 } 3611 3612 void MacroAssembler::string_compare(Register str1, Register str2, 3613 Register cnt1, Register cnt2, 3614 Register tmp1, Register result, int ae) { 3615 const Register tmp0 = R0, 3616 diff = tmp1; 3617 3618 assert_different_registers(str1, str2, cnt1, cnt2, tmp0, tmp1, result); 3619 Label Ldone, Lslow, Lloop, Lreturn_diff; 3620 3621 // Note: Making use of the fact that compareTo(a, b) == -compareTo(b, a) 3622 // we interchange str1 and str2 in the UL case and negate the result. 3623 // Like this, str1 is always latin1 encoded, except for the UU case. 3624 // In addition, we need 0 (or sign which is 0) extend. 3625 3626 if (ae == StrIntrinsicNode::UU) { 3627 srwi(cnt1, cnt1, 1); 3628 } else { 3629 clrldi(cnt1, cnt1, 32); 3630 } 3631 3632 if (ae != StrIntrinsicNode::LL) { 3633 srwi(cnt2, cnt2, 1); 3634 } else { 3635 clrldi(cnt2, cnt2, 32); 3636 } 3637 3638 // See if the lengths are different, and calculate min in cnt1. 3639 // Save diff in case we need it for a tie-breaker. 3640 subf_(diff, cnt2, cnt1); // diff = cnt1 - cnt2 3641 // if (diff > 0) { cnt1 = cnt2; } 3642 if (VM_Version::has_isel()) { 3643 isel(cnt1, CCR0, Assembler::greater, /*invert*/ false, cnt2); 3644 } else { 3645 Label Lskip; 3646 blt(CCR0, Lskip); 3647 mr(cnt1, cnt2); 3648 bind(Lskip); 3649 } 3650 3651 // Rename registers 3652 Register chr1 = result; 3653 Register chr2 = tmp0; 3654 3655 // Compare multiple characters in fast loop (only implemented for same encoding). 3656 int stride1 = 8, stride2 = 8; 3657 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3658 int log2_chars_per_iter = (ae == StrIntrinsicNode::LL) ? 3 : 2; 3659 Label Lfastloop, Lskipfast; 3660 3661 srwi_(tmp0, cnt1, log2_chars_per_iter); 3662 beq(CCR0, Lskipfast); 3663 rldicl(cnt2, cnt1, 0, 64 - log2_chars_per_iter); // Remaining characters. 3664 li(cnt1, 1 << log2_chars_per_iter); // Initialize for failure case: Rescan characters from current iteration. 3665 mtctr(tmp0); 3666 3667 bind(Lfastloop); 3668 ld(chr1, 0, str1); 3669 ld(chr2, 0, str2); 3670 cmpd(CCR0, chr1, chr2); 3671 bne(CCR0, Lslow); 3672 addi(str1, str1, stride1); 3673 addi(str2, str2, stride2); 3674 bdnz(Lfastloop); 3675 mr(cnt1, cnt2); // Remaining characters. 3676 bind(Lskipfast); 3677 } 3678 3679 // Loop which searches the first difference character by character. 3680 cmpwi(CCR0, cnt1, 0); 3681 beq(CCR0, Lreturn_diff); 3682 bind(Lslow); 3683 mtctr(cnt1); 3684 3685 switch (ae) { 3686 case StrIntrinsicNode::LL: stride1 = 1; stride2 = 1; break; 3687 case StrIntrinsicNode::UL: // fallthru (see comment above) 3688 case StrIntrinsicNode::LU: stride1 = 1; stride2 = 2; break; 3689 case StrIntrinsicNode::UU: stride1 = 2; stride2 = 2; break; 3690 default: ShouldNotReachHere(); break; 3691 } 3692 3693 bind(Lloop); 3694 if (stride1 == 1) { lbz(chr1, 0, str1); } else { lhz(chr1, 0, str1); } 3695 if (stride2 == 1) { lbz(chr2, 0, str2); } else { lhz(chr2, 0, str2); } 3696 subf_(result, chr2, chr1); // result = chr1 - chr2 3697 bne(CCR0, Ldone); 3698 addi(str1, str1, stride1); 3699 addi(str2, str2, stride2); 3700 bdnz(Lloop); 3701 3702 // If strings are equal up to min length, return the length difference. 3703 bind(Lreturn_diff); 3704 mr(result, diff); 3705 3706 // Otherwise, return the difference between the first mismatched chars. 3707 bind(Ldone); 3708 if (ae == StrIntrinsicNode::UL) { 3709 neg(result, result); // Negate result (see note above). 3710 } 3711 } 3712 3713 void MacroAssembler::array_equals(bool is_array_equ, Register ary1, Register ary2, 3714 Register limit, Register tmp1, Register result, bool is_byte) { 3715 const Register tmp0 = R0; 3716 assert_different_registers(ary1, ary2, limit, tmp0, tmp1, result); 3717 Label Ldone, Lskiploop, Lloop, Lfastloop, Lskipfast; 3718 bool limit_needs_shift = false; 3719 3720 if (is_array_equ) { 3721 const int length_offset = arrayOopDesc::length_offset_in_bytes(); 3722 const int base_offset = arrayOopDesc::base_offset_in_bytes(is_byte ? T_BYTE : T_CHAR); 3723 3724 // Return true if the same array. 3725 cmpd(CCR0, ary1, ary2); 3726 beq(CCR0, Lskiploop); 3727 3728 // Return false if one of them is NULL. 3729 cmpdi(CCR0, ary1, 0); 3730 cmpdi(CCR1, ary2, 0); 3731 li(result, 0); 3732 cror(CCR0, Assembler::equal, CCR1, Assembler::equal); 3733 beq(CCR0, Ldone); 3734 3735 // Load the lengths of arrays. 3736 lwz(limit, length_offset, ary1); 3737 lwz(tmp0, length_offset, ary2); 3738 3739 // Return false if the two arrays are not equal length. 3740 cmpw(CCR0, limit, tmp0); 3741 bne(CCR0, Ldone); 3742 3743 // Load array addresses. 3744 addi(ary1, ary1, base_offset); 3745 addi(ary2, ary2, base_offset); 3746 } else { 3747 limit_needs_shift = !is_byte; 3748 li(result, 0); // Assume not equal. 3749 } 3750 3751 // Rename registers 3752 Register chr1 = tmp0; 3753 Register chr2 = tmp1; 3754 3755 // Compare 8 bytes per iteration in fast loop. 3756 const int log2_chars_per_iter = is_byte ? 3 : 2; 3757 3758 srwi_(tmp0, limit, log2_chars_per_iter + (limit_needs_shift ? 1 : 0)); 3759 beq(CCR0, Lskipfast); 3760 mtctr(tmp0); 3761 3762 bind(Lfastloop); 3763 ld(chr1, 0, ary1); 3764 ld(chr2, 0, ary2); 3765 addi(ary1, ary1, 8); 3766 addi(ary2, ary2, 8); 3767 cmpd(CCR0, chr1, chr2); 3768 bne(CCR0, Ldone); 3769 bdnz(Lfastloop); 3770 3771 bind(Lskipfast); 3772 rldicl_(limit, limit, limit_needs_shift ? 64 - 1 : 0, 64 - log2_chars_per_iter); // Remaining characters. 3773 beq(CCR0, Lskiploop); 3774 mtctr(limit); 3775 3776 // Character by character. 3777 bind(Lloop); 3778 if (is_byte) { 3779 lbz(chr1, 0, ary1); 3780 lbz(chr2, 0, ary2); 3781 addi(ary1, ary1, 1); 3782 addi(ary2, ary2, 1); 3783 } else { 3784 lhz(chr1, 0, ary1); 3785 lhz(chr2, 0, ary2); 3786 addi(ary1, ary1, 2); 3787 addi(ary2, ary2, 2); 3788 } 3789 cmpw(CCR0, chr1, chr2); 3790 bne(CCR0, Ldone); 3791 bdnz(Lloop); 3792 3793 bind(Lskiploop); 3794 li(result, 1); // All characters are equal. 3795 bind(Ldone); 3796 } 3797 3798 void MacroAssembler::string_indexof(Register result, Register haystack, Register haycnt, 3799 Register needle, ciTypeArray* needle_values, Register needlecnt, int needlecntval, 3800 Register tmp1, Register tmp2, Register tmp3, Register tmp4, int ae) { 3801 3802 // Ensure 0<needlecnt<=haycnt in ideal graph as prerequisite! 3803 Label L_TooShort, L_Found, L_NotFound, L_End; 3804 Register last_addr = haycnt, // Kill haycnt at the beginning. 3805 addr = tmp1, 3806 n_start = tmp2, 3807 ch1 = tmp3, 3808 ch2 = R0; 3809 3810 assert(ae != StrIntrinsicNode::LU, "Invalid encoding"); 3811 const int h_csize = (ae == StrIntrinsicNode::LL) ? 1 : 2; 3812 const int n_csize = (ae == StrIntrinsicNode::UU) ? 2 : 1; 3813 3814 // ************************************************************************************************** 3815 // Prepare for main loop: optimized for needle count >=2, bail out otherwise. 3816 // ************************************************************************************************** 3817 3818 // Compute last haystack addr to use if no match gets found. 3819 clrldi(haycnt, haycnt, 32); // Ensure positive int is valid as 64 bit value. 3820 addi(addr, haystack, -h_csize); // Accesses use pre-increment. 3821 if (needlecntval == 0) { // variable needlecnt 3822 cmpwi(CCR6, needlecnt, 2); 3823 clrldi(needlecnt, needlecnt, 32); // Ensure positive int is valid as 64 bit value. 3824 blt(CCR6, L_TooShort); // Variable needlecnt: handle short needle separately. 3825 } 3826 3827 if (n_csize == 2) { lwz(n_start, 0, needle); } else { lhz(n_start, 0, needle); } // Load first 2 characters of needle. 3828 3829 if (needlecntval == 0) { // variable needlecnt 3830 subf(ch1, needlecnt, haycnt); // Last character index to compare is haycnt-needlecnt. 3831 addi(needlecnt, needlecnt, -2); // Rest of needle. 3832 } else { // constant needlecnt 3833 guarantee(needlecntval != 1, "IndexOf with single-character needle must be handled separately"); 3834 assert((needlecntval & 0x7fff) == needlecntval, "wrong immediate"); 3835 addi(ch1, haycnt, -needlecntval); // Last character index to compare is haycnt-needlecnt. 3836 if (needlecntval > 3) { li(needlecnt, needlecntval - 2); } // Rest of needle. 3837 } 3838 3839 if (h_csize == 2) { slwi(ch1, ch1, 1); } // Scale to number of bytes. 3840 3841 if (ae ==StrIntrinsicNode::UL) { 3842 srwi(tmp4, n_start, 1*8); // ___0 3843 rlwimi(n_start, tmp4, 2*8, 0, 23); // _0_1 3844 } 3845 3846 add(last_addr, haystack, ch1); // Point to last address to compare (haystack+2*(haycnt-needlecnt)). 3847 3848 // Main Loop (now we have at least 2 characters). 3849 Label L_OuterLoop, L_InnerLoop, L_FinalCheck, L_Comp1, L_Comp2; 3850 bind(L_OuterLoop); // Search for 1st 2 characters. 3851 Register addr_diff = tmp4; 3852 subf(addr_diff, addr, last_addr); // Difference between already checked address and last address to check. 3853 addi(addr, addr, h_csize); // This is the new address we want to use for comparing. 3854 srdi_(ch2, addr_diff, h_csize); 3855 beq(CCR0, L_FinalCheck); // 2 characters left? 3856 mtctr(ch2); // num of characters / 2 3857 bind(L_InnerLoop); // Main work horse (2x unrolled search loop) 3858 if (h_csize == 2) { // Load 2 characters of haystack (ignore alignment). 3859 lwz(ch1, 0, addr); 3860 lwz(ch2, 2, addr); 3861 } else { 3862 lhz(ch1, 0, addr); 3863 lhz(ch2, 1, addr); 3864 } 3865 cmpw(CCR0, ch1, n_start); // Compare 2 characters (1 would be sufficient but try to reduce branches to CompLoop). 3866 cmpw(CCR1, ch2, n_start); 3867 beq(CCR0, L_Comp1); // Did we find the needle start? 3868 beq(CCR1, L_Comp2); 3869 addi(addr, addr, 2 * h_csize); 3870 bdnz(L_InnerLoop); 3871 bind(L_FinalCheck); 3872 andi_(addr_diff, addr_diff, h_csize); // Remaining characters not covered by InnerLoop: (num of characters) & 1. 3873 beq(CCR0, L_NotFound); 3874 if (h_csize == 2) { lwz(ch1, 0, addr); } else { lhz(ch1, 0, addr); } // One position left at which we have to compare. 3875 cmpw(CCR1, ch1, n_start); 3876 beq(CCR1, L_Comp1); 3877 bind(L_NotFound); 3878 li(result, -1); // not found 3879 b(L_End); 3880 3881 // ************************************************************************************************** 3882 // Special Case: unfortunately, the variable needle case can be called with needlecnt<2 3883 // ************************************************************************************************** 3884 if (needlecntval == 0) { // We have to handle these cases separately. 3885 Label L_OneCharLoop; 3886 bind(L_TooShort); 3887 mtctr(haycnt); 3888 if (n_csize == 2) { lhz(n_start, 0, needle); } else { lbz(n_start, 0, needle); } // First character of needle 3889 bind(L_OneCharLoop); 3890 if (h_csize == 2) { lhzu(ch1, 2, addr); } else { lbzu(ch1, 1, addr); } 3891 cmpw(CCR1, ch1, n_start); 3892 beq(CCR1, L_Found); // Did we find the one character needle? 3893 bdnz(L_OneCharLoop); 3894 li(result, -1); // Not found. 3895 b(L_End); 3896 } 3897 3898 // ************************************************************************************************** 3899 // Regular Case Part II: compare rest of needle (first 2 characters have been compared already) 3900 // ************************************************************************************************** 3901 3902 // Compare the rest 3903 bind(L_Comp2); 3904 addi(addr, addr, h_csize); // First comparison has failed, 2nd one hit. 3905 bind(L_Comp1); // Addr points to possible needle start. 3906 if (needlecntval != 2) { // Const needlecnt==2? 3907 if (needlecntval != 3) { 3908 if (needlecntval == 0) { beq(CCR6, L_Found); } // Variable needlecnt==2? 3909 Register n_ind = tmp4, 3910 h_ind = n_ind; 3911 li(n_ind, 2 * n_csize); // First 2 characters are already compared, use index 2. 3912 mtctr(needlecnt); // Decremented by 2, still > 0. 3913 Label L_CompLoop; 3914 bind(L_CompLoop); 3915 if (ae ==StrIntrinsicNode::UL) { 3916 h_ind = ch1; 3917 sldi(h_ind, n_ind, 1); 3918 } 3919 if (n_csize == 2) { lhzx(ch2, needle, n_ind); } else { lbzx(ch2, needle, n_ind); } 3920 if (h_csize == 2) { lhzx(ch1, addr, h_ind); } else { lbzx(ch1, addr, h_ind); } 3921 cmpw(CCR1, ch1, ch2); 3922 bne(CCR1, L_OuterLoop); 3923 addi(n_ind, n_ind, n_csize); 3924 bdnz(L_CompLoop); 3925 } else { // No loop required if there's only one needle character left. 3926 if (n_csize == 2) { lhz(ch2, 2 * 2, needle); } else { lbz(ch2, 2 * 1, needle); } 3927 if (h_csize == 2) { lhz(ch1, 2 * 2, addr); } else { lbz(ch1, 2 * 1, addr); } 3928 cmpw(CCR1, ch1, ch2); 3929 bne(CCR1, L_OuterLoop); 3930 } 3931 } 3932 // Return index ... 3933 bind(L_Found); 3934 subf(result, haystack, addr); // relative to haystack, ... 3935 if (h_csize == 2) { srdi(result, result, 1); } // in characters. 3936 bind(L_End); 3937 } // string_indexof 3938 3939 void MacroAssembler::string_indexof_char(Register result, Register haystack, Register haycnt, 3940 Register needle, jchar needleChar, Register tmp1, Register tmp2, bool is_byte) { 3941 assert_different_registers(haystack, haycnt, needle, tmp1, tmp2); 3942 3943 Label L_InnerLoop, L_FinalCheck, L_Found1, L_Found2, L_NotFound, L_End; 3944 Register addr = tmp1, 3945 ch1 = tmp2, 3946 ch2 = R0; 3947 3948 const int h_csize = is_byte ? 1 : 2; 3949 3950 //4: 3951 srwi_(tmp2, haycnt, 1); // Shift right by exact_log2(UNROLL_FACTOR). 3952 mr(addr, haystack); 3953 beq(CCR0, L_FinalCheck); 3954 mtctr(tmp2); // Move to count register. 3955 //8: 3956 bind(L_InnerLoop); // Main work horse (2x unrolled search loop). 3957 if (!is_byte) { 3958 lhz(ch1, 0, addr); 3959 lhz(ch2, 2, addr); 3960 } else { 3961 lbz(ch1, 0, addr); 3962 lbz(ch2, 1, addr); 3963 } 3964 (needle != R0) ? cmpw(CCR0, ch1, needle) : cmplwi(CCR0, ch1, (unsigned int)needleChar); 3965 (needle != R0) ? cmpw(CCR1, ch2, needle) : cmplwi(CCR1, ch2, (unsigned int)needleChar); 3966 beq(CCR0, L_Found1); // Did we find the needle? 3967 beq(CCR1, L_Found2); 3968 addi(addr, addr, 2 * h_csize); 3969 bdnz(L_InnerLoop); 3970 //16: 3971 bind(L_FinalCheck); 3972 andi_(R0, haycnt, 1); 3973 beq(CCR0, L_NotFound); 3974 if (!is_byte) { lhz(ch1, 0, addr); } else { lbz(ch1, 0, addr); } // One position left at which we have to compare. 3975 (needle != R0) ? cmpw(CCR1, ch1, needle) : cmplwi(CCR1, ch1, (unsigned int)needleChar); 3976 beq(CCR1, L_Found1); 3977 //21: 3978 bind(L_NotFound); 3979 li(result, -1); // Not found. 3980 b(L_End); 3981 3982 bind(L_Found2); 3983 addi(addr, addr, h_csize); 3984 //24: 3985 bind(L_Found1); // Return index ... 3986 subf(result, haystack, addr); // relative to haystack, ... 3987 if (!is_byte) { srdi(result, result, 1); } // in characters. 3988 bind(L_End); 3989 } // string_indexof_char 3990 3991 3992 void MacroAssembler::has_negatives(Register src, Register cnt, Register result, 3993 Register tmp1, Register tmp2) { 3994 const Register tmp0 = R0; 3995 assert_different_registers(src, result, cnt, tmp0, tmp1, tmp2); 3996 Label Lfastloop, Lslow, Lloop, Lnoneg, Ldone; 3997 3998 // Check if cnt >= 8 (= 16 bytes) 3999 lis(tmp1, (int)(short)0x8080); // tmp1 = 0x8080808080808080 4000 srwi_(tmp2, cnt, 4); 4001 li(result, 1); // Assume there's a negative byte. 4002 beq(CCR0, Lslow); 4003 ori(tmp1, tmp1, 0x8080); 4004 rldimi(tmp1, tmp1, 32, 0); 4005 mtctr(tmp2); 4006 4007 // 2x unrolled loop 4008 bind(Lfastloop); 4009 ld(tmp2, 0, src); 4010 ld(tmp0, 8, src); 4011 4012 orr(tmp0, tmp2, tmp0); 4013 4014 and_(tmp0, tmp0, tmp1); 4015 bne(CCR0, Ldone); // Found negative byte. 4016 addi(src, src, 16); 4017 4018 bdnz(Lfastloop); 4019 4020 bind(Lslow); // Fallback to slow version 4021 rldicl_(tmp0, cnt, 0, 64-4); 4022 beq(CCR0, Lnoneg); 4023 mtctr(tmp0); 4024 bind(Lloop); 4025 lbz(tmp0, 0, src); 4026 addi(src, src, 1); 4027 andi_(tmp0, tmp0, 0x80); 4028 bne(CCR0, Ldone); // Found negative byte. 4029 bdnz(Lloop); 4030 bind(Lnoneg); 4031 li(result, 0); 4032 4033 bind(Ldone); 4034 } 4035 4036 #endif // Compiler2 4037 4038 // Helpers for Intrinsic Emitters 4039 // 4040 // Revert the byte order of a 32bit value in a register 4041 // src: 0x44556677 4042 // dst: 0x77665544 4043 // Three steps to obtain the result: 4044 // 1) Rotate src (as doubleword) left 5 bytes. That puts the leftmost byte of the src word 4045 // into the rightmost byte position. Afterwards, everything left of the rightmost byte is cleared. 4046 // This value initializes dst. 4047 // 2) Rotate src (as word) left 3 bytes. That puts the rightmost byte of the src word into the leftmost 4048 // byte position. Furthermore, byte 5 is rotated into byte 6 position where it is supposed to go. 4049 // This value is mask inserted into dst with a [0..23] mask of 1s. 4050 // 3) Rotate src (as word) left 1 byte. That puts byte 6 into byte 5 position. 4051 // This value is mask inserted into dst with a [8..15] mask of 1s. 4052 void MacroAssembler::load_reverse_32(Register dst, Register src) { 4053 assert_different_registers(dst, src); 4054 4055 rldicl(dst, src, (4+1)*8, 56); // Rotate byte 4 into position 7 (rightmost), clear all to the left. 4056 rlwimi(dst, src, 3*8, 0, 23); // Insert byte 5 into position 6, 7 into 4, leave pos 7 alone. 4057 rlwimi(dst, src, 1*8, 8, 15); // Insert byte 6 into position 5, leave the rest alone. 4058 } 4059 4060 // Calculate the column addresses of the crc32 lookup table into distinct registers. 4061 // This loop-invariant calculation is moved out of the loop body, reducing the loop 4062 // body size from 20 to 16 instructions. 4063 // Returns the offset that was used to calculate the address of column tc3. 4064 // Due to register shortage, setting tc3 may overwrite table. With the return offset 4065 // at hand, the original table address can be easily reconstructed. 4066 int MacroAssembler::crc32_table_columns(Register table, Register tc0, Register tc1, Register tc2, Register tc3) { 4067 4068 #ifdef VM_LITTLE_ENDIAN 4069 // This is what we implement (the DOLIT4 part): 4070 // ========================================================================= */ 4071 // #define DOLIT4 c ^= *buf4++; \ 4072 // c = crc_table[3][c & 0xff] ^ crc_table[2][(c >> 8) & 0xff] ^ \ 4073 // crc_table[1][(c >> 16) & 0xff] ^ crc_table[0][c >> 24] 4074 // #define DOLIT32 DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4 4075 // ========================================================================= */ 4076 const int ix0 = 3*(4*CRC32_COLUMN_SIZE); 4077 const int ix1 = 2*(4*CRC32_COLUMN_SIZE); 4078 const int ix2 = 1*(4*CRC32_COLUMN_SIZE); 4079 const int ix3 = 0*(4*CRC32_COLUMN_SIZE); 4080 #else 4081 // This is what we implement (the DOBIG4 part): 4082 // ========================================================================= 4083 // #define DOBIG4 c ^= *++buf4; \ 4084 // c = crc_table[4][c & 0xff] ^ crc_table[5][(c >> 8) & 0xff] ^ \ 4085 // crc_table[6][(c >> 16) & 0xff] ^ crc_table[7][c >> 24] 4086 // #define DOBIG32 DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4 4087 // ========================================================================= 4088 const int ix0 = 4*(4*CRC32_COLUMN_SIZE); 4089 const int ix1 = 5*(4*CRC32_COLUMN_SIZE); 4090 const int ix2 = 6*(4*CRC32_COLUMN_SIZE); 4091 const int ix3 = 7*(4*CRC32_COLUMN_SIZE); 4092 #endif 4093 assert_different_registers(table, tc0, tc1, tc2); 4094 assert(table == tc3, "must be!"); 4095 4096 addi(tc0, table, ix0); 4097 addi(tc1, table, ix1); 4098 addi(tc2, table, ix2); 4099 if (ix3 != 0) addi(tc3, table, ix3); 4100 4101 return ix3; 4102 } 4103 4104 /** 4105 * uint32_t crc; 4106 * timesXtoThe32[crc & 0xFF] ^ (crc >> 8); 4107 */ 4108 void MacroAssembler::fold_byte_crc32(Register crc, Register val, Register table, Register tmp) { 4109 assert_different_registers(crc, table, tmp); 4110 assert_different_registers(val, table); 4111 4112 if (crc == val) { // Must rotate first to use the unmodified value. 4113 rlwinm(tmp, val, 2, 24-2, 31-2); // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest. 4114 // As we use a word (4-byte) instruction, we have to adapt the mask bit positions. 4115 srwi(crc, crc, 8); // Unsigned shift, clear leftmost 8 bits. 4116 } else { 4117 srwi(crc, crc, 8); // Unsigned shift, clear leftmost 8 bits. 4118 rlwinm(tmp, val, 2, 24-2, 31-2); // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest. 4119 } 4120 lwzx(tmp, table, tmp); 4121 xorr(crc, crc, tmp); 4122 } 4123 4124 /** 4125 * uint32_t crc; 4126 * timesXtoThe32[crc & 0xFF] ^ (crc >> 8); 4127 */ 4128 void MacroAssembler::fold_8bit_crc32(Register crc, Register table, Register tmp) { 4129 fold_byte_crc32(crc, crc, table, tmp); 4130 } 4131 4132 /** 4133 * Emits code to update CRC-32 with a byte value according to constants in table. 4134 * 4135 * @param [in,out]crc Register containing the crc. 4136 * @param [in]val Register containing the byte to fold into the CRC. 4137 * @param [in]table Register containing the table of crc constants. 4138 * 4139 * uint32_t crc; 4140 * val = crc_table[(val ^ crc) & 0xFF]; 4141 * crc = val ^ (crc >> 8); 4142 */ 4143 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) { 4144 BLOCK_COMMENT("update_byte_crc32:"); 4145 xorr(val, val, crc); 4146 fold_byte_crc32(crc, val, table, val); 4147 } 4148 4149 /** 4150 * @param crc register containing existing CRC (32-bit) 4151 * @param buf register pointing to input byte buffer (byte*) 4152 * @param len register containing number of bytes 4153 * @param table register pointing to CRC table 4154 */ 4155 void MacroAssembler::update_byteLoop_crc32(Register crc, Register buf, Register len, Register table, 4156 Register data, bool loopAlignment) { 4157 assert_different_registers(crc, buf, len, table, data); 4158 4159 Label L_mainLoop, L_done; 4160 const int mainLoop_stepping = 1; 4161 const int mainLoop_alignment = loopAlignment ? 32 : 4; // (InputForNewCode > 4 ? InputForNewCode : 32) : 4; 4162 4163 // Process all bytes in a single-byte loop. 4164 clrldi_(len, len, 32); // Enforce 32 bit. Anything to do? 4165 beq(CCR0, L_done); 4166 4167 mtctr(len); 4168 align(mainLoop_alignment); 4169 BIND(L_mainLoop); 4170 lbz(data, 0, buf); // Byte from buffer, zero-extended. 4171 addi(buf, buf, mainLoop_stepping); // Advance buffer position. 4172 update_byte_crc32(crc, data, table); 4173 bdnz(L_mainLoop); // Iterate. 4174 4175 bind(L_done); 4176 } 4177 4178 /** 4179 * Emits code to update CRC-32 with a 4-byte value according to constants in table 4180 * Implementation according to jdk/src/share/native/java/util/zip/zlib-1.2.8/crc32.c 4181 */ 4182 // A not on the lookup table address(es): 4183 // The lookup table consists of two sets of four columns each. 4184 // The columns {0..3} are used for little-endian machines. 4185 // The columns {4..7} are used for big-endian machines. 4186 // To save the effort of adding the column offset to the table address each time 4187 // a table element is looked up, it is possible to pass the pre-calculated 4188 // column addresses. 4189 // Uses R9..R12 as work register. Must be saved/restored by caller, if necessary. 4190 void MacroAssembler::update_1word_crc32(Register crc, Register buf, Register table, int bufDisp, int bufInc, 4191 Register t0, Register t1, Register t2, Register t3, 4192 Register tc0, Register tc1, Register tc2, Register tc3) { 4193 assert_different_registers(crc, t3); 4194 4195 // XOR crc with next four bytes of buffer. 4196 lwz(t3, bufDisp, buf); 4197 if (bufInc != 0) { 4198 addi(buf, buf, bufInc); 4199 } 4200 xorr(t3, t3, crc); 4201 4202 // Chop crc into 4 single-byte pieces, shifted left 2 bits, to form the table indices. 4203 rlwinm(t0, t3, 2, 24-2, 31-2); // ((t1 >> 0) & 0xff) << 2 4204 rlwinm(t1, t3, 32+(2- 8), 24-2, 31-2); // ((t1 >> 8) & 0xff) << 2 4205 rlwinm(t2, t3, 32+(2-16), 24-2, 31-2); // ((t1 >> 16) & 0xff) << 2 4206 rlwinm(t3, t3, 32+(2-24), 24-2, 31-2); // ((t1 >> 24) & 0xff) << 2 4207 4208 // Use the pre-calculated column addresses. 4209 // Load pre-calculated table values. 4210 lwzx(t0, tc0, t0); 4211 lwzx(t1, tc1, t1); 4212 lwzx(t2, tc2, t2); 4213 lwzx(t3, tc3, t3); 4214 4215 // Calculate new crc from table values. 4216 xorr(t0, t0, t1); 4217 xorr(t2, t2, t3); 4218 xorr(crc, t0, t2); // Now crc contains the final checksum value. 4219 } 4220 4221 /** 4222 * @param crc register containing existing CRC (32-bit) 4223 * @param buf register pointing to input byte buffer (byte*) 4224 * @param len register containing number of bytes 4225 * @param table register pointing to CRC table 4226 * 4227 * Uses R9..R12 as work register. Must be saved/restored by caller! 4228 */ 4229 void MacroAssembler::kernel_crc32_2word(Register crc, Register buf, Register len, Register table, 4230 Register t0, Register t1, Register t2, Register t3, 4231 Register tc0, Register tc1, Register tc2, Register tc3, 4232 bool invertCRC) { 4233 assert_different_registers(crc, buf, len, table); 4234 4235 Label L_mainLoop, L_tail; 4236 Register tmp = t0; 4237 Register data = t0; 4238 Register tmp2 = t1; 4239 const int mainLoop_stepping = 8; 4240 const int tailLoop_stepping = 1; 4241 const int log_stepping = exact_log2(mainLoop_stepping); 4242 const int mainLoop_alignment = 32; // InputForNewCode > 4 ? InputForNewCode : 32; 4243 const int complexThreshold = 2*mainLoop_stepping; 4244 4245 // Don't test for len <= 0 here. This pathological case should not occur anyway. 4246 // Optimizing for it by adding a test and a branch seems to be a waste of CPU cycles 4247 // for all well-behaved cases. The situation itself is detected and handled correctly 4248 // within update_byteLoop_crc32. 4249 assert(tailLoop_stepping == 1, "check tailLoop_stepping!"); 4250 4251 BLOCK_COMMENT("kernel_crc32_2word {"); 4252 4253 if (invertCRC) { 4254 nand(crc, crc, crc); // 1s complement of crc 4255 } 4256 4257 // Check for short (<mainLoop_stepping) buffer. 4258 cmpdi(CCR0, len, complexThreshold); 4259 blt(CCR0, L_tail); 4260 4261 // Pre-mainLoop alignment did show a slight (1%) positive effect on performance. 4262 // We leave the code in for reference. Maybe we need alignment when we exploit vector instructions. 4263 { 4264 // Align buf addr to mainLoop_stepping boundary. 4265 neg(tmp2, buf); // Calculate # preLoop iterations for alignment. 4266 rldicl(tmp2, tmp2, 0, 64-log_stepping); // Rotate tmp2 0 bits, insert into tmp2, anding with mask with 1s from 62..63. 4267 4268 if (complexThreshold > mainLoop_stepping) { 4269 sub(len, len, tmp2); // Remaining bytes for main loop (>=mainLoop_stepping is guaranteed). 4270 } else { 4271 sub(tmp, len, tmp2); // Remaining bytes for main loop. 4272 cmpdi(CCR0, tmp, mainLoop_stepping); 4273 blt(CCR0, L_tail); // For less than one mainloop_stepping left, do only tail processing 4274 mr(len, tmp); // remaining bytes for main loop (>=mainLoop_stepping is guaranteed). 4275 } 4276 update_byteLoop_crc32(crc, buf, tmp2, table, data, false); 4277 } 4278 4279 srdi(tmp2, len, log_stepping); // #iterations for mainLoop 4280 andi(len, len, mainLoop_stepping-1); // remaining bytes for tailLoop 4281 mtctr(tmp2); 4282 4283 #ifdef VM_LITTLE_ENDIAN 4284 Register crc_rv = crc; 4285 #else 4286 Register crc_rv = tmp; // Load_reverse needs separate registers to work on. 4287 // Occupies tmp, but frees up crc. 4288 load_reverse_32(crc_rv, crc); // Revert byte order because we are dealing with big-endian data. 4289 tmp = crc; 4290 #endif 4291 4292 int reconstructTableOffset = crc32_table_columns(table, tc0, tc1, tc2, tc3); 4293 4294 align(mainLoop_alignment); // Octoword-aligned loop address. Shows 2% improvement. 4295 BIND(L_mainLoop); 4296 update_1word_crc32(crc_rv, buf, table, 0, 0, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3); 4297 update_1word_crc32(crc_rv, buf, table, 4, mainLoop_stepping, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3); 4298 bdnz(L_mainLoop); 4299 4300 #ifndef VM_LITTLE_ENDIAN 4301 load_reverse_32(crc, crc_rv); // Revert byte order because we are dealing with big-endian data. 4302 tmp = crc_rv; // Tmp uses it's original register again. 4303 #endif 4304 4305 // Restore original table address for tailLoop. 4306 if (reconstructTableOffset != 0) { 4307 addi(table, table, -reconstructTableOffset); 4308 } 4309 4310 // Process last few (<complexThreshold) bytes of buffer. 4311 BIND(L_tail); 4312 update_byteLoop_crc32(crc, buf, len, table, data, false); 4313 4314 if (invertCRC) { 4315 nand(crc, crc, crc); // 1s complement of crc 4316 } 4317 BLOCK_COMMENT("} kernel_crc32_2word"); 4318 } 4319 4320 /** 4321 * @param crc register containing existing CRC (32-bit) 4322 * @param buf register pointing to input byte buffer (byte*) 4323 * @param len register containing number of bytes 4324 * @param table register pointing to CRC table 4325 * 4326 * uses R9..R12 as work register. Must be saved/restored by caller! 4327 */ 4328 void MacroAssembler::kernel_crc32_1word(Register crc, Register buf, Register len, Register table, 4329 Register t0, Register t1, Register t2, Register t3, 4330 Register tc0, Register tc1, Register tc2, Register tc3, 4331 bool invertCRC) { 4332 assert_different_registers(crc, buf, len, table); 4333 4334 Label L_mainLoop, L_tail; 4335 Register tmp = t0; 4336 Register data = t0; 4337 Register tmp2 = t1; 4338 const int mainLoop_stepping = 4; 4339 const int tailLoop_stepping = 1; 4340 const int log_stepping = exact_log2(mainLoop_stepping); 4341 const int mainLoop_alignment = 32; // InputForNewCode > 4 ? InputForNewCode : 32; 4342 const int complexThreshold = 2*mainLoop_stepping; 4343 4344 // Don't test for len <= 0 here. This pathological case should not occur anyway. 4345 // Optimizing for it by adding a test and a branch seems to be a waste of CPU cycles 4346 // for all well-behaved cases. The situation itself is detected and handled correctly 4347 // within update_byteLoop_crc32. 4348 assert(tailLoop_stepping == 1, "check tailLoop_stepping!"); 4349 4350 BLOCK_COMMENT("kernel_crc32_1word {"); 4351 4352 if (invertCRC) { 4353 nand(crc, crc, crc); // 1s complement of crc 4354 } 4355 4356 // Check for short (<mainLoop_stepping) buffer. 4357 cmpdi(CCR0, len, complexThreshold); 4358 blt(CCR0, L_tail); 4359 4360 // Pre-mainLoop alignment did show a slight (1%) positive effect on performance. 4361 // We leave the code in for reference. Maybe we need alignment when we exploit vector instructions. 4362 { 4363 // Align buf addr to mainLoop_stepping boundary. 4364 neg(tmp2, buf); // Calculate # preLoop iterations for alignment. 4365 rldicl(tmp2, tmp2, 0, 64-log_stepping); // Rotate tmp2 0 bits, insert into tmp2, anding with mask with 1s from 62..63. 4366 4367 if (complexThreshold > mainLoop_stepping) { 4368 sub(len, len, tmp2); // Remaining bytes for main loop (>=mainLoop_stepping is guaranteed). 4369 } else { 4370 sub(tmp, len, tmp2); // Remaining bytes for main loop. 4371 cmpdi(CCR0, tmp, mainLoop_stepping); 4372 blt(CCR0, L_tail); // For less than one mainloop_stepping left, do only tail processing 4373 mr(len, tmp); // remaining bytes for main loop (>=mainLoop_stepping is guaranteed). 4374 } 4375 update_byteLoop_crc32(crc, buf, tmp2, table, data, false); 4376 } 4377 4378 srdi(tmp2, len, log_stepping); // #iterations for mainLoop 4379 andi(len, len, mainLoop_stepping-1); // remaining bytes for tailLoop 4380 mtctr(tmp2); 4381 4382 #ifdef VM_LITTLE_ENDIAN 4383 Register crc_rv = crc; 4384 #else 4385 Register crc_rv = tmp; // Load_reverse needs separate registers to work on. 4386 // Occupies tmp, but frees up crc. 4387 load_reverse_32(crc_rv, crc); // Revert byte order because we are dealing with big-endian data. 4388 tmp = crc; 4389 #endif 4390 4391 int reconstructTableOffset = crc32_table_columns(table, tc0, tc1, tc2, tc3); 4392 4393 align(mainLoop_alignment); // Octoword-aligned loop address. Shows 2% improvement. 4394 BIND(L_mainLoop); 4395 update_1word_crc32(crc_rv, buf, table, 0, mainLoop_stepping, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3); 4396 bdnz(L_mainLoop); 4397 4398 #ifndef VM_LITTLE_ENDIAN 4399 load_reverse_32(crc, crc_rv); // Revert byte order because we are dealing with big-endian data. 4400 tmp = crc_rv; // Tmp uses it's original register again. 4401 #endif 4402 4403 // Restore original table address for tailLoop. 4404 if (reconstructTableOffset != 0) { 4405 addi(table, table, -reconstructTableOffset); 4406 } 4407 4408 // Process last few (<complexThreshold) bytes of buffer. 4409 BIND(L_tail); 4410 update_byteLoop_crc32(crc, buf, len, table, data, false); 4411 4412 if (invertCRC) { 4413 nand(crc, crc, crc); // 1s complement of crc 4414 } 4415 BLOCK_COMMENT("} kernel_crc32_1word"); 4416 } 4417 4418 /** 4419 * @param crc register containing existing CRC (32-bit) 4420 * @param buf register pointing to input byte buffer (byte*) 4421 * @param len register containing number of bytes 4422 * @param table register pointing to CRC table 4423 * 4424 * Uses R7_ARG5, R8_ARG6 as work registers. 4425 */ 4426 void MacroAssembler::kernel_crc32_1byte(Register crc, Register buf, Register len, Register table, 4427 Register t0, Register t1, Register t2, Register t3, 4428 bool invertCRC) { 4429 assert_different_registers(crc, buf, len, table); 4430 4431 Register data = t0; // Holds the current byte to be folded into crc. 4432 4433 BLOCK_COMMENT("kernel_crc32_1byte {"); 4434 4435 if (invertCRC) { 4436 nand(crc, crc, crc); // 1s complement of crc 4437 } 4438 4439 // Process all bytes in a single-byte loop. 4440 update_byteLoop_crc32(crc, buf, len, table, data, true); 4441 4442 if (invertCRC) { 4443 nand(crc, crc, crc); // 1s complement of crc 4444 } 4445 BLOCK_COMMENT("} kernel_crc32_1byte"); 4446 } 4447 4448 /** 4449 * @param crc register containing existing CRC (32-bit) 4450 * @param buf register pointing to input byte buffer (byte*) 4451 * @param len register containing number of bytes 4452 * @param table register pointing to CRC table 4453 * @param constants register pointing to CRC table for 128-bit aligned memory 4454 * @param barretConstants register pointing to table for barrett reduction 4455 * @param t0 volatile register 4456 * @param t1 volatile register 4457 * @param t2 volatile register 4458 * @param t3 volatile register 4459 */ 4460 void MacroAssembler::kernel_crc32_1word_vpmsumd(Register crc, Register buf, Register len, Register table, 4461 Register constants, Register barretConstants, 4462 Register t0, Register t1, Register t2, Register t3, Register t4, 4463 bool invertCRC) { 4464 assert_different_registers(crc, buf, len, table); 4465 4466 Label L_alignedHead, L_tail, L_alignTail, L_start, L_end; 4467 4468 Register prealign = t0; 4469 Register postalign = t0; 4470 4471 BLOCK_COMMENT("kernel_crc32_1word_vpmsumb {"); 4472 4473 // 1. use kernel_crc32_1word for shorter than 384bit 4474 clrldi(len, len, 32); 4475 cmpdi(CCR0, len, 384); 4476 bge(CCR0, L_start); 4477 4478 Register tc0 = t4; 4479 Register tc1 = constants; 4480 Register tc2 = barretConstants; 4481 kernel_crc32_1word(crc, buf, len, table,t0, t1, t2, t3, tc0, tc1, tc2, table, invertCRC); 4482 b(L_end); 4483 4484 BIND(L_start); 4485 4486 // 2. ~c 4487 if (invertCRC) { 4488 nand(crc, crc, crc); // 1s complement of crc 4489 } 4490 4491 // 3. calculate from 0 to first 128bit-aligned address 4492 clrldi_(prealign, buf, 57); 4493 beq(CCR0, L_alignedHead); 4494 4495 subfic(prealign, prealign, 128); 4496 4497 subf(len, prealign, len); 4498 update_byteLoop_crc32(crc, buf, prealign, table, t2, false); 4499 4500 // 4. calculate from first 128bit-aligned address to last 128bit-aligned address 4501 BIND(L_alignedHead); 4502 4503 clrldi(postalign, len, 57); 4504 subf(len, postalign, len); 4505 4506 // len must be more than 256bit 4507 kernel_crc32_1word_aligned(crc, buf, len, constants, barretConstants, t1, t2, t3); 4508 4509 // 5. calculate remaining 4510 cmpdi(CCR0, postalign, 0); 4511 beq(CCR0, L_tail); 4512 4513 update_byteLoop_crc32(crc, buf, postalign, table, t2, false); 4514 4515 BIND(L_tail); 4516 4517 // 6. ~c 4518 if (invertCRC) { 4519 nand(crc, crc, crc); // 1s complement of crc 4520 } 4521 4522 BIND(L_end); 4523 4524 BLOCK_COMMENT("} kernel_crc32_1word_vpmsumb"); 4525 } 4526 4527 /** 4528 * @param crc register containing existing CRC (32-bit) 4529 * @param buf register pointing to input byte buffer (byte*) 4530 * @param len register containing number of bytes 4531 * @param constants register pointing to CRC table for 128-bit aligned memory 4532 * @param barretConstants register pointing to table for barrett reduction 4533 * @param t0 volatile register 4534 * @param t1 volatile register 4535 * @param t2 volatile register 4536 */ 4537 void MacroAssembler::kernel_crc32_1word_aligned(Register crc, Register buf, Register len, 4538 Register constants, Register barretConstants, Register t0, Register t1, Register t2) { 4539 Label L_mainLoop, L_tail, L_alignTail, L_barrett_reduction, L_end, L_first_warm_up_done, L_first_cool_down, L_second_cool_down, L_XOR, L_test; 4540 Label L_lv0, L_lv1, L_lv2, L_lv3, L_lv4, L_lv5, L_lv6, L_lv7, L_lv8, L_lv9, L_lv10, L_lv11, L_lv12, L_lv13, L_lv14, L_lv15; 4541 Label L_1, L_2, L_3, L_4; 4542 4543 Register rLoaded = t0; 4544 Register rTmp1 = t1; 4545 Register rTmp2 = t2; 4546 Register off16 = R22; 4547 Register off32 = R23; 4548 Register off48 = R24; 4549 Register off64 = R25; 4550 Register off80 = R26; 4551 Register off96 = R27; 4552 Register off112 = R28; 4553 Register rIdx = R29; 4554 Register rMax = R30; 4555 Register constantsPos = R31; 4556 4557 VectorRegister mask_32bit = VR24; 4558 VectorRegister mask_64bit = VR25; 4559 VectorRegister zeroes = VR26; 4560 VectorRegister const1 = VR27; 4561 VectorRegister const2 = VR28; 4562 4563 // Save non-volatile vector registers (frameless). 4564 Register offset = t1; int offsetInt = 0; 4565 offsetInt -= 16; li(offset, -16); stvx(VR20, offset, R1_SP); 4566 offsetInt -= 16; addi(offset, offset, -16); stvx(VR21, offset, R1_SP); 4567 offsetInt -= 16; addi(offset, offset, -16); stvx(VR22, offset, R1_SP); 4568 offsetInt -= 16; addi(offset, offset, -16); stvx(VR23, offset, R1_SP); 4569 offsetInt -= 16; addi(offset, offset, -16); stvx(VR24, offset, R1_SP); 4570 offsetInt -= 16; addi(offset, offset, -16); stvx(VR25, offset, R1_SP); 4571 offsetInt -= 16; addi(offset, offset, -16); stvx(VR26, offset, R1_SP); 4572 offsetInt -= 16; addi(offset, offset, -16); stvx(VR27, offset, R1_SP); 4573 offsetInt -= 16; addi(offset, offset, -16); stvx(VR28, offset, R1_SP); 4574 offsetInt -= 8; std(R22, offsetInt, R1_SP); 4575 offsetInt -= 8; std(R23, offsetInt, R1_SP); 4576 offsetInt -= 8; std(R24, offsetInt, R1_SP); 4577 offsetInt -= 8; std(R25, offsetInt, R1_SP); 4578 offsetInt -= 8; std(R26, offsetInt, R1_SP); 4579 offsetInt -= 8; std(R27, offsetInt, R1_SP); 4580 offsetInt -= 8; std(R28, offsetInt, R1_SP); 4581 offsetInt -= 8; std(R29, offsetInt, R1_SP); 4582 offsetInt -= 8; std(R30, offsetInt, R1_SP); 4583 offsetInt -= 8; std(R31, offsetInt, R1_SP); 4584 4585 // Set constants 4586 li(off16, 16); 4587 li(off32, 32); 4588 li(off48, 48); 4589 li(off64, 64); 4590 li(off80, 80); 4591 li(off96, 96); 4592 li(off112, 112); 4593 4594 clrldi(crc, crc, 32); 4595 4596 vxor(zeroes, zeroes, zeroes); 4597 vspltisw(VR0, -1); 4598 4599 vsldoi(mask_32bit, zeroes, VR0, 4); 4600 vsldoi(mask_64bit, zeroes, VR0, 8); 4601 4602 // Get the initial value into v8 4603 vxor(VR8, VR8, VR8); 4604 mtvrd(VR8, crc); 4605 vsldoi(VR8, zeroes, VR8, 8); // shift into bottom 32 bits 4606 4607 li (rLoaded, 0); 4608 4609 rldicr(rIdx, len, 0, 56); 4610 4611 { 4612 BIND(L_1); 4613 // Checksum in blocks of MAX_SIZE (32768) 4614 lis(rMax, 0); 4615 ori(rMax, rMax, 32768); 4616 mr(rTmp2, rMax); 4617 cmpd(CCR0, rIdx, rMax); 4618 bgt(CCR0, L_2); 4619 mr(rMax, rIdx); 4620 4621 BIND(L_2); 4622 subf(rIdx, rMax, rIdx); 4623 4624 // our main loop does 128 bytes at a time 4625 srdi(rMax, rMax, 7); 4626 4627 /* 4628 * Work out the offset into the constants table to start at. Each 4629 * constant is 16 bytes, and it is used against 128 bytes of input 4630 * data - 128 / 16 = 8 4631 */ 4632 sldi(rTmp1, rMax, 4); 4633 srdi(rTmp2, rTmp2, 3); 4634 subf(rTmp1, rTmp1, rTmp2); 4635 4636 // We reduce our final 128 bytes in a separate step 4637 addi(rMax, rMax, -1); 4638 mtctr(rMax); 4639 4640 // Find the start of our constants 4641 add(constantsPos, constants, rTmp1); 4642 4643 // zero VR0-v7 which will contain our checksums 4644 vxor(VR0, VR0, VR0); 4645 vxor(VR1, VR1, VR1); 4646 vxor(VR2, VR2, VR2); 4647 vxor(VR3, VR3, VR3); 4648 vxor(VR4, VR4, VR4); 4649 vxor(VR5, VR5, VR5); 4650 vxor(VR6, VR6, VR6); 4651 vxor(VR7, VR7, VR7); 4652 4653 lvx(const1, constantsPos); 4654 4655 /* 4656 * If we are looping back to consume more data we use the values 4657 * already in VR16-v23. 4658 */ 4659 cmpdi(CCR0, rLoaded, 1); 4660 beq(CCR0, L_3); 4661 { 4662 4663 // First warm up pass 4664 lvx(VR16, buf); 4665 lvx(VR17, off16, buf); 4666 lvx(VR18, off32, buf); 4667 lvx(VR19, off48, buf); 4668 lvx(VR20, off64, buf); 4669 lvx(VR21, off80, buf); 4670 lvx(VR22, off96, buf); 4671 lvx(VR23, off112, buf); 4672 addi(buf, buf, 8*16); 4673 4674 // xor in initial value 4675 vxor(VR16, VR16, VR8); 4676 } 4677 4678 BIND(L_3); 4679 bdz(L_first_warm_up_done); 4680 4681 addi(constantsPos, constantsPos, 16); 4682 lvx(const2, constantsPos); 4683 4684 // Second warm up pass 4685 vpmsumd(VR8, VR16, const1); 4686 lvx(VR16, buf); 4687 4688 vpmsumd(VR9, VR17, const1); 4689 lvx(VR17, off16, buf); 4690 4691 vpmsumd(VR10, VR18, const1); 4692 lvx(VR18, off32, buf); 4693 4694 vpmsumd(VR11, VR19, const1); 4695 lvx(VR19, off48, buf); 4696 4697 vpmsumd(VR12, VR20, const1); 4698 lvx(VR20, off64, buf); 4699 4700 vpmsumd(VR13, VR21, const1); 4701 lvx(VR21, off80, buf); 4702 4703 vpmsumd(VR14, VR22, const1); 4704 lvx(VR22, off96, buf); 4705 4706 vpmsumd(VR15, VR23, const1); 4707 lvx(VR23, off112, buf); 4708 4709 addi(buf, buf, 8 * 16); 4710 4711 bdz(L_first_cool_down); 4712 4713 /* 4714 * main loop. We modulo schedule it such that it takes three iterations 4715 * to complete - first iteration load, second iteration vpmsum, third 4716 * iteration xor. 4717 */ 4718 { 4719 BIND(L_4); 4720 lvx(const1, constantsPos); addi(constantsPos, constantsPos, 16); 4721 4722 vxor(VR0, VR0, VR8); 4723 vpmsumd(VR8, VR16, const2); 4724 lvx(VR16, buf); 4725 4726 vxor(VR1, VR1, VR9); 4727 vpmsumd(VR9, VR17, const2); 4728 lvx(VR17, off16, buf); 4729 4730 vxor(VR2, VR2, VR10); 4731 vpmsumd(VR10, VR18, const2); 4732 lvx(VR18, off32, buf); 4733 4734 vxor(VR3, VR3, VR11); 4735 vpmsumd(VR11, VR19, const2); 4736 lvx(VR19, off48, buf); 4737 lvx(const2, constantsPos); 4738 4739 vxor(VR4, VR4, VR12); 4740 vpmsumd(VR12, VR20, const1); 4741 lvx(VR20, off64, buf); 4742 4743 vxor(VR5, VR5, VR13); 4744 vpmsumd(VR13, VR21, const1); 4745 lvx(VR21, off80, buf); 4746 4747 vxor(VR6, VR6, VR14); 4748 vpmsumd(VR14, VR22, const1); 4749 lvx(VR22, off96, buf); 4750 4751 vxor(VR7, VR7, VR15); 4752 vpmsumd(VR15, VR23, const1); 4753 lvx(VR23, off112, buf); 4754 4755 addi(buf, buf, 8 * 16); 4756 4757 bdnz(L_4); 4758 } 4759 4760 BIND(L_first_cool_down); 4761 4762 // First cool down pass 4763 lvx(const1, constantsPos); 4764 addi(constantsPos, constantsPos, 16); 4765 4766 vxor(VR0, VR0, VR8); 4767 vpmsumd(VR8, VR16, const1); 4768 4769 vxor(VR1, VR1, VR9); 4770 vpmsumd(VR9, VR17, const1); 4771 4772 vxor(VR2, VR2, VR10); 4773 vpmsumd(VR10, VR18, const1); 4774 4775 vxor(VR3, VR3, VR11); 4776 vpmsumd(VR11, VR19, const1); 4777 4778 vxor(VR4, VR4, VR12); 4779 vpmsumd(VR12, VR20, const1); 4780 4781 vxor(VR5, VR5, VR13); 4782 vpmsumd(VR13, VR21, const1); 4783 4784 vxor(VR6, VR6, VR14); 4785 vpmsumd(VR14, VR22, const1); 4786 4787 vxor(VR7, VR7, VR15); 4788 vpmsumd(VR15, VR23, const1); 4789 4790 BIND(L_second_cool_down); 4791 // Second cool down pass 4792 vxor(VR0, VR0, VR8); 4793 vxor(VR1, VR1, VR9); 4794 vxor(VR2, VR2, VR10); 4795 vxor(VR3, VR3, VR11); 4796 vxor(VR4, VR4, VR12); 4797 vxor(VR5, VR5, VR13); 4798 vxor(VR6, VR6, VR14); 4799 vxor(VR7, VR7, VR15); 4800 4801 /* 4802 * vpmsumd produces a 96 bit result in the least significant bits 4803 * of the register. Since we are bit reflected we have to shift it 4804 * left 32 bits so it occupies the least significant bits in the 4805 * bit reflected domain. 4806 */ 4807 vsldoi(VR0, VR0, zeroes, 4); 4808 vsldoi(VR1, VR1, zeroes, 4); 4809 vsldoi(VR2, VR2, zeroes, 4); 4810 vsldoi(VR3, VR3, zeroes, 4); 4811 vsldoi(VR4, VR4, zeroes, 4); 4812 vsldoi(VR5, VR5, zeroes, 4); 4813 vsldoi(VR6, VR6, zeroes, 4); 4814 vsldoi(VR7, VR7, zeroes, 4); 4815 4816 // xor with last 1024 bits 4817 lvx(VR8, buf); 4818 lvx(VR9, off16, buf); 4819 lvx(VR10, off32, buf); 4820 lvx(VR11, off48, buf); 4821 lvx(VR12, off64, buf); 4822 lvx(VR13, off80, buf); 4823 lvx(VR14, off96, buf); 4824 lvx(VR15, off112, buf); 4825 addi(buf, buf, 8 * 16); 4826 4827 vxor(VR16, VR0, VR8); 4828 vxor(VR17, VR1, VR9); 4829 vxor(VR18, VR2, VR10); 4830 vxor(VR19, VR3, VR11); 4831 vxor(VR20, VR4, VR12); 4832 vxor(VR21, VR5, VR13); 4833 vxor(VR22, VR6, VR14); 4834 vxor(VR23, VR7, VR15); 4835 4836 li(rLoaded, 1); 4837 cmpdi(CCR0, rIdx, 0); 4838 addi(rIdx, rIdx, 128); 4839 bne(CCR0, L_1); 4840 } 4841 4842 // Work out how many bytes we have left 4843 andi_(len, len, 127); 4844 4845 // Calculate where in the constant table we need to start 4846 subfic(rTmp1, len, 128); 4847 add(constantsPos, constantsPos, rTmp1); 4848 4849 // How many 16 byte chunks are in the tail 4850 srdi(rIdx, len, 4); 4851 mtctr(rIdx); 4852 4853 /* 4854 * Reduce the previously calculated 1024 bits to 64 bits, shifting 4855 * 32 bits to include the trailing 32 bits of zeros 4856 */ 4857 lvx(VR0, constantsPos); 4858 lvx(VR1, off16, constantsPos); 4859 lvx(VR2, off32, constantsPos); 4860 lvx(VR3, off48, constantsPos); 4861 lvx(VR4, off64, constantsPos); 4862 lvx(VR5, off80, constantsPos); 4863 lvx(VR6, off96, constantsPos); 4864 lvx(VR7, off112, constantsPos); 4865 addi(constantsPos, constantsPos, 8 * 16); 4866 4867 vpmsumw(VR0, VR16, VR0); 4868 vpmsumw(VR1, VR17, VR1); 4869 vpmsumw(VR2, VR18, VR2); 4870 vpmsumw(VR3, VR19, VR3); 4871 vpmsumw(VR4, VR20, VR4); 4872 vpmsumw(VR5, VR21, VR5); 4873 vpmsumw(VR6, VR22, VR6); 4874 vpmsumw(VR7, VR23, VR7); 4875 4876 // Now reduce the tail (0 - 112 bytes) 4877 cmpdi(CCR0, rIdx, 0); 4878 beq(CCR0, L_XOR); 4879 4880 lvx(VR16, buf); addi(buf, buf, 16); 4881 lvx(VR17, constantsPos); 4882 vpmsumw(VR16, VR16, VR17); 4883 vxor(VR0, VR0, VR16); 4884 beq(CCR0, L_XOR); 4885 4886 lvx(VR16, buf); addi(buf, buf, 16); 4887 lvx(VR17, off16, constantsPos); 4888 vpmsumw(VR16, VR16, VR17); 4889 vxor(VR0, VR0, VR16); 4890 beq(CCR0, L_XOR); 4891 4892 lvx(VR16, buf); addi(buf, buf, 16); 4893 lvx(VR17, off32, constantsPos); 4894 vpmsumw(VR16, VR16, VR17); 4895 vxor(VR0, VR0, VR16); 4896 beq(CCR0, L_XOR); 4897 4898 lvx(VR16, buf); addi(buf, buf, 16); 4899 lvx(VR17, off48,constantsPos); 4900 vpmsumw(VR16, VR16, VR17); 4901 vxor(VR0, VR0, VR16); 4902 beq(CCR0, L_XOR); 4903 4904 lvx(VR16, buf); addi(buf, buf, 16); 4905 lvx(VR17, off64, constantsPos); 4906 vpmsumw(VR16, VR16, VR17); 4907 vxor(VR0, VR0, VR16); 4908 beq(CCR0, L_XOR); 4909 4910 lvx(VR16, buf); addi(buf, buf, 16); 4911 lvx(VR17, off80, constantsPos); 4912 vpmsumw(VR16, VR16, VR17); 4913 vxor(VR0, VR0, VR16); 4914 beq(CCR0, L_XOR); 4915 4916 lvx(VR16, buf); addi(buf, buf, 16); 4917 lvx(VR17, off96, constantsPos); 4918 vpmsumw(VR16, VR16, VR17); 4919 vxor(VR0, VR0, VR16); 4920 4921 // Now xor all the parallel chunks together 4922 BIND(L_XOR); 4923 vxor(VR0, VR0, VR1); 4924 vxor(VR2, VR2, VR3); 4925 vxor(VR4, VR4, VR5); 4926 vxor(VR6, VR6, VR7); 4927 4928 vxor(VR0, VR0, VR2); 4929 vxor(VR4, VR4, VR6); 4930 4931 vxor(VR0, VR0, VR4); 4932 4933 b(L_barrett_reduction); 4934 4935 BIND(L_first_warm_up_done); 4936 lvx(const1, constantsPos); 4937 addi(constantsPos, constantsPos, 16); 4938 vpmsumd(VR8, VR16, const1); 4939 vpmsumd(VR9, VR17, const1); 4940 vpmsumd(VR10, VR18, const1); 4941 vpmsumd(VR11, VR19, const1); 4942 vpmsumd(VR12, VR20, const1); 4943 vpmsumd(VR13, VR21, const1); 4944 vpmsumd(VR14, VR22, const1); 4945 vpmsumd(VR15, VR23, const1); 4946 b(L_second_cool_down); 4947 4948 BIND(L_barrett_reduction); 4949 4950 lvx(const1, barretConstants); 4951 addi(barretConstants, barretConstants, 16); 4952 lvx(const2, barretConstants); 4953 4954 vsldoi(VR1, VR0, VR0, 8); 4955 vxor(VR0, VR0, VR1); // xor two 64 bit results together 4956 4957 // shift left one bit 4958 vspltisb(VR1, 1); 4959 vsl(VR0, VR0, VR1); 4960 4961 vand(VR0, VR0, mask_64bit); 4962 4963 /* 4964 * The reflected version of Barrett reduction. Instead of bit 4965 * reflecting our data (which is expensive to do), we bit reflect our 4966 * constants and our algorithm, which means the intermediate data in 4967 * our vector registers goes from 0-63 instead of 63-0. We can reflect 4968 * the algorithm because we don't carry in mod 2 arithmetic. 4969 */ 4970 vand(VR1, VR0, mask_32bit); // bottom 32 bits of a 4971 vpmsumd(VR1, VR1, const1); // ma 4972 vand(VR1, VR1, mask_32bit); // bottom 32bits of ma 4973 vpmsumd(VR1, VR1, const2); // qn */ 4974 vxor(VR0, VR0, VR1); // a - qn, subtraction is xor in GF(2) 4975 4976 /* 4977 * Since we are bit reflected, the result (ie the low 32 bits) is in 4978 * the high 32 bits. We just need to shift it left 4 bytes 4979 * V0 [ 0 1 X 3 ] 4980 * V0 [ 0 X 2 3 ] 4981 */ 4982 vsldoi(VR0, VR0, zeroes, 4); // shift result into top 64 bits of 4983 4984 // Get it into r3 4985 mfvrd(crc, VR0); 4986 4987 BIND(L_end); 4988 4989 offsetInt = 0; 4990 // Restore non-volatile Vector registers (frameless). 4991 offsetInt -= 16; li(offset, -16); lvx(VR20, offset, R1_SP); 4992 offsetInt -= 16; addi(offset, offset, -16); lvx(VR21, offset, R1_SP); 4993 offsetInt -= 16; addi(offset, offset, -16); lvx(VR22, offset, R1_SP); 4994 offsetInt -= 16; addi(offset, offset, -16); lvx(VR23, offset, R1_SP); 4995 offsetInt -= 16; addi(offset, offset, -16); lvx(VR24, offset, R1_SP); 4996 offsetInt -= 16; addi(offset, offset, -16); lvx(VR25, offset, R1_SP); 4997 offsetInt -= 16; addi(offset, offset, -16); lvx(VR26, offset, R1_SP); 4998 offsetInt -= 16; addi(offset, offset, -16); lvx(VR27, offset, R1_SP); 4999 offsetInt -= 16; addi(offset, offset, -16); lvx(VR28, offset, R1_SP); 5000 offsetInt -= 8; ld(R22, offsetInt, R1_SP); 5001 offsetInt -= 8; ld(R23, offsetInt, R1_SP); 5002 offsetInt -= 8; ld(R24, offsetInt, R1_SP); 5003 offsetInt -= 8; ld(R25, offsetInt, R1_SP); 5004 offsetInt -= 8; ld(R26, offsetInt, R1_SP); 5005 offsetInt -= 8; ld(R27, offsetInt, R1_SP); 5006 offsetInt -= 8; ld(R28, offsetInt, R1_SP); 5007 offsetInt -= 8; ld(R29, offsetInt, R1_SP); 5008 offsetInt -= 8; ld(R30, offsetInt, R1_SP); 5009 offsetInt -= 8; ld(R31, offsetInt, R1_SP); 5010 } 5011 5012 void MacroAssembler::kernel_crc32_singleByte(Register crc, Register buf, Register len, Register table, Register tmp, bool invertCRC) { 5013 assert_different_registers(crc, buf, /* len, not used!! */ table, tmp); 5014 5015 BLOCK_COMMENT("kernel_crc32_singleByte:"); 5016 if (invertCRC) { 5017 nand(crc, crc, crc); // 1s complement of crc 5018 } 5019 5020 lbz(tmp, 0, buf); // Byte from buffer, zero-extended. 5021 update_byte_crc32(crc, tmp, table); 5022 5023 if (invertCRC) { 5024 nand(crc, crc, crc); // 1s complement of crc 5025 } 5026 } 5027 5028 void MacroAssembler::kernel_crc32_singleByteReg(Register crc, Register val, Register table, bool invertCRC) { 5029 assert_different_registers(crc, val, table); 5030 5031 BLOCK_COMMENT("kernel_crc32_singleByteReg:"); 5032 if (invertCRC) { 5033 nand(crc, crc, crc); // 1s complement of crc 5034 } 5035 5036 update_byte_crc32(crc, val, table); 5037 5038 if (invertCRC) { 5039 nand(crc, crc, crc); // 1s complement of crc 5040 } 5041 } 5042 5043 // dest_lo += src1 + src2 5044 // dest_hi += carry1 + carry2 5045 void MacroAssembler::add2_with_carry(Register dest_hi, 5046 Register dest_lo, 5047 Register src1, Register src2) { 5048 li(R0, 0); 5049 addc(dest_lo, dest_lo, src1); 5050 adde(dest_hi, dest_hi, R0); 5051 addc(dest_lo, dest_lo, src2); 5052 adde(dest_hi, dest_hi, R0); 5053 } 5054 5055 // Multiply 64 bit by 64 bit first loop. 5056 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, 5057 Register x_xstart, 5058 Register y, Register y_idx, 5059 Register z, 5060 Register carry, 5061 Register product_high, Register product, 5062 Register idx, Register kdx, 5063 Register tmp) { 5064 // jlong carry, x[], y[], z[]; 5065 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx--, kdx--) { 5066 // huge_128 product = y[idx] * x[xstart] + carry; 5067 // z[kdx] = (jlong)product; 5068 // carry = (jlong)(product >>> 64); 5069 // } 5070 // z[xstart] = carry; 5071 5072 Label L_first_loop, L_first_loop_exit; 5073 Label L_one_x, L_one_y, L_multiply; 5074 5075 addic_(xstart, xstart, -1); 5076 blt(CCR0, L_one_x); // Special case: length of x is 1. 5077 5078 // Load next two integers of x. 5079 sldi(tmp, xstart, LogBytesPerInt); 5080 ldx(x_xstart, x, tmp); 5081 #ifdef VM_LITTLE_ENDIAN 5082 rldicl(x_xstart, x_xstart, 32, 0); 5083 #endif 5084 5085 align(32, 16); 5086 bind(L_first_loop); 5087 5088 cmpdi(CCR0, idx, 1); 5089 blt(CCR0, L_first_loop_exit); 5090 addi(idx, idx, -2); 5091 beq(CCR0, L_one_y); 5092 5093 // Load next two integers of y. 5094 sldi(tmp, idx, LogBytesPerInt); 5095 ldx(y_idx, y, tmp); 5096 #ifdef VM_LITTLE_ENDIAN 5097 rldicl(y_idx, y_idx, 32, 0); 5098 #endif 5099 5100 5101 bind(L_multiply); 5102 multiply64(product_high, product, x_xstart, y_idx); 5103 5104 li(tmp, 0); 5105 addc(product, product, carry); // Add carry to result. 5106 adde(product_high, product_high, tmp); // Add carry of the last addition. 5107 addi(kdx, kdx, -2); 5108 5109 // Store result. 5110 #ifdef VM_LITTLE_ENDIAN 5111 rldicl(product, product, 32, 0); 5112 #endif 5113 sldi(tmp, kdx, LogBytesPerInt); 5114 stdx(product, z, tmp); 5115 mr_if_needed(carry, product_high); 5116 b(L_first_loop); 5117 5118 5119 bind(L_one_y); // Load one 32 bit portion of y as (0,value). 5120 5121 lwz(y_idx, 0, y); 5122 b(L_multiply); 5123 5124 5125 bind(L_one_x); // Load one 32 bit portion of x as (0,value). 5126 5127 lwz(x_xstart, 0, x); 5128 b(L_first_loop); 5129 5130 bind(L_first_loop_exit); 5131 } 5132 5133 // Multiply 64 bit by 64 bit and add 128 bit. 5134 void MacroAssembler::multiply_add_128_x_128(Register x_xstart, Register y, 5135 Register z, Register yz_idx, 5136 Register idx, Register carry, 5137 Register product_high, Register product, 5138 Register tmp, int offset) { 5139 5140 // huge_128 product = (y[idx] * x_xstart) + z[kdx] + carry; 5141 // z[kdx] = (jlong)product; 5142 5143 sldi(tmp, idx, LogBytesPerInt); 5144 if (offset) { 5145 addi(tmp, tmp, offset); 5146 } 5147 ldx(yz_idx, y, tmp); 5148 #ifdef VM_LITTLE_ENDIAN 5149 rldicl(yz_idx, yz_idx, 32, 0); 5150 #endif 5151 5152 multiply64(product_high, product, x_xstart, yz_idx); 5153 ldx(yz_idx, z, tmp); 5154 #ifdef VM_LITTLE_ENDIAN 5155 rldicl(yz_idx, yz_idx, 32, 0); 5156 #endif 5157 5158 add2_with_carry(product_high, product, carry, yz_idx); 5159 5160 sldi(tmp, idx, LogBytesPerInt); 5161 if (offset) { 5162 addi(tmp, tmp, offset); 5163 } 5164 #ifdef VM_LITTLE_ENDIAN 5165 rldicl(product, product, 32, 0); 5166 #endif 5167 stdx(product, z, tmp); 5168 } 5169 5170 // Multiply 128 bit by 128 bit. Unrolled inner loop. 5171 void MacroAssembler::multiply_128_x_128_loop(Register x_xstart, 5172 Register y, Register z, 5173 Register yz_idx, Register idx, Register carry, 5174 Register product_high, Register product, 5175 Register carry2, Register tmp) { 5176 5177 // jlong carry, x[], y[], z[]; 5178 // int kdx = ystart+1; 5179 // for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop 5180 // huge_128 product = (y[idx+1] * x_xstart) + z[kdx+idx+1] + carry; 5181 // z[kdx+idx+1] = (jlong)product; 5182 // jlong carry2 = (jlong)(product >>> 64); 5183 // product = (y[idx] * x_xstart) + z[kdx+idx] + carry2; 5184 // z[kdx+idx] = (jlong)product; 5185 // carry = (jlong)(product >>> 64); 5186 // } 5187 // idx += 2; 5188 // if (idx > 0) { 5189 // product = (y[idx] * x_xstart) + z[kdx+idx] + carry; 5190 // z[kdx+idx] = (jlong)product; 5191 // carry = (jlong)(product >>> 64); 5192 // } 5193 5194 Label L_third_loop, L_third_loop_exit, L_post_third_loop_done; 5195 const Register jdx = R0; 5196 5197 // Scale the index. 5198 srdi_(jdx, idx, 2); 5199 beq(CCR0, L_third_loop_exit); 5200 mtctr(jdx); 5201 5202 align(32, 16); 5203 bind(L_third_loop); 5204 5205 addi(idx, idx, -4); 5206 5207 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 8); 5208 mr_if_needed(carry2, product_high); 5209 5210 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry2, product_high, product, tmp, 0); 5211 mr_if_needed(carry, product_high); 5212 bdnz(L_third_loop); 5213 5214 bind(L_third_loop_exit); // Handle any left-over operand parts. 5215 5216 andi_(idx, idx, 0x3); 5217 beq(CCR0, L_post_third_loop_done); 5218 5219 Label L_check_1; 5220 5221 addic_(idx, idx, -2); 5222 blt(CCR0, L_check_1); 5223 5224 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 0); 5225 mr_if_needed(carry, product_high); 5226 5227 bind(L_check_1); 5228 5229 addi(idx, idx, 0x2); 5230 andi_(idx, idx, 0x1); 5231 addic_(idx, idx, -1); 5232 blt(CCR0, L_post_third_loop_done); 5233 5234 sldi(tmp, idx, LogBytesPerInt); 5235 lwzx(yz_idx, y, tmp); 5236 multiply64(product_high, product, x_xstart, yz_idx); 5237 lwzx(yz_idx, z, tmp); 5238 5239 add2_with_carry(product_high, product, yz_idx, carry); 5240 5241 sldi(tmp, idx, LogBytesPerInt); 5242 stwx(product, z, tmp); 5243 srdi(product, product, 32); 5244 5245 sldi(product_high, product_high, 32); 5246 orr(product, product, product_high); 5247 mr_if_needed(carry, product); 5248 5249 bind(L_post_third_loop_done); 5250 } // multiply_128_x_128_loop 5251 5252 void MacroAssembler::muladd(Register out, Register in, 5253 Register offset, Register len, Register k, 5254 Register tmp1, Register tmp2, Register carry) { 5255 5256 // Labels 5257 Label LOOP, SKIP; 5258 5259 // Make sure length is positive. 5260 cmpdi (CCR0, len, 0); 5261 5262 // Prepare variables 5263 subi (offset, offset, 4); 5264 li (carry, 0); 5265 ble (CCR0, SKIP); 5266 5267 mtctr (len); 5268 subi (len, len, 1 ); 5269 sldi (len, len, 2 ); 5270 5271 // Main loop 5272 bind(LOOP); 5273 lwzx (tmp1, len, in ); 5274 lwzx (tmp2, offset, out ); 5275 mulld (tmp1, tmp1, k ); 5276 add (tmp2, carry, tmp2 ); 5277 add (tmp2, tmp1, tmp2 ); 5278 stwx (tmp2, offset, out ); 5279 srdi (carry, tmp2, 32 ); 5280 subi (offset, offset, 4 ); 5281 subi (len, len, 4 ); 5282 bdnz (LOOP); 5283 bind(SKIP); 5284 } 5285 5286 void MacroAssembler::multiply_to_len(Register x, Register xlen, 5287 Register y, Register ylen, 5288 Register z, Register zlen, 5289 Register tmp1, Register tmp2, 5290 Register tmp3, Register tmp4, 5291 Register tmp5, Register tmp6, 5292 Register tmp7, Register tmp8, 5293 Register tmp9, Register tmp10, 5294 Register tmp11, Register tmp12, 5295 Register tmp13) { 5296 5297 ShortBranchVerifier sbv(this); 5298 5299 assert_different_registers(x, xlen, y, ylen, z, zlen, 5300 tmp1, tmp2, tmp3, tmp4, tmp5, tmp6); 5301 assert_different_registers(x, xlen, y, ylen, z, zlen, 5302 tmp1, tmp2, tmp3, tmp4, tmp5, tmp7); 5303 assert_different_registers(x, xlen, y, ylen, z, zlen, 5304 tmp1, tmp2, tmp3, tmp4, tmp5, tmp8); 5305 5306 const Register idx = tmp1; 5307 const Register kdx = tmp2; 5308 const Register xstart = tmp3; 5309 5310 const Register y_idx = tmp4; 5311 const Register carry = tmp5; 5312 const Register product = tmp6; 5313 const Register product_high = tmp7; 5314 const Register x_xstart = tmp8; 5315 const Register tmp = tmp9; 5316 5317 // First Loop. 5318 // 5319 // final static long LONG_MASK = 0xffffffffL; 5320 // int xstart = xlen - 1; 5321 // int ystart = ylen - 1; 5322 // long carry = 0; 5323 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) { 5324 // long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry; 5325 // z[kdx] = (int)product; 5326 // carry = product >>> 32; 5327 // } 5328 // z[xstart] = (int)carry; 5329 5330 mr_if_needed(idx, ylen); // idx = ylen 5331 mr_if_needed(kdx, zlen); // kdx = xlen + ylen 5332 li(carry, 0); // carry = 0 5333 5334 Label L_done; 5335 5336 addic_(xstart, xlen, -1); 5337 blt(CCR0, L_done); 5338 5339 multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, 5340 carry, product_high, product, idx, kdx, tmp); 5341 5342 Label L_second_loop; 5343 5344 cmpdi(CCR0, kdx, 0); 5345 beq(CCR0, L_second_loop); 5346 5347 Label L_carry; 5348 5349 addic_(kdx, kdx, -1); 5350 beq(CCR0, L_carry); 5351 5352 // Store lower 32 bits of carry. 5353 sldi(tmp, kdx, LogBytesPerInt); 5354 stwx(carry, z, tmp); 5355 srdi(carry, carry, 32); 5356 addi(kdx, kdx, -1); 5357 5358 5359 bind(L_carry); 5360 5361 // Store upper 32 bits of carry. 5362 sldi(tmp, kdx, LogBytesPerInt); 5363 stwx(carry, z, tmp); 5364 5365 // Second and third (nested) loops. 5366 // 5367 // for (int i = xstart-1; i >= 0; i--) { // Second loop 5368 // carry = 0; 5369 // for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop 5370 // long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) + 5371 // (z[k] & LONG_MASK) + carry; 5372 // z[k] = (int)product; 5373 // carry = product >>> 32; 5374 // } 5375 // z[i] = (int)carry; 5376 // } 5377 // 5378 // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = rdx 5379 5380 bind(L_second_loop); 5381 5382 li(carry, 0); // carry = 0; 5383 5384 addic_(xstart, xstart, -1); // i = xstart-1; 5385 blt(CCR0, L_done); 5386 5387 Register zsave = tmp10; 5388 5389 mr(zsave, z); 5390 5391 5392 Label L_last_x; 5393 5394 sldi(tmp, xstart, LogBytesPerInt); 5395 add(z, z, tmp); // z = z + k - j 5396 addi(z, z, 4); 5397 addic_(xstart, xstart, -1); // i = xstart-1; 5398 blt(CCR0, L_last_x); 5399 5400 sldi(tmp, xstart, LogBytesPerInt); 5401 ldx(x_xstart, x, tmp); 5402 #ifdef VM_LITTLE_ENDIAN 5403 rldicl(x_xstart, x_xstart, 32, 0); 5404 #endif 5405 5406 5407 Label L_third_loop_prologue; 5408 5409 bind(L_third_loop_prologue); 5410 5411 Register xsave = tmp11; 5412 Register xlensave = tmp12; 5413 Register ylensave = tmp13; 5414 5415 mr(xsave, x); 5416 mr(xlensave, xstart); 5417 mr(ylensave, ylen); 5418 5419 5420 multiply_128_x_128_loop(x_xstart, y, z, y_idx, ylen, 5421 carry, product_high, product, x, tmp); 5422 5423 mr(z, zsave); 5424 mr(x, xsave); 5425 mr(xlen, xlensave); // This is the decrement of the loop counter! 5426 mr(ylen, ylensave); 5427 5428 addi(tmp3, xlen, 1); 5429 sldi(tmp, tmp3, LogBytesPerInt); 5430 stwx(carry, z, tmp); 5431 addic_(tmp3, tmp3, -1); 5432 blt(CCR0, L_done); 5433 5434 srdi(carry, carry, 32); 5435 sldi(tmp, tmp3, LogBytesPerInt); 5436 stwx(carry, z, tmp); 5437 b(L_second_loop); 5438 5439 // Next infrequent code is moved outside loops. 5440 bind(L_last_x); 5441 5442 lwz(x_xstart, 0, x); 5443 b(L_third_loop_prologue); 5444 5445 bind(L_done); 5446 } // multiply_to_len 5447 5448 void MacroAssembler::asm_assert(bool check_equal, const char *msg, int id) { 5449 #ifdef ASSERT 5450 Label ok; 5451 if (check_equal) { 5452 beq(CCR0, ok); 5453 } else { 5454 bne(CCR0, ok); 5455 } 5456 stop(msg, id); 5457 bind(ok); 5458 #endif 5459 } 5460 5461 void MacroAssembler::asm_assert_mems_zero(bool check_equal, int size, int mem_offset, 5462 Register mem_base, const char* msg, int id) { 5463 #ifdef ASSERT 5464 switch (size) { 5465 case 4: 5466 lwz(R0, mem_offset, mem_base); 5467 cmpwi(CCR0, R0, 0); 5468 break; 5469 case 8: 5470 ld(R0, mem_offset, mem_base); 5471 cmpdi(CCR0, R0, 0); 5472 break; 5473 default: 5474 ShouldNotReachHere(); 5475 } 5476 asm_assert(check_equal, msg, id); 5477 #endif // ASSERT 5478 } 5479 5480 void MacroAssembler::verify_thread() { 5481 if (VerifyThread) { 5482 unimplemented("'VerifyThread' currently not implemented on PPC"); 5483 } 5484 } 5485 5486 // READ: oop. KILL: R0. Volatile floats perhaps. 5487 void MacroAssembler::verify_oop(Register oop, const char* msg) { 5488 if (!VerifyOops) { 5489 return; 5490 } 5491 5492 address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address(); 5493 const Register tmp = R11; // Will be preserved. 5494 const int nbytes_save = MacroAssembler::num_volatile_regs * 8; 5495 save_volatile_gprs(R1_SP, -nbytes_save); // except R0 5496 5497 mr_if_needed(R4_ARG2, oop); 5498 save_LR_CR(tmp); // save in old frame 5499 push_frame_reg_args(nbytes_save, tmp); 5500 // load FunctionDescriptor** / entry_address * 5501 load_const_optimized(tmp, fd, R0); 5502 // load FunctionDescriptor* / entry_address 5503 ld(tmp, 0, tmp); 5504 load_const_optimized(R3_ARG1, (address)msg, R0); 5505 // Call destination for its side effect. 5506 call_c(tmp); 5507 5508 pop_frame(); 5509 restore_LR_CR(tmp); 5510 restore_volatile_gprs(R1_SP, -nbytes_save); // except R0 5511 } 5512 5513 void MacroAssembler::verify_oop_addr(RegisterOrConstant offs, Register base, const char* msg) { 5514 if (!VerifyOops) { 5515 return; 5516 } 5517 5518 address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address(); 5519 const Register tmp = R11; // Will be preserved. 5520 const int nbytes_save = MacroAssembler::num_volatile_regs * 8; 5521 save_volatile_gprs(R1_SP, -nbytes_save); // except R0 5522 5523 ld(R4_ARG2, offs, base); 5524 save_LR_CR(tmp); // save in old frame 5525 push_frame_reg_args(nbytes_save, tmp); 5526 // load FunctionDescriptor** / entry_address * 5527 load_const_optimized(tmp, fd, R0); 5528 // load FunctionDescriptor* / entry_address 5529 ld(tmp, 0, tmp); 5530 load_const_optimized(R3_ARG1, (address)msg, R0); 5531 // Call destination for its side effect. 5532 call_c(tmp); 5533 5534 pop_frame(); 5535 restore_LR_CR(tmp); 5536 restore_volatile_gprs(R1_SP, -nbytes_save); // except R0 5537 } 5538 5539 const char* stop_types[] = { 5540 "stop", 5541 "untested", 5542 "unimplemented", 5543 "shouldnotreachhere" 5544 }; 5545 5546 static void stop_on_request(int tp, const char* msg) { 5547 tty->print("PPC assembly code requires stop: (%s) %s\n", stop_types[tp%/*stop_end*/4], msg); 5548 guarantee(false, "PPC assembly code requires stop: %s", msg); 5549 } 5550 5551 // Call a C-function that prints output. 5552 void MacroAssembler::stop(int type, const char* msg, int id) { 5553 #ifndef PRODUCT 5554 block_comment(err_msg("stop: %s %s {", stop_types[type%stop_end], msg)); 5555 #else 5556 block_comment("stop {"); 5557 #endif 5558 5559 // setup arguments 5560 load_const_optimized(R3_ARG1, type); 5561 load_const_optimized(R4_ARG2, (void *)msg, /*tmp=*/R0); 5562 call_VM_leaf(CAST_FROM_FN_PTR(address, stop_on_request), R3_ARG1, R4_ARG2); 5563 illtrap(); 5564 emit_int32(id); 5565 block_comment("} stop;"); 5566 } 5567 5568 #ifndef PRODUCT 5569 // Write pattern 0x0101010101010101 in memory region [low-before, high+after]. 5570 // Val, addr are temp registers. 5571 // If low == addr, addr is killed. 5572 // High is preserved. 5573 void MacroAssembler::zap_from_to(Register low, int before, Register high, int after, Register val, Register addr) { 5574 if (!ZapMemory) return; 5575 5576 assert_different_registers(low, val); 5577 5578 BLOCK_COMMENT("zap memory region {"); 5579 load_const_optimized(val, 0x0101010101010101); 5580 int size = before + after; 5581 if (low == high && size < 5 && size > 0) { 5582 int offset = -before*BytesPerWord; 5583 for (int i = 0; i < size; ++i) { 5584 std(val, offset, low); 5585 offset += (1*BytesPerWord); 5586 } 5587 } else { 5588 addi(addr, low, -before*BytesPerWord); 5589 assert_different_registers(high, val); 5590 if (after) addi(high, high, after * BytesPerWord); 5591 Label loop; 5592 bind(loop); 5593 std(val, 0, addr); 5594 addi(addr, addr, 8); 5595 cmpd(CCR6, addr, high); 5596 ble(CCR6, loop); 5597 if (after) addi(high, high, -after * BytesPerWord); // Correct back to old value. 5598 } 5599 BLOCK_COMMENT("} zap memory region"); 5600 } 5601 5602 #endif // !PRODUCT 5603 5604 SkipIfEqualZero::SkipIfEqualZero(MacroAssembler* masm, Register temp, const bool* flag_addr) : _masm(masm), _label() { 5605 int simm16_offset = masm->load_const_optimized(temp, (address)flag_addr, R0, true); 5606 assert(sizeof(bool) == 1, "PowerPC ABI"); 5607 masm->lbz(temp, simm16_offset, temp); 5608 masm->cmpwi(CCR0, temp, 0); 5609 masm->beq(CCR0, _label); 5610 } 5611 5612 SkipIfEqualZero::~SkipIfEqualZero() { 5613 _masm->bind(_label); 5614 }