1 /* 2 * Copyright (c) 1997, 2018, Oracle and/or its affiliates. All rights reserved. 3 * Copyright (c) 2012, 2018, SAP SE. All rights reserved. 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. 9 * 10 * This code is distributed in the hope that it will be useful, but WITHOUT 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 * version 2 for more details (a copy is included in the LICENSE file that 14 * accompanied this code). 15 * 16 * You should have received a copy of the GNU General Public License version 17 * 2 along with this work; if not, write to the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 19 * 20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 21 * or visit www.oracle.com if you need additional information or have any 22 * questions. 23 * 24 */ 25 26 #include "precompiled.hpp" 27 #include "asm/macroAssembler.inline.hpp" 28 #include "compiler/disassembler.hpp" 29 #include "gc/shared/cardTable.hpp" 30 #include "gc/shared/cardTableBarrierSet.hpp" 31 #include "gc/shared/collectedHeap.inline.hpp" 32 #include "interpreter/interpreter.hpp" 33 #include "memory/resourceArea.hpp" 34 #include "nativeInst_ppc.hpp" 35 #include "prims/methodHandles.hpp" 36 #include "runtime/biasedLocking.hpp" 37 #include "runtime/icache.hpp" 38 #include "runtime/interfaceSupport.inline.hpp" 39 #include "runtime/objectMonitor.hpp" 40 #include "runtime/os.hpp" 41 #include "runtime/safepoint.hpp" 42 #include "runtime/safepointMechanism.hpp" 43 #include "runtime/sharedRuntime.hpp" 44 #include "runtime/stubRoutines.hpp" 45 #include "utilities/macros.hpp" 46 #if INCLUDE_ALL_GCS 47 #include "gc/g1/g1BarrierSet.hpp" 48 #include "gc/g1/g1CardTable.hpp" 49 #include "gc/g1/heapRegion.hpp" 50 #endif // INCLUDE_ALL_GCS 51 #ifdef COMPILER2 52 #include "opto/intrinsicnode.hpp" 53 #endif 54 55 #ifdef PRODUCT 56 #define BLOCK_COMMENT(str) // nothing 57 #else 58 #define BLOCK_COMMENT(str) block_comment(str) 59 #endif 60 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 61 62 #ifdef ASSERT 63 // On RISC, there's no benefit to verifying instruction boundaries. 64 bool AbstractAssembler::pd_check_instruction_mark() { return false; } 65 #endif 66 67 void MacroAssembler::ld_largeoffset_unchecked(Register d, int si31, Register a, int emit_filler_nop) { 68 assert(Assembler::is_simm(si31, 31) && si31 >= 0, "si31 out of range"); 69 if (Assembler::is_simm(si31, 16)) { 70 ld(d, si31, a); 71 if (emit_filler_nop) nop(); 72 } else { 73 const int hi = MacroAssembler::largeoffset_si16_si16_hi(si31); 74 const int lo = MacroAssembler::largeoffset_si16_si16_lo(si31); 75 addis(d, a, hi); 76 ld(d, lo, d); 77 } 78 } 79 80 void MacroAssembler::ld_largeoffset(Register d, int si31, Register a, int emit_filler_nop) { 81 assert_different_registers(d, a); 82 ld_largeoffset_unchecked(d, si31, a, emit_filler_nop); 83 } 84 85 void MacroAssembler::load_sized_value(Register dst, RegisterOrConstant offs, Register base, 86 size_t size_in_bytes, bool is_signed) { 87 switch (size_in_bytes) { 88 case 8: ld(dst, offs, base); break; 89 case 4: is_signed ? lwa(dst, offs, base) : lwz(dst, offs, base); break; 90 case 2: is_signed ? lha(dst, offs, base) : lhz(dst, offs, base); break; 91 case 1: lbz(dst, offs, base); if (is_signed) extsb(dst, dst); break; // lba doesn't exist :( 92 default: ShouldNotReachHere(); 93 } 94 } 95 96 void MacroAssembler::store_sized_value(Register dst, RegisterOrConstant offs, Register base, 97 size_t size_in_bytes) { 98 switch (size_in_bytes) { 99 case 8: std(dst, offs, base); break; 100 case 4: stw(dst, offs, base); break; 101 case 2: sth(dst, offs, base); break; 102 case 1: stb(dst, offs, base); break; 103 default: ShouldNotReachHere(); 104 } 105 } 106 107 void MacroAssembler::align(int modulus, int max, int rem) { 108 int padding = (rem + modulus - (offset() % modulus)) % modulus; 109 if (padding > max) return; 110 for (int c = (padding >> 2); c > 0; --c) { nop(); } 111 } 112 113 // Issue instructions that calculate given TOC from global TOC. 114 void MacroAssembler::calculate_address_from_global_toc(Register dst, address addr, bool hi16, bool lo16, 115 bool add_relocation, bool emit_dummy_addr) { 116 int offset = -1; 117 if (emit_dummy_addr) { 118 offset = -128; // dummy address 119 } else if (addr != (address)(intptr_t)-1) { 120 offset = MacroAssembler::offset_to_global_toc(addr); 121 } 122 123 if (hi16) { 124 addis(dst, R29_TOC, MacroAssembler::largeoffset_si16_si16_hi(offset)); 125 } 126 if (lo16) { 127 if (add_relocation) { 128 // Relocate at the addi to avoid confusion with a load from the method's TOC. 129 relocate(internal_word_Relocation::spec(addr)); 130 } 131 addi(dst, dst, MacroAssembler::largeoffset_si16_si16_lo(offset)); 132 } 133 } 134 135 address MacroAssembler::patch_calculate_address_from_global_toc_at(address a, address bound, address addr) { 136 const int offset = MacroAssembler::offset_to_global_toc(addr); 137 138 const address inst2_addr = a; 139 const int inst2 = *(int *)inst2_addr; 140 141 // The relocation points to the second instruction, the addi, 142 // and the addi reads and writes the same register dst. 143 const int dst = inv_rt_field(inst2); 144 assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst"); 145 146 // Now, find the preceding addis which writes to dst. 147 int inst1 = 0; 148 address inst1_addr = inst2_addr - BytesPerInstWord; 149 while (inst1_addr >= bound) { 150 inst1 = *(int *) inst1_addr; 151 if (is_addis(inst1) && inv_rt_field(inst1) == dst) { 152 // Stop, found the addis which writes dst. 153 break; 154 } 155 inst1_addr -= BytesPerInstWord; 156 } 157 158 assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC"); 159 set_imm((int *)inst1_addr, MacroAssembler::largeoffset_si16_si16_hi(offset)); 160 set_imm((int *)inst2_addr, MacroAssembler::largeoffset_si16_si16_lo(offset)); 161 return inst1_addr; 162 } 163 164 address MacroAssembler::get_address_of_calculate_address_from_global_toc_at(address a, address bound) { 165 const address inst2_addr = a; 166 const int inst2 = *(int *)inst2_addr; 167 168 // The relocation points to the second instruction, the addi, 169 // and the addi reads and writes the same register dst. 170 const int dst = inv_rt_field(inst2); 171 assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst"); 172 173 // Now, find the preceding addis which writes to dst. 174 int inst1 = 0; 175 address inst1_addr = inst2_addr - BytesPerInstWord; 176 while (inst1_addr >= bound) { 177 inst1 = *(int *) inst1_addr; 178 if (is_addis(inst1) && inv_rt_field(inst1) == dst) { 179 // stop, found the addis which writes dst 180 break; 181 } 182 inst1_addr -= BytesPerInstWord; 183 } 184 185 assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC"); 186 187 int offset = (get_imm(inst1_addr, 0) << 16) + get_imm(inst2_addr, 0); 188 // -1 is a special case 189 if (offset == -1) { 190 return (address)(intptr_t)-1; 191 } else { 192 return global_toc() + offset; 193 } 194 } 195 196 #ifdef _LP64 197 // Patch compressed oops or klass constants. 198 // Assembler sequence is 199 // 1) compressed oops: 200 // lis rx = const.hi 201 // ori rx = rx | const.lo 202 // 2) compressed klass: 203 // lis rx = const.hi 204 // clrldi rx = rx & 0xFFFFffff // clearMS32b, optional 205 // ori rx = rx | const.lo 206 // Clrldi will be passed by. 207 address MacroAssembler::patch_set_narrow_oop(address a, address bound, narrowOop data) { 208 assert(UseCompressedOops, "Should only patch compressed oops"); 209 210 const address inst2_addr = a; 211 const int inst2 = *(int *)inst2_addr; 212 213 // The relocation points to the second instruction, the ori, 214 // and the ori reads and writes the same register dst. 215 const int dst = inv_rta_field(inst2); 216 assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst"); 217 // Now, find the preceding addis which writes to dst. 218 int inst1 = 0; 219 address inst1_addr = inst2_addr - BytesPerInstWord; 220 bool inst1_found = false; 221 while (inst1_addr >= bound) { 222 inst1 = *(int *)inst1_addr; 223 if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break; } 224 inst1_addr -= BytesPerInstWord; 225 } 226 assert(inst1_found, "inst is not lis"); 227 228 int xc = (data >> 16) & 0xffff; 229 int xd = (data >> 0) & 0xffff; 230 231 set_imm((int *)inst1_addr, (short)(xc)); // see enc_load_con_narrow_hi/_lo 232 set_imm((int *)inst2_addr, (xd)); // unsigned int 233 return inst1_addr; 234 } 235 236 // Get compressed oop or klass constant. 237 narrowOop MacroAssembler::get_narrow_oop(address a, address bound) { 238 assert(UseCompressedOops, "Should only patch compressed oops"); 239 240 const address inst2_addr = a; 241 const int inst2 = *(int *)inst2_addr; 242 243 // The relocation points to the second instruction, the ori, 244 // and the ori reads and writes the same register dst. 245 const int dst = inv_rta_field(inst2); 246 assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst"); 247 // Now, find the preceding lis which writes to dst. 248 int inst1 = 0; 249 address inst1_addr = inst2_addr - BytesPerInstWord; 250 bool inst1_found = false; 251 252 while (inst1_addr >= bound) { 253 inst1 = *(int *) inst1_addr; 254 if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break;} 255 inst1_addr -= BytesPerInstWord; 256 } 257 assert(inst1_found, "inst is not lis"); 258 259 uint xl = ((unsigned int) (get_imm(inst2_addr, 0) & 0xffff)); 260 uint xh = (((get_imm(inst1_addr, 0)) & 0xffff) << 16); 261 262 return (int) (xl | xh); 263 } 264 #endif // _LP64 265 266 // Returns true if successful. 267 bool MacroAssembler::load_const_from_method_toc(Register dst, AddressLiteral& a, 268 Register toc, bool fixed_size) { 269 int toc_offset = 0; 270 // Use RelocationHolder::none for the constant pool entry, otherwise 271 // we will end up with a failing NativeCall::verify(x) where x is 272 // the address of the constant pool entry. 273 // FIXME: We should insert relocation information for oops at the constant 274 // pool entries instead of inserting it at the loads; patching of a constant 275 // pool entry should be less expensive. 276 address const_address = address_constant((address)a.value(), RelocationHolder::none); 277 if (const_address == NULL) { return false; } // allocation failure 278 // Relocate at the pc of the load. 279 relocate(a.rspec()); 280 toc_offset = (int)(const_address - code()->consts()->start()); 281 ld_largeoffset_unchecked(dst, toc_offset, toc, fixed_size); 282 return true; 283 } 284 285 bool MacroAssembler::is_load_const_from_method_toc_at(address a) { 286 const address inst1_addr = a; 287 const int inst1 = *(int *)inst1_addr; 288 289 // The relocation points to the ld or the addis. 290 return (is_ld(inst1)) || 291 (is_addis(inst1) && inv_ra_field(inst1) != 0); 292 } 293 294 int MacroAssembler::get_offset_of_load_const_from_method_toc_at(address a) { 295 assert(is_load_const_from_method_toc_at(a), "must be load_const_from_method_toc"); 296 297 const address inst1_addr = a; 298 const int inst1 = *(int *)inst1_addr; 299 300 if (is_ld(inst1)) { 301 return inv_d1_field(inst1); 302 } else if (is_addis(inst1)) { 303 const int dst = inv_rt_field(inst1); 304 305 // Now, find the succeeding ld which reads and writes to dst. 306 address inst2_addr = inst1_addr + BytesPerInstWord; 307 int inst2 = 0; 308 while (true) { 309 inst2 = *(int *) inst2_addr; 310 if (is_ld(inst2) && inv_ra_field(inst2) == dst && inv_rt_field(inst2) == dst) { 311 // Stop, found the ld which reads and writes dst. 312 break; 313 } 314 inst2_addr += BytesPerInstWord; 315 } 316 return (inv_d1_field(inst1) << 16) + inv_d1_field(inst2); 317 } 318 ShouldNotReachHere(); 319 return 0; 320 } 321 322 // Get the constant from a `load_const' sequence. 323 long MacroAssembler::get_const(address a) { 324 assert(is_load_const_at(a), "not a load of a constant"); 325 const int *p = (const int*) a; 326 unsigned long x = (((unsigned long) (get_imm(a,0) & 0xffff)) << 48); 327 if (is_ori(*(p+1))) { 328 x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 32); 329 x |= (((unsigned long) (get_imm(a,3) & 0xffff)) << 16); 330 x |= (((unsigned long) (get_imm(a,4) & 0xffff))); 331 } else if (is_lis(*(p+1))) { 332 x |= (((unsigned long) (get_imm(a,2) & 0xffff)) << 32); 333 x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 16); 334 x |= (((unsigned long) (get_imm(a,3) & 0xffff))); 335 } else { 336 ShouldNotReachHere(); 337 return (long) 0; 338 } 339 return (long) x; 340 } 341 342 // Patch the 64 bit constant of a `load_const' sequence. This is a low 343 // level procedure. It neither flushes the instruction cache nor is it 344 // mt safe. 345 void MacroAssembler::patch_const(address a, long x) { 346 assert(is_load_const_at(a), "not a load of a constant"); 347 int *p = (int*) a; 348 if (is_ori(*(p+1))) { 349 set_imm(0 + p, (x >> 48) & 0xffff); 350 set_imm(1 + p, (x >> 32) & 0xffff); 351 set_imm(3 + p, (x >> 16) & 0xffff); 352 set_imm(4 + p, x & 0xffff); 353 } else if (is_lis(*(p+1))) { 354 set_imm(0 + p, (x >> 48) & 0xffff); 355 set_imm(2 + p, (x >> 32) & 0xffff); 356 set_imm(1 + p, (x >> 16) & 0xffff); 357 set_imm(3 + p, x & 0xffff); 358 } else { 359 ShouldNotReachHere(); 360 } 361 } 362 363 AddressLiteral MacroAssembler::allocate_metadata_address(Metadata* obj) { 364 assert(oop_recorder() != NULL, "this assembler needs a Recorder"); 365 int index = oop_recorder()->allocate_metadata_index(obj); 366 RelocationHolder rspec = metadata_Relocation::spec(index); 367 return AddressLiteral((address)obj, rspec); 368 } 369 370 AddressLiteral MacroAssembler::constant_metadata_address(Metadata* obj) { 371 assert(oop_recorder() != NULL, "this assembler needs a Recorder"); 372 int index = oop_recorder()->find_index(obj); 373 RelocationHolder rspec = metadata_Relocation::spec(index); 374 return AddressLiteral((address)obj, rspec); 375 } 376 377 AddressLiteral MacroAssembler::allocate_oop_address(jobject obj) { 378 assert(oop_recorder() != NULL, "this assembler needs an OopRecorder"); 379 int oop_index = oop_recorder()->allocate_oop_index(obj); 380 return AddressLiteral(address(obj), oop_Relocation::spec(oop_index)); 381 } 382 383 AddressLiteral MacroAssembler::constant_oop_address(jobject obj) { 384 assert(oop_recorder() != NULL, "this assembler needs an OopRecorder"); 385 int oop_index = oop_recorder()->find_index(obj); 386 return AddressLiteral(address(obj), oop_Relocation::spec(oop_index)); 387 } 388 389 RegisterOrConstant MacroAssembler::delayed_value_impl(intptr_t* delayed_value_addr, 390 Register tmp, int offset) { 391 intptr_t value = *delayed_value_addr; 392 if (value != 0) { 393 return RegisterOrConstant(value + offset); 394 } 395 396 // Load indirectly to solve generation ordering problem. 397 // static address, no relocation 398 int simm16_offset = load_const_optimized(tmp, delayed_value_addr, noreg, true); 399 ld(tmp, simm16_offset, tmp); // must be aligned ((xa & 3) == 0) 400 401 if (offset != 0) { 402 addi(tmp, tmp, offset); 403 } 404 405 return RegisterOrConstant(tmp); 406 } 407 408 #ifndef PRODUCT 409 void MacroAssembler::pd_print_patched_instruction(address branch) { 410 Unimplemented(); // TODO: PPC port 411 } 412 #endif // ndef PRODUCT 413 414 // Conditional far branch for destinations encodable in 24+2 bits. 415 void MacroAssembler::bc_far(int boint, int biint, Label& dest, int optimize) { 416 417 // If requested by flag optimize, relocate the bc_far as a 418 // runtime_call and prepare for optimizing it when the code gets 419 // relocated. 420 if (optimize == bc_far_optimize_on_relocate) { 421 relocate(relocInfo::runtime_call_type); 422 } 423 424 // variant 2: 425 // 426 // b!cxx SKIP 427 // bxx DEST 428 // SKIP: 429 // 430 431 const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)), 432 opposite_bcond(inv_boint_bcond(boint))); 433 434 // We emit two branches. 435 // First, a conditional branch which jumps around the far branch. 436 const address not_taken_pc = pc() + 2 * BytesPerInstWord; 437 const address bc_pc = pc(); 438 bc(opposite_boint, biint, not_taken_pc); 439 440 const int bc_instr = *(int*)bc_pc; 441 assert(not_taken_pc == (address)inv_bd_field(bc_instr, (intptr_t)bc_pc), "postcondition"); 442 assert(opposite_boint == inv_bo_field(bc_instr), "postcondition"); 443 assert(boint == add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(bc_instr))), 444 opposite_bcond(inv_boint_bcond(inv_bo_field(bc_instr)))), 445 "postcondition"); 446 assert(biint == inv_bi_field(bc_instr), "postcondition"); 447 448 // Second, an unconditional far branch which jumps to dest. 449 // Note: target(dest) remembers the current pc (see CodeSection::target) 450 // and returns the current pc if the label is not bound yet; when 451 // the label gets bound, the unconditional far branch will be patched. 452 const address target_pc = target(dest); 453 const address b_pc = pc(); 454 b(target_pc); 455 456 assert(not_taken_pc == pc(), "postcondition"); 457 assert(dest.is_bound() || target_pc == b_pc, "postcondition"); 458 } 459 460 // 1 or 2 instructions 461 void MacroAssembler::bc_far_optimized(int boint, int biint, Label& dest) { 462 if (dest.is_bound() && is_within_range_of_bcxx(target(dest), pc())) { 463 bc(boint, biint, dest); 464 } else { 465 bc_far(boint, biint, dest, MacroAssembler::bc_far_optimize_on_relocate); 466 } 467 } 468 469 bool MacroAssembler::is_bc_far_at(address instruction_addr) { 470 return is_bc_far_variant1_at(instruction_addr) || 471 is_bc_far_variant2_at(instruction_addr) || 472 is_bc_far_variant3_at(instruction_addr); 473 } 474 475 address MacroAssembler::get_dest_of_bc_far_at(address instruction_addr) { 476 if (is_bc_far_variant1_at(instruction_addr)) { 477 const address instruction_1_addr = instruction_addr; 478 const int instruction_1 = *(int*)instruction_1_addr; 479 return (address)inv_bd_field(instruction_1, (intptr_t)instruction_1_addr); 480 } else if (is_bc_far_variant2_at(instruction_addr)) { 481 const address instruction_2_addr = instruction_addr + 4; 482 return bxx_destination(instruction_2_addr); 483 } else if (is_bc_far_variant3_at(instruction_addr)) { 484 return instruction_addr + 8; 485 } 486 // variant 4 ??? 487 ShouldNotReachHere(); 488 return NULL; 489 } 490 void MacroAssembler::set_dest_of_bc_far_at(address instruction_addr, address dest) { 491 492 if (is_bc_far_variant3_at(instruction_addr)) { 493 // variant 3, far cond branch to the next instruction, already patched to nops: 494 // 495 // nop 496 // endgroup 497 // SKIP/DEST: 498 // 499 return; 500 } 501 502 // first, extract boint and biint from the current branch 503 int boint = 0; 504 int biint = 0; 505 506 ResourceMark rm; 507 const int code_size = 2 * BytesPerInstWord; 508 CodeBuffer buf(instruction_addr, code_size); 509 MacroAssembler masm(&buf); 510 if (is_bc_far_variant2_at(instruction_addr) && dest == instruction_addr + 8) { 511 // Far branch to next instruction: Optimize it by patching nops (produce variant 3). 512 masm.nop(); 513 masm.endgroup(); 514 } else { 515 if (is_bc_far_variant1_at(instruction_addr)) { 516 // variant 1, the 1st instruction contains the destination address: 517 // 518 // bcxx DEST 519 // nop 520 // 521 const int instruction_1 = *(int*)(instruction_addr); 522 boint = inv_bo_field(instruction_1); 523 biint = inv_bi_field(instruction_1); 524 } else if (is_bc_far_variant2_at(instruction_addr)) { 525 // variant 2, the 2nd instruction contains the destination address: 526 // 527 // b!cxx SKIP 528 // bxx DEST 529 // SKIP: 530 // 531 const int instruction_1 = *(int*)(instruction_addr); 532 boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(instruction_1))), 533 opposite_bcond(inv_boint_bcond(inv_bo_field(instruction_1)))); 534 biint = inv_bi_field(instruction_1); 535 } else { 536 // variant 4??? 537 ShouldNotReachHere(); 538 } 539 540 // second, set the new branch destination and optimize the code 541 if (dest != instruction_addr + 4 && // the bc_far is still unbound! 542 masm.is_within_range_of_bcxx(dest, instruction_addr)) { 543 // variant 1: 544 // 545 // bcxx DEST 546 // nop 547 // 548 masm.bc(boint, biint, dest); 549 masm.nop(); 550 } else { 551 // variant 2: 552 // 553 // b!cxx SKIP 554 // bxx DEST 555 // SKIP: 556 // 557 const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)), 558 opposite_bcond(inv_boint_bcond(boint))); 559 const address not_taken_pc = masm.pc() + 2 * BytesPerInstWord; 560 masm.bc(opposite_boint, biint, not_taken_pc); 561 masm.b(dest); 562 } 563 } 564 ICache::ppc64_flush_icache_bytes(instruction_addr, code_size); 565 } 566 567 // Emit a NOT mt-safe patchable 64 bit absolute call/jump. 568 void MacroAssembler::bxx64_patchable(address dest, relocInfo::relocType rt, bool link) { 569 // get current pc 570 uint64_t start_pc = (uint64_t) pc(); 571 572 const address pc_of_bl = (address) (start_pc + (6*BytesPerInstWord)); // bl is last 573 const address pc_of_b = (address) (start_pc + (0*BytesPerInstWord)); // b is first 574 575 // relocate here 576 if (rt != relocInfo::none) { 577 relocate(rt); 578 } 579 580 if ( ReoptimizeCallSequences && 581 (( link && is_within_range_of_b(dest, pc_of_bl)) || 582 (!link && is_within_range_of_b(dest, pc_of_b)))) { 583 // variant 2: 584 // Emit an optimized, pc-relative call/jump. 585 586 if (link) { 587 // some padding 588 nop(); 589 nop(); 590 nop(); 591 nop(); 592 nop(); 593 nop(); 594 595 // do the call 596 assert(pc() == pc_of_bl, "just checking"); 597 bl(dest, relocInfo::none); 598 } else { 599 // do the jump 600 assert(pc() == pc_of_b, "just checking"); 601 b(dest, relocInfo::none); 602 603 // some padding 604 nop(); 605 nop(); 606 nop(); 607 nop(); 608 nop(); 609 nop(); 610 } 611 612 // Assert that we can identify the emitted call/jump. 613 assert(is_bxx64_patchable_variant2_at((address)start_pc, link), 614 "can't identify emitted call"); 615 } else { 616 // variant 1: 617 mr(R0, R11); // spill R11 -> R0. 618 619 // Load the destination address into CTR, 620 // calculate destination relative to global toc. 621 calculate_address_from_global_toc(R11, dest, true, true, false); 622 623 mtctr(R11); 624 mr(R11, R0); // spill R11 <- R0. 625 nop(); 626 627 // do the call/jump 628 if (link) { 629 bctrl(); 630 } else{ 631 bctr(); 632 } 633 // Assert that we can identify the emitted call/jump. 634 assert(is_bxx64_patchable_variant1b_at((address)start_pc, link), 635 "can't identify emitted call"); 636 } 637 638 // Assert that we can identify the emitted call/jump. 639 assert(is_bxx64_patchable_at((address)start_pc, link), 640 "can't identify emitted call"); 641 assert(get_dest_of_bxx64_patchable_at((address)start_pc, link) == dest, 642 "wrong encoding of dest address"); 643 } 644 645 // Identify a bxx64_patchable instruction. 646 bool MacroAssembler::is_bxx64_patchable_at(address instruction_addr, bool link) { 647 return is_bxx64_patchable_variant1b_at(instruction_addr, link) 648 //|| is_bxx64_patchable_variant1_at(instruction_addr, link) 649 || is_bxx64_patchable_variant2_at(instruction_addr, link); 650 } 651 652 // Does the call64_patchable instruction use a pc-relative encoding of 653 // the call destination? 654 bool MacroAssembler::is_bxx64_patchable_pcrelative_at(address instruction_addr, bool link) { 655 // variant 2 is pc-relative 656 return is_bxx64_patchable_variant2_at(instruction_addr, link); 657 } 658 659 // Identify variant 1. 660 bool MacroAssembler::is_bxx64_patchable_variant1_at(address instruction_addr, bool link) { 661 unsigned int* instr = (unsigned int*) instruction_addr; 662 return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l] 663 && is_mtctr(instr[5]) // mtctr 664 && is_load_const_at(instruction_addr); 665 } 666 667 // Identify variant 1b: load destination relative to global toc. 668 bool MacroAssembler::is_bxx64_patchable_variant1b_at(address instruction_addr, bool link) { 669 unsigned int* instr = (unsigned int*) instruction_addr; 670 return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l] 671 && is_mtctr(instr[3]) // mtctr 672 && is_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord, instruction_addr); 673 } 674 675 // Identify variant 2. 676 bool MacroAssembler::is_bxx64_patchable_variant2_at(address instruction_addr, bool link) { 677 unsigned int* instr = (unsigned int*) instruction_addr; 678 if (link) { 679 return is_bl (instr[6]) // bl dest is last 680 && is_nop(instr[0]) // nop 681 && is_nop(instr[1]) // nop 682 && is_nop(instr[2]) // nop 683 && is_nop(instr[3]) // nop 684 && is_nop(instr[4]) // nop 685 && is_nop(instr[5]); // nop 686 } else { 687 return is_b (instr[0]) // b dest is first 688 && is_nop(instr[1]) // nop 689 && is_nop(instr[2]) // nop 690 && is_nop(instr[3]) // nop 691 && is_nop(instr[4]) // nop 692 && is_nop(instr[5]) // nop 693 && is_nop(instr[6]); // nop 694 } 695 } 696 697 // Set dest address of a bxx64_patchable instruction. 698 void MacroAssembler::set_dest_of_bxx64_patchable_at(address instruction_addr, address dest, bool link) { 699 ResourceMark rm; 700 int code_size = MacroAssembler::bxx64_patchable_size; 701 CodeBuffer buf(instruction_addr, code_size); 702 MacroAssembler masm(&buf); 703 masm.bxx64_patchable(dest, relocInfo::none, link); 704 ICache::ppc64_flush_icache_bytes(instruction_addr, code_size); 705 } 706 707 // Get dest address of a bxx64_patchable instruction. 708 address MacroAssembler::get_dest_of_bxx64_patchable_at(address instruction_addr, bool link) { 709 if (is_bxx64_patchable_variant1_at(instruction_addr, link)) { 710 return (address) (unsigned long) get_const(instruction_addr); 711 } else if (is_bxx64_patchable_variant2_at(instruction_addr, link)) { 712 unsigned int* instr = (unsigned int*) instruction_addr; 713 if (link) { 714 const int instr_idx = 6; // bl is last 715 int branchoffset = branch_destination(instr[instr_idx], 0); 716 return instruction_addr + branchoffset + instr_idx*BytesPerInstWord; 717 } else { 718 const int instr_idx = 0; // b is first 719 int branchoffset = branch_destination(instr[instr_idx], 0); 720 return instruction_addr + branchoffset + instr_idx*BytesPerInstWord; 721 } 722 // Load dest relative to global toc. 723 } else if (is_bxx64_patchable_variant1b_at(instruction_addr, link)) { 724 return get_address_of_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord, 725 instruction_addr); 726 } else { 727 ShouldNotReachHere(); 728 return NULL; 729 } 730 } 731 732 // Uses ordering which corresponds to ABI: 733 // _savegpr0_14: std r14,-144(r1) 734 // _savegpr0_15: std r15,-136(r1) 735 // _savegpr0_16: std r16,-128(r1) 736 void MacroAssembler::save_nonvolatile_gprs(Register dst, int offset) { 737 std(R14, offset, dst); offset += 8; 738 std(R15, offset, dst); offset += 8; 739 std(R16, offset, dst); offset += 8; 740 std(R17, offset, dst); offset += 8; 741 std(R18, offset, dst); offset += 8; 742 std(R19, offset, dst); offset += 8; 743 std(R20, offset, dst); offset += 8; 744 std(R21, offset, dst); offset += 8; 745 std(R22, offset, dst); offset += 8; 746 std(R23, offset, dst); offset += 8; 747 std(R24, offset, dst); offset += 8; 748 std(R25, offset, dst); offset += 8; 749 std(R26, offset, dst); offset += 8; 750 std(R27, offset, dst); offset += 8; 751 std(R28, offset, dst); offset += 8; 752 std(R29, offset, dst); offset += 8; 753 std(R30, offset, dst); offset += 8; 754 std(R31, offset, dst); offset += 8; 755 756 stfd(F14, offset, dst); offset += 8; 757 stfd(F15, offset, dst); offset += 8; 758 stfd(F16, offset, dst); offset += 8; 759 stfd(F17, offset, dst); offset += 8; 760 stfd(F18, offset, dst); offset += 8; 761 stfd(F19, offset, dst); offset += 8; 762 stfd(F20, offset, dst); offset += 8; 763 stfd(F21, offset, dst); offset += 8; 764 stfd(F22, offset, dst); offset += 8; 765 stfd(F23, offset, dst); offset += 8; 766 stfd(F24, offset, dst); offset += 8; 767 stfd(F25, offset, dst); offset += 8; 768 stfd(F26, offset, dst); offset += 8; 769 stfd(F27, offset, dst); offset += 8; 770 stfd(F28, offset, dst); offset += 8; 771 stfd(F29, offset, dst); offset += 8; 772 stfd(F30, offset, dst); offset += 8; 773 stfd(F31, offset, dst); 774 } 775 776 // Uses ordering which corresponds to ABI: 777 // _restgpr0_14: ld r14,-144(r1) 778 // _restgpr0_15: ld r15,-136(r1) 779 // _restgpr0_16: ld r16,-128(r1) 780 void MacroAssembler::restore_nonvolatile_gprs(Register src, int offset) { 781 ld(R14, offset, src); offset += 8; 782 ld(R15, offset, src); offset += 8; 783 ld(R16, offset, src); offset += 8; 784 ld(R17, offset, src); offset += 8; 785 ld(R18, offset, src); offset += 8; 786 ld(R19, offset, src); offset += 8; 787 ld(R20, offset, src); offset += 8; 788 ld(R21, offset, src); offset += 8; 789 ld(R22, offset, src); offset += 8; 790 ld(R23, offset, src); offset += 8; 791 ld(R24, offset, src); offset += 8; 792 ld(R25, offset, src); offset += 8; 793 ld(R26, offset, src); offset += 8; 794 ld(R27, offset, src); offset += 8; 795 ld(R28, offset, src); offset += 8; 796 ld(R29, offset, src); offset += 8; 797 ld(R30, offset, src); offset += 8; 798 ld(R31, offset, src); offset += 8; 799 800 // FP registers 801 lfd(F14, offset, src); offset += 8; 802 lfd(F15, offset, src); offset += 8; 803 lfd(F16, offset, src); offset += 8; 804 lfd(F17, offset, src); offset += 8; 805 lfd(F18, offset, src); offset += 8; 806 lfd(F19, offset, src); offset += 8; 807 lfd(F20, offset, src); offset += 8; 808 lfd(F21, offset, src); offset += 8; 809 lfd(F22, offset, src); offset += 8; 810 lfd(F23, offset, src); offset += 8; 811 lfd(F24, offset, src); offset += 8; 812 lfd(F25, offset, src); offset += 8; 813 lfd(F26, offset, src); offset += 8; 814 lfd(F27, offset, src); offset += 8; 815 lfd(F28, offset, src); offset += 8; 816 lfd(F29, offset, src); offset += 8; 817 lfd(F30, offset, src); offset += 8; 818 lfd(F31, offset, src); 819 } 820 821 // For verify_oops. 822 void MacroAssembler::save_volatile_gprs(Register dst, int offset) { 823 std(R2, offset, dst); offset += 8; 824 std(R3, offset, dst); offset += 8; 825 std(R4, offset, dst); offset += 8; 826 std(R5, offset, dst); offset += 8; 827 std(R6, offset, dst); offset += 8; 828 std(R7, offset, dst); offset += 8; 829 std(R8, offset, dst); offset += 8; 830 std(R9, offset, dst); offset += 8; 831 std(R10, offset, dst); offset += 8; 832 std(R11, offset, dst); offset += 8; 833 std(R12, offset, dst); offset += 8; 834 835 stfd(F0, offset, dst); offset += 8; 836 stfd(F1, offset, dst); offset += 8; 837 stfd(F2, offset, dst); offset += 8; 838 stfd(F3, offset, dst); offset += 8; 839 stfd(F4, offset, dst); offset += 8; 840 stfd(F5, offset, dst); offset += 8; 841 stfd(F6, offset, dst); offset += 8; 842 stfd(F7, offset, dst); offset += 8; 843 stfd(F8, offset, dst); offset += 8; 844 stfd(F9, offset, dst); offset += 8; 845 stfd(F10, offset, dst); offset += 8; 846 stfd(F11, offset, dst); offset += 8; 847 stfd(F12, offset, dst); offset += 8; 848 stfd(F13, offset, dst); 849 } 850 851 // For verify_oops. 852 void MacroAssembler::restore_volatile_gprs(Register src, int offset) { 853 ld(R2, offset, src); offset += 8; 854 ld(R3, offset, src); offset += 8; 855 ld(R4, offset, src); offset += 8; 856 ld(R5, offset, src); offset += 8; 857 ld(R6, offset, src); offset += 8; 858 ld(R7, offset, src); offset += 8; 859 ld(R8, offset, src); offset += 8; 860 ld(R9, offset, src); offset += 8; 861 ld(R10, offset, src); offset += 8; 862 ld(R11, offset, src); offset += 8; 863 ld(R12, offset, src); offset += 8; 864 865 lfd(F0, offset, src); offset += 8; 866 lfd(F1, offset, src); offset += 8; 867 lfd(F2, offset, src); offset += 8; 868 lfd(F3, offset, src); offset += 8; 869 lfd(F4, offset, src); offset += 8; 870 lfd(F5, offset, src); offset += 8; 871 lfd(F6, offset, src); offset += 8; 872 lfd(F7, offset, src); offset += 8; 873 lfd(F8, offset, src); offset += 8; 874 lfd(F9, offset, src); offset += 8; 875 lfd(F10, offset, src); offset += 8; 876 lfd(F11, offset, src); offset += 8; 877 lfd(F12, offset, src); offset += 8; 878 lfd(F13, offset, src); 879 } 880 881 void MacroAssembler::save_LR_CR(Register tmp) { 882 mfcr(tmp); 883 std(tmp, _abi(cr), R1_SP); 884 mflr(tmp); 885 std(tmp, _abi(lr), R1_SP); 886 // Tmp must contain lr on exit! (see return_addr and prolog in ppc64.ad) 887 } 888 889 void MacroAssembler::restore_LR_CR(Register tmp) { 890 assert(tmp != R1_SP, "must be distinct"); 891 ld(tmp, _abi(lr), R1_SP); 892 mtlr(tmp); 893 ld(tmp, _abi(cr), R1_SP); 894 mtcr(tmp); 895 } 896 897 address MacroAssembler::get_PC_trash_LR(Register result) { 898 Label L; 899 bl(L); 900 bind(L); 901 address lr_pc = pc(); 902 mflr(result); 903 return lr_pc; 904 } 905 906 void MacroAssembler::resize_frame(Register offset, Register tmp) { 907 #ifdef ASSERT 908 assert_different_registers(offset, tmp, R1_SP); 909 andi_(tmp, offset, frame::alignment_in_bytes-1); 910 asm_assert_eq("resize_frame: unaligned", 0x204); 911 #endif 912 913 // tmp <- *(SP) 914 ld(tmp, _abi(callers_sp), R1_SP); 915 // addr <- SP + offset; 916 // *(addr) <- tmp; 917 // SP <- addr 918 stdux(tmp, R1_SP, offset); 919 } 920 921 void MacroAssembler::resize_frame(int offset, Register tmp) { 922 assert(is_simm(offset, 16), "too big an offset"); 923 assert_different_registers(tmp, R1_SP); 924 assert((offset & (frame::alignment_in_bytes-1))==0, "resize_frame: unaligned"); 925 // tmp <- *(SP) 926 ld(tmp, _abi(callers_sp), R1_SP); 927 // addr <- SP + offset; 928 // *(addr) <- tmp; 929 // SP <- addr 930 stdu(tmp, offset, R1_SP); 931 } 932 933 void MacroAssembler::resize_frame_absolute(Register addr, Register tmp1, Register tmp2) { 934 // (addr == tmp1) || (addr == tmp2) is allowed here! 935 assert(tmp1 != tmp2, "must be distinct"); 936 937 // compute offset w.r.t. current stack pointer 938 // tmp_1 <- addr - SP (!) 939 subf(tmp1, R1_SP, addr); 940 941 // atomically update SP keeping back link. 942 resize_frame(tmp1/* offset */, tmp2/* tmp */); 943 } 944 945 void MacroAssembler::push_frame(Register bytes, Register tmp) { 946 #ifdef ASSERT 947 assert(bytes != R0, "r0 not allowed here"); 948 andi_(R0, bytes, frame::alignment_in_bytes-1); 949 asm_assert_eq("push_frame(Reg, Reg): unaligned", 0x203); 950 #endif 951 neg(tmp, bytes); 952 stdux(R1_SP, R1_SP, tmp); 953 } 954 955 // Push a frame of size `bytes'. 956 void MacroAssembler::push_frame(unsigned int bytes, Register tmp) { 957 long offset = align_addr(bytes, frame::alignment_in_bytes); 958 if (is_simm(-offset, 16)) { 959 stdu(R1_SP, -offset, R1_SP); 960 } else { 961 load_const_optimized(tmp, -offset); 962 stdux(R1_SP, R1_SP, tmp); 963 } 964 } 965 966 // Push a frame of size `bytes' plus abi_reg_args on top. 967 void MacroAssembler::push_frame_reg_args(unsigned int bytes, Register tmp) { 968 push_frame(bytes + frame::abi_reg_args_size, tmp); 969 } 970 971 // Setup up a new C frame with a spill area for non-volatile GPRs and 972 // additional space for local variables. 973 void MacroAssembler::push_frame_reg_args_nonvolatiles(unsigned int bytes, 974 Register tmp) { 975 push_frame(bytes + frame::abi_reg_args_size + frame::spill_nonvolatiles_size, tmp); 976 } 977 978 // Pop current C frame. 979 void MacroAssembler::pop_frame() { 980 ld(R1_SP, _abi(callers_sp), R1_SP); 981 } 982 983 #if defined(ABI_ELFv2) 984 address MacroAssembler::branch_to(Register r_function_entry, bool and_link) { 985 // TODO(asmundak): make sure the caller uses R12 as function descriptor 986 // most of the times. 987 if (R12 != r_function_entry) { 988 mr(R12, r_function_entry); 989 } 990 mtctr(R12); 991 // Do a call or a branch. 992 if (and_link) { 993 bctrl(); 994 } else { 995 bctr(); 996 } 997 _last_calls_return_pc = pc(); 998 999 return _last_calls_return_pc; 1000 } 1001 1002 // Call a C function via a function descriptor and use full C 1003 // calling conventions. Updates and returns _last_calls_return_pc. 1004 address MacroAssembler::call_c(Register r_function_entry) { 1005 return branch_to(r_function_entry, /*and_link=*/true); 1006 } 1007 1008 // For tail calls: only branch, don't link, so callee returns to caller of this function. 1009 address MacroAssembler::call_c_and_return_to_caller(Register r_function_entry) { 1010 return branch_to(r_function_entry, /*and_link=*/false); 1011 } 1012 1013 address MacroAssembler::call_c(address function_entry, relocInfo::relocType rt) { 1014 load_const(R12, function_entry, R0); 1015 return branch_to(R12, /*and_link=*/true); 1016 } 1017 1018 #else 1019 // Generic version of a call to C function via a function descriptor 1020 // with variable support for C calling conventions (TOC, ENV, etc.). 1021 // Updates and returns _last_calls_return_pc. 1022 address MacroAssembler::branch_to(Register function_descriptor, bool and_link, bool save_toc_before_call, 1023 bool restore_toc_after_call, bool load_toc_of_callee, bool load_env_of_callee) { 1024 // we emit standard ptrgl glue code here 1025 assert((function_descriptor != R0), "function_descriptor cannot be R0"); 1026 1027 // retrieve necessary entries from the function descriptor 1028 ld(R0, in_bytes(FunctionDescriptor::entry_offset()), function_descriptor); 1029 mtctr(R0); 1030 1031 if (load_toc_of_callee) { 1032 ld(R2_TOC, in_bytes(FunctionDescriptor::toc_offset()), function_descriptor); 1033 } 1034 if (load_env_of_callee) { 1035 ld(R11, in_bytes(FunctionDescriptor::env_offset()), function_descriptor); 1036 } else if (load_toc_of_callee) { 1037 li(R11, 0); 1038 } 1039 1040 // do a call or a branch 1041 if (and_link) { 1042 bctrl(); 1043 } else { 1044 bctr(); 1045 } 1046 _last_calls_return_pc = pc(); 1047 1048 return _last_calls_return_pc; 1049 } 1050 1051 // Call a C function via a function descriptor and use full C calling 1052 // conventions. 1053 // We don't use the TOC in generated code, so there is no need to save 1054 // and restore its value. 1055 address MacroAssembler::call_c(Register fd) { 1056 return branch_to(fd, /*and_link=*/true, 1057 /*save toc=*/false, 1058 /*restore toc=*/false, 1059 /*load toc=*/true, 1060 /*load env=*/true); 1061 } 1062 1063 address MacroAssembler::call_c_and_return_to_caller(Register fd) { 1064 return branch_to(fd, /*and_link=*/false, 1065 /*save toc=*/false, 1066 /*restore toc=*/false, 1067 /*load toc=*/true, 1068 /*load env=*/true); 1069 } 1070 1071 address MacroAssembler::call_c(const FunctionDescriptor* fd, relocInfo::relocType rt) { 1072 if (rt != relocInfo::none) { 1073 // this call needs to be relocatable 1074 if (!ReoptimizeCallSequences 1075 || (rt != relocInfo::runtime_call_type && rt != relocInfo::none) 1076 || fd == NULL // support code-size estimation 1077 || !fd->is_friend_function() 1078 || fd->entry() == NULL) { 1079 // it's not a friend function as defined by class FunctionDescriptor, 1080 // so do a full call-c here. 1081 load_const(R11, (address)fd, R0); 1082 1083 bool has_env = (fd != NULL && fd->env() != NULL); 1084 return branch_to(R11, /*and_link=*/true, 1085 /*save toc=*/false, 1086 /*restore toc=*/false, 1087 /*load toc=*/true, 1088 /*load env=*/has_env); 1089 } else { 1090 // It's a friend function. Load the entry point and don't care about 1091 // toc and env. Use an optimizable call instruction, but ensure the 1092 // same code-size as in the case of a non-friend function. 1093 nop(); 1094 nop(); 1095 nop(); 1096 bl64_patchable(fd->entry(), rt); 1097 _last_calls_return_pc = pc(); 1098 return _last_calls_return_pc; 1099 } 1100 } else { 1101 // This call does not need to be relocatable, do more aggressive 1102 // optimizations. 1103 if (!ReoptimizeCallSequences 1104 || !fd->is_friend_function()) { 1105 // It's not a friend function as defined by class FunctionDescriptor, 1106 // so do a full call-c here. 1107 load_const(R11, (address)fd, R0); 1108 return branch_to(R11, /*and_link=*/true, 1109 /*save toc=*/false, 1110 /*restore toc=*/false, 1111 /*load toc=*/true, 1112 /*load env=*/true); 1113 } else { 1114 // it's a friend function, load the entry point and don't care about 1115 // toc and env. 1116 address dest = fd->entry(); 1117 if (is_within_range_of_b(dest, pc())) { 1118 bl(dest); 1119 } else { 1120 bl64_patchable(dest, rt); 1121 } 1122 _last_calls_return_pc = pc(); 1123 return _last_calls_return_pc; 1124 } 1125 } 1126 } 1127 1128 // Call a C function. All constants needed reside in TOC. 1129 // 1130 // Read the address to call from the TOC. 1131 // Read env from TOC, if fd specifies an env. 1132 // Read new TOC from TOC. 1133 address MacroAssembler::call_c_using_toc(const FunctionDescriptor* fd, 1134 relocInfo::relocType rt, Register toc) { 1135 if (!ReoptimizeCallSequences 1136 || (rt != relocInfo::runtime_call_type && rt != relocInfo::none) 1137 || !fd->is_friend_function()) { 1138 // It's not a friend function as defined by class FunctionDescriptor, 1139 // so do a full call-c here. 1140 assert(fd->entry() != NULL, "function must be linked"); 1141 1142 AddressLiteral fd_entry(fd->entry()); 1143 bool success = load_const_from_method_toc(R11, fd_entry, toc, /*fixed_size*/ true); 1144 mtctr(R11); 1145 if (fd->env() == NULL) { 1146 li(R11, 0); 1147 nop(); 1148 } else { 1149 AddressLiteral fd_env(fd->env()); 1150 success = success && load_const_from_method_toc(R11, fd_env, toc, /*fixed_size*/ true); 1151 } 1152 AddressLiteral fd_toc(fd->toc()); 1153 // Set R2_TOC (load from toc) 1154 success = success && load_const_from_method_toc(R2_TOC, fd_toc, toc, /*fixed_size*/ true); 1155 bctrl(); 1156 _last_calls_return_pc = pc(); 1157 if (!success) { return NULL; } 1158 } else { 1159 // It's a friend function, load the entry point and don't care about 1160 // toc and env. Use an optimizable call instruction, but ensure the 1161 // same code-size as in the case of a non-friend function. 1162 nop(); 1163 bl64_patchable(fd->entry(), rt); 1164 _last_calls_return_pc = pc(); 1165 } 1166 return _last_calls_return_pc; 1167 } 1168 #endif // ABI_ELFv2 1169 1170 void MacroAssembler::call_VM_base(Register oop_result, 1171 Register last_java_sp, 1172 address entry_point, 1173 bool check_exceptions) { 1174 BLOCK_COMMENT("call_VM {"); 1175 // Determine last_java_sp register. 1176 if (!last_java_sp->is_valid()) { 1177 last_java_sp = R1_SP; 1178 } 1179 set_top_ijava_frame_at_SP_as_last_Java_frame(last_java_sp, R11_scratch1); 1180 1181 // ARG1 must hold thread address. 1182 mr(R3_ARG1, R16_thread); 1183 #if defined(ABI_ELFv2) 1184 address return_pc = call_c(entry_point, relocInfo::none); 1185 #else 1186 address return_pc = call_c((FunctionDescriptor*)entry_point, relocInfo::none); 1187 #endif 1188 1189 reset_last_Java_frame(); 1190 1191 // Check for pending exceptions. 1192 if (check_exceptions) { 1193 // We don't check for exceptions here. 1194 ShouldNotReachHere(); 1195 } 1196 1197 // Get oop result if there is one and reset the value in the thread. 1198 if (oop_result->is_valid()) { 1199 get_vm_result(oop_result); 1200 } 1201 1202 _last_calls_return_pc = return_pc; 1203 BLOCK_COMMENT("} call_VM"); 1204 } 1205 1206 void MacroAssembler::call_VM_leaf_base(address entry_point) { 1207 BLOCK_COMMENT("call_VM_leaf {"); 1208 #if defined(ABI_ELFv2) 1209 call_c(entry_point, relocInfo::none); 1210 #else 1211 call_c(CAST_FROM_FN_PTR(FunctionDescriptor*, entry_point), relocInfo::none); 1212 #endif 1213 BLOCK_COMMENT("} call_VM_leaf"); 1214 } 1215 1216 void MacroAssembler::call_VM(Register oop_result, address entry_point, bool check_exceptions) { 1217 call_VM_base(oop_result, noreg, entry_point, check_exceptions); 1218 } 1219 1220 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, 1221 bool check_exceptions) { 1222 // R3_ARG1 is reserved for the thread. 1223 mr_if_needed(R4_ARG2, arg_1); 1224 call_VM(oop_result, entry_point, check_exceptions); 1225 } 1226 1227 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2, 1228 bool check_exceptions) { 1229 // R3_ARG1 is reserved for the thread 1230 mr_if_needed(R4_ARG2, arg_1); 1231 assert(arg_2 != R4_ARG2, "smashed argument"); 1232 mr_if_needed(R5_ARG3, arg_2); 1233 call_VM(oop_result, entry_point, check_exceptions); 1234 } 1235 1236 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2, Register arg_3, 1237 bool check_exceptions) { 1238 // R3_ARG1 is reserved for the thread 1239 mr_if_needed(R4_ARG2, arg_1); 1240 assert(arg_2 != R4_ARG2, "smashed argument"); 1241 mr_if_needed(R5_ARG3, arg_2); 1242 mr_if_needed(R6_ARG4, arg_3); 1243 call_VM(oop_result, entry_point, check_exceptions); 1244 } 1245 1246 void MacroAssembler::call_VM_leaf(address entry_point) { 1247 call_VM_leaf_base(entry_point); 1248 } 1249 1250 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1) { 1251 mr_if_needed(R3_ARG1, arg_1); 1252 call_VM_leaf(entry_point); 1253 } 1254 1255 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2) { 1256 mr_if_needed(R3_ARG1, arg_1); 1257 assert(arg_2 != R3_ARG1, "smashed argument"); 1258 mr_if_needed(R4_ARG2, arg_2); 1259 call_VM_leaf(entry_point); 1260 } 1261 1262 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2, Register arg_3) { 1263 mr_if_needed(R3_ARG1, arg_1); 1264 assert(arg_2 != R3_ARG1, "smashed argument"); 1265 mr_if_needed(R4_ARG2, arg_2); 1266 assert(arg_3 != R3_ARG1 && arg_3 != R4_ARG2, "smashed argument"); 1267 mr_if_needed(R5_ARG3, arg_3); 1268 call_VM_leaf(entry_point); 1269 } 1270 1271 // Check whether instruction is a read access to the polling page 1272 // which was emitted by load_from_polling_page(..). 1273 bool MacroAssembler::is_load_from_polling_page(int instruction, void* ucontext, 1274 address* polling_address_ptr) { 1275 if (!is_ld(instruction)) 1276 return false; // It's not a ld. Fail. 1277 1278 int rt = inv_rt_field(instruction); 1279 int ra = inv_ra_field(instruction); 1280 int ds = inv_ds_field(instruction); 1281 if (!(ds == 0 && ra != 0 && rt == 0)) { 1282 return false; // It's not a ld(r0, X, ra). Fail. 1283 } 1284 1285 if (!ucontext) { 1286 // Set polling address. 1287 if (polling_address_ptr != NULL) { 1288 *polling_address_ptr = NULL; 1289 } 1290 return true; // No ucontext given. Can't check value of ra. Assume true. 1291 } 1292 1293 #ifdef LINUX 1294 // Ucontext given. Check that register ra contains the address of 1295 // the safepoing polling page. 1296 ucontext_t* uc = (ucontext_t*) ucontext; 1297 // Set polling address. 1298 address addr = (address)uc->uc_mcontext.regs->gpr[ra] + (ssize_t)ds; 1299 if (polling_address_ptr != NULL) { 1300 *polling_address_ptr = addr; 1301 } 1302 return os::is_poll_address(addr); 1303 #else 1304 // Not on Linux, ucontext must be NULL. 1305 ShouldNotReachHere(); 1306 return false; 1307 #endif 1308 } 1309 1310 bool MacroAssembler::is_memory_serialization(int instruction, JavaThread* thread, void* ucontext) { 1311 #ifdef LINUX 1312 ucontext_t* uc = (ucontext_t*) ucontext; 1313 1314 if (is_stwx(instruction) || is_stwux(instruction)) { 1315 int ra = inv_ra_field(instruction); 1316 int rb = inv_rb_field(instruction); 1317 1318 // look up content of ra and rb in ucontext 1319 address ra_val=(address)uc->uc_mcontext.regs->gpr[ra]; 1320 long rb_val=(long)uc->uc_mcontext.regs->gpr[rb]; 1321 return os::is_memory_serialize_page(thread, ra_val+rb_val); 1322 } else if (is_stw(instruction) || is_stwu(instruction)) { 1323 int ra = inv_ra_field(instruction); 1324 int d1 = inv_d1_field(instruction); 1325 1326 // look up content of ra in ucontext 1327 address ra_val=(address)uc->uc_mcontext.regs->gpr[ra]; 1328 return os::is_memory_serialize_page(thread, ra_val+d1); 1329 } else { 1330 return false; 1331 } 1332 #else 1333 // workaround not needed on !LINUX :-) 1334 ShouldNotCallThis(); 1335 return false; 1336 #endif 1337 } 1338 1339 void MacroAssembler::bang_stack_with_offset(int offset) { 1340 // When increasing the stack, the old stack pointer will be written 1341 // to the new top of stack according to the PPC64 abi. 1342 // Therefore, stack banging is not necessary when increasing 1343 // the stack by <= os::vm_page_size() bytes. 1344 // When increasing the stack by a larger amount, this method is 1345 // called repeatedly to bang the intermediate pages. 1346 1347 // Stack grows down, caller passes positive offset. 1348 assert(offset > 0, "must bang with positive offset"); 1349 1350 long stdoffset = -offset; 1351 1352 if (is_simm(stdoffset, 16)) { 1353 // Signed 16 bit offset, a simple std is ok. 1354 if (UseLoadInstructionsForStackBangingPPC64) { 1355 ld(R0, (int)(signed short)stdoffset, R1_SP); 1356 } else { 1357 std(R0,(int)(signed short)stdoffset, R1_SP); 1358 } 1359 } else if (is_simm(stdoffset, 31)) { 1360 const int hi = MacroAssembler::largeoffset_si16_si16_hi(stdoffset); 1361 const int lo = MacroAssembler::largeoffset_si16_si16_lo(stdoffset); 1362 1363 Register tmp = R11; 1364 addis(tmp, R1_SP, hi); 1365 if (UseLoadInstructionsForStackBangingPPC64) { 1366 ld(R0, lo, tmp); 1367 } else { 1368 std(R0, lo, tmp); 1369 } 1370 } else { 1371 ShouldNotReachHere(); 1372 } 1373 } 1374 1375 // If instruction is a stack bang of the form 1376 // std R0, x(Ry), (see bang_stack_with_offset()) 1377 // stdu R1_SP, x(R1_SP), (see push_frame(), resize_frame()) 1378 // or stdux R1_SP, Rx, R1_SP (see push_frame(), resize_frame()) 1379 // return the banged address. Otherwise, return 0. 1380 address MacroAssembler::get_stack_bang_address(int instruction, void *ucontext) { 1381 #ifdef LINUX 1382 ucontext_t* uc = (ucontext_t*) ucontext; 1383 int rs = inv_rs_field(instruction); 1384 int ra = inv_ra_field(instruction); 1385 if ( (is_ld(instruction) && rs == 0 && UseLoadInstructionsForStackBangingPPC64) 1386 || (is_std(instruction) && rs == 0 && !UseLoadInstructionsForStackBangingPPC64) 1387 || (is_stdu(instruction) && rs == 1)) { 1388 int ds = inv_ds_field(instruction); 1389 // return banged address 1390 return ds+(address)uc->uc_mcontext.regs->gpr[ra]; 1391 } else if (is_stdux(instruction) && rs == 1) { 1392 int rb = inv_rb_field(instruction); 1393 address sp = (address)uc->uc_mcontext.regs->gpr[1]; 1394 long rb_val = (long)uc->uc_mcontext.regs->gpr[rb]; 1395 return ra != 1 || rb_val >= 0 ? NULL // not a stack bang 1396 : sp + rb_val; // banged address 1397 } 1398 return NULL; // not a stack bang 1399 #else 1400 // workaround not needed on !LINUX :-) 1401 ShouldNotCallThis(); 1402 return NULL; 1403 #endif 1404 } 1405 1406 void MacroAssembler::reserved_stack_check(Register return_pc) { 1407 // Test if reserved zone needs to be enabled. 1408 Label no_reserved_zone_enabling; 1409 1410 ld_ptr(R0, JavaThread::reserved_stack_activation_offset(), R16_thread); 1411 cmpld(CCR0, R1_SP, R0); 1412 blt_predict_taken(CCR0, no_reserved_zone_enabling); 1413 1414 // Enable reserved zone again, throw stack overflow exception. 1415 push_frame_reg_args(0, R0); 1416 call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone), R16_thread); 1417 pop_frame(); 1418 mtlr(return_pc); 1419 load_const_optimized(R0, StubRoutines::throw_delayed_StackOverflowError_entry()); 1420 mtctr(R0); 1421 bctr(); 1422 1423 should_not_reach_here(); 1424 1425 bind(no_reserved_zone_enabling); 1426 } 1427 1428 void MacroAssembler::getandsetd(Register dest_current_value, Register exchange_value, Register addr_base, 1429 bool cmpxchgx_hint) { 1430 Label retry; 1431 bind(retry); 1432 ldarx(dest_current_value, addr_base, cmpxchgx_hint); 1433 stdcx_(exchange_value, addr_base); 1434 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1435 bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0. 1436 } else { 1437 bne( CCR0, retry); // StXcx_ sets CCR0. 1438 } 1439 } 1440 1441 void MacroAssembler::getandaddd(Register dest_current_value, Register inc_value, Register addr_base, 1442 Register tmp, bool cmpxchgx_hint) { 1443 Label retry; 1444 bind(retry); 1445 ldarx(dest_current_value, addr_base, cmpxchgx_hint); 1446 add(tmp, dest_current_value, inc_value); 1447 stdcx_(tmp, addr_base); 1448 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1449 bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0. 1450 } else { 1451 bne( CCR0, retry); // StXcx_ sets CCR0. 1452 } 1453 } 1454 1455 // Word/sub-word atomic helper functions 1456 1457 // Temps and addr_base are killed if size < 4 and processor does not support respective instructions. 1458 // Only signed types are supported with size < 4. 1459 // Atomic add always kills tmp1. 1460 void MacroAssembler::atomic_get_and_modify_generic(Register dest_current_value, Register exchange_value, 1461 Register addr_base, Register tmp1, Register tmp2, Register tmp3, 1462 bool cmpxchgx_hint, bool is_add, int size) { 1463 // Sub-word instructions are available since Power 8. 1464 // For older processors, instruction_type != size holds, and we 1465 // emulate the sub-word instructions by constructing a 4-byte value 1466 // that leaves the other bytes unchanged. 1467 const int instruction_type = VM_Version::has_lqarx() ? size : 4; 1468 1469 Label retry; 1470 Register shift_amount = noreg, 1471 val32 = dest_current_value, 1472 modval = is_add ? tmp1 : exchange_value; 1473 1474 if (instruction_type != size) { 1475 assert_different_registers(tmp1, tmp2, tmp3, dest_current_value, exchange_value, addr_base); 1476 modval = tmp1; 1477 shift_amount = tmp2; 1478 val32 = tmp3; 1479 // Need some preperation: Compute shift amount, align address. Note: shorts must be 2 byte aligned. 1480 #ifdef VM_LITTLE_ENDIAN 1481 rldic(shift_amount, addr_base, 3, 64-5); // (dest & 3) * 8; 1482 clrrdi(addr_base, addr_base, 2); 1483 #else 1484 xori(shift_amount, addr_base, (size == 1) ? 3 : 2); 1485 clrrdi(addr_base, addr_base, 2); 1486 rldic(shift_amount, shift_amount, 3, 64-5); // byte: ((3-dest) & 3) * 8; short: ((1-dest/2) & 1) * 16; 1487 #endif 1488 } 1489 1490 // atomic emulation loop 1491 bind(retry); 1492 1493 switch (instruction_type) { 1494 case 4: lwarx(val32, addr_base, cmpxchgx_hint); break; 1495 case 2: lharx(val32, addr_base, cmpxchgx_hint); break; 1496 case 1: lbarx(val32, addr_base, cmpxchgx_hint); break; 1497 default: ShouldNotReachHere(); 1498 } 1499 1500 if (instruction_type != size) { 1501 srw(dest_current_value, val32, shift_amount); 1502 } 1503 1504 if (is_add) { add(modval, dest_current_value, exchange_value); } 1505 1506 if (instruction_type != size) { 1507 // Transform exchange value such that the replacement can be done by one xor instruction. 1508 xorr(modval, dest_current_value, is_add ? modval : exchange_value); 1509 clrldi(modval, modval, (size == 1) ? 56 : 48); 1510 slw(modval, modval, shift_amount); 1511 xorr(modval, val32, modval); 1512 } 1513 1514 switch (instruction_type) { 1515 case 4: stwcx_(modval, addr_base); break; 1516 case 2: sthcx_(modval, addr_base); break; 1517 case 1: stbcx_(modval, addr_base); break; 1518 default: ShouldNotReachHere(); 1519 } 1520 1521 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1522 bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0. 1523 } else { 1524 bne( CCR0, retry); // StXcx_ sets CCR0. 1525 } 1526 1527 // l?arx zero-extends, but Java wants byte/short values sign-extended. 1528 if (size == 1) { 1529 extsb(dest_current_value, dest_current_value); 1530 } else if (size == 2) { 1531 extsh(dest_current_value, dest_current_value); 1532 }; 1533 } 1534 1535 // Temps, addr_base and exchange_value are killed if size < 4 and processor does not support respective instructions. 1536 // Only signed types are supported with size < 4. 1537 void MacroAssembler::cmpxchg_loop_body(ConditionRegister flag, Register dest_current_value, 1538 Register compare_value, Register exchange_value, 1539 Register addr_base, Register tmp1, Register tmp2, 1540 Label &retry, Label &failed, bool cmpxchgx_hint, int size) { 1541 // Sub-word instructions are available since Power 8. 1542 // For older processors, instruction_type != size holds, and we 1543 // emulate the sub-word instructions by constructing a 4-byte value 1544 // that leaves the other bytes unchanged. 1545 const int instruction_type = VM_Version::has_lqarx() ? size : 4; 1546 1547 Register shift_amount = noreg, 1548 val32 = dest_current_value, 1549 modval = exchange_value; 1550 1551 if (instruction_type != size) { 1552 assert_different_registers(tmp1, tmp2, dest_current_value, compare_value, exchange_value, addr_base); 1553 shift_amount = tmp1; 1554 val32 = tmp2; 1555 modval = tmp2; 1556 // Need some preperation: Compute shift amount, align address. Note: shorts must be 2 byte aligned. 1557 #ifdef VM_LITTLE_ENDIAN 1558 rldic(shift_amount, addr_base, 3, 64-5); // (dest & 3) * 8; 1559 clrrdi(addr_base, addr_base, 2); 1560 #else 1561 xori(shift_amount, addr_base, (size == 1) ? 3 : 2); 1562 clrrdi(addr_base, addr_base, 2); 1563 rldic(shift_amount, shift_amount, 3, 64-5); // byte: ((3-dest) & 3) * 8; short: ((1-dest/2) & 1) * 16; 1564 #endif 1565 // Transform exchange value such that the replacement can be done by one xor instruction. 1566 xorr(exchange_value, compare_value, exchange_value); 1567 clrldi(exchange_value, exchange_value, (size == 1) ? 56 : 48); 1568 slw(exchange_value, exchange_value, shift_amount); 1569 } 1570 1571 // atomic emulation loop 1572 bind(retry); 1573 1574 switch (instruction_type) { 1575 case 4: lwarx(val32, addr_base, cmpxchgx_hint); break; 1576 case 2: lharx(val32, addr_base, cmpxchgx_hint); break; 1577 case 1: lbarx(val32, addr_base, cmpxchgx_hint); break; 1578 default: ShouldNotReachHere(); 1579 } 1580 1581 if (instruction_type != size) { 1582 srw(dest_current_value, val32, shift_amount); 1583 } 1584 if (size == 1) { 1585 extsb(dest_current_value, dest_current_value); 1586 } else if (size == 2) { 1587 extsh(dest_current_value, dest_current_value); 1588 }; 1589 1590 cmpw(flag, dest_current_value, compare_value); 1591 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1592 bne_predict_not_taken(flag, failed); 1593 } else { 1594 bne( flag, failed); 1595 } 1596 // branch to done => (flag == ne), (dest_current_value != compare_value) 1597 // fall through => (flag == eq), (dest_current_value == compare_value) 1598 1599 if (instruction_type != size) { 1600 xorr(modval, val32, exchange_value); 1601 } 1602 1603 switch (instruction_type) { 1604 case 4: stwcx_(modval, addr_base); break; 1605 case 2: sthcx_(modval, addr_base); break; 1606 case 1: stbcx_(modval, addr_base); break; 1607 default: ShouldNotReachHere(); 1608 } 1609 } 1610 1611 // CmpxchgX sets condition register to cmpX(current, compare). 1612 void MacroAssembler::cmpxchg_generic(ConditionRegister flag, Register dest_current_value, 1613 Register compare_value, Register exchange_value, 1614 Register addr_base, Register tmp1, Register tmp2, 1615 int semantics, bool cmpxchgx_hint, 1616 Register int_flag_success, bool contention_hint, bool weak, int size) { 1617 Label retry; 1618 Label failed; 1619 Label done; 1620 1621 // Save one branch if result is returned via register and 1622 // result register is different from the other ones. 1623 bool use_result_reg = (int_flag_success != noreg); 1624 bool preset_result_reg = (int_flag_success != dest_current_value && int_flag_success != compare_value && 1625 int_flag_success != exchange_value && int_flag_success != addr_base && 1626 int_flag_success != tmp1 && int_flag_success != tmp2); 1627 assert(!weak || flag == CCR0, "weak only supported with CCR0"); 1628 assert(size == 1 || size == 2 || size == 4, "unsupported"); 1629 1630 if (use_result_reg && preset_result_reg) { 1631 li(int_flag_success, 0); // preset (assume cas failed) 1632 } 1633 1634 // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM). 1635 if (contention_hint) { // Don't try to reserve if cmp fails. 1636 switch (size) { 1637 case 1: lbz(dest_current_value, 0, addr_base); extsb(dest_current_value, dest_current_value); break; 1638 case 2: lha(dest_current_value, 0, addr_base); break; 1639 case 4: lwz(dest_current_value, 0, addr_base); break; 1640 default: ShouldNotReachHere(); 1641 } 1642 cmpw(flag, dest_current_value, compare_value); 1643 bne(flag, failed); 1644 } 1645 1646 // release/fence semantics 1647 if (semantics & MemBarRel) { 1648 release(); 1649 } 1650 1651 cmpxchg_loop_body(flag, dest_current_value, compare_value, exchange_value, addr_base, tmp1, tmp2, 1652 retry, failed, cmpxchgx_hint, size); 1653 if (!weak || use_result_reg) { 1654 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1655 bne_predict_not_taken(CCR0, weak ? failed : retry); // StXcx_ sets CCR0. 1656 } else { 1657 bne( CCR0, weak ? failed : retry); // StXcx_ sets CCR0. 1658 } 1659 } 1660 // fall through => (flag == eq), (dest_current_value == compare_value), (swapped) 1661 1662 // Result in register (must do this at the end because int_flag_success can be the 1663 // same register as one above). 1664 if (use_result_reg) { 1665 li(int_flag_success, 1); 1666 } 1667 1668 if (semantics & MemBarFenceAfter) { 1669 fence(); 1670 } else if (semantics & MemBarAcq) { 1671 isync(); 1672 } 1673 1674 if (use_result_reg && !preset_result_reg) { 1675 b(done); 1676 } 1677 1678 bind(failed); 1679 if (use_result_reg && !preset_result_reg) { 1680 li(int_flag_success, 0); 1681 } 1682 1683 bind(done); 1684 // (flag == ne) => (dest_current_value != compare_value), (!swapped) 1685 // (flag == eq) => (dest_current_value == compare_value), ( swapped) 1686 } 1687 1688 // Preforms atomic compare exchange: 1689 // if (compare_value == *addr_base) 1690 // *addr_base = exchange_value 1691 // int_flag_success = 1; 1692 // else 1693 // int_flag_success = 0; 1694 // 1695 // ConditionRegister flag = cmp(compare_value, *addr_base) 1696 // Register dest_current_value = *addr_base 1697 // Register compare_value Used to compare with value in memory 1698 // Register exchange_value Written to memory if compare_value == *addr_base 1699 // Register addr_base The memory location to compareXChange 1700 // Register int_flag_success Set to 1 if exchange_value was written to *addr_base 1701 // 1702 // To avoid the costly compare exchange the value is tested beforehand. 1703 // Several special cases exist to avoid that unnecessary information is generated. 1704 // 1705 void MacroAssembler::cmpxchgd(ConditionRegister flag, 1706 Register dest_current_value, RegisterOrConstant compare_value, Register exchange_value, 1707 Register addr_base, int semantics, bool cmpxchgx_hint, 1708 Register int_flag_success, Label* failed_ext, bool contention_hint, bool weak) { 1709 Label retry; 1710 Label failed_int; 1711 Label& failed = (failed_ext != NULL) ? *failed_ext : failed_int; 1712 Label done; 1713 1714 // Save one branch if result is returned via register and result register is different from the other ones. 1715 bool use_result_reg = (int_flag_success!=noreg); 1716 bool preset_result_reg = (int_flag_success!=dest_current_value && int_flag_success!=compare_value.register_or_noreg() && 1717 int_flag_success!=exchange_value && int_flag_success!=addr_base); 1718 assert(!weak || flag == CCR0, "weak only supported with CCR0"); 1719 assert(int_flag_success == noreg || failed_ext == NULL, "cannot have both"); 1720 1721 if (use_result_reg && preset_result_reg) { 1722 li(int_flag_success, 0); // preset (assume cas failed) 1723 } 1724 1725 // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM). 1726 if (contention_hint) { // Don't try to reserve if cmp fails. 1727 ld(dest_current_value, 0, addr_base); 1728 cmpd(flag, compare_value, dest_current_value); 1729 bne(flag, failed); 1730 } 1731 1732 // release/fence semantics 1733 if (semantics & MemBarRel) { 1734 release(); 1735 } 1736 1737 // atomic emulation loop 1738 bind(retry); 1739 1740 ldarx(dest_current_value, addr_base, cmpxchgx_hint); 1741 cmpd(flag, compare_value, dest_current_value); 1742 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1743 bne_predict_not_taken(flag, failed); 1744 } else { 1745 bne( flag, failed); 1746 } 1747 1748 stdcx_(exchange_value, addr_base); 1749 if (!weak || use_result_reg || failed_ext) { 1750 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1751 bne_predict_not_taken(CCR0, weak ? failed : retry); // stXcx_ sets CCR0 1752 } else { 1753 bne( CCR0, weak ? failed : retry); // stXcx_ sets CCR0 1754 } 1755 } 1756 1757 // result in register (must do this at the end because int_flag_success can be the same register as one above) 1758 if (use_result_reg) { 1759 li(int_flag_success, 1); 1760 } 1761 1762 if (semantics & MemBarFenceAfter) { 1763 fence(); 1764 } else if (semantics & MemBarAcq) { 1765 isync(); 1766 } 1767 1768 if (use_result_reg && !preset_result_reg) { 1769 b(done); 1770 } 1771 1772 bind(failed_int); 1773 if (use_result_reg && !preset_result_reg) { 1774 li(int_flag_success, 0); 1775 } 1776 1777 bind(done); 1778 // (flag == ne) => (dest_current_value != compare_value), (!swapped) 1779 // (flag == eq) => (dest_current_value == compare_value), ( swapped) 1780 } 1781 1782 // Look up the method for a megamorphic invokeinterface call. 1783 // The target method is determined by <intf_klass, itable_index>. 1784 // The receiver klass is in recv_klass. 1785 // On success, the result will be in method_result, and execution falls through. 1786 // On failure, execution transfers to the given label. 1787 void MacroAssembler::lookup_interface_method(Register recv_klass, 1788 Register intf_klass, 1789 RegisterOrConstant itable_index, 1790 Register method_result, 1791 Register scan_temp, 1792 Register temp2, 1793 Label& L_no_such_interface, 1794 bool return_method) { 1795 assert_different_registers(recv_klass, intf_klass, method_result, scan_temp); 1796 1797 // Compute start of first itableOffsetEntry (which is at the end of the vtable). 1798 int vtable_base = in_bytes(Klass::vtable_start_offset()); 1799 int itentry_off = itableMethodEntry::method_offset_in_bytes(); 1800 int logMEsize = exact_log2(itableMethodEntry::size() * wordSize); 1801 int scan_step = itableOffsetEntry::size() * wordSize; 1802 int log_vte_size= exact_log2(vtableEntry::size_in_bytes()); 1803 1804 lwz(scan_temp, in_bytes(Klass::vtable_length_offset()), recv_klass); 1805 // %%% We should store the aligned, prescaled offset in the klassoop. 1806 // Then the next several instructions would fold away. 1807 1808 sldi(scan_temp, scan_temp, log_vte_size); 1809 addi(scan_temp, scan_temp, vtable_base); 1810 add(scan_temp, recv_klass, scan_temp); 1811 1812 // Adjust recv_klass by scaled itable_index, so we can free itable_index. 1813 if (return_method) { 1814 if (itable_index.is_register()) { 1815 Register itable_offset = itable_index.as_register(); 1816 sldi(method_result, itable_offset, logMEsize); 1817 if (itentry_off) { addi(method_result, method_result, itentry_off); } 1818 add(method_result, method_result, recv_klass); 1819 } else { 1820 long itable_offset = (long)itable_index.as_constant(); 1821 // static address, no relocation 1822 add_const_optimized(method_result, recv_klass, (itable_offset << logMEsize) + itentry_off, temp2); 1823 } 1824 } 1825 1826 // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) { 1827 // if (scan->interface() == intf) { 1828 // result = (klass + scan->offset() + itable_index); 1829 // } 1830 // } 1831 Label search, found_method; 1832 1833 for (int peel = 1; peel >= 0; peel--) { 1834 // %%%% Could load both offset and interface in one ldx, if they were 1835 // in the opposite order. This would save a load. 1836 ld(temp2, itableOffsetEntry::interface_offset_in_bytes(), scan_temp); 1837 1838 // Check that this entry is non-null. A null entry means that 1839 // the receiver class doesn't implement the interface, and wasn't the 1840 // same as when the caller was compiled. 1841 cmpd(CCR0, temp2, intf_klass); 1842 1843 if (peel) { 1844 beq(CCR0, found_method); 1845 } else { 1846 bne(CCR0, search); 1847 // (invert the test to fall through to found_method...) 1848 } 1849 1850 if (!peel) break; 1851 1852 bind(search); 1853 1854 cmpdi(CCR0, temp2, 0); 1855 beq(CCR0, L_no_such_interface); 1856 addi(scan_temp, scan_temp, scan_step); 1857 } 1858 1859 bind(found_method); 1860 1861 // Got a hit. 1862 if (return_method) { 1863 int ito_offset = itableOffsetEntry::offset_offset_in_bytes(); 1864 lwz(scan_temp, ito_offset, scan_temp); 1865 ldx(method_result, scan_temp, method_result); 1866 } 1867 } 1868 1869 // virtual method calling 1870 void MacroAssembler::lookup_virtual_method(Register recv_klass, 1871 RegisterOrConstant vtable_index, 1872 Register method_result) { 1873 1874 assert_different_registers(recv_klass, method_result, vtable_index.register_or_noreg()); 1875 1876 const int base = in_bytes(Klass::vtable_start_offset()); 1877 assert(vtableEntry::size() * wordSize == wordSize, "adjust the scaling in the code below"); 1878 1879 if (vtable_index.is_register()) { 1880 sldi(vtable_index.as_register(), vtable_index.as_register(), LogBytesPerWord); 1881 add(recv_klass, vtable_index.as_register(), recv_klass); 1882 } else { 1883 addi(recv_klass, recv_klass, vtable_index.as_constant() << LogBytesPerWord); 1884 } 1885 ld(R19_method, base + vtableEntry::method_offset_in_bytes(), recv_klass); 1886 } 1887 1888 /////////////////////////////////////////// subtype checking //////////////////////////////////////////// 1889 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass, 1890 Register super_klass, 1891 Register temp1_reg, 1892 Register temp2_reg, 1893 Label* L_success, 1894 Label* L_failure, 1895 Label* L_slow_path, 1896 RegisterOrConstant super_check_offset) { 1897 1898 const Register check_cache_offset = temp1_reg; 1899 const Register cached_super = temp2_reg; 1900 1901 assert_different_registers(sub_klass, super_klass, check_cache_offset, cached_super); 1902 1903 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 1904 int sc_offset = in_bytes(Klass::secondary_super_cache_offset()); 1905 1906 bool must_load_sco = (super_check_offset.constant_or_zero() == -1); 1907 bool need_slow_path = (must_load_sco || super_check_offset.constant_or_zero() == sco_offset); 1908 1909 Label L_fallthrough; 1910 int label_nulls = 0; 1911 if (L_success == NULL) { L_success = &L_fallthrough; label_nulls++; } 1912 if (L_failure == NULL) { L_failure = &L_fallthrough; label_nulls++; } 1913 if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; } 1914 assert(label_nulls <= 1 || 1915 (L_slow_path == &L_fallthrough && label_nulls <= 2 && !need_slow_path), 1916 "at most one NULL in the batch, usually"); 1917 1918 // If the pointers are equal, we are done (e.g., String[] elements). 1919 // This self-check enables sharing of secondary supertype arrays among 1920 // non-primary types such as array-of-interface. Otherwise, each such 1921 // type would need its own customized SSA. 1922 // We move this check to the front of the fast path because many 1923 // type checks are in fact trivially successful in this manner, 1924 // so we get a nicely predicted branch right at the start of the check. 1925 cmpd(CCR0, sub_klass, super_klass); 1926 beq(CCR0, *L_success); 1927 1928 // Check the supertype display: 1929 if (must_load_sco) { 1930 // The super check offset is always positive... 1931 lwz(check_cache_offset, sco_offset, super_klass); 1932 super_check_offset = RegisterOrConstant(check_cache_offset); 1933 // super_check_offset is register. 1934 assert_different_registers(sub_klass, super_klass, cached_super, super_check_offset.as_register()); 1935 } 1936 // The loaded value is the offset from KlassOopDesc. 1937 1938 ld(cached_super, super_check_offset, sub_klass); 1939 cmpd(CCR0, cached_super, super_klass); 1940 1941 // This check has worked decisively for primary supers. 1942 // Secondary supers are sought in the super_cache ('super_cache_addr'). 1943 // (Secondary supers are interfaces and very deeply nested subtypes.) 1944 // This works in the same check above because of a tricky aliasing 1945 // between the super_cache and the primary super display elements. 1946 // (The 'super_check_addr' can address either, as the case requires.) 1947 // Note that the cache is updated below if it does not help us find 1948 // what we need immediately. 1949 // So if it was a primary super, we can just fail immediately. 1950 // Otherwise, it's the slow path for us (no success at this point). 1951 1952 #define FINAL_JUMP(label) if (&(label) != &L_fallthrough) { b(label); } 1953 1954 if (super_check_offset.is_register()) { 1955 beq(CCR0, *L_success); 1956 cmpwi(CCR0, super_check_offset.as_register(), sc_offset); 1957 if (L_failure == &L_fallthrough) { 1958 beq(CCR0, *L_slow_path); 1959 } else { 1960 bne(CCR0, *L_failure); 1961 FINAL_JUMP(*L_slow_path); 1962 } 1963 } else { 1964 if (super_check_offset.as_constant() == sc_offset) { 1965 // Need a slow path; fast failure is impossible. 1966 if (L_slow_path == &L_fallthrough) { 1967 beq(CCR0, *L_success); 1968 } else { 1969 bne(CCR0, *L_slow_path); 1970 FINAL_JUMP(*L_success); 1971 } 1972 } else { 1973 // No slow path; it's a fast decision. 1974 if (L_failure == &L_fallthrough) { 1975 beq(CCR0, *L_success); 1976 } else { 1977 bne(CCR0, *L_failure); 1978 FINAL_JUMP(*L_success); 1979 } 1980 } 1981 } 1982 1983 bind(L_fallthrough); 1984 #undef FINAL_JUMP 1985 } 1986 1987 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass, 1988 Register super_klass, 1989 Register temp1_reg, 1990 Register temp2_reg, 1991 Label* L_success, 1992 Register result_reg) { 1993 const Register array_ptr = temp1_reg; // current value from cache array 1994 const Register temp = temp2_reg; 1995 1996 assert_different_registers(sub_klass, super_klass, array_ptr, temp); 1997 1998 int source_offset = in_bytes(Klass::secondary_supers_offset()); 1999 int target_offset = in_bytes(Klass::secondary_super_cache_offset()); 2000 2001 int length_offset = Array<Klass*>::length_offset_in_bytes(); 2002 int base_offset = Array<Klass*>::base_offset_in_bytes(); 2003 2004 Label hit, loop, failure, fallthru; 2005 2006 ld(array_ptr, source_offset, sub_klass); 2007 2008 // TODO: PPC port: assert(4 == arrayOopDesc::length_length_in_bytes(), "precondition violated."); 2009 lwz(temp, length_offset, array_ptr); 2010 cmpwi(CCR0, temp, 0); 2011 beq(CCR0, result_reg!=noreg ? failure : fallthru); // length 0 2012 2013 mtctr(temp); // load ctr 2014 2015 bind(loop); 2016 // Oops in table are NO MORE compressed. 2017 ld(temp, base_offset, array_ptr); 2018 cmpd(CCR0, temp, super_klass); 2019 beq(CCR0, hit); 2020 addi(array_ptr, array_ptr, BytesPerWord); 2021 bdnz(loop); 2022 2023 bind(failure); 2024 if (result_reg!=noreg) li(result_reg, 1); // load non-zero result (indicates a miss) 2025 b(fallthru); 2026 2027 bind(hit); 2028 std(super_klass, target_offset, sub_klass); // save result to cache 2029 if (result_reg != noreg) { li(result_reg, 0); } // load zero result (indicates a hit) 2030 if (L_success != NULL) { b(*L_success); } 2031 else if (result_reg == noreg) { blr(); } // return with CR0.eq if neither label nor result reg provided 2032 2033 bind(fallthru); 2034 } 2035 2036 // Try fast path, then go to slow one if not successful 2037 void MacroAssembler::check_klass_subtype(Register sub_klass, 2038 Register super_klass, 2039 Register temp1_reg, 2040 Register temp2_reg, 2041 Label& L_success) { 2042 Label L_failure; 2043 check_klass_subtype_fast_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success, &L_failure); 2044 check_klass_subtype_slow_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success); 2045 bind(L_failure); // Fallthru if not successful. 2046 } 2047 2048 void MacroAssembler::check_method_handle_type(Register mtype_reg, Register mh_reg, 2049 Register temp_reg, 2050 Label& wrong_method_type) { 2051 assert_different_registers(mtype_reg, mh_reg, temp_reg); 2052 // Compare method type against that of the receiver. 2053 load_heap_oop_not_null(temp_reg, delayed_value(java_lang_invoke_MethodHandle::type_offset_in_bytes, temp_reg), mh_reg); 2054 cmpd(CCR0, temp_reg, mtype_reg); 2055 bne(CCR0, wrong_method_type); 2056 } 2057 2058 RegisterOrConstant MacroAssembler::argument_offset(RegisterOrConstant arg_slot, 2059 Register temp_reg, 2060 int extra_slot_offset) { 2061 // cf. TemplateTable::prepare_invoke(), if (load_receiver). 2062 int stackElementSize = Interpreter::stackElementSize; 2063 int offset = extra_slot_offset * stackElementSize; 2064 if (arg_slot.is_constant()) { 2065 offset += arg_slot.as_constant() * stackElementSize; 2066 return offset; 2067 } else { 2068 assert(temp_reg != noreg, "must specify"); 2069 sldi(temp_reg, arg_slot.as_register(), exact_log2(stackElementSize)); 2070 if (offset != 0) 2071 addi(temp_reg, temp_reg, offset); 2072 return temp_reg; 2073 } 2074 } 2075 2076 // Supports temp2_reg = R0. 2077 void MacroAssembler::biased_locking_enter(ConditionRegister cr_reg, Register obj_reg, 2078 Register mark_reg, Register temp_reg, 2079 Register temp2_reg, Label& done, Label* slow_case) { 2080 assert(UseBiasedLocking, "why call this otherwise?"); 2081 2082 #ifdef ASSERT 2083 assert_different_registers(obj_reg, mark_reg, temp_reg, temp2_reg); 2084 #endif 2085 2086 Label cas_label; 2087 2088 // Branch to done if fast path fails and no slow_case provided. 2089 Label *slow_case_int = (slow_case != NULL) ? slow_case : &done; 2090 2091 // Biased locking 2092 // See whether the lock is currently biased toward our thread and 2093 // whether the epoch is still valid 2094 // Note that the runtime guarantees sufficient alignment of JavaThread 2095 // pointers to allow age to be placed into low bits 2096 assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits, 2097 "biased locking makes assumptions about bit layout"); 2098 2099 if (PrintBiasedLockingStatistics) { 2100 load_const(temp2_reg, (address) BiasedLocking::total_entry_count_addr(), temp_reg); 2101 lwzx(temp_reg, temp2_reg); 2102 addi(temp_reg, temp_reg, 1); 2103 stwx(temp_reg, temp2_reg); 2104 } 2105 2106 andi(temp_reg, mark_reg, markOopDesc::biased_lock_mask_in_place); 2107 cmpwi(cr_reg, temp_reg, markOopDesc::biased_lock_pattern); 2108 bne(cr_reg, cas_label); 2109 2110 load_klass(temp_reg, obj_reg); 2111 2112 load_const_optimized(temp2_reg, ~((int) markOopDesc::age_mask_in_place)); 2113 ld(temp_reg, in_bytes(Klass::prototype_header_offset()), temp_reg); 2114 orr(temp_reg, R16_thread, temp_reg); 2115 xorr(temp_reg, mark_reg, temp_reg); 2116 andr(temp_reg, temp_reg, temp2_reg); 2117 cmpdi(cr_reg, temp_reg, 0); 2118 if (PrintBiasedLockingStatistics) { 2119 Label l; 2120 bne(cr_reg, l); 2121 load_const(temp2_reg, (address) BiasedLocking::biased_lock_entry_count_addr()); 2122 lwzx(mark_reg, temp2_reg); 2123 addi(mark_reg, mark_reg, 1); 2124 stwx(mark_reg, temp2_reg); 2125 // restore mark_reg 2126 ld(mark_reg, oopDesc::mark_offset_in_bytes(), obj_reg); 2127 bind(l); 2128 } 2129 beq(cr_reg, done); 2130 2131 Label try_revoke_bias; 2132 Label try_rebias; 2133 2134 // At this point we know that the header has the bias pattern and 2135 // that we are not the bias owner in the current epoch. We need to 2136 // figure out more details about the state of the header in order to 2137 // know what operations can be legally performed on the object's 2138 // header. 2139 2140 // If the low three bits in the xor result aren't clear, that means 2141 // the prototype header is no longer biased and we have to revoke 2142 // the bias on this object. 2143 andi(temp2_reg, temp_reg, markOopDesc::biased_lock_mask_in_place); 2144 cmpwi(cr_reg, temp2_reg, 0); 2145 bne(cr_reg, try_revoke_bias); 2146 2147 // Biasing is still enabled for this data type. See whether the 2148 // epoch of the current bias is still valid, meaning that the epoch 2149 // bits of the mark word are equal to the epoch bits of the 2150 // prototype header. (Note that the prototype header's epoch bits 2151 // only change at a safepoint.) If not, attempt to rebias the object 2152 // toward the current thread. Note that we must be absolutely sure 2153 // that the current epoch is invalid in order to do this because 2154 // otherwise the manipulations it performs on the mark word are 2155 // illegal. 2156 2157 int shift_amount = 64 - markOopDesc::epoch_shift; 2158 // rotate epoch bits to right (little) end and set other bits to 0 2159 // [ big part | epoch | little part ] -> [ 0..0 | epoch ] 2160 rldicl_(temp2_reg, temp_reg, shift_amount, 64 - markOopDesc::epoch_bits); 2161 // branch if epoch bits are != 0, i.e. they differ, because the epoch has been incremented 2162 bne(CCR0, try_rebias); 2163 2164 // The epoch of the current bias is still valid but we know nothing 2165 // about the owner; it might be set or it might be clear. Try to 2166 // acquire the bias of the object using an atomic operation. If this 2167 // fails we will go in to the runtime to revoke the object's bias. 2168 // Note that we first construct the presumed unbiased header so we 2169 // don't accidentally blow away another thread's valid bias. 2170 andi(mark_reg, mark_reg, (markOopDesc::biased_lock_mask_in_place | 2171 markOopDesc::age_mask_in_place | 2172 markOopDesc::epoch_mask_in_place)); 2173 orr(temp_reg, R16_thread, mark_reg); 2174 2175 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 2176 2177 // CmpxchgX sets cr_reg to cmpX(temp2_reg, mark_reg). 2178 cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg, 2179 /*compare_value=*/mark_reg, /*exchange_value=*/temp_reg, 2180 /*where=*/obj_reg, 2181 MacroAssembler::MemBarAcq, 2182 MacroAssembler::cmpxchgx_hint_acquire_lock(), 2183 noreg, slow_case_int); // bail out if failed 2184 2185 // If the biasing toward our thread failed, this means that 2186 // another thread succeeded in biasing it toward itself and we 2187 // need to revoke that bias. The revocation will occur in the 2188 // interpreter runtime in the slow case. 2189 if (PrintBiasedLockingStatistics) { 2190 load_const(temp2_reg, (address) BiasedLocking::anonymously_biased_lock_entry_count_addr(), temp_reg); 2191 lwzx(temp_reg, temp2_reg); 2192 addi(temp_reg, temp_reg, 1); 2193 stwx(temp_reg, temp2_reg); 2194 } 2195 b(done); 2196 2197 bind(try_rebias); 2198 // At this point we know the epoch has expired, meaning that the 2199 // current "bias owner", if any, is actually invalid. Under these 2200 // circumstances _only_, we are allowed to use the current header's 2201 // value as the comparison value when doing the cas to acquire the 2202 // bias in the current epoch. In other words, we allow transfer of 2203 // the bias from one thread to another directly in this situation. 2204 load_klass(temp_reg, obj_reg); 2205 andi(temp2_reg, mark_reg, markOopDesc::age_mask_in_place); 2206 orr(temp2_reg, R16_thread, temp2_reg); 2207 ld(temp_reg, in_bytes(Klass::prototype_header_offset()), temp_reg); 2208 orr(temp_reg, temp2_reg, temp_reg); 2209 2210 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 2211 2212 cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg, 2213 /*compare_value=*/mark_reg, /*exchange_value=*/temp_reg, 2214 /*where=*/obj_reg, 2215 MacroAssembler::MemBarAcq, 2216 MacroAssembler::cmpxchgx_hint_acquire_lock(), 2217 noreg, slow_case_int); // bail out if failed 2218 2219 // If the biasing toward our thread failed, this means that 2220 // another thread succeeded in biasing it toward itself and we 2221 // need to revoke that bias. The revocation will occur in the 2222 // interpreter runtime in the slow case. 2223 if (PrintBiasedLockingStatistics) { 2224 load_const(temp2_reg, (address) BiasedLocking::rebiased_lock_entry_count_addr(), temp_reg); 2225 lwzx(temp_reg, temp2_reg); 2226 addi(temp_reg, temp_reg, 1); 2227 stwx(temp_reg, temp2_reg); 2228 } 2229 b(done); 2230 2231 bind(try_revoke_bias); 2232 // The prototype mark in the klass doesn't have the bias bit set any 2233 // more, indicating that objects of this data type are not supposed 2234 // to be biased any more. We are going to try to reset the mark of 2235 // this object to the prototype value and fall through to the 2236 // CAS-based locking scheme. Note that if our CAS fails, it means 2237 // that another thread raced us for the privilege of revoking the 2238 // bias of this particular object, so it's okay to continue in the 2239 // normal locking code. 2240 load_klass(temp_reg, obj_reg); 2241 ld(temp_reg, in_bytes(Klass::prototype_header_offset()), temp_reg); 2242 andi(temp2_reg, mark_reg, markOopDesc::age_mask_in_place); 2243 orr(temp_reg, temp_reg, temp2_reg); 2244 2245 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 2246 2247 // CmpxchgX sets cr_reg to cmpX(temp2_reg, mark_reg). 2248 cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg, 2249 /*compare_value=*/mark_reg, /*exchange_value=*/temp_reg, 2250 /*where=*/obj_reg, 2251 MacroAssembler::MemBarAcq, 2252 MacroAssembler::cmpxchgx_hint_acquire_lock()); 2253 2254 // reload markOop in mark_reg before continuing with lightweight locking 2255 ld(mark_reg, oopDesc::mark_offset_in_bytes(), obj_reg); 2256 2257 // Fall through to the normal CAS-based lock, because no matter what 2258 // the result of the above CAS, some thread must have succeeded in 2259 // removing the bias bit from the object's header. 2260 if (PrintBiasedLockingStatistics) { 2261 Label l; 2262 bne(cr_reg, l); 2263 load_const(temp2_reg, (address) BiasedLocking::revoked_lock_entry_count_addr(), temp_reg); 2264 lwzx(temp_reg, temp2_reg); 2265 addi(temp_reg, temp_reg, 1); 2266 stwx(temp_reg, temp2_reg); 2267 bind(l); 2268 } 2269 2270 bind(cas_label); 2271 } 2272 2273 void MacroAssembler::biased_locking_exit (ConditionRegister cr_reg, Register mark_addr, Register temp_reg, Label& done) { 2274 // Check for biased locking unlock case, which is a no-op 2275 // Note: we do not have to check the thread ID for two reasons. 2276 // First, the interpreter checks for IllegalMonitorStateException at 2277 // a higher level. Second, if the bias was revoked while we held the 2278 // lock, the object could not be rebiased toward another thread, so 2279 // the bias bit would be clear. 2280 2281 ld(temp_reg, 0, mark_addr); 2282 andi(temp_reg, temp_reg, markOopDesc::biased_lock_mask_in_place); 2283 2284 cmpwi(cr_reg, temp_reg, markOopDesc::biased_lock_pattern); 2285 beq(cr_reg, done); 2286 } 2287 2288 // allocation (for C1) 2289 void MacroAssembler::eden_allocate( 2290 Register obj, // result: pointer to object after successful allocation 2291 Register var_size_in_bytes, // object size in bytes if unknown at compile time; invalid otherwise 2292 int con_size_in_bytes, // object size in bytes if known at compile time 2293 Register t1, // temp register 2294 Register t2, // temp register 2295 Label& slow_case // continuation point if fast allocation fails 2296 ) { 2297 b(slow_case); 2298 } 2299 2300 void MacroAssembler::tlab_allocate( 2301 Register obj, // result: pointer to object after successful allocation 2302 Register var_size_in_bytes, // object size in bytes if unknown at compile time; invalid otherwise 2303 int con_size_in_bytes, // object size in bytes if known at compile time 2304 Register t1, // temp register 2305 Label& slow_case // continuation point if fast allocation fails 2306 ) { 2307 // make sure arguments make sense 2308 assert_different_registers(obj, var_size_in_bytes, t1); 2309 assert(0 <= con_size_in_bytes && is_simm13(con_size_in_bytes), "illegal object size"); 2310 assert((con_size_in_bytes & MinObjAlignmentInBytesMask) == 0, "object size is not multiple of alignment"); 2311 2312 const Register new_top = t1; 2313 //verify_tlab(); not implemented 2314 2315 ld(obj, in_bytes(JavaThread::tlab_top_offset()), R16_thread); 2316 ld(R0, in_bytes(JavaThread::tlab_fast_path_end_offset()), R16_thread); 2317 if (var_size_in_bytes == noreg) { 2318 addi(new_top, obj, con_size_in_bytes); 2319 } else { 2320 add(new_top, obj, var_size_in_bytes); 2321 } 2322 cmpld(CCR0, new_top, R0); 2323 bc_far_optimized(Assembler::bcondCRbiIs1, bi0(CCR0, Assembler::greater), slow_case); 2324 2325 #ifdef ASSERT 2326 // make sure new free pointer is properly aligned 2327 { 2328 Label L; 2329 andi_(R0, new_top, MinObjAlignmentInBytesMask); 2330 beq(CCR0, L); 2331 stop("updated TLAB free is not properly aligned", 0x934); 2332 bind(L); 2333 } 2334 #endif // ASSERT 2335 2336 // update the tlab top pointer 2337 std(new_top, in_bytes(JavaThread::tlab_top_offset()), R16_thread); 2338 //verify_tlab(); not implemented 2339 } 2340 void MacroAssembler::incr_allocated_bytes(RegisterOrConstant size_in_bytes, Register t1, Register t2) { 2341 unimplemented("incr_allocated_bytes"); 2342 } 2343 2344 address MacroAssembler::emit_trampoline_stub(int destination_toc_offset, 2345 int insts_call_instruction_offset, Register Rtoc) { 2346 // Start the stub. 2347 address stub = start_a_stub(64); 2348 if (stub == NULL) { return NULL; } // CodeCache full: bail out 2349 2350 // Create a trampoline stub relocation which relates this trampoline stub 2351 // with the call instruction at insts_call_instruction_offset in the 2352 // instructions code-section. 2353 relocate(trampoline_stub_Relocation::spec(code()->insts()->start() + insts_call_instruction_offset)); 2354 const int stub_start_offset = offset(); 2355 2356 // For java_to_interp stubs we use R11_scratch1 as scratch register 2357 // and in call trampoline stubs we use R12_scratch2. This way we 2358 // can distinguish them (see is_NativeCallTrampolineStub_at()). 2359 Register reg_scratch = R12_scratch2; 2360 2361 // Now, create the trampoline stub's code: 2362 // - load the TOC 2363 // - load the call target from the constant pool 2364 // - call 2365 if (Rtoc == noreg) { 2366 calculate_address_from_global_toc(reg_scratch, method_toc()); 2367 Rtoc = reg_scratch; 2368 } 2369 2370 ld_largeoffset_unchecked(reg_scratch, destination_toc_offset, Rtoc, false); 2371 mtctr(reg_scratch); 2372 bctr(); 2373 2374 const address stub_start_addr = addr_at(stub_start_offset); 2375 2376 // Assert that the encoded destination_toc_offset can be identified and that it is correct. 2377 assert(destination_toc_offset == NativeCallTrampolineStub_at(stub_start_addr)->destination_toc_offset(), 2378 "encoded offset into the constant pool must match"); 2379 // Trampoline_stub_size should be good. 2380 assert((uint)(offset() - stub_start_offset) <= trampoline_stub_size, "should be good size"); 2381 assert(is_NativeCallTrampolineStub_at(stub_start_addr), "doesn't look like a trampoline"); 2382 2383 // End the stub. 2384 end_a_stub(); 2385 return stub; 2386 } 2387 2388 // TM on PPC64. 2389 void MacroAssembler::atomic_inc_ptr(Register addr, Register result, int simm16) { 2390 Label retry; 2391 bind(retry); 2392 ldarx(result, addr, /*hint*/ false); 2393 addi(result, result, simm16); 2394 stdcx_(result, addr); 2395 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 2396 bne_predict_not_taken(CCR0, retry); // stXcx_ sets CCR0 2397 } else { 2398 bne( CCR0, retry); // stXcx_ sets CCR0 2399 } 2400 } 2401 2402 void MacroAssembler::atomic_ori_int(Register addr, Register result, int uimm16) { 2403 Label retry; 2404 bind(retry); 2405 lwarx(result, addr, /*hint*/ false); 2406 ori(result, result, uimm16); 2407 stwcx_(result, addr); 2408 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 2409 bne_predict_not_taken(CCR0, retry); // stXcx_ sets CCR0 2410 } else { 2411 bne( CCR0, retry); // stXcx_ sets CCR0 2412 } 2413 } 2414 2415 #if INCLUDE_RTM_OPT 2416 2417 // Update rtm_counters based on abort status 2418 // input: abort_status 2419 // rtm_counters (RTMLockingCounters*) 2420 void MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters_Reg) { 2421 // Mapping to keep PreciseRTMLockingStatistics similar to x86. 2422 // x86 ppc (! means inverted, ? means not the same) 2423 // 0 31 Set if abort caused by XABORT instruction. 2424 // 1 ! 7 If set, the transaction may succeed on a retry. This bit is always clear if bit 0 is set. 2425 // 2 13 Set if another logical processor conflicted with a memory address that was part of the transaction that aborted. 2426 // 3 10 Set if an internal buffer overflowed. 2427 // 4 ?12 Set if a debug breakpoint was hit. 2428 // 5 ?32 Set if an abort occurred during execution of a nested transaction. 2429 const int tm_failure_bit[] = {Assembler::tm_tabort, // Note: Seems like signal handler sets this, too. 2430 Assembler::tm_failure_persistent, // inverted: transient 2431 Assembler::tm_trans_cf, 2432 Assembler::tm_footprint_of, 2433 Assembler::tm_non_trans_cf, 2434 Assembler::tm_suspended}; 2435 const bool tm_failure_inv[] = {false, true, false, false, false, false}; 2436 assert(sizeof(tm_failure_bit)/sizeof(int) == RTMLockingCounters::ABORT_STATUS_LIMIT, "adapt mapping!"); 2437 2438 const Register addr_Reg = R0; 2439 // Keep track of offset to where rtm_counters_Reg had pointed to. 2440 int counters_offs = RTMLockingCounters::abort_count_offset(); 2441 addi(addr_Reg, rtm_counters_Reg, counters_offs); 2442 const Register temp_Reg = rtm_counters_Reg; 2443 2444 //atomic_inc_ptr(addr_Reg, temp_Reg); We don't increment atomically 2445 ldx(temp_Reg, addr_Reg); 2446 addi(temp_Reg, temp_Reg, 1); 2447 stdx(temp_Reg, addr_Reg); 2448 2449 if (PrintPreciseRTMLockingStatistics) { 2450 int counters_offs_delta = RTMLockingCounters::abortX_count_offset() - counters_offs; 2451 2452 //mftexasr(abort_status); done by caller 2453 for (int i = 0; i < RTMLockingCounters::ABORT_STATUS_LIMIT; i++) { 2454 counters_offs += counters_offs_delta; 2455 li(temp_Reg, counters_offs_delta); // can't use addi with R0 2456 add(addr_Reg, addr_Reg, temp_Reg); // point to next counter 2457 counters_offs_delta = sizeof(uintx); 2458 2459 Label check_abort; 2460 rldicr_(temp_Reg, abort_status, tm_failure_bit[i], 0); 2461 if (tm_failure_inv[i]) { 2462 bne(CCR0, check_abort); 2463 } else { 2464 beq(CCR0, check_abort); 2465 } 2466 //atomic_inc_ptr(addr_Reg, temp_Reg); We don't increment atomically 2467 ldx(temp_Reg, addr_Reg); 2468 addi(temp_Reg, temp_Reg, 1); 2469 stdx(temp_Reg, addr_Reg); 2470 bind(check_abort); 2471 } 2472 } 2473 li(temp_Reg, -counters_offs); // can't use addi with R0 2474 add(rtm_counters_Reg, addr_Reg, temp_Reg); // restore 2475 } 2476 2477 // Branch if (random & (count-1) != 0), count is 2^n 2478 // tmp and CR0 are killed 2479 void MacroAssembler::branch_on_random_using_tb(Register tmp, int count, Label& brLabel) { 2480 mftb(tmp); 2481 andi_(tmp, tmp, count-1); 2482 bne(CCR0, brLabel); 2483 } 2484 2485 // Perform abort ratio calculation, set no_rtm bit if high ratio. 2486 // input: rtm_counters_Reg (RTMLockingCounters* address) - KILLED 2487 void MacroAssembler::rtm_abort_ratio_calculation(Register rtm_counters_Reg, 2488 RTMLockingCounters* rtm_counters, 2489 Metadata* method_data) { 2490 Label L_done, L_check_always_rtm1, L_check_always_rtm2; 2491 2492 if (RTMLockingCalculationDelay > 0) { 2493 // Delay calculation. 2494 ld(rtm_counters_Reg, (RegisterOrConstant)(intptr_t)RTMLockingCounters::rtm_calculation_flag_addr()); 2495 cmpdi(CCR0, rtm_counters_Reg, 0); 2496 beq(CCR0, L_done); 2497 load_const_optimized(rtm_counters_Reg, (address)rtm_counters, R0); // reload 2498 } 2499 // Abort ratio calculation only if abort_count > RTMAbortThreshold. 2500 // Aborted transactions = abort_count * 100 2501 // All transactions = total_count * RTMTotalCountIncrRate 2502 // Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio) 2503 ld(R0, RTMLockingCounters::abort_count_offset(), rtm_counters_Reg); 2504 if (is_simm(RTMAbortThreshold, 16)) { // cmpdi can handle 16bit immediate only. 2505 cmpdi(CCR0, R0, RTMAbortThreshold); 2506 blt(CCR0, L_check_always_rtm2); // reload of rtm_counters_Reg not necessary 2507 } else { 2508 load_const_optimized(rtm_counters_Reg, RTMAbortThreshold); 2509 cmpd(CCR0, R0, rtm_counters_Reg); 2510 blt(CCR0, L_check_always_rtm1); // reload of rtm_counters_Reg required 2511 } 2512 mulli(R0, R0, 100); 2513 2514 const Register tmpReg = rtm_counters_Reg; 2515 ld(tmpReg, RTMLockingCounters::total_count_offset(), rtm_counters_Reg); 2516 mulli(tmpReg, tmpReg, RTMTotalCountIncrRate); // allowable range: int16 2517 mulli(tmpReg, tmpReg, RTMAbortRatio); // allowable range: int16 2518 cmpd(CCR0, R0, tmpReg); 2519 blt(CCR0, L_check_always_rtm1); // jump to reload 2520 if (method_data != NULL) { 2521 // Set rtm_state to "no rtm" in MDO. 2522 // Not using a metadata relocation. Method and Class Loader are kept alive anyway. 2523 // (See nmethod::metadata_do and CodeBuffer::finalize_oop_references.) 2524 load_const(R0, (address)method_data + MethodData::rtm_state_offset_in_bytes(), tmpReg); 2525 atomic_ori_int(R0, tmpReg, NoRTM); 2526 } 2527 b(L_done); 2528 2529 bind(L_check_always_rtm1); 2530 load_const_optimized(rtm_counters_Reg, (address)rtm_counters, R0); // reload 2531 bind(L_check_always_rtm2); 2532 ld(tmpReg, RTMLockingCounters::total_count_offset(), rtm_counters_Reg); 2533 int64_t thresholdValue = RTMLockingThreshold / RTMTotalCountIncrRate; 2534 if (is_simm(thresholdValue, 16)) { // cmpdi can handle 16bit immediate only. 2535 cmpdi(CCR0, tmpReg, thresholdValue); 2536 } else { 2537 load_const_optimized(R0, thresholdValue); 2538 cmpd(CCR0, tmpReg, R0); 2539 } 2540 blt(CCR0, L_done); 2541 if (method_data != NULL) { 2542 // Set rtm_state to "always rtm" in MDO. 2543 // Not using a metadata relocation. See above. 2544 load_const(R0, (address)method_data + MethodData::rtm_state_offset_in_bytes(), tmpReg); 2545 atomic_ori_int(R0, tmpReg, UseRTM); 2546 } 2547 bind(L_done); 2548 } 2549 2550 // Update counters and perform abort ratio calculation. 2551 // input: abort_status_Reg 2552 void MacroAssembler::rtm_profiling(Register abort_status_Reg, Register temp_Reg, 2553 RTMLockingCounters* rtm_counters, 2554 Metadata* method_data, 2555 bool profile_rtm) { 2556 2557 assert(rtm_counters != NULL, "should not be NULL when profiling RTM"); 2558 // Update rtm counters based on state at abort. 2559 // Reads abort_status_Reg, updates flags. 2560 assert_different_registers(abort_status_Reg, temp_Reg); 2561 load_const_optimized(temp_Reg, (address)rtm_counters, R0); 2562 rtm_counters_update(abort_status_Reg, temp_Reg); 2563 if (profile_rtm) { 2564 assert(rtm_counters != NULL, "should not be NULL when profiling RTM"); 2565 rtm_abort_ratio_calculation(temp_Reg, rtm_counters, method_data); 2566 } 2567 } 2568 2569 // Retry on abort if abort's status indicates non-persistent failure. 2570 // inputs: retry_count_Reg 2571 // : abort_status_Reg 2572 // output: retry_count_Reg decremented by 1 2573 void MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg, 2574 Label& retryLabel, Label* checkRetry) { 2575 Label doneRetry; 2576 rldicr_(R0, abort_status_Reg, tm_failure_persistent, 0); 2577 bne(CCR0, doneRetry); 2578 if (checkRetry) { bind(*checkRetry); } 2579 addic_(retry_count_Reg, retry_count_Reg, -1); 2580 blt(CCR0, doneRetry); 2581 b(retryLabel); 2582 bind(doneRetry); 2583 } 2584 2585 // Spin and retry if lock is busy. 2586 // inputs: owner_addr_Reg (monitor address) 2587 // : retry_count_Reg 2588 // output: retry_count_Reg decremented by 1 2589 // CTR is killed 2590 void MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register owner_addr_Reg, Label& retryLabel) { 2591 Label SpinLoop, doneRetry, doRetry; 2592 addic_(retry_count_Reg, retry_count_Reg, -1); 2593 blt(CCR0, doneRetry); 2594 2595 if (RTMSpinLoopCount > 1) { 2596 li(R0, RTMSpinLoopCount); 2597 mtctr(R0); 2598 } 2599 2600 // low thread priority 2601 smt_prio_low(); 2602 bind(SpinLoop); 2603 2604 if (RTMSpinLoopCount > 1) { 2605 bdz(doRetry); 2606 ld(R0, 0, owner_addr_Reg); 2607 cmpdi(CCR0, R0, 0); 2608 bne(CCR0, SpinLoop); 2609 } 2610 2611 bind(doRetry); 2612 2613 // restore thread priority to default in userspace 2614 #ifdef LINUX 2615 smt_prio_medium_low(); 2616 #else 2617 smt_prio_medium(); 2618 #endif 2619 2620 b(retryLabel); 2621 2622 bind(doneRetry); 2623 } 2624 2625 // Use RTM for normal stack locks. 2626 // Input: objReg (object to lock) 2627 void MacroAssembler::rtm_stack_locking(ConditionRegister flag, 2628 Register obj, Register mark_word, Register tmp, 2629 Register retry_on_abort_count_Reg, 2630 RTMLockingCounters* stack_rtm_counters, 2631 Metadata* method_data, bool profile_rtm, 2632 Label& DONE_LABEL, Label& IsInflated) { 2633 assert(UseRTMForStackLocks, "why call this otherwise?"); 2634 assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking"); 2635 Label L_rtm_retry, L_decrement_retry, L_on_abort; 2636 2637 if (RTMRetryCount > 0) { 2638 load_const_optimized(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort 2639 bind(L_rtm_retry); 2640 } 2641 andi_(R0, mark_word, markOopDesc::monitor_value); // inflated vs stack-locked|neutral|biased 2642 bne(CCR0, IsInflated); 2643 2644 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 2645 Label L_noincrement; 2646 if (RTMTotalCountIncrRate > 1) { 2647 branch_on_random_using_tb(tmp, RTMTotalCountIncrRate, L_noincrement); 2648 } 2649 assert(stack_rtm_counters != NULL, "should not be NULL when profiling RTM"); 2650 load_const_optimized(tmp, (address)stack_rtm_counters->total_count_addr(), R0); 2651 //atomic_inc_ptr(tmp, /*temp, will be reloaded*/mark_word); We don't increment atomically 2652 ldx(mark_word, tmp); 2653 addi(mark_word, mark_word, 1); 2654 stdx(mark_word, tmp); 2655 bind(L_noincrement); 2656 } 2657 tbegin_(); 2658 beq(CCR0, L_on_abort); 2659 ld(mark_word, oopDesc::mark_offset_in_bytes(), obj); // Reload in transaction, conflicts need to be tracked. 2660 andi(R0, mark_word, markOopDesc::biased_lock_mask_in_place); // look at 3 lock bits 2661 cmpwi(flag, R0, markOopDesc::unlocked_value); // bits = 001 unlocked 2662 beq(flag, DONE_LABEL); // all done if unlocked 2663 2664 if (UseRTMXendForLockBusy) { 2665 tend_(); 2666 b(L_decrement_retry); 2667 } else { 2668 tabort_(); 2669 } 2670 bind(L_on_abort); 2671 const Register abort_status_Reg = tmp; 2672 mftexasr(abort_status_Reg); 2673 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 2674 rtm_profiling(abort_status_Reg, /*temp*/mark_word, stack_rtm_counters, method_data, profile_rtm); 2675 } 2676 ld(mark_word, oopDesc::mark_offset_in_bytes(), obj); // reload 2677 if (RTMRetryCount > 0) { 2678 // Retry on lock abort if abort status is not permanent. 2679 rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry, &L_decrement_retry); 2680 } else { 2681 bind(L_decrement_retry); 2682 } 2683 } 2684 2685 // Use RTM for inflating locks 2686 // inputs: obj (object to lock) 2687 // mark_word (current header - KILLED) 2688 // boxReg (on-stack box address (displaced header location) - KILLED) 2689 void MacroAssembler::rtm_inflated_locking(ConditionRegister flag, 2690 Register obj, Register mark_word, Register boxReg, 2691 Register retry_on_busy_count_Reg, Register retry_on_abort_count_Reg, 2692 RTMLockingCounters* rtm_counters, 2693 Metadata* method_data, bool profile_rtm, 2694 Label& DONE_LABEL) { 2695 assert(UseRTMLocking, "why call this otherwise?"); 2696 Label L_rtm_retry, L_decrement_retry, L_on_abort; 2697 // Clean monitor_value bit to get valid pointer. 2698 int owner_offset = ObjectMonitor::owner_offset_in_bytes() - markOopDesc::monitor_value; 2699 2700 // Store non-null, using boxReg instead of (intptr_t)markOopDesc::unused_mark(). 2701 std(boxReg, BasicLock::displaced_header_offset_in_bytes(), boxReg); 2702 const Register tmpReg = boxReg; 2703 const Register owner_addr_Reg = mark_word; 2704 addi(owner_addr_Reg, mark_word, owner_offset); 2705 2706 if (RTMRetryCount > 0) { 2707 load_const_optimized(retry_on_busy_count_Reg, RTMRetryCount); // Retry on lock busy. 2708 load_const_optimized(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort. 2709 bind(L_rtm_retry); 2710 } 2711 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 2712 Label L_noincrement; 2713 if (RTMTotalCountIncrRate > 1) { 2714 branch_on_random_using_tb(R0, RTMTotalCountIncrRate, L_noincrement); 2715 } 2716 assert(rtm_counters != NULL, "should not be NULL when profiling RTM"); 2717 load_const(R0, (address)rtm_counters->total_count_addr(), tmpReg); 2718 //atomic_inc_ptr(R0, tmpReg); We don't increment atomically 2719 ldx(tmpReg, R0); 2720 addi(tmpReg, tmpReg, 1); 2721 stdx(tmpReg, R0); 2722 bind(L_noincrement); 2723 } 2724 tbegin_(); 2725 beq(CCR0, L_on_abort); 2726 // We don't reload mark word. Will only be reset at safepoint. 2727 ld(R0, 0, owner_addr_Reg); // Load in transaction, conflicts need to be tracked. 2728 cmpdi(flag, R0, 0); 2729 beq(flag, DONE_LABEL); 2730 2731 if (UseRTMXendForLockBusy) { 2732 tend_(); 2733 b(L_decrement_retry); 2734 } else { 2735 tabort_(); 2736 } 2737 bind(L_on_abort); 2738 const Register abort_status_Reg = tmpReg; 2739 mftexasr(abort_status_Reg); 2740 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 2741 rtm_profiling(abort_status_Reg, /*temp*/ owner_addr_Reg, rtm_counters, method_data, profile_rtm); 2742 // Restore owner_addr_Reg 2743 ld(mark_word, oopDesc::mark_offset_in_bytes(), obj); 2744 #ifdef ASSERT 2745 andi_(R0, mark_word, markOopDesc::monitor_value); 2746 asm_assert_ne("must be inflated", 0xa754); // Deflating only allowed at safepoint. 2747 #endif 2748 addi(owner_addr_Reg, mark_word, owner_offset); 2749 } 2750 if (RTMRetryCount > 0) { 2751 // Retry on lock abort if abort status is not permanent. 2752 rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry); 2753 } 2754 2755 // Appears unlocked - try to swing _owner from null to non-null. 2756 cmpxchgd(flag, /*current val*/ R0, (intptr_t)0, /*new val*/ R16_thread, owner_addr_Reg, 2757 MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq, 2758 MacroAssembler::cmpxchgx_hint_acquire_lock(), noreg, &L_decrement_retry, true); 2759 2760 if (RTMRetryCount > 0) { 2761 // success done else retry 2762 b(DONE_LABEL); 2763 bind(L_decrement_retry); 2764 // Spin and retry if lock is busy. 2765 rtm_retry_lock_on_busy(retry_on_busy_count_Reg, owner_addr_Reg, L_rtm_retry); 2766 } else { 2767 bind(L_decrement_retry); 2768 } 2769 } 2770 2771 #endif // INCLUDE_RTM_OPT 2772 2773 // "The box" is the space on the stack where we copy the object mark. 2774 void MacroAssembler::compiler_fast_lock_object(ConditionRegister flag, Register oop, Register box, 2775 Register temp, Register displaced_header, Register current_header, 2776 bool try_bias, 2777 RTMLockingCounters* rtm_counters, 2778 RTMLockingCounters* stack_rtm_counters, 2779 Metadata* method_data, 2780 bool use_rtm, bool profile_rtm) { 2781 assert_different_registers(oop, box, temp, displaced_header, current_header); 2782 assert(flag != CCR0, "bad condition register"); 2783 Label cont; 2784 Label object_has_monitor; 2785 Label cas_failed; 2786 2787 // Load markOop from object into displaced_header. 2788 ld(displaced_header, oopDesc::mark_offset_in_bytes(), oop); 2789 2790 2791 // Always do locking in runtime. 2792 if (EmitSync & 0x01) { 2793 cmpdi(flag, oop, 0); // Oop can't be 0 here => always false. 2794 return; 2795 } 2796 2797 if (try_bias) { 2798 biased_locking_enter(flag, oop, displaced_header, temp, current_header, cont); 2799 } 2800 2801 #if INCLUDE_RTM_OPT 2802 if (UseRTMForStackLocks && use_rtm) { 2803 rtm_stack_locking(flag, oop, displaced_header, temp, /*temp*/ current_header, 2804 stack_rtm_counters, method_data, profile_rtm, 2805 cont, object_has_monitor); 2806 } 2807 #endif // INCLUDE_RTM_OPT 2808 2809 // Handle existing monitor. 2810 if ((EmitSync & 0x02) == 0) { 2811 // The object has an existing monitor iff (mark & monitor_value) != 0. 2812 andi_(temp, displaced_header, markOopDesc::monitor_value); 2813 bne(CCR0, object_has_monitor); 2814 } 2815 2816 // Set displaced_header to be (markOop of object | UNLOCK_VALUE). 2817 ori(displaced_header, displaced_header, markOopDesc::unlocked_value); 2818 2819 // Load Compare Value application register. 2820 2821 // Initialize the box. (Must happen before we update the object mark!) 2822 std(displaced_header, BasicLock::displaced_header_offset_in_bytes(), box); 2823 2824 // Must fence, otherwise, preceding store(s) may float below cmpxchg. 2825 // Compare object markOop with mark and if equal exchange scratch1 with object markOop. 2826 cmpxchgd(/*flag=*/flag, 2827 /*current_value=*/current_header, 2828 /*compare_value=*/displaced_header, 2829 /*exchange_value=*/box, 2830 /*where=*/oop, 2831 MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq, 2832 MacroAssembler::cmpxchgx_hint_acquire_lock(), 2833 noreg, 2834 &cas_failed, 2835 /*check without membar and ldarx first*/true); 2836 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 2837 2838 // If the compare-and-exchange succeeded, then we found an unlocked 2839 // object and we have now locked it. 2840 b(cont); 2841 2842 bind(cas_failed); 2843 // We did not see an unlocked object so try the fast recursive case. 2844 2845 // Check if the owner is self by comparing the value in the markOop of object 2846 // (current_header) with the stack pointer. 2847 sub(current_header, current_header, R1_SP); 2848 load_const_optimized(temp, ~(os::vm_page_size()-1) | markOopDesc::lock_mask_in_place); 2849 2850 and_(R0/*==0?*/, current_header, temp); 2851 // If condition is true we are cont and hence we can store 0 as the 2852 // displaced header in the box, which indicates that it is a recursive lock. 2853 mcrf(flag,CCR0); 2854 std(R0/*==0, perhaps*/, BasicLock::displaced_header_offset_in_bytes(), box); 2855 2856 // Handle existing monitor. 2857 if ((EmitSync & 0x02) == 0) { 2858 b(cont); 2859 2860 bind(object_has_monitor); 2861 // The object's monitor m is unlocked iff m->owner == NULL, 2862 // otherwise m->owner may contain a thread or a stack address. 2863 2864 #if INCLUDE_RTM_OPT 2865 // Use the same RTM locking code in 32- and 64-bit VM. 2866 if (use_rtm) { 2867 rtm_inflated_locking(flag, oop, displaced_header, box, temp, /*temp*/ current_header, 2868 rtm_counters, method_data, profile_rtm, cont); 2869 } else { 2870 #endif // INCLUDE_RTM_OPT 2871 2872 // Try to CAS m->owner from NULL to current thread. 2873 addi(temp, displaced_header, ObjectMonitor::owner_offset_in_bytes()-markOopDesc::monitor_value); 2874 cmpxchgd(/*flag=*/flag, 2875 /*current_value=*/current_header, 2876 /*compare_value=*/(intptr_t)0, 2877 /*exchange_value=*/R16_thread, 2878 /*where=*/temp, 2879 MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq, 2880 MacroAssembler::cmpxchgx_hint_acquire_lock()); 2881 2882 // Store a non-null value into the box. 2883 std(box, BasicLock::displaced_header_offset_in_bytes(), box); 2884 2885 # ifdef ASSERT 2886 bne(flag, cont); 2887 // We have acquired the monitor, check some invariants. 2888 addi(/*monitor=*/temp, temp, -ObjectMonitor::owner_offset_in_bytes()); 2889 // Invariant 1: _recursions should be 0. 2890 //assert(ObjectMonitor::recursions_size_in_bytes() == 8, "unexpected size"); 2891 asm_assert_mem8_is_zero(ObjectMonitor::recursions_offset_in_bytes(), temp, 2892 "monitor->_recursions should be 0", -1); 2893 # endif 2894 2895 #if INCLUDE_RTM_OPT 2896 } // use_rtm() 2897 #endif 2898 } 2899 2900 bind(cont); 2901 // flag == EQ indicates success 2902 // flag == NE indicates failure 2903 } 2904 2905 void MacroAssembler::compiler_fast_unlock_object(ConditionRegister flag, Register oop, Register box, 2906 Register temp, Register displaced_header, Register current_header, 2907 bool try_bias, bool use_rtm) { 2908 assert_different_registers(oop, box, temp, displaced_header, current_header); 2909 assert(flag != CCR0, "bad condition register"); 2910 Label cont; 2911 Label object_has_monitor; 2912 2913 // Always do locking in runtime. 2914 if (EmitSync & 0x01) { 2915 cmpdi(flag, oop, 0); // Oop can't be 0 here => always false. 2916 return; 2917 } 2918 2919 if (try_bias) { 2920 biased_locking_exit(flag, oop, current_header, cont); 2921 } 2922 2923 #if INCLUDE_RTM_OPT 2924 if (UseRTMForStackLocks && use_rtm) { 2925 assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking"); 2926 Label L_regular_unlock; 2927 ld(current_header, oopDesc::mark_offset_in_bytes(), oop); // fetch markword 2928 andi(R0, current_header, markOopDesc::biased_lock_mask_in_place); // look at 3 lock bits 2929 cmpwi(flag, R0, markOopDesc::unlocked_value); // bits = 001 unlocked 2930 bne(flag, L_regular_unlock); // else RegularLock 2931 tend_(); // otherwise end... 2932 b(cont); // ... and we're done 2933 bind(L_regular_unlock); 2934 } 2935 #endif 2936 2937 // Find the lock address and load the displaced header from the stack. 2938 ld(displaced_header, BasicLock::displaced_header_offset_in_bytes(), box); 2939 2940 // If the displaced header is 0, we have a recursive unlock. 2941 cmpdi(flag, displaced_header, 0); 2942 beq(flag, cont); 2943 2944 // Handle existing monitor. 2945 if ((EmitSync & 0x02) == 0) { 2946 // The object has an existing monitor iff (mark & monitor_value) != 0. 2947 RTM_OPT_ONLY( if (!(UseRTMForStackLocks && use_rtm)) ) // skip load if already done 2948 ld(current_header, oopDesc::mark_offset_in_bytes(), oop); 2949 andi_(R0, current_header, markOopDesc::monitor_value); 2950 bne(CCR0, object_has_monitor); 2951 } 2952 2953 // Check if it is still a light weight lock, this is is true if we see 2954 // the stack address of the basicLock in the markOop of the object. 2955 // Cmpxchg sets flag to cmpd(current_header, box). 2956 cmpxchgd(/*flag=*/flag, 2957 /*current_value=*/current_header, 2958 /*compare_value=*/box, 2959 /*exchange_value=*/displaced_header, 2960 /*where=*/oop, 2961 MacroAssembler::MemBarRel, 2962 MacroAssembler::cmpxchgx_hint_release_lock(), 2963 noreg, 2964 &cont); 2965 2966 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 2967 2968 // Handle existing monitor. 2969 if ((EmitSync & 0x02) == 0) { 2970 b(cont); 2971 2972 bind(object_has_monitor); 2973 addi(current_header, current_header, -markOopDesc::monitor_value); // monitor 2974 ld(temp, ObjectMonitor::owner_offset_in_bytes(), current_header); 2975 2976 // It's inflated. 2977 #if INCLUDE_RTM_OPT 2978 if (use_rtm) { 2979 Label L_regular_inflated_unlock; 2980 // Clean monitor_value bit to get valid pointer 2981 cmpdi(flag, temp, 0); 2982 bne(flag, L_regular_inflated_unlock); 2983 tend_(); 2984 b(cont); 2985 bind(L_regular_inflated_unlock); 2986 } 2987 #endif 2988 2989 ld(displaced_header, ObjectMonitor::recursions_offset_in_bytes(), current_header); 2990 xorr(temp, R16_thread, temp); // Will be 0 if we are the owner. 2991 orr(temp, temp, displaced_header); // Will be 0 if there are 0 recursions. 2992 cmpdi(flag, temp, 0); 2993 bne(flag, cont); 2994 2995 ld(temp, ObjectMonitor::EntryList_offset_in_bytes(), current_header); 2996 ld(displaced_header, ObjectMonitor::cxq_offset_in_bytes(), current_header); 2997 orr(temp, temp, displaced_header); // Will be 0 if both are 0. 2998 cmpdi(flag, temp, 0); 2999 bne(flag, cont); 3000 release(); 3001 std(temp, ObjectMonitor::owner_offset_in_bytes(), current_header); 3002 } 3003 3004 bind(cont); 3005 // flag == EQ indicates success 3006 // flag == NE indicates failure 3007 } 3008 3009 // Write serialization page so VM thread can do a pseudo remote membar. 3010 // We use the current thread pointer to calculate a thread specific 3011 // offset to write to within the page. This minimizes bus traffic 3012 // due to cache line collision. 3013 void MacroAssembler::serialize_memory(Register thread, Register tmp1, Register tmp2) { 3014 srdi(tmp2, thread, os::get_serialize_page_shift_count()); 3015 3016 int mask = os::vm_page_size() - sizeof(int); 3017 if (Assembler::is_simm(mask, 16)) { 3018 andi(tmp2, tmp2, mask); 3019 } else { 3020 lis(tmp1, (int)((signed short) (mask >> 16))); 3021 ori(tmp1, tmp1, mask & 0x0000ffff); 3022 andr(tmp2, tmp2, tmp1); 3023 } 3024 3025 load_const(tmp1, (long) os::get_memory_serialize_page()); 3026 release(); 3027 stwx(R0, tmp1, tmp2); 3028 } 3029 3030 void MacroAssembler::safepoint_poll(Label& slow_path, Register temp_reg) { 3031 if (SafepointMechanism::uses_thread_local_poll()) { 3032 ld(temp_reg, in_bytes(Thread::polling_page_offset()), R16_thread); 3033 // Armed page has poll_bit set. 3034 andi_(temp_reg, temp_reg, SafepointMechanism::poll_bit()); 3035 } else { 3036 lwz(temp_reg, (RegisterOrConstant)(intptr_t)SafepointSynchronize::address_of_state()); 3037 cmpwi(CCR0, temp_reg, SafepointSynchronize::_not_synchronized); 3038 } 3039 bne(CCR0, slow_path); 3040 } 3041 3042 3043 // GC barrier helper macros 3044 3045 // Write the card table byte if needed. 3046 void MacroAssembler::card_write_barrier_post(Register Rstore_addr, Register Rnew_val, Register Rtmp) { 3047 CardTableBarrierSet* bs = 3048 barrier_set_cast<CardTableBarrierSet>(Universe::heap()->barrier_set()); 3049 assert(bs->kind() == BarrierSet::CardTableBarrierSet, "wrong barrier"); 3050 CardTable* ct = bs->card_table(); 3051 #ifdef ASSERT 3052 cmpdi(CCR0, Rnew_val, 0); 3053 asm_assert_ne("null oop not allowed", 0x321); 3054 #endif 3055 card_table_write(ct->byte_map_base(), Rtmp, Rstore_addr); 3056 } 3057 3058 // Write the card table byte. 3059 void MacroAssembler::card_table_write(jbyte* byte_map_base, Register Rtmp, Register Robj) { 3060 assert_different_registers(Robj, Rtmp, R0); 3061 load_const_optimized(Rtmp, (address)byte_map_base, R0); 3062 srdi(Robj, Robj, CardTable::card_shift); 3063 li(R0, 0); // dirty 3064 if (UseConcMarkSweepGC) membar(Assembler::StoreStore); 3065 stbx(R0, Rtmp, Robj); 3066 } 3067 3068 // Kills R31 if value is a volatile register. 3069 void MacroAssembler::resolve_jobject(Register value, Register tmp1, Register tmp2, bool needs_frame) { 3070 Label done; 3071 cmpdi(CCR0, value, 0); 3072 beq(CCR0, done); // Use NULL as-is. 3073 3074 clrrdi(tmp1, value, JNIHandles::weak_tag_size); 3075 #if INCLUDE_ALL_GCS 3076 if (UseG1GC) { andi_(tmp2, value, JNIHandles::weak_tag_mask); } 3077 #endif 3078 ld(value, 0, tmp1); // Resolve (untagged) jobject. 3079 3080 #if INCLUDE_ALL_GCS 3081 if (UseG1GC) { 3082 Label not_weak; 3083 beq(CCR0, not_weak); // Test for jweak tag. 3084 verify_oop(value); 3085 g1_write_barrier_pre(noreg, // obj 3086 noreg, // offset 3087 value, // pre_val 3088 tmp1, tmp2, needs_frame); 3089 bind(not_weak); 3090 } 3091 #endif // INCLUDE_ALL_GCS 3092 verify_oop(value); 3093 bind(done); 3094 } 3095 3096 #if INCLUDE_ALL_GCS 3097 // General G1 pre-barrier generator. 3098 // Goal: record the previous value if it is not null. 3099 void MacroAssembler::g1_write_barrier_pre(Register Robj, RegisterOrConstant offset, Register Rpre_val, 3100 Register Rtmp1, Register Rtmp2, bool needs_frame) { 3101 Label runtime, filtered; 3102 3103 // Is marking active? 3104 if (in_bytes(SATBMarkQueue::byte_width_of_active()) == 4) { 3105 lwz(Rtmp1, in_bytes(JavaThread::satb_mark_queue_offset() + SATBMarkQueue::byte_offset_of_active()), R16_thread); 3106 } else { 3107 guarantee(in_bytes(SATBMarkQueue::byte_width_of_active()) == 1, "Assumption"); 3108 lbz(Rtmp1, in_bytes(JavaThread::satb_mark_queue_offset() + SATBMarkQueue::byte_offset_of_active()), R16_thread); 3109 } 3110 cmpdi(CCR0, Rtmp1, 0); 3111 beq(CCR0, filtered); 3112 3113 // Do we need to load the previous value? 3114 if (Robj != noreg) { 3115 // Load the previous value... 3116 if (UseCompressedOops) { 3117 lwz(Rpre_val, offset, Robj); 3118 } else { 3119 ld(Rpre_val, offset, Robj); 3120 } 3121 // Previous value has been loaded into Rpre_val. 3122 } 3123 assert(Rpre_val != noreg, "must have a real register"); 3124 3125 // Is the previous value null? 3126 cmpdi(CCR0, Rpre_val, 0); 3127 beq(CCR0, filtered); 3128 3129 if (Robj != noreg && UseCompressedOops) { 3130 decode_heap_oop_not_null(Rpre_val); 3131 } 3132 3133 // OK, it's not filtered, so we'll need to call enqueue. In the normal 3134 // case, pre_val will be a scratch G-reg, but there are some cases in 3135 // which it's an O-reg. In the first case, do a normal call. In the 3136 // latter, do a save here and call the frameless version. 3137 3138 // Can we store original value in the thread's buffer? 3139 // Is index == 0? 3140 // (The index field is typed as size_t.) 3141 const Register Rbuffer = Rtmp1, Rindex = Rtmp2; 3142 3143 ld(Rindex, in_bytes(JavaThread::satb_mark_queue_offset() + SATBMarkQueue::byte_offset_of_index()), R16_thread); 3144 cmpdi(CCR0, Rindex, 0); 3145 beq(CCR0, runtime); // If index == 0, goto runtime. 3146 ld(Rbuffer, in_bytes(JavaThread::satb_mark_queue_offset() + SATBMarkQueue::byte_offset_of_buf()), R16_thread); 3147 3148 addi(Rindex, Rindex, -wordSize); // Decrement index. 3149 std(Rindex, in_bytes(JavaThread::satb_mark_queue_offset() + SATBMarkQueue::byte_offset_of_index()), R16_thread); 3150 3151 // Record the previous value. 3152 stdx(Rpre_val, Rbuffer, Rindex); 3153 b(filtered); 3154 3155 bind(runtime); 3156 3157 // May need to preserve LR. Also needed if current frame is not compatible with C calling convention. 3158 if (needs_frame) { 3159 save_LR_CR(Rtmp1); 3160 push_frame_reg_args(0, Rtmp2); 3161 } 3162 3163 if (Rpre_val->is_volatile() && Robj == noreg) mr(R31, Rpre_val); // Save pre_val across C call if it was preloaded. 3164 call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_pre), Rpre_val, R16_thread); 3165 if (Rpre_val->is_volatile() && Robj == noreg) mr(Rpre_val, R31); // restore 3166 3167 if (needs_frame) { 3168 pop_frame(); 3169 restore_LR_CR(Rtmp1); 3170 } 3171 3172 bind(filtered); 3173 } 3174 3175 // General G1 post-barrier generator 3176 // Store cross-region card. 3177 void MacroAssembler::g1_write_barrier_post(Register Rstore_addr, Register Rnew_val, Register Rtmp1, Register Rtmp2, Register Rtmp3, Label *filtered_ext) { 3178 Label runtime, filtered_int; 3179 Label& filtered = (filtered_ext != NULL) ? *filtered_ext : filtered_int; 3180 assert_different_registers(Rstore_addr, Rnew_val, Rtmp1, Rtmp2); 3181 3182 G1BarrierSet* bs = 3183 barrier_set_cast<G1BarrierSet>(Universe::heap()->barrier_set()); 3184 CardTable* ct = bs->card_table(); 3185 3186 // Does store cross heap regions? 3187 if (G1RSBarrierRegionFilter) { 3188 xorr(Rtmp1, Rstore_addr, Rnew_val); 3189 srdi_(Rtmp1, Rtmp1, HeapRegion::LogOfHRGrainBytes); 3190 beq(CCR0, filtered); 3191 } 3192 3193 // Crosses regions, storing NULL? 3194 #ifdef ASSERT 3195 cmpdi(CCR0, Rnew_val, 0); 3196 asm_assert_ne("null oop not allowed (G1)", 0x322); // Checked by caller on PPC64, so following branch is obsolete: 3197 //beq(CCR0, filtered); 3198 #endif 3199 3200 // Storing region crossing non-NULL, is card already dirty? 3201 assert(sizeof(*ct->byte_map_base()) == sizeof(jbyte), "adjust this code"); 3202 const Register Rcard_addr = Rtmp1; 3203 Register Rbase = Rtmp2; 3204 load_const_optimized(Rbase, (address)ct->byte_map_base(), /*temp*/ Rtmp3); 3205 3206 srdi(Rcard_addr, Rstore_addr, CardTable::card_shift); 3207 3208 // Get the address of the card. 3209 lbzx(/*card value*/ Rtmp3, Rbase, Rcard_addr); 3210 cmpwi(CCR0, Rtmp3, (int)G1CardTable::g1_young_card_val()); 3211 beq(CCR0, filtered); 3212 3213 membar(Assembler::StoreLoad); 3214 lbzx(/*card value*/ Rtmp3, Rbase, Rcard_addr); // Reload after membar. 3215 cmpwi(CCR0, Rtmp3 /* card value */, CardTable::dirty_card_val()); 3216 beq(CCR0, filtered); 3217 3218 // Storing a region crossing, non-NULL oop, card is clean. 3219 // Dirty card and log. 3220 li(Rtmp3, CardTable::dirty_card_val()); 3221 //release(); // G1: oops are allowed to get visible after dirty marking. 3222 stbx(Rtmp3, Rbase, Rcard_addr); 3223 3224 add(Rcard_addr, Rbase, Rcard_addr); // This is the address which needs to get enqueued. 3225 Rbase = noreg; // end of lifetime 3226 3227 const Register Rqueue_index = Rtmp2, 3228 Rqueue_buf = Rtmp3; 3229 ld(Rqueue_index, in_bytes(JavaThread::dirty_card_queue_offset() + DirtyCardQueue::byte_offset_of_index()), R16_thread); 3230 cmpdi(CCR0, Rqueue_index, 0); 3231 beq(CCR0, runtime); // index == 0 then jump to runtime 3232 ld(Rqueue_buf, in_bytes(JavaThread::dirty_card_queue_offset() + DirtyCardQueue::byte_offset_of_buf()), R16_thread); 3233 3234 addi(Rqueue_index, Rqueue_index, -wordSize); // decrement index 3235 std(Rqueue_index, in_bytes(JavaThread::dirty_card_queue_offset() + DirtyCardQueue::byte_offset_of_index()), R16_thread); 3236 3237 stdx(Rcard_addr, Rqueue_buf, Rqueue_index); // store card 3238 b(filtered); 3239 3240 bind(runtime); 3241 3242 // Save the live input values. 3243 call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_post), Rcard_addr, R16_thread); 3244 3245 bind(filtered_int); 3246 } 3247 #endif // INCLUDE_ALL_GCS 3248 3249 // Values for last_Java_pc, and last_Java_sp must comply to the rules 3250 // in frame_ppc.hpp. 3251 void MacroAssembler::set_last_Java_frame(Register last_Java_sp, Register last_Java_pc) { 3252 // Always set last_Java_pc and flags first because once last_Java_sp 3253 // is visible has_last_Java_frame is true and users will look at the 3254 // rest of the fields. (Note: flags should always be zero before we 3255 // get here so doesn't need to be set.) 3256 3257 // Verify that last_Java_pc was zeroed on return to Java 3258 asm_assert_mem8_is_zero(in_bytes(JavaThread::last_Java_pc_offset()), R16_thread, 3259 "last_Java_pc not zeroed before leaving Java", 0x200); 3260 3261 // When returning from calling out from Java mode the frame anchor's 3262 // last_Java_pc will always be set to NULL. It is set here so that 3263 // if we are doing a call to native (not VM) that we capture the 3264 // known pc and don't have to rely on the native call having a 3265 // standard frame linkage where we can find the pc. 3266 if (last_Java_pc != noreg) 3267 std(last_Java_pc, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread); 3268 3269 // Set last_Java_sp last. 3270 std(last_Java_sp, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread); 3271 } 3272 3273 void MacroAssembler::reset_last_Java_frame(void) { 3274 asm_assert_mem8_isnot_zero(in_bytes(JavaThread::last_Java_sp_offset()), 3275 R16_thread, "SP was not set, still zero", 0x202); 3276 3277 BLOCK_COMMENT("reset_last_Java_frame {"); 3278 li(R0, 0); 3279 3280 // _last_Java_sp = 0 3281 std(R0, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread); 3282 3283 // _last_Java_pc = 0 3284 std(R0, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread); 3285 BLOCK_COMMENT("} reset_last_Java_frame"); 3286 } 3287 3288 void MacroAssembler::set_top_ijava_frame_at_SP_as_last_Java_frame(Register sp, Register tmp1) { 3289 assert_different_registers(sp, tmp1); 3290 3291 // sp points to a TOP_IJAVA_FRAME, retrieve frame's PC via 3292 // TOP_IJAVA_FRAME_ABI. 3293 // FIXME: assert that we really have a TOP_IJAVA_FRAME here! 3294 address entry = pc(); 3295 load_const_optimized(tmp1, entry); 3296 3297 set_last_Java_frame(/*sp=*/sp, /*pc=*/tmp1); 3298 } 3299 3300 void MacroAssembler::get_vm_result(Register oop_result) { 3301 // Read: 3302 // R16_thread 3303 // R16_thread->in_bytes(JavaThread::vm_result_offset()) 3304 // 3305 // Updated: 3306 // oop_result 3307 // R16_thread->in_bytes(JavaThread::vm_result_offset()) 3308 3309 verify_thread(); 3310 3311 ld(oop_result, in_bytes(JavaThread::vm_result_offset()), R16_thread); 3312 li(R0, 0); 3313 std(R0, in_bytes(JavaThread::vm_result_offset()), R16_thread); 3314 3315 verify_oop(oop_result); 3316 } 3317 3318 void MacroAssembler::get_vm_result_2(Register metadata_result) { 3319 // Read: 3320 // R16_thread 3321 // R16_thread->in_bytes(JavaThread::vm_result_2_offset()) 3322 // 3323 // Updated: 3324 // metadata_result 3325 // R16_thread->in_bytes(JavaThread::vm_result_2_offset()) 3326 3327 ld(metadata_result, in_bytes(JavaThread::vm_result_2_offset()), R16_thread); 3328 li(R0, 0); 3329 std(R0, in_bytes(JavaThread::vm_result_2_offset()), R16_thread); 3330 } 3331 3332 Register MacroAssembler::encode_klass_not_null(Register dst, Register src) { 3333 Register current = (src != noreg) ? src : dst; // Klass is in dst if no src provided. 3334 if (Universe::narrow_klass_base() != 0) { 3335 // Use dst as temp if it is free. 3336 sub_const_optimized(dst, current, Universe::narrow_klass_base(), R0); 3337 current = dst; 3338 } 3339 if (Universe::narrow_klass_shift() != 0) { 3340 srdi(dst, current, Universe::narrow_klass_shift()); 3341 current = dst; 3342 } 3343 return current; 3344 } 3345 3346 void MacroAssembler::store_klass(Register dst_oop, Register klass, Register ck) { 3347 if (UseCompressedClassPointers) { 3348 Register compressedKlass = encode_klass_not_null(ck, klass); 3349 stw(compressedKlass, oopDesc::klass_offset_in_bytes(), dst_oop); 3350 } else { 3351 std(klass, oopDesc::klass_offset_in_bytes(), dst_oop); 3352 } 3353 } 3354 3355 void MacroAssembler::store_klass_gap(Register dst_oop, Register val) { 3356 if (UseCompressedClassPointers) { 3357 if (val == noreg) { 3358 val = R0; 3359 li(val, 0); 3360 } 3361 stw(val, oopDesc::klass_gap_offset_in_bytes(), dst_oop); // klass gap if compressed 3362 } 3363 } 3364 3365 int MacroAssembler::instr_size_for_decode_klass_not_null() { 3366 if (!UseCompressedClassPointers) return 0; 3367 int num_instrs = 1; // shift or move 3368 if (Universe::narrow_klass_base() != 0) num_instrs = 7; // shift + load const + add 3369 return num_instrs * BytesPerInstWord; 3370 } 3371 3372 void MacroAssembler::decode_klass_not_null(Register dst, Register src) { 3373 assert(dst != R0, "Dst reg may not be R0, as R0 is used here."); 3374 if (src == noreg) src = dst; 3375 Register shifted_src = src; 3376 if (Universe::narrow_klass_shift() != 0 || 3377 Universe::narrow_klass_base() == 0 && src != dst) { // Move required. 3378 shifted_src = dst; 3379 sldi(shifted_src, src, Universe::narrow_klass_shift()); 3380 } 3381 if (Universe::narrow_klass_base() != 0) { 3382 add_const_optimized(dst, shifted_src, Universe::narrow_klass_base(), R0); 3383 } 3384 } 3385 3386 void MacroAssembler::load_klass(Register dst, Register src) { 3387 if (UseCompressedClassPointers) { 3388 lwz(dst, oopDesc::klass_offset_in_bytes(), src); 3389 // Attention: no null check here! 3390 decode_klass_not_null(dst, dst); 3391 } else { 3392 ld(dst, oopDesc::klass_offset_in_bytes(), src); 3393 } 3394 } 3395 3396 // ((OopHandle)result).resolve(); 3397 void MacroAssembler::resolve_oop_handle(Register result) { 3398 // OopHandle::resolve is an indirection. 3399 ld(result, 0, result); 3400 } 3401 3402 void MacroAssembler::load_mirror_from_const_method(Register mirror, Register const_method) { 3403 ld(mirror, in_bytes(ConstMethod::constants_offset()), const_method); 3404 ld(mirror, ConstantPool::pool_holder_offset_in_bytes(), mirror); 3405 ld(mirror, in_bytes(Klass::java_mirror_offset()), mirror); 3406 resolve_oop_handle(mirror); 3407 } 3408 3409 // Clear Array 3410 // For very short arrays. tmp == R0 is allowed. 3411 void MacroAssembler::clear_memory_unrolled(Register base_ptr, int cnt_dwords, Register tmp, int offset) { 3412 if (cnt_dwords > 0) { li(tmp, 0); } 3413 for (int i = 0; i < cnt_dwords; ++i) { std(tmp, offset + i * 8, base_ptr); } 3414 } 3415 3416 // Version for constant short array length. Kills base_ptr. tmp == R0 is allowed. 3417 void MacroAssembler::clear_memory_constlen(Register base_ptr, int cnt_dwords, Register tmp) { 3418 if (cnt_dwords < 8) { 3419 clear_memory_unrolled(base_ptr, cnt_dwords, tmp); 3420 return; 3421 } 3422 3423 Label loop; 3424 const long loopcnt = cnt_dwords >> 1, 3425 remainder = cnt_dwords & 1; 3426 3427 li(tmp, loopcnt); 3428 mtctr(tmp); 3429 li(tmp, 0); 3430 bind(loop); 3431 std(tmp, 0, base_ptr); 3432 std(tmp, 8, base_ptr); 3433 addi(base_ptr, base_ptr, 16); 3434 bdnz(loop); 3435 if (remainder) { std(tmp, 0, base_ptr); } 3436 } 3437 3438 // Kills both input registers. tmp == R0 is allowed. 3439 void MacroAssembler::clear_memory_doubleword(Register base_ptr, Register cnt_dwords, Register tmp, long const_cnt) { 3440 // Procedure for large arrays (uses data cache block zero instruction). 3441 Label startloop, fast, fastloop, small_rest, restloop, done; 3442 const int cl_size = VM_Version::L1_data_cache_line_size(), 3443 cl_dwords = cl_size >> 3, 3444 cl_dw_addr_bits = exact_log2(cl_dwords), 3445 dcbz_min = 1, // Min count of dcbz executions, needs to be >0. 3446 min_cnt = ((dcbz_min + 1) << cl_dw_addr_bits) - 1; 3447 3448 if (const_cnt >= 0) { 3449 // Constant case. 3450 if (const_cnt < min_cnt) { 3451 clear_memory_constlen(base_ptr, const_cnt, tmp); 3452 return; 3453 } 3454 load_const_optimized(cnt_dwords, const_cnt, tmp); 3455 } else { 3456 // cnt_dwords already loaded in register. Need to check size. 3457 cmpdi(CCR1, cnt_dwords, min_cnt); // Big enough? (ensure >= dcbz_min lines included). 3458 blt(CCR1, small_rest); 3459 } 3460 rldicl_(tmp, base_ptr, 64-3, 64-cl_dw_addr_bits); // Extract dword offset within first cache line. 3461 beq(CCR0, fast); // Already 128byte aligned. 3462 3463 subfic(tmp, tmp, cl_dwords); 3464 mtctr(tmp); // Set ctr to hit 128byte boundary (0<ctr<cl_dwords). 3465 subf(cnt_dwords, tmp, cnt_dwords); // rest. 3466 li(tmp, 0); 3467 3468 bind(startloop); // Clear at the beginning to reach 128byte boundary. 3469 std(tmp, 0, base_ptr); // Clear 8byte aligned block. 3470 addi(base_ptr, base_ptr, 8); 3471 bdnz(startloop); 3472 3473 bind(fast); // Clear 128byte blocks. 3474 srdi(tmp, cnt_dwords, cl_dw_addr_bits); // Loop count for 128byte loop (>0). 3475 andi(cnt_dwords, cnt_dwords, cl_dwords-1); // Rest in dwords. 3476 mtctr(tmp); // Load counter. 3477 3478 bind(fastloop); 3479 dcbz(base_ptr); // Clear 128byte aligned block. 3480 addi(base_ptr, base_ptr, cl_size); 3481 bdnz(fastloop); 3482 3483 bind(small_rest); 3484 cmpdi(CCR0, cnt_dwords, 0); // size 0? 3485 beq(CCR0, done); // rest == 0 3486 li(tmp, 0); 3487 mtctr(cnt_dwords); // Load counter. 3488 3489 bind(restloop); // Clear rest. 3490 std(tmp, 0, base_ptr); // Clear 8byte aligned block. 3491 addi(base_ptr, base_ptr, 8); 3492 bdnz(restloop); 3493 3494 bind(done); 3495 } 3496 3497 /////////////////////////////////////////// String intrinsics //////////////////////////////////////////// 3498 3499 #ifdef COMPILER2 3500 // Intrinsics for CompactStrings 3501 3502 // Compress char[] to byte[] by compressing 16 bytes at once. 3503 void MacroAssembler::string_compress_16(Register src, Register dst, Register cnt, 3504 Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5, 3505 Label& Lfailure) { 3506 3507 const Register tmp0 = R0; 3508 assert_different_registers(src, dst, cnt, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5); 3509 Label Lloop, Lslow; 3510 3511 // Check if cnt >= 8 (= 16 bytes) 3512 lis(tmp1, 0xFF); // tmp1 = 0x00FF00FF00FF00FF 3513 srwi_(tmp2, cnt, 3); 3514 beq(CCR0, Lslow); 3515 ori(tmp1, tmp1, 0xFF); 3516 rldimi(tmp1, tmp1, 32, 0); 3517 mtctr(tmp2); 3518 3519 // 2x unrolled loop 3520 bind(Lloop); 3521 ld(tmp2, 0, src); // _0_1_2_3 (Big Endian) 3522 ld(tmp4, 8, src); // _4_5_6_7 3523 3524 orr(tmp0, tmp2, tmp4); 3525 rldicl(tmp3, tmp2, 6*8, 64-24); // _____1_2 3526 rldimi(tmp2, tmp2, 2*8, 2*8); // _0_2_3_3 3527 rldicl(tmp5, tmp4, 6*8, 64-24); // _____5_6 3528 rldimi(tmp4, tmp4, 2*8, 2*8); // _4_6_7_7 3529 3530 andc_(tmp0, tmp0, tmp1); 3531 bne(CCR0, Lfailure); // Not latin1. 3532 addi(src, src, 16); 3533 3534 rlwimi(tmp3, tmp2, 0*8, 24, 31);// _____1_3 3535 srdi(tmp2, tmp2, 3*8); // ____0_2_ 3536 rlwimi(tmp5, tmp4, 0*8, 24, 31);// _____5_7 3537 srdi(tmp4, tmp4, 3*8); // ____4_6_ 3538 3539 orr(tmp2, tmp2, tmp3); // ____0123 3540 orr(tmp4, tmp4, tmp5); // ____4567 3541 3542 stw(tmp2, 0, dst); 3543 stw(tmp4, 4, dst); 3544 addi(dst, dst, 8); 3545 bdnz(Lloop); 3546 3547 bind(Lslow); // Fallback to slow version 3548 } 3549 3550 // Compress char[] to byte[]. cnt must be positive int. 3551 void MacroAssembler::string_compress(Register src, Register dst, Register cnt, Register tmp, Label& Lfailure) { 3552 Label Lloop; 3553 mtctr(cnt); 3554 3555 bind(Lloop); 3556 lhz(tmp, 0, src); 3557 cmplwi(CCR0, tmp, 0xff); 3558 bgt(CCR0, Lfailure); // Not latin1. 3559 addi(src, src, 2); 3560 stb(tmp, 0, dst); 3561 addi(dst, dst, 1); 3562 bdnz(Lloop); 3563 } 3564 3565 // Inflate byte[] to char[] by inflating 16 bytes at once. 3566 void MacroAssembler::string_inflate_16(Register src, Register dst, Register cnt, 3567 Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5) { 3568 const Register tmp0 = R0; 3569 assert_different_registers(src, dst, cnt, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5); 3570 Label Lloop, Lslow; 3571 3572 // Check if cnt >= 8 3573 srwi_(tmp2, cnt, 3); 3574 beq(CCR0, Lslow); 3575 lis(tmp1, 0xFF); // tmp1 = 0x00FF00FF 3576 ori(tmp1, tmp1, 0xFF); 3577 mtctr(tmp2); 3578 3579 // 2x unrolled loop 3580 bind(Lloop); 3581 lwz(tmp2, 0, src); // ____0123 (Big Endian) 3582 lwz(tmp4, 4, src); // ____4567 3583 addi(src, src, 8); 3584 3585 rldicl(tmp3, tmp2, 7*8, 64-8); // _______2 3586 rlwimi(tmp2, tmp2, 3*8, 16, 23);// ____0113 3587 rldicl(tmp5, tmp4, 7*8, 64-8); // _______6 3588 rlwimi(tmp4, tmp4, 3*8, 16, 23);// ____4557 3589 3590 andc(tmp0, tmp2, tmp1); // ____0_1_ 3591 rlwimi(tmp2, tmp3, 2*8, 0, 23); // _____2_3 3592 andc(tmp3, tmp4, tmp1); // ____4_5_ 3593 rlwimi(tmp4, tmp5, 2*8, 0, 23); // _____6_7 3594 3595 rldimi(tmp2, tmp0, 3*8, 0*8); // _0_1_2_3 3596 rldimi(tmp4, tmp3, 3*8, 0*8); // _4_5_6_7 3597 3598 std(tmp2, 0, dst); 3599 std(tmp4, 8, dst); 3600 addi(dst, dst, 16); 3601 bdnz(Lloop); 3602 3603 bind(Lslow); // Fallback to slow version 3604 } 3605 3606 // Inflate byte[] to char[]. cnt must be positive int. 3607 void MacroAssembler::string_inflate(Register src, Register dst, Register cnt, Register tmp) { 3608 Label Lloop; 3609 mtctr(cnt); 3610 3611 bind(Lloop); 3612 lbz(tmp, 0, src); 3613 addi(src, src, 1); 3614 sth(tmp, 0, dst); 3615 addi(dst, dst, 2); 3616 bdnz(Lloop); 3617 } 3618 3619 void MacroAssembler::string_compare(Register str1, Register str2, 3620 Register cnt1, Register cnt2, 3621 Register tmp1, Register result, int ae) { 3622 const Register tmp0 = R0, 3623 diff = tmp1; 3624 3625 assert_different_registers(str1, str2, cnt1, cnt2, tmp0, tmp1, result); 3626 Label Ldone, Lslow, Lloop, Lreturn_diff; 3627 3628 // Note: Making use of the fact that compareTo(a, b) == -compareTo(b, a) 3629 // we interchange str1 and str2 in the UL case and negate the result. 3630 // Like this, str1 is always latin1 encoded, except for the UU case. 3631 // In addition, we need 0 (or sign which is 0) extend. 3632 3633 if (ae == StrIntrinsicNode::UU) { 3634 srwi(cnt1, cnt1, 1); 3635 } else { 3636 clrldi(cnt1, cnt1, 32); 3637 } 3638 3639 if (ae != StrIntrinsicNode::LL) { 3640 srwi(cnt2, cnt2, 1); 3641 } else { 3642 clrldi(cnt2, cnt2, 32); 3643 } 3644 3645 // See if the lengths are different, and calculate min in cnt1. 3646 // Save diff in case we need it for a tie-breaker. 3647 subf_(diff, cnt2, cnt1); // diff = cnt1 - cnt2 3648 // if (diff > 0) { cnt1 = cnt2; } 3649 if (VM_Version::has_isel()) { 3650 isel(cnt1, CCR0, Assembler::greater, /*invert*/ false, cnt2); 3651 } else { 3652 Label Lskip; 3653 blt(CCR0, Lskip); 3654 mr(cnt1, cnt2); 3655 bind(Lskip); 3656 } 3657 3658 // Rename registers 3659 Register chr1 = result; 3660 Register chr2 = tmp0; 3661 3662 // Compare multiple characters in fast loop (only implemented for same encoding). 3663 int stride1 = 8, stride2 = 8; 3664 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3665 int log2_chars_per_iter = (ae == StrIntrinsicNode::LL) ? 3 : 2; 3666 Label Lfastloop, Lskipfast; 3667 3668 srwi_(tmp0, cnt1, log2_chars_per_iter); 3669 beq(CCR0, Lskipfast); 3670 rldicl(cnt2, cnt1, 0, 64 - log2_chars_per_iter); // Remaining characters. 3671 li(cnt1, 1 << log2_chars_per_iter); // Initialize for failure case: Rescan characters from current iteration. 3672 mtctr(tmp0); 3673 3674 bind(Lfastloop); 3675 ld(chr1, 0, str1); 3676 ld(chr2, 0, str2); 3677 cmpd(CCR0, chr1, chr2); 3678 bne(CCR0, Lslow); 3679 addi(str1, str1, stride1); 3680 addi(str2, str2, stride2); 3681 bdnz(Lfastloop); 3682 mr(cnt1, cnt2); // Remaining characters. 3683 bind(Lskipfast); 3684 } 3685 3686 // Loop which searches the first difference character by character. 3687 cmpwi(CCR0, cnt1, 0); 3688 beq(CCR0, Lreturn_diff); 3689 bind(Lslow); 3690 mtctr(cnt1); 3691 3692 switch (ae) { 3693 case StrIntrinsicNode::LL: stride1 = 1; stride2 = 1; break; 3694 case StrIntrinsicNode::UL: // fallthru (see comment above) 3695 case StrIntrinsicNode::LU: stride1 = 1; stride2 = 2; break; 3696 case StrIntrinsicNode::UU: stride1 = 2; stride2 = 2; break; 3697 default: ShouldNotReachHere(); break; 3698 } 3699 3700 bind(Lloop); 3701 if (stride1 == 1) { lbz(chr1, 0, str1); } else { lhz(chr1, 0, str1); } 3702 if (stride2 == 1) { lbz(chr2, 0, str2); } else { lhz(chr2, 0, str2); } 3703 subf_(result, chr2, chr1); // result = chr1 - chr2 3704 bne(CCR0, Ldone); 3705 addi(str1, str1, stride1); 3706 addi(str2, str2, stride2); 3707 bdnz(Lloop); 3708 3709 // If strings are equal up to min length, return the length difference. 3710 bind(Lreturn_diff); 3711 mr(result, diff); 3712 3713 // Otherwise, return the difference between the first mismatched chars. 3714 bind(Ldone); 3715 if (ae == StrIntrinsicNode::UL) { 3716 neg(result, result); // Negate result (see note above). 3717 } 3718 } 3719 3720 void MacroAssembler::array_equals(bool is_array_equ, Register ary1, Register ary2, 3721 Register limit, Register tmp1, Register result, bool is_byte) { 3722 const Register tmp0 = R0; 3723 assert_different_registers(ary1, ary2, limit, tmp0, tmp1, result); 3724 Label Ldone, Lskiploop, Lloop, Lfastloop, Lskipfast; 3725 bool limit_needs_shift = false; 3726 3727 if (is_array_equ) { 3728 const int length_offset = arrayOopDesc::length_offset_in_bytes(); 3729 const int base_offset = arrayOopDesc::base_offset_in_bytes(is_byte ? T_BYTE : T_CHAR); 3730 3731 // Return true if the same array. 3732 cmpd(CCR0, ary1, ary2); 3733 beq(CCR0, Lskiploop); 3734 3735 // Return false if one of them is NULL. 3736 cmpdi(CCR0, ary1, 0); 3737 cmpdi(CCR1, ary2, 0); 3738 li(result, 0); 3739 cror(CCR0, Assembler::equal, CCR1, Assembler::equal); 3740 beq(CCR0, Ldone); 3741 3742 // Load the lengths of arrays. 3743 lwz(limit, length_offset, ary1); 3744 lwz(tmp0, length_offset, ary2); 3745 3746 // Return false if the two arrays are not equal length. 3747 cmpw(CCR0, limit, tmp0); 3748 bne(CCR0, Ldone); 3749 3750 // Load array addresses. 3751 addi(ary1, ary1, base_offset); 3752 addi(ary2, ary2, base_offset); 3753 } else { 3754 limit_needs_shift = !is_byte; 3755 li(result, 0); // Assume not equal. 3756 } 3757 3758 // Rename registers 3759 Register chr1 = tmp0; 3760 Register chr2 = tmp1; 3761 3762 // Compare 8 bytes per iteration in fast loop. 3763 const int log2_chars_per_iter = is_byte ? 3 : 2; 3764 3765 srwi_(tmp0, limit, log2_chars_per_iter + (limit_needs_shift ? 1 : 0)); 3766 beq(CCR0, Lskipfast); 3767 mtctr(tmp0); 3768 3769 bind(Lfastloop); 3770 ld(chr1, 0, ary1); 3771 ld(chr2, 0, ary2); 3772 addi(ary1, ary1, 8); 3773 addi(ary2, ary2, 8); 3774 cmpd(CCR0, chr1, chr2); 3775 bne(CCR0, Ldone); 3776 bdnz(Lfastloop); 3777 3778 bind(Lskipfast); 3779 rldicl_(limit, limit, limit_needs_shift ? 64 - 1 : 0, 64 - log2_chars_per_iter); // Remaining characters. 3780 beq(CCR0, Lskiploop); 3781 mtctr(limit); 3782 3783 // Character by character. 3784 bind(Lloop); 3785 if (is_byte) { 3786 lbz(chr1, 0, ary1); 3787 lbz(chr2, 0, ary2); 3788 addi(ary1, ary1, 1); 3789 addi(ary2, ary2, 1); 3790 } else { 3791 lhz(chr1, 0, ary1); 3792 lhz(chr2, 0, ary2); 3793 addi(ary1, ary1, 2); 3794 addi(ary2, ary2, 2); 3795 } 3796 cmpw(CCR0, chr1, chr2); 3797 bne(CCR0, Ldone); 3798 bdnz(Lloop); 3799 3800 bind(Lskiploop); 3801 li(result, 1); // All characters are equal. 3802 bind(Ldone); 3803 } 3804 3805 void MacroAssembler::string_indexof(Register result, Register haystack, Register haycnt, 3806 Register needle, ciTypeArray* needle_values, Register needlecnt, int needlecntval, 3807 Register tmp1, Register tmp2, Register tmp3, Register tmp4, int ae) { 3808 3809 // Ensure 0<needlecnt<=haycnt in ideal graph as prerequisite! 3810 Label L_TooShort, L_Found, L_NotFound, L_End; 3811 Register last_addr = haycnt, // Kill haycnt at the beginning. 3812 addr = tmp1, 3813 n_start = tmp2, 3814 ch1 = tmp3, 3815 ch2 = R0; 3816 3817 assert(ae != StrIntrinsicNode::LU, "Invalid encoding"); 3818 const int h_csize = (ae == StrIntrinsicNode::LL) ? 1 : 2; 3819 const int n_csize = (ae == StrIntrinsicNode::UU) ? 2 : 1; 3820 3821 // ************************************************************************************************** 3822 // Prepare for main loop: optimized for needle count >=2, bail out otherwise. 3823 // ************************************************************************************************** 3824 3825 // Compute last haystack addr to use if no match gets found. 3826 clrldi(haycnt, haycnt, 32); // Ensure positive int is valid as 64 bit value. 3827 addi(addr, haystack, -h_csize); // Accesses use pre-increment. 3828 if (needlecntval == 0) { // variable needlecnt 3829 cmpwi(CCR6, needlecnt, 2); 3830 clrldi(needlecnt, needlecnt, 32); // Ensure positive int is valid as 64 bit value. 3831 blt(CCR6, L_TooShort); // Variable needlecnt: handle short needle separately. 3832 } 3833 3834 if (n_csize == 2) { lwz(n_start, 0, needle); } else { lhz(n_start, 0, needle); } // Load first 2 characters of needle. 3835 3836 if (needlecntval == 0) { // variable needlecnt 3837 subf(ch1, needlecnt, haycnt); // Last character index to compare is haycnt-needlecnt. 3838 addi(needlecnt, needlecnt, -2); // Rest of needle. 3839 } else { // constant needlecnt 3840 guarantee(needlecntval != 1, "IndexOf with single-character needle must be handled separately"); 3841 assert((needlecntval & 0x7fff) == needlecntval, "wrong immediate"); 3842 addi(ch1, haycnt, -needlecntval); // Last character index to compare is haycnt-needlecnt. 3843 if (needlecntval > 3) { li(needlecnt, needlecntval - 2); } // Rest of needle. 3844 } 3845 3846 if (h_csize == 2) { slwi(ch1, ch1, 1); } // Scale to number of bytes. 3847 3848 if (ae ==StrIntrinsicNode::UL) { 3849 srwi(tmp4, n_start, 1*8); // ___0 3850 rlwimi(n_start, tmp4, 2*8, 0, 23); // _0_1 3851 } 3852 3853 add(last_addr, haystack, ch1); // Point to last address to compare (haystack+2*(haycnt-needlecnt)). 3854 3855 // Main Loop (now we have at least 2 characters). 3856 Label L_OuterLoop, L_InnerLoop, L_FinalCheck, L_Comp1, L_Comp2; 3857 bind(L_OuterLoop); // Search for 1st 2 characters. 3858 Register addr_diff = tmp4; 3859 subf(addr_diff, addr, last_addr); // Difference between already checked address and last address to check. 3860 addi(addr, addr, h_csize); // This is the new address we want to use for comparing. 3861 srdi_(ch2, addr_diff, h_csize); 3862 beq(CCR0, L_FinalCheck); // 2 characters left? 3863 mtctr(ch2); // num of characters / 2 3864 bind(L_InnerLoop); // Main work horse (2x unrolled search loop) 3865 if (h_csize == 2) { // Load 2 characters of haystack (ignore alignment). 3866 lwz(ch1, 0, addr); 3867 lwz(ch2, 2, addr); 3868 } else { 3869 lhz(ch1, 0, addr); 3870 lhz(ch2, 1, addr); 3871 } 3872 cmpw(CCR0, ch1, n_start); // Compare 2 characters (1 would be sufficient but try to reduce branches to CompLoop). 3873 cmpw(CCR1, ch2, n_start); 3874 beq(CCR0, L_Comp1); // Did we find the needle start? 3875 beq(CCR1, L_Comp2); 3876 addi(addr, addr, 2 * h_csize); 3877 bdnz(L_InnerLoop); 3878 bind(L_FinalCheck); 3879 andi_(addr_diff, addr_diff, h_csize); // Remaining characters not covered by InnerLoop: (num of characters) & 1. 3880 beq(CCR0, L_NotFound); 3881 if (h_csize == 2) { lwz(ch1, 0, addr); } else { lhz(ch1, 0, addr); } // One position left at which we have to compare. 3882 cmpw(CCR1, ch1, n_start); 3883 beq(CCR1, L_Comp1); 3884 bind(L_NotFound); 3885 li(result, -1); // not found 3886 b(L_End); 3887 3888 // ************************************************************************************************** 3889 // Special Case: unfortunately, the variable needle case can be called with needlecnt<2 3890 // ************************************************************************************************** 3891 if (needlecntval == 0) { // We have to handle these cases separately. 3892 Label L_OneCharLoop; 3893 bind(L_TooShort); 3894 mtctr(haycnt); 3895 if (n_csize == 2) { lhz(n_start, 0, needle); } else { lbz(n_start, 0, needle); } // First character of needle 3896 bind(L_OneCharLoop); 3897 if (h_csize == 2) { lhzu(ch1, 2, addr); } else { lbzu(ch1, 1, addr); } 3898 cmpw(CCR1, ch1, n_start); 3899 beq(CCR1, L_Found); // Did we find the one character needle? 3900 bdnz(L_OneCharLoop); 3901 li(result, -1); // Not found. 3902 b(L_End); 3903 } 3904 3905 // ************************************************************************************************** 3906 // Regular Case Part II: compare rest of needle (first 2 characters have been compared already) 3907 // ************************************************************************************************** 3908 3909 // Compare the rest 3910 bind(L_Comp2); 3911 addi(addr, addr, h_csize); // First comparison has failed, 2nd one hit. 3912 bind(L_Comp1); // Addr points to possible needle start. 3913 if (needlecntval != 2) { // Const needlecnt==2? 3914 if (needlecntval != 3) { 3915 if (needlecntval == 0) { beq(CCR6, L_Found); } // Variable needlecnt==2? 3916 Register n_ind = tmp4, 3917 h_ind = n_ind; 3918 li(n_ind, 2 * n_csize); // First 2 characters are already compared, use index 2. 3919 mtctr(needlecnt); // Decremented by 2, still > 0. 3920 Label L_CompLoop; 3921 bind(L_CompLoop); 3922 if (ae ==StrIntrinsicNode::UL) { 3923 h_ind = ch1; 3924 sldi(h_ind, n_ind, 1); 3925 } 3926 if (n_csize == 2) { lhzx(ch2, needle, n_ind); } else { lbzx(ch2, needle, n_ind); } 3927 if (h_csize == 2) { lhzx(ch1, addr, h_ind); } else { lbzx(ch1, addr, h_ind); } 3928 cmpw(CCR1, ch1, ch2); 3929 bne(CCR1, L_OuterLoop); 3930 addi(n_ind, n_ind, n_csize); 3931 bdnz(L_CompLoop); 3932 } else { // No loop required if there's only one needle character left. 3933 if (n_csize == 2) { lhz(ch2, 2 * 2, needle); } else { lbz(ch2, 2 * 1, needle); } 3934 if (h_csize == 2) { lhz(ch1, 2 * 2, addr); } else { lbz(ch1, 2 * 1, addr); } 3935 cmpw(CCR1, ch1, ch2); 3936 bne(CCR1, L_OuterLoop); 3937 } 3938 } 3939 // Return index ... 3940 bind(L_Found); 3941 subf(result, haystack, addr); // relative to haystack, ... 3942 if (h_csize == 2) { srdi(result, result, 1); } // in characters. 3943 bind(L_End); 3944 } // string_indexof 3945 3946 void MacroAssembler::string_indexof_char(Register result, Register haystack, Register haycnt, 3947 Register needle, jchar needleChar, Register tmp1, Register tmp2, bool is_byte) { 3948 assert_different_registers(haystack, haycnt, needle, tmp1, tmp2); 3949 3950 Label L_InnerLoop, L_FinalCheck, L_Found1, L_Found2, L_NotFound, L_End; 3951 Register addr = tmp1, 3952 ch1 = tmp2, 3953 ch2 = R0; 3954 3955 const int h_csize = is_byte ? 1 : 2; 3956 3957 //4: 3958 srwi_(tmp2, haycnt, 1); // Shift right by exact_log2(UNROLL_FACTOR). 3959 mr(addr, haystack); 3960 beq(CCR0, L_FinalCheck); 3961 mtctr(tmp2); // Move to count register. 3962 //8: 3963 bind(L_InnerLoop); // Main work horse (2x unrolled search loop). 3964 if (!is_byte) { 3965 lhz(ch1, 0, addr); 3966 lhz(ch2, 2, addr); 3967 } else { 3968 lbz(ch1, 0, addr); 3969 lbz(ch2, 1, addr); 3970 } 3971 (needle != R0) ? cmpw(CCR0, ch1, needle) : cmplwi(CCR0, ch1, (unsigned int)needleChar); 3972 (needle != R0) ? cmpw(CCR1, ch2, needle) : cmplwi(CCR1, ch2, (unsigned int)needleChar); 3973 beq(CCR0, L_Found1); // Did we find the needle? 3974 beq(CCR1, L_Found2); 3975 addi(addr, addr, 2 * h_csize); 3976 bdnz(L_InnerLoop); 3977 //16: 3978 bind(L_FinalCheck); 3979 andi_(R0, haycnt, 1); 3980 beq(CCR0, L_NotFound); 3981 if (!is_byte) { lhz(ch1, 0, addr); } else { lbz(ch1, 0, addr); } // One position left at which we have to compare. 3982 (needle != R0) ? cmpw(CCR1, ch1, needle) : cmplwi(CCR1, ch1, (unsigned int)needleChar); 3983 beq(CCR1, L_Found1); 3984 //21: 3985 bind(L_NotFound); 3986 li(result, -1); // Not found. 3987 b(L_End); 3988 3989 bind(L_Found2); 3990 addi(addr, addr, h_csize); 3991 //24: 3992 bind(L_Found1); // Return index ... 3993 subf(result, haystack, addr); // relative to haystack, ... 3994 if (!is_byte) { srdi(result, result, 1); } // in characters. 3995 bind(L_End); 3996 } // string_indexof_char 3997 3998 3999 void MacroAssembler::has_negatives(Register src, Register cnt, Register result, 4000 Register tmp1, Register tmp2) { 4001 const Register tmp0 = R0; 4002 assert_different_registers(src, result, cnt, tmp0, tmp1, tmp2); 4003 Label Lfastloop, Lslow, Lloop, Lnoneg, Ldone; 4004 4005 // Check if cnt >= 8 (= 16 bytes) 4006 lis(tmp1, (int)(short)0x8080); // tmp1 = 0x8080808080808080 4007 srwi_(tmp2, cnt, 4); 4008 li(result, 1); // Assume there's a negative byte. 4009 beq(CCR0, Lslow); 4010 ori(tmp1, tmp1, 0x8080); 4011 rldimi(tmp1, tmp1, 32, 0); 4012 mtctr(tmp2); 4013 4014 // 2x unrolled loop 4015 bind(Lfastloop); 4016 ld(tmp2, 0, src); 4017 ld(tmp0, 8, src); 4018 4019 orr(tmp0, tmp2, tmp0); 4020 4021 and_(tmp0, tmp0, tmp1); 4022 bne(CCR0, Ldone); // Found negative byte. 4023 addi(src, src, 16); 4024 4025 bdnz(Lfastloop); 4026 4027 bind(Lslow); // Fallback to slow version 4028 rldicl_(tmp0, cnt, 0, 64-4); 4029 beq(CCR0, Lnoneg); 4030 mtctr(tmp0); 4031 bind(Lloop); 4032 lbz(tmp0, 0, src); 4033 addi(src, src, 1); 4034 andi_(tmp0, tmp0, 0x80); 4035 bne(CCR0, Ldone); // Found negative byte. 4036 bdnz(Lloop); 4037 bind(Lnoneg); 4038 li(result, 0); 4039 4040 bind(Ldone); 4041 } 4042 4043 #endif // Compiler2 4044 4045 // Helpers for Intrinsic Emitters 4046 // 4047 // Revert the byte order of a 32bit value in a register 4048 // src: 0x44556677 4049 // dst: 0x77665544 4050 // Three steps to obtain the result: 4051 // 1) Rotate src (as doubleword) left 5 bytes. That puts the leftmost byte of the src word 4052 // into the rightmost byte position. Afterwards, everything left of the rightmost byte is cleared. 4053 // This value initializes dst. 4054 // 2) Rotate src (as word) left 3 bytes. That puts the rightmost byte of the src word into the leftmost 4055 // byte position. Furthermore, byte 5 is rotated into byte 6 position where it is supposed to go. 4056 // This value is mask inserted into dst with a [0..23] mask of 1s. 4057 // 3) Rotate src (as word) left 1 byte. That puts byte 6 into byte 5 position. 4058 // This value is mask inserted into dst with a [8..15] mask of 1s. 4059 void MacroAssembler::load_reverse_32(Register dst, Register src) { 4060 assert_different_registers(dst, src); 4061 4062 rldicl(dst, src, (4+1)*8, 56); // Rotate byte 4 into position 7 (rightmost), clear all to the left. 4063 rlwimi(dst, src, 3*8, 0, 23); // Insert byte 5 into position 6, 7 into 4, leave pos 7 alone. 4064 rlwimi(dst, src, 1*8, 8, 15); // Insert byte 6 into position 5, leave the rest alone. 4065 } 4066 4067 // Calculate the column addresses of the crc32 lookup table into distinct registers. 4068 // This loop-invariant calculation is moved out of the loop body, reducing the loop 4069 // body size from 20 to 16 instructions. 4070 // Returns the offset that was used to calculate the address of column tc3. 4071 // Due to register shortage, setting tc3 may overwrite table. With the return offset 4072 // at hand, the original table address can be easily reconstructed. 4073 int MacroAssembler::crc32_table_columns(Register table, Register tc0, Register tc1, Register tc2, Register tc3) { 4074 4075 #ifdef VM_LITTLE_ENDIAN 4076 // This is what we implement (the DOLIT4 part): 4077 // ========================================================================= */ 4078 // #define DOLIT4 c ^= *buf4++; \ 4079 // c = crc_table[3][c & 0xff] ^ crc_table[2][(c >> 8) & 0xff] ^ \ 4080 // crc_table[1][(c >> 16) & 0xff] ^ crc_table[0][c >> 24] 4081 // #define DOLIT32 DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4 4082 // ========================================================================= */ 4083 const int ix0 = 3*(4*CRC32_COLUMN_SIZE); 4084 const int ix1 = 2*(4*CRC32_COLUMN_SIZE); 4085 const int ix2 = 1*(4*CRC32_COLUMN_SIZE); 4086 const int ix3 = 0*(4*CRC32_COLUMN_SIZE); 4087 #else 4088 // This is what we implement (the DOBIG4 part): 4089 // ========================================================================= 4090 // #define DOBIG4 c ^= *++buf4; \ 4091 // c = crc_table[4][c & 0xff] ^ crc_table[5][(c >> 8) & 0xff] ^ \ 4092 // crc_table[6][(c >> 16) & 0xff] ^ crc_table[7][c >> 24] 4093 // #define DOBIG32 DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4 4094 // ========================================================================= 4095 const int ix0 = 4*(4*CRC32_COLUMN_SIZE); 4096 const int ix1 = 5*(4*CRC32_COLUMN_SIZE); 4097 const int ix2 = 6*(4*CRC32_COLUMN_SIZE); 4098 const int ix3 = 7*(4*CRC32_COLUMN_SIZE); 4099 #endif 4100 assert_different_registers(table, tc0, tc1, tc2); 4101 assert(table == tc3, "must be!"); 4102 4103 addi(tc0, table, ix0); 4104 addi(tc1, table, ix1); 4105 addi(tc2, table, ix2); 4106 if (ix3 != 0) addi(tc3, table, ix3); 4107 4108 return ix3; 4109 } 4110 4111 /** 4112 * uint32_t crc; 4113 * timesXtoThe32[crc & 0xFF] ^ (crc >> 8); 4114 */ 4115 void MacroAssembler::fold_byte_crc32(Register crc, Register val, Register table, Register tmp) { 4116 assert_different_registers(crc, table, tmp); 4117 assert_different_registers(val, table); 4118 4119 if (crc == val) { // Must rotate first to use the unmodified value. 4120 rlwinm(tmp, val, 2, 24-2, 31-2); // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest. 4121 // As we use a word (4-byte) instruction, we have to adapt the mask bit positions. 4122 srwi(crc, crc, 8); // Unsigned shift, clear leftmost 8 bits. 4123 } else { 4124 srwi(crc, crc, 8); // Unsigned shift, clear leftmost 8 bits. 4125 rlwinm(tmp, val, 2, 24-2, 31-2); // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest. 4126 } 4127 lwzx(tmp, table, tmp); 4128 xorr(crc, crc, tmp); 4129 } 4130 4131 /** 4132 * uint32_t crc; 4133 * timesXtoThe32[crc & 0xFF] ^ (crc >> 8); 4134 */ 4135 void MacroAssembler::fold_8bit_crc32(Register crc, Register table, Register tmp) { 4136 fold_byte_crc32(crc, crc, table, tmp); 4137 } 4138 4139 /** 4140 * Emits code to update CRC-32 with a byte value according to constants in table. 4141 * 4142 * @param [in,out]crc Register containing the crc. 4143 * @param [in]val Register containing the byte to fold into the CRC. 4144 * @param [in]table Register containing the table of crc constants. 4145 * 4146 * uint32_t crc; 4147 * val = crc_table[(val ^ crc) & 0xFF]; 4148 * crc = val ^ (crc >> 8); 4149 */ 4150 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) { 4151 BLOCK_COMMENT("update_byte_crc32:"); 4152 xorr(val, val, crc); 4153 fold_byte_crc32(crc, val, table, val); 4154 } 4155 4156 /** 4157 * @param crc register containing existing CRC (32-bit) 4158 * @param buf register pointing to input byte buffer (byte*) 4159 * @param len register containing number of bytes 4160 * @param table register pointing to CRC table 4161 */ 4162 void MacroAssembler::update_byteLoop_crc32(Register crc, Register buf, Register len, Register table, 4163 Register data, bool loopAlignment) { 4164 assert_different_registers(crc, buf, len, table, data); 4165 4166 Label L_mainLoop, L_done; 4167 const int mainLoop_stepping = 1; 4168 const int mainLoop_alignment = loopAlignment ? 32 : 4; // (InputForNewCode > 4 ? InputForNewCode : 32) : 4; 4169 4170 // Process all bytes in a single-byte loop. 4171 clrldi_(len, len, 32); // Enforce 32 bit. Anything to do? 4172 beq(CCR0, L_done); 4173 4174 mtctr(len); 4175 align(mainLoop_alignment); 4176 BIND(L_mainLoop); 4177 lbz(data, 0, buf); // Byte from buffer, zero-extended. 4178 addi(buf, buf, mainLoop_stepping); // Advance buffer position. 4179 update_byte_crc32(crc, data, table); 4180 bdnz(L_mainLoop); // Iterate. 4181 4182 bind(L_done); 4183 } 4184 4185 /** 4186 * Emits code to update CRC-32 with a 4-byte value according to constants in table 4187 * Implementation according to jdk/src/share/native/java/util/zip/zlib-1.2.8/crc32.c 4188 */ 4189 // A not on the lookup table address(es): 4190 // The lookup table consists of two sets of four columns each. 4191 // The columns {0..3} are used for little-endian machines. 4192 // The columns {4..7} are used for big-endian machines. 4193 // To save the effort of adding the column offset to the table address each time 4194 // a table element is looked up, it is possible to pass the pre-calculated 4195 // column addresses. 4196 // Uses R9..R12 as work register. Must be saved/restored by caller, if necessary. 4197 void MacroAssembler::update_1word_crc32(Register crc, Register buf, Register table, int bufDisp, int bufInc, 4198 Register t0, Register t1, Register t2, Register t3, 4199 Register tc0, Register tc1, Register tc2, Register tc3) { 4200 assert_different_registers(crc, t3); 4201 4202 // XOR crc with next four bytes of buffer. 4203 lwz(t3, bufDisp, buf); 4204 if (bufInc != 0) { 4205 addi(buf, buf, bufInc); 4206 } 4207 xorr(t3, t3, crc); 4208 4209 // Chop crc into 4 single-byte pieces, shifted left 2 bits, to form the table indices. 4210 rlwinm(t0, t3, 2, 24-2, 31-2); // ((t1 >> 0) & 0xff) << 2 4211 rlwinm(t1, t3, 32+(2- 8), 24-2, 31-2); // ((t1 >> 8) & 0xff) << 2 4212 rlwinm(t2, t3, 32+(2-16), 24-2, 31-2); // ((t1 >> 16) & 0xff) << 2 4213 rlwinm(t3, t3, 32+(2-24), 24-2, 31-2); // ((t1 >> 24) & 0xff) << 2 4214 4215 // Use the pre-calculated column addresses. 4216 // Load pre-calculated table values. 4217 lwzx(t0, tc0, t0); 4218 lwzx(t1, tc1, t1); 4219 lwzx(t2, tc2, t2); 4220 lwzx(t3, tc3, t3); 4221 4222 // Calculate new crc from table values. 4223 xorr(t0, t0, t1); 4224 xorr(t2, t2, t3); 4225 xorr(crc, t0, t2); // Now crc contains the final checksum value. 4226 } 4227 4228 /** 4229 * @param crc register containing existing CRC (32-bit) 4230 * @param buf register pointing to input byte buffer (byte*) 4231 * @param len register containing number of bytes 4232 * @param table register pointing to CRC table 4233 * 4234 * Uses R9..R12 as work register. Must be saved/restored by caller! 4235 */ 4236 void MacroAssembler::kernel_crc32_2word(Register crc, Register buf, Register len, Register table, 4237 Register t0, Register t1, Register t2, Register t3, 4238 Register tc0, Register tc1, Register tc2, Register tc3, 4239 bool invertCRC) { 4240 assert_different_registers(crc, buf, len, table); 4241 4242 Label L_mainLoop, L_tail; 4243 Register tmp = t0; 4244 Register data = t0; 4245 Register tmp2 = t1; 4246 const int mainLoop_stepping = 8; 4247 const int tailLoop_stepping = 1; 4248 const int log_stepping = exact_log2(mainLoop_stepping); 4249 const int mainLoop_alignment = 32; // InputForNewCode > 4 ? InputForNewCode : 32; 4250 const int complexThreshold = 2*mainLoop_stepping; 4251 4252 // Don't test for len <= 0 here. This pathological case should not occur anyway. 4253 // Optimizing for it by adding a test and a branch seems to be a waste of CPU cycles 4254 // for all well-behaved cases. The situation itself is detected and handled correctly 4255 // within update_byteLoop_crc32. 4256 assert(tailLoop_stepping == 1, "check tailLoop_stepping!"); 4257 4258 BLOCK_COMMENT("kernel_crc32_2word {"); 4259 4260 if (invertCRC) { 4261 nand(crc, crc, crc); // 1s complement of crc 4262 } 4263 4264 // Check for short (<mainLoop_stepping) buffer. 4265 cmpdi(CCR0, len, complexThreshold); 4266 blt(CCR0, L_tail); 4267 4268 // Pre-mainLoop alignment did show a slight (1%) positive effect on performance. 4269 // We leave the code in for reference. Maybe we need alignment when we exploit vector instructions. 4270 { 4271 // Align buf addr to mainLoop_stepping boundary. 4272 neg(tmp2, buf); // Calculate # preLoop iterations for alignment. 4273 rldicl(tmp2, tmp2, 0, 64-log_stepping); // Rotate tmp2 0 bits, insert into tmp2, anding with mask with 1s from 62..63. 4274 4275 if (complexThreshold > mainLoop_stepping) { 4276 sub(len, len, tmp2); // Remaining bytes for main loop (>=mainLoop_stepping is guaranteed). 4277 } else { 4278 sub(tmp, len, tmp2); // Remaining bytes for main loop. 4279 cmpdi(CCR0, tmp, mainLoop_stepping); 4280 blt(CCR0, L_tail); // For less than one mainloop_stepping left, do only tail processing 4281 mr(len, tmp); // remaining bytes for main loop (>=mainLoop_stepping is guaranteed). 4282 } 4283 update_byteLoop_crc32(crc, buf, tmp2, table, data, false); 4284 } 4285 4286 srdi(tmp2, len, log_stepping); // #iterations for mainLoop 4287 andi(len, len, mainLoop_stepping-1); // remaining bytes for tailLoop 4288 mtctr(tmp2); 4289 4290 #ifdef VM_LITTLE_ENDIAN 4291 Register crc_rv = crc; 4292 #else 4293 Register crc_rv = tmp; // Load_reverse needs separate registers to work on. 4294 // Occupies tmp, but frees up crc. 4295 load_reverse_32(crc_rv, crc); // Revert byte order because we are dealing with big-endian data. 4296 tmp = crc; 4297 #endif 4298 4299 int reconstructTableOffset = crc32_table_columns(table, tc0, tc1, tc2, tc3); 4300 4301 align(mainLoop_alignment); // Octoword-aligned loop address. Shows 2% improvement. 4302 BIND(L_mainLoop); 4303 update_1word_crc32(crc_rv, buf, table, 0, 0, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3); 4304 update_1word_crc32(crc_rv, buf, table, 4, mainLoop_stepping, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3); 4305 bdnz(L_mainLoop); 4306 4307 #ifndef VM_LITTLE_ENDIAN 4308 load_reverse_32(crc, crc_rv); // Revert byte order because we are dealing with big-endian data. 4309 tmp = crc_rv; // Tmp uses it's original register again. 4310 #endif 4311 4312 // Restore original table address for tailLoop. 4313 if (reconstructTableOffset != 0) { 4314 addi(table, table, -reconstructTableOffset); 4315 } 4316 4317 // Process last few (<complexThreshold) bytes of buffer. 4318 BIND(L_tail); 4319 update_byteLoop_crc32(crc, buf, len, table, data, false); 4320 4321 if (invertCRC) { 4322 nand(crc, crc, crc); // 1s complement of crc 4323 } 4324 BLOCK_COMMENT("} kernel_crc32_2word"); 4325 } 4326 4327 /** 4328 * @param crc register containing existing CRC (32-bit) 4329 * @param buf register pointing to input byte buffer (byte*) 4330 * @param len register containing number of bytes 4331 * @param table register pointing to CRC table 4332 * 4333 * uses R9..R12 as work register. Must be saved/restored by caller! 4334 */ 4335 void MacroAssembler::kernel_crc32_1word(Register crc, Register buf, Register len, Register table, 4336 Register t0, Register t1, Register t2, Register t3, 4337 Register tc0, Register tc1, Register tc2, Register tc3, 4338 bool invertCRC) { 4339 assert_different_registers(crc, buf, len, table); 4340 4341 Label L_mainLoop, L_tail; 4342 Register tmp = t0; 4343 Register data = t0; 4344 Register tmp2 = t1; 4345 const int mainLoop_stepping = 4; 4346 const int tailLoop_stepping = 1; 4347 const int log_stepping = exact_log2(mainLoop_stepping); 4348 const int mainLoop_alignment = 32; // InputForNewCode > 4 ? InputForNewCode : 32; 4349 const int complexThreshold = 2*mainLoop_stepping; 4350 4351 // Don't test for len <= 0 here. This pathological case should not occur anyway. 4352 // Optimizing for it by adding a test and a branch seems to be a waste of CPU cycles 4353 // for all well-behaved cases. The situation itself is detected and handled correctly 4354 // within update_byteLoop_crc32. 4355 assert(tailLoop_stepping == 1, "check tailLoop_stepping!"); 4356 4357 BLOCK_COMMENT("kernel_crc32_1word {"); 4358 4359 if (invertCRC) { 4360 nand(crc, crc, crc); // 1s complement of crc 4361 } 4362 4363 // Check for short (<mainLoop_stepping) buffer. 4364 cmpdi(CCR0, len, complexThreshold); 4365 blt(CCR0, L_tail); 4366 4367 // Pre-mainLoop alignment did show a slight (1%) positive effect on performance. 4368 // We leave the code in for reference. Maybe we need alignment when we exploit vector instructions. 4369 { 4370 // Align buf addr to mainLoop_stepping boundary. 4371 neg(tmp2, buf); // Calculate # preLoop iterations for alignment. 4372 rldicl(tmp2, tmp2, 0, 64-log_stepping); // Rotate tmp2 0 bits, insert into tmp2, anding with mask with 1s from 62..63. 4373 4374 if (complexThreshold > mainLoop_stepping) { 4375 sub(len, len, tmp2); // Remaining bytes for main loop (>=mainLoop_stepping is guaranteed). 4376 } else { 4377 sub(tmp, len, tmp2); // Remaining bytes for main loop. 4378 cmpdi(CCR0, tmp, mainLoop_stepping); 4379 blt(CCR0, L_tail); // For less than one mainloop_stepping left, do only tail processing 4380 mr(len, tmp); // remaining bytes for main loop (>=mainLoop_stepping is guaranteed). 4381 } 4382 update_byteLoop_crc32(crc, buf, tmp2, table, data, false); 4383 } 4384 4385 srdi(tmp2, len, log_stepping); // #iterations for mainLoop 4386 andi(len, len, mainLoop_stepping-1); // remaining bytes for tailLoop 4387 mtctr(tmp2); 4388 4389 #ifdef VM_LITTLE_ENDIAN 4390 Register crc_rv = crc; 4391 #else 4392 Register crc_rv = tmp; // Load_reverse needs separate registers to work on. 4393 // Occupies tmp, but frees up crc. 4394 load_reverse_32(crc_rv, crc); // Revert byte order because we are dealing with big-endian data. 4395 tmp = crc; 4396 #endif 4397 4398 int reconstructTableOffset = crc32_table_columns(table, tc0, tc1, tc2, tc3); 4399 4400 align(mainLoop_alignment); // Octoword-aligned loop address. Shows 2% improvement. 4401 BIND(L_mainLoop); 4402 update_1word_crc32(crc_rv, buf, table, 0, mainLoop_stepping, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3); 4403 bdnz(L_mainLoop); 4404 4405 #ifndef VM_LITTLE_ENDIAN 4406 load_reverse_32(crc, crc_rv); // Revert byte order because we are dealing with big-endian data. 4407 tmp = crc_rv; // Tmp uses it's original register again. 4408 #endif 4409 4410 // Restore original table address for tailLoop. 4411 if (reconstructTableOffset != 0) { 4412 addi(table, table, -reconstructTableOffset); 4413 } 4414 4415 // Process last few (<complexThreshold) bytes of buffer. 4416 BIND(L_tail); 4417 update_byteLoop_crc32(crc, buf, len, table, data, false); 4418 4419 if (invertCRC) { 4420 nand(crc, crc, crc); // 1s complement of crc 4421 } 4422 BLOCK_COMMENT("} kernel_crc32_1word"); 4423 } 4424 4425 /** 4426 * @param crc register containing existing CRC (32-bit) 4427 * @param buf register pointing to input byte buffer (byte*) 4428 * @param len register containing number of bytes 4429 * @param table register pointing to CRC table 4430 * 4431 * Uses R7_ARG5, R8_ARG6 as work registers. 4432 */ 4433 void MacroAssembler::kernel_crc32_1byte(Register crc, Register buf, Register len, Register table, 4434 Register t0, Register t1, Register t2, Register t3, 4435 bool invertCRC) { 4436 assert_different_registers(crc, buf, len, table); 4437 4438 Register data = t0; // Holds the current byte to be folded into crc. 4439 4440 BLOCK_COMMENT("kernel_crc32_1byte {"); 4441 4442 if (invertCRC) { 4443 nand(crc, crc, crc); // 1s complement of crc 4444 } 4445 4446 // Process all bytes in a single-byte loop. 4447 update_byteLoop_crc32(crc, buf, len, table, data, true); 4448 4449 if (invertCRC) { 4450 nand(crc, crc, crc); // 1s complement of crc 4451 } 4452 BLOCK_COMMENT("} kernel_crc32_1byte"); 4453 } 4454 4455 /** 4456 * @param crc register containing existing CRC (32-bit) 4457 * @param buf register pointing to input byte buffer (byte*) 4458 * @param len register containing number of bytes 4459 * @param table register pointing to CRC table 4460 * @param constants register pointing to CRC table for 128-bit aligned memory 4461 * @param barretConstants register pointing to table for barrett reduction 4462 * @param t0-t4 temp registers 4463 */ 4464 void MacroAssembler::kernel_crc32_1word_vpmsum(Register crc, Register buf, Register len, Register table, 4465 Register constants, Register barretConstants, 4466 Register t0, Register t1, Register t2, Register t3, Register t4, 4467 bool invertCRC) { 4468 assert_different_registers(crc, buf, len, table); 4469 4470 Label L_alignedHead, L_tail; 4471 4472 BLOCK_COMMENT("kernel_crc32_1word_vpmsum {"); 4473 4474 // 1. ~c 4475 if (invertCRC) { 4476 nand(crc, crc, crc); // 1s complement of crc 4477 } 4478 4479 // 2. use kernel_crc32_1word for short len 4480 clrldi(len, len, 32); 4481 cmpdi(CCR0, len, 512); 4482 blt(CCR0, L_tail); 4483 4484 // 3. calculate from 0 to first aligned address 4485 const int alignment = 16; 4486 Register prealign = t0; 4487 4488 andi_(prealign, buf, alignment - 1); 4489 beq(CCR0, L_alignedHead); 4490 subfic(prealign, prealign, alignment); 4491 4492 subf(len, prealign, len); 4493 update_byteLoop_crc32(crc, buf, prealign, table, t2, false); 4494 4495 // 4. calculate from first aligned address as far as possible 4496 BIND(L_alignedHead); 4497 kernel_crc32_1word_aligned(crc, buf, len, constants, barretConstants, t0, t1, t2, t3, t4); 4498 4499 // 5. remaining bytes 4500 BIND(L_tail); 4501 Register tc0 = t4; 4502 Register tc1 = constants; 4503 Register tc2 = barretConstants; 4504 kernel_crc32_1word(crc, buf, len, table, t0, t1, t2, t3, tc0, tc1, tc2, table, false); 4505 4506 // 6. ~c 4507 if (invertCRC) { 4508 nand(crc, crc, crc); // 1s complement of crc 4509 } 4510 4511 BLOCK_COMMENT("} kernel_crc32_1word_vpmsum"); 4512 } 4513 4514 /** 4515 * @param crc register containing existing CRC (32-bit) 4516 * @param buf register pointing to input byte buffer (byte*) 4517 * @param len register containing number of bytes (will get updated to remaining bytes) 4518 * @param constants register pointing to CRC table for 128-bit aligned memory 4519 * @param barretConstants register pointing to table for barrett reduction 4520 * @param t0-t4 temp registers 4521 * Precondition: len should be >= 512. Otherwise, nothing will be done. 4522 */ 4523 void MacroAssembler::kernel_crc32_1word_aligned(Register crc, Register buf, Register len, 4524 Register constants, Register barretConstants, 4525 Register t0, Register t1, Register t2, Register t3, Register t4) { 4526 4527 // Save non-volatile vector registers (frameless). 4528 Register offset = t1; 4529 int offsetInt = 0; 4530 offsetInt -= 16; li(offset, offsetInt); stvx(VR20, offset, R1_SP); 4531 offsetInt -= 16; li(offset, offsetInt); stvx(VR21, offset, R1_SP); 4532 offsetInt -= 16; li(offset, offsetInt); stvx(VR22, offset, R1_SP); 4533 offsetInt -= 16; li(offset, offsetInt); stvx(VR23, offset, R1_SP); 4534 offsetInt -= 16; li(offset, offsetInt); stvx(VR24, offset, R1_SP); 4535 offsetInt -= 16; li(offset, offsetInt); stvx(VR25, offset, R1_SP); 4536 #ifndef VM_LITTLE_ENDIAN 4537 offsetInt -= 16; li(offset, offsetInt); stvx(VR26, offset, R1_SP); 4538 #endif 4539 offsetInt -= 8; std(R14, offsetInt, R1_SP); 4540 offsetInt -= 8; std(R15, offsetInt, R1_SP); 4541 offsetInt -= 8; std(R16, offsetInt, R1_SP); 4542 offsetInt -= 8; std(R17, offsetInt, R1_SP); 4543 4544 // Implementation uses an inner loop which uses between 256 and 16 * unroll_factor 4545 // bytes per iteration. The basic scheme is: 4546 // lvx: load vector (Big Endian needs reversal) 4547 // vpmsumw: carry-less 32 bit multiplications with constant representing a large CRC shift 4548 // vxor: xor partial results together to get unroll_factor2 vectors 4549 4550 // Outer loop performs the CRC shifts needed to combine the unroll_factor2 vectors. 4551 4552 // Using 16 * unroll_factor / unroll_factor_2 bytes for constants. 4553 const int unroll_factor = 2048; 4554 const int unroll_factor2 = 8; 4555 4556 // Support registers. 4557 Register offs[] = { noreg, t0, t1, t2, t3, t4, crc /* will live in VCRC */, R14 }; 4558 Register num_bytes = R15, 4559 loop_count = R16, 4560 cur_const = R17; 4561 // Constant array for outer loop: unroll_factor2 - 1 registers, 4562 // Constant array for inner loop: unroll_factor / unroll_factor2 registers. 4563 VectorRegister consts0[] = { VR16, VR17, VR18, VR19, VR20, VR21, VR22 }, 4564 consts1[] = { VR23, VR24 }; 4565 // Data register arrays: 2 arrays with unroll_factor2 registers. 4566 VectorRegister data0[] = { VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7 }, 4567 data1[] = { VR8, VR9, VR10, VR11, VR12, VR13, VR14, VR15 }; 4568 4569 VectorRegister VCRC = data0[0]; 4570 VectorRegister Vc = VR25; 4571 VectorRegister swap_bytes = VR26; // Only for Big Endian. 4572 4573 // We have at least 1 iteration (ensured by caller). 4574 Label L_outer_loop, L_inner_loop, L_last; 4575 4576 // If supported set DSCR pre-fetch to deepest. 4577 if (VM_Version::has_mfdscr()) { 4578 load_const_optimized(t0, VM_Version::_dscr_val | 7); 4579 mtdscr(t0); 4580 } 4581 4582 mtvrwz(VCRC, crc); // crc lives lives in VCRC, now 4583 4584 for (int i = 1; i < unroll_factor2; ++i) { 4585 li(offs[i], 16 * i); 4586 } 4587 4588 // Load consts for outer loop 4589 lvx(consts0[0], constants); 4590 for (int i = 1; i < unroll_factor2 - 1; ++i) { 4591 lvx(consts0[i], offs[i], constants); 4592 } 4593 addi(constants, constants, (unroll_factor2 - 1) * 16); 4594 4595 load_const_optimized(num_bytes, 16 * unroll_factor); 4596 load_const_optimized(loop_count, unroll_factor / (2 * unroll_factor2) - 1); // One double-iteration peeled off. 4597 4598 // Reuse data registers outside of the loop. 4599 VectorRegister Vtmp = data1[0]; 4600 VectorRegister Vtmp2 = data1[1]; 4601 VectorRegister zeroes = data1[2]; 4602 4603 vspltisb(Vtmp, 0); 4604 vsldoi(VCRC, Vtmp, VCRC, 8); // 96 bit zeroes, 32 bit CRC. 4605 4606 // Load vector for vpermxor (to xor both 64 bit parts together) 4607 lvsl(Vtmp, buf); // 000102030405060708090a0b0c0d0e0f 4608 vspltisb(Vc, 4); 4609 vsl(Vc, Vtmp, Vc); // 00102030405060708090a0b0c0d0e0f0 4610 xxspltd(Vc->to_vsr(), Vc->to_vsr(), 0); 4611 vor(Vc, Vtmp, Vc); // 001122334455667708192a3b4c5d6e7f 4612 4613 #ifdef VM_LITTLE_ENDIAN 4614 #define BE_swap_bytes(x) 4615 #else 4616 vspltisb(Vtmp2, 0xf); 4617 vxor(swap_bytes, Vtmp, Vtmp2); 4618 #define BE_swap_bytes(x) vperm(x, x, x, swap_bytes) 4619 #endif 4620 4621 cmpd(CCR0, len, num_bytes); 4622 blt(CCR0, L_last); 4623 4624 // ********** Main loop start ********** 4625 align(32); 4626 bind(L_outer_loop); 4627 4628 // Begin of unrolled first iteration (no xor). 4629 lvx(data1[0], buf); 4630 mr(cur_const, constants); 4631 for (int i = 1; i < unroll_factor2 / 2; ++i) { 4632 lvx(data1[i], offs[i], buf); 4633 } 4634 vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result. 4635 lvx(consts1[0], cur_const); 4636 mtctr(loop_count); 4637 for (int i = 0; i < unroll_factor2 / 2; ++i) { 4638 BE_swap_bytes(data1[i]); 4639 if (i == 0) { vxor(data1[0], data1[0], VCRC); } // xor in previous CRC. 4640 lvx(data1[i + unroll_factor2 / 2], offs[i + unroll_factor2 / 2], buf); 4641 vpmsumw(data0[i], data1[i], consts1[0]); 4642 } 4643 addi(buf, buf, 16 * unroll_factor2); 4644 subf(len, num_bytes, len); 4645 lvx(consts1[1], offs[1], cur_const); 4646 addi(cur_const, cur_const, 32); 4647 // Begin of unrolled second iteration (head). 4648 for (int i = 0; i < unroll_factor2 / 2; ++i) { 4649 BE_swap_bytes(data1[i + unroll_factor2 / 2]); 4650 if (i == 0) { lvx(data1[0], buf); } else { lvx(data1[i], offs[i], buf); } 4651 vpmsumw(data0[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2], consts1[0]); 4652 } 4653 for (int i = 0; i < unroll_factor2 / 2; ++i) { 4654 BE_swap_bytes(data1[i]); 4655 lvx(data1[i + unroll_factor2 / 2], offs[i + unroll_factor2 / 2], buf); 4656 vpmsumw(data1[i], data1[i], consts1[1]); 4657 } 4658 addi(buf, buf, 16 * unroll_factor2); 4659 4660 // Generate most performance relevant code. Loads + half of the vpmsumw have been generated. 4661 // Double-iteration allows using the 2 constant registers alternatingly. 4662 align(32); 4663 bind(L_inner_loop); 4664 for (int j = 1; j < 3; ++j) { // j < unroll_factor / unroll_factor2 - 1 for complete unrolling. 4665 if (j & 1) { 4666 lvx(consts1[0], cur_const); 4667 } else { 4668 lvx(consts1[1], offs[1], cur_const); 4669 addi(cur_const, cur_const, 32); 4670 } 4671 for (int i = 0; i < unroll_factor2; ++i) { 4672 int idx = i + unroll_factor2 / 2, inc = 0; // For modulo-scheduled input. 4673 if (idx >= unroll_factor2) { idx -= unroll_factor2; inc = 1; } 4674 BE_swap_bytes(data1[idx]); 4675 vxor(data0[i], data0[i], data1[i]); 4676 if (i == 0) lvx(data1[0], buf); else lvx(data1[i], offs[i], buf); 4677 vpmsumw(data1[idx], data1[idx], consts1[(j + inc) & 1]); 4678 } 4679 addi(buf, buf, 16 * unroll_factor2); 4680 } 4681 bdnz(L_inner_loop); 4682 4683 // Tail of last iteration (no loads). 4684 for (int i = 0; i < unroll_factor2 / 2; ++i) { 4685 BE_swap_bytes(data1[i + unroll_factor2 / 2]); 4686 vxor(data0[i], data0[i], data1[i]); 4687 vpmsumw(data1[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2], consts1[1]); 4688 } 4689 for (int i = 0; i < unroll_factor2 / 2; ++i) { 4690 vpmsumw(data0[i], data0[i], consts0[unroll_factor2 - 2 - i]); // First half of fixup shifts. 4691 vxor(data0[i + unroll_factor2 / 2], data0[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2]); 4692 } 4693 4694 // Last data register is ok, other ones need fixup shift. 4695 for (int i = unroll_factor2 / 2; i < unroll_factor2 - 1; ++i) { 4696 vpmsumw(data0[i], data0[i], consts0[unroll_factor2 - 2 - i]); 4697 } 4698 4699 // Combine to 128 bit result vector VCRC = data0[0]. 4700 for (int i = 1; i < unroll_factor2; i<<=1) { 4701 for (int j = 0; j <= unroll_factor2 - 2*i; j+=2*i) { 4702 vxor(data0[j], data0[j], data0[j+i]); 4703 } 4704 } 4705 cmpd(CCR0, len, num_bytes); 4706 bge(CCR0, L_outer_loop); 4707 4708 // Last chance with lower num_bytes. 4709 bind(L_last); 4710 srdi(loop_count, len, exact_log2(16 * 2 * unroll_factor2)); // Use double-iterations. 4711 add_const_optimized(constants, constants, 16 * (unroll_factor / unroll_factor2)); // Point behind last one. 4712 sldi(R0, loop_count, exact_log2(16 * 2)); // Bytes of constants to be used. 4713 clrrdi(num_bytes, len, exact_log2(16 * 2 * unroll_factor2)); 4714 subf(constants, R0, constants); // Point to constant to be used first. 4715 4716 addic_(loop_count, loop_count, -1); // One double-iteration peeled off. 4717 bgt(CCR0, L_outer_loop); 4718 // ********** Main loop end ********** 4719 #undef BE_swap_bytes 4720 4721 // Restore DSCR pre-fetch value. 4722 if (VM_Version::has_mfdscr()) { 4723 load_const_optimized(t0, VM_Version::_dscr_val); 4724 mtdscr(t0); 4725 } 4726 4727 vspltisb(zeroes, 0); 4728 4729 // Combine to 64 bit result. 4730 vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result. 4731 4732 // Reduce to 32 bit CRC: Remainder by multiply-high. 4733 lvx(Vtmp, barretConstants); 4734 vsldoi(Vtmp2, zeroes, VCRC, 12); // Extract high 32 bit. 4735 vpmsumd(Vtmp2, Vtmp2, Vtmp); // Multiply by inverse long poly. 4736 vsldoi(Vtmp2, zeroes, Vtmp2, 12); // Extract high 32 bit. 4737 vsldoi(Vtmp, zeroes, Vtmp, 8); 4738 vpmsumd(Vtmp2, Vtmp2, Vtmp); // Multiply quotient by long poly. 4739 vxor(VCRC, VCRC, Vtmp2); // Remainder fits into 32 bit. 4740 4741 // Move result. len is already updated. 4742 vsldoi(VCRC, VCRC, zeroes, 8); 4743 mfvrd(crc, VCRC); 4744 4745 // Restore non-volatile Vector registers (frameless). 4746 offsetInt = 0; 4747 offsetInt -= 16; li(offset, offsetInt); lvx(VR20, offset, R1_SP); 4748 offsetInt -= 16; li(offset, offsetInt); lvx(VR21, offset, R1_SP); 4749 offsetInt -= 16; li(offset, offsetInt); lvx(VR22, offset, R1_SP); 4750 offsetInt -= 16; li(offset, offsetInt); lvx(VR23, offset, R1_SP); 4751 offsetInt -= 16; li(offset, offsetInt); lvx(VR24, offset, R1_SP); 4752 offsetInt -= 16; li(offset, offsetInt); lvx(VR25, offset, R1_SP); 4753 #ifndef VM_LITTLE_ENDIAN 4754 offsetInt -= 16; li(offset, offsetInt); lvx(VR26, offset, R1_SP); 4755 #endif 4756 offsetInt -= 8; ld(R14, offsetInt, R1_SP); 4757 offsetInt -= 8; ld(R15, offsetInt, R1_SP); 4758 offsetInt -= 8; ld(R16, offsetInt, R1_SP); 4759 offsetInt -= 8; ld(R17, offsetInt, R1_SP); 4760 } 4761 4762 void MacroAssembler::kernel_crc32_singleByte(Register crc, Register buf, Register len, Register table, Register tmp, bool invertCRC) { 4763 assert_different_registers(crc, buf, /* len, not used!! */ table, tmp); 4764 4765 BLOCK_COMMENT("kernel_crc32_singleByte:"); 4766 if (invertCRC) { 4767 nand(crc, crc, crc); // 1s complement of crc 4768 } 4769 4770 lbz(tmp, 0, buf); // Byte from buffer, zero-extended. 4771 update_byte_crc32(crc, tmp, table); 4772 4773 if (invertCRC) { 4774 nand(crc, crc, crc); // 1s complement of crc 4775 } 4776 } 4777 4778 void MacroAssembler::kernel_crc32_singleByteReg(Register crc, Register val, Register table, bool invertCRC) { 4779 assert_different_registers(crc, val, table); 4780 4781 BLOCK_COMMENT("kernel_crc32_singleByteReg:"); 4782 if (invertCRC) { 4783 nand(crc, crc, crc); // 1s complement of crc 4784 } 4785 4786 update_byte_crc32(crc, val, table); 4787 4788 if (invertCRC) { 4789 nand(crc, crc, crc); // 1s complement of crc 4790 } 4791 } 4792 4793 // dest_lo += src1 + src2 4794 // dest_hi += carry1 + carry2 4795 void MacroAssembler::add2_with_carry(Register dest_hi, 4796 Register dest_lo, 4797 Register src1, Register src2) { 4798 li(R0, 0); 4799 addc(dest_lo, dest_lo, src1); 4800 adde(dest_hi, dest_hi, R0); 4801 addc(dest_lo, dest_lo, src2); 4802 adde(dest_hi, dest_hi, R0); 4803 } 4804 4805 // Multiply 64 bit by 64 bit first loop. 4806 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, 4807 Register x_xstart, 4808 Register y, Register y_idx, 4809 Register z, 4810 Register carry, 4811 Register product_high, Register product, 4812 Register idx, Register kdx, 4813 Register tmp) { 4814 // jlong carry, x[], y[], z[]; 4815 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx--, kdx--) { 4816 // huge_128 product = y[idx] * x[xstart] + carry; 4817 // z[kdx] = (jlong)product; 4818 // carry = (jlong)(product >>> 64); 4819 // } 4820 // z[xstart] = carry; 4821 4822 Label L_first_loop, L_first_loop_exit; 4823 Label L_one_x, L_one_y, L_multiply; 4824 4825 addic_(xstart, xstart, -1); 4826 blt(CCR0, L_one_x); // Special case: length of x is 1. 4827 4828 // Load next two integers of x. 4829 sldi(tmp, xstart, LogBytesPerInt); 4830 ldx(x_xstart, x, tmp); 4831 #ifdef VM_LITTLE_ENDIAN 4832 rldicl(x_xstart, x_xstart, 32, 0); 4833 #endif 4834 4835 align(32, 16); 4836 bind(L_first_loop); 4837 4838 cmpdi(CCR0, idx, 1); 4839 blt(CCR0, L_first_loop_exit); 4840 addi(idx, idx, -2); 4841 beq(CCR0, L_one_y); 4842 4843 // Load next two integers of y. 4844 sldi(tmp, idx, LogBytesPerInt); 4845 ldx(y_idx, y, tmp); 4846 #ifdef VM_LITTLE_ENDIAN 4847 rldicl(y_idx, y_idx, 32, 0); 4848 #endif 4849 4850 4851 bind(L_multiply); 4852 multiply64(product_high, product, x_xstart, y_idx); 4853 4854 li(tmp, 0); 4855 addc(product, product, carry); // Add carry to result. 4856 adde(product_high, product_high, tmp); // Add carry of the last addition. 4857 addi(kdx, kdx, -2); 4858 4859 // Store result. 4860 #ifdef VM_LITTLE_ENDIAN 4861 rldicl(product, product, 32, 0); 4862 #endif 4863 sldi(tmp, kdx, LogBytesPerInt); 4864 stdx(product, z, tmp); 4865 mr_if_needed(carry, product_high); 4866 b(L_first_loop); 4867 4868 4869 bind(L_one_y); // Load one 32 bit portion of y as (0,value). 4870 4871 lwz(y_idx, 0, y); 4872 b(L_multiply); 4873 4874 4875 bind(L_one_x); // Load one 32 bit portion of x as (0,value). 4876 4877 lwz(x_xstart, 0, x); 4878 b(L_first_loop); 4879 4880 bind(L_first_loop_exit); 4881 } 4882 4883 // Multiply 64 bit by 64 bit and add 128 bit. 4884 void MacroAssembler::multiply_add_128_x_128(Register x_xstart, Register y, 4885 Register z, Register yz_idx, 4886 Register idx, Register carry, 4887 Register product_high, Register product, 4888 Register tmp, int offset) { 4889 4890 // huge_128 product = (y[idx] * x_xstart) + z[kdx] + carry; 4891 // z[kdx] = (jlong)product; 4892 4893 sldi(tmp, idx, LogBytesPerInt); 4894 if (offset) { 4895 addi(tmp, tmp, offset); 4896 } 4897 ldx(yz_idx, y, tmp); 4898 #ifdef VM_LITTLE_ENDIAN 4899 rldicl(yz_idx, yz_idx, 32, 0); 4900 #endif 4901 4902 multiply64(product_high, product, x_xstart, yz_idx); 4903 ldx(yz_idx, z, tmp); 4904 #ifdef VM_LITTLE_ENDIAN 4905 rldicl(yz_idx, yz_idx, 32, 0); 4906 #endif 4907 4908 add2_with_carry(product_high, product, carry, yz_idx); 4909 4910 sldi(tmp, idx, LogBytesPerInt); 4911 if (offset) { 4912 addi(tmp, tmp, offset); 4913 } 4914 #ifdef VM_LITTLE_ENDIAN 4915 rldicl(product, product, 32, 0); 4916 #endif 4917 stdx(product, z, tmp); 4918 } 4919 4920 // Multiply 128 bit by 128 bit. Unrolled inner loop. 4921 void MacroAssembler::multiply_128_x_128_loop(Register x_xstart, 4922 Register y, Register z, 4923 Register yz_idx, Register idx, Register carry, 4924 Register product_high, Register product, 4925 Register carry2, Register tmp) { 4926 4927 // jlong carry, x[], y[], z[]; 4928 // int kdx = ystart+1; 4929 // for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop 4930 // huge_128 product = (y[idx+1] * x_xstart) + z[kdx+idx+1] + carry; 4931 // z[kdx+idx+1] = (jlong)product; 4932 // jlong carry2 = (jlong)(product >>> 64); 4933 // product = (y[idx] * x_xstart) + z[kdx+idx] + carry2; 4934 // z[kdx+idx] = (jlong)product; 4935 // carry = (jlong)(product >>> 64); 4936 // } 4937 // idx += 2; 4938 // if (idx > 0) { 4939 // product = (y[idx] * x_xstart) + z[kdx+idx] + carry; 4940 // z[kdx+idx] = (jlong)product; 4941 // carry = (jlong)(product >>> 64); 4942 // } 4943 4944 Label L_third_loop, L_third_loop_exit, L_post_third_loop_done; 4945 const Register jdx = R0; 4946 4947 // Scale the index. 4948 srdi_(jdx, idx, 2); 4949 beq(CCR0, L_third_loop_exit); 4950 mtctr(jdx); 4951 4952 align(32, 16); 4953 bind(L_third_loop); 4954 4955 addi(idx, idx, -4); 4956 4957 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 8); 4958 mr_if_needed(carry2, product_high); 4959 4960 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry2, product_high, product, tmp, 0); 4961 mr_if_needed(carry, product_high); 4962 bdnz(L_third_loop); 4963 4964 bind(L_third_loop_exit); // Handle any left-over operand parts. 4965 4966 andi_(idx, idx, 0x3); 4967 beq(CCR0, L_post_third_loop_done); 4968 4969 Label L_check_1; 4970 4971 addic_(idx, idx, -2); 4972 blt(CCR0, L_check_1); 4973 4974 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 0); 4975 mr_if_needed(carry, product_high); 4976 4977 bind(L_check_1); 4978 4979 addi(idx, idx, 0x2); 4980 andi_(idx, idx, 0x1); 4981 addic_(idx, idx, -1); 4982 blt(CCR0, L_post_third_loop_done); 4983 4984 sldi(tmp, idx, LogBytesPerInt); 4985 lwzx(yz_idx, y, tmp); 4986 multiply64(product_high, product, x_xstart, yz_idx); 4987 lwzx(yz_idx, z, tmp); 4988 4989 add2_with_carry(product_high, product, yz_idx, carry); 4990 4991 sldi(tmp, idx, LogBytesPerInt); 4992 stwx(product, z, tmp); 4993 srdi(product, product, 32); 4994 4995 sldi(product_high, product_high, 32); 4996 orr(product, product, product_high); 4997 mr_if_needed(carry, product); 4998 4999 bind(L_post_third_loop_done); 5000 } // multiply_128_x_128_loop 5001 5002 void MacroAssembler::muladd(Register out, Register in, 5003 Register offset, Register len, Register k, 5004 Register tmp1, Register tmp2, Register carry) { 5005 5006 // Labels 5007 Label LOOP, SKIP; 5008 5009 // Make sure length is positive. 5010 cmpdi (CCR0, len, 0); 5011 5012 // Prepare variables 5013 subi (offset, offset, 4); 5014 li (carry, 0); 5015 ble (CCR0, SKIP); 5016 5017 mtctr (len); 5018 subi (len, len, 1 ); 5019 sldi (len, len, 2 ); 5020 5021 // Main loop 5022 bind(LOOP); 5023 lwzx (tmp1, len, in ); 5024 lwzx (tmp2, offset, out ); 5025 mulld (tmp1, tmp1, k ); 5026 add (tmp2, carry, tmp2 ); 5027 add (tmp2, tmp1, tmp2 ); 5028 stwx (tmp2, offset, out ); 5029 srdi (carry, tmp2, 32 ); 5030 subi (offset, offset, 4 ); 5031 subi (len, len, 4 ); 5032 bdnz (LOOP); 5033 bind(SKIP); 5034 } 5035 5036 void MacroAssembler::multiply_to_len(Register x, Register xlen, 5037 Register y, Register ylen, 5038 Register z, Register zlen, 5039 Register tmp1, Register tmp2, 5040 Register tmp3, Register tmp4, 5041 Register tmp5, Register tmp6, 5042 Register tmp7, Register tmp8, 5043 Register tmp9, Register tmp10, 5044 Register tmp11, Register tmp12, 5045 Register tmp13) { 5046 5047 ShortBranchVerifier sbv(this); 5048 5049 assert_different_registers(x, xlen, y, ylen, z, zlen, 5050 tmp1, tmp2, tmp3, tmp4, tmp5, tmp6); 5051 assert_different_registers(x, xlen, y, ylen, z, zlen, 5052 tmp1, tmp2, tmp3, tmp4, tmp5, tmp7); 5053 assert_different_registers(x, xlen, y, ylen, z, zlen, 5054 tmp1, tmp2, tmp3, tmp4, tmp5, tmp8); 5055 5056 const Register idx = tmp1; 5057 const Register kdx = tmp2; 5058 const Register xstart = tmp3; 5059 5060 const Register y_idx = tmp4; 5061 const Register carry = tmp5; 5062 const Register product = tmp6; 5063 const Register product_high = tmp7; 5064 const Register x_xstart = tmp8; 5065 const Register tmp = tmp9; 5066 5067 // First Loop. 5068 // 5069 // final static long LONG_MASK = 0xffffffffL; 5070 // int xstart = xlen - 1; 5071 // int ystart = ylen - 1; 5072 // long carry = 0; 5073 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) { 5074 // long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry; 5075 // z[kdx] = (int)product; 5076 // carry = product >>> 32; 5077 // } 5078 // z[xstart] = (int)carry; 5079 5080 mr_if_needed(idx, ylen); // idx = ylen 5081 mr_if_needed(kdx, zlen); // kdx = xlen + ylen 5082 li(carry, 0); // carry = 0 5083 5084 Label L_done; 5085 5086 addic_(xstart, xlen, -1); 5087 blt(CCR0, L_done); 5088 5089 multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, 5090 carry, product_high, product, idx, kdx, tmp); 5091 5092 Label L_second_loop; 5093 5094 cmpdi(CCR0, kdx, 0); 5095 beq(CCR0, L_second_loop); 5096 5097 Label L_carry; 5098 5099 addic_(kdx, kdx, -1); 5100 beq(CCR0, L_carry); 5101 5102 // Store lower 32 bits of carry. 5103 sldi(tmp, kdx, LogBytesPerInt); 5104 stwx(carry, z, tmp); 5105 srdi(carry, carry, 32); 5106 addi(kdx, kdx, -1); 5107 5108 5109 bind(L_carry); 5110 5111 // Store upper 32 bits of carry. 5112 sldi(tmp, kdx, LogBytesPerInt); 5113 stwx(carry, z, tmp); 5114 5115 // Second and third (nested) loops. 5116 // 5117 // for (int i = xstart-1; i >= 0; i--) { // Second loop 5118 // carry = 0; 5119 // for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop 5120 // long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) + 5121 // (z[k] & LONG_MASK) + carry; 5122 // z[k] = (int)product; 5123 // carry = product >>> 32; 5124 // } 5125 // z[i] = (int)carry; 5126 // } 5127 // 5128 // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = rdx 5129 5130 bind(L_second_loop); 5131 5132 li(carry, 0); // carry = 0; 5133 5134 addic_(xstart, xstart, -1); // i = xstart-1; 5135 blt(CCR0, L_done); 5136 5137 Register zsave = tmp10; 5138 5139 mr(zsave, z); 5140 5141 5142 Label L_last_x; 5143 5144 sldi(tmp, xstart, LogBytesPerInt); 5145 add(z, z, tmp); // z = z + k - j 5146 addi(z, z, 4); 5147 addic_(xstart, xstart, -1); // i = xstart-1; 5148 blt(CCR0, L_last_x); 5149 5150 sldi(tmp, xstart, LogBytesPerInt); 5151 ldx(x_xstart, x, tmp); 5152 #ifdef VM_LITTLE_ENDIAN 5153 rldicl(x_xstart, x_xstart, 32, 0); 5154 #endif 5155 5156 5157 Label L_third_loop_prologue; 5158 5159 bind(L_third_loop_prologue); 5160 5161 Register xsave = tmp11; 5162 Register xlensave = tmp12; 5163 Register ylensave = tmp13; 5164 5165 mr(xsave, x); 5166 mr(xlensave, xstart); 5167 mr(ylensave, ylen); 5168 5169 5170 multiply_128_x_128_loop(x_xstart, y, z, y_idx, ylen, 5171 carry, product_high, product, x, tmp); 5172 5173 mr(z, zsave); 5174 mr(x, xsave); 5175 mr(xlen, xlensave); // This is the decrement of the loop counter! 5176 mr(ylen, ylensave); 5177 5178 addi(tmp3, xlen, 1); 5179 sldi(tmp, tmp3, LogBytesPerInt); 5180 stwx(carry, z, tmp); 5181 addic_(tmp3, tmp3, -1); 5182 blt(CCR0, L_done); 5183 5184 srdi(carry, carry, 32); 5185 sldi(tmp, tmp3, LogBytesPerInt); 5186 stwx(carry, z, tmp); 5187 b(L_second_loop); 5188 5189 // Next infrequent code is moved outside loops. 5190 bind(L_last_x); 5191 5192 lwz(x_xstart, 0, x); 5193 b(L_third_loop_prologue); 5194 5195 bind(L_done); 5196 } // multiply_to_len 5197 5198 void MacroAssembler::asm_assert(bool check_equal, const char *msg, int id) { 5199 #ifdef ASSERT 5200 Label ok; 5201 if (check_equal) { 5202 beq(CCR0, ok); 5203 } else { 5204 bne(CCR0, ok); 5205 } 5206 stop(msg, id); 5207 bind(ok); 5208 #endif 5209 } 5210 5211 void MacroAssembler::asm_assert_mems_zero(bool check_equal, int size, int mem_offset, 5212 Register mem_base, const char* msg, int id) { 5213 #ifdef ASSERT 5214 switch (size) { 5215 case 4: 5216 lwz(R0, mem_offset, mem_base); 5217 cmpwi(CCR0, R0, 0); 5218 break; 5219 case 8: 5220 ld(R0, mem_offset, mem_base); 5221 cmpdi(CCR0, R0, 0); 5222 break; 5223 default: 5224 ShouldNotReachHere(); 5225 } 5226 asm_assert(check_equal, msg, id); 5227 #endif // ASSERT 5228 } 5229 5230 void MacroAssembler::verify_thread() { 5231 if (VerifyThread) { 5232 unimplemented("'VerifyThread' currently not implemented on PPC"); 5233 } 5234 } 5235 5236 // READ: oop. KILL: R0. Volatile floats perhaps. 5237 void MacroAssembler::verify_oop(Register oop, const char* msg) { 5238 if (!VerifyOops) { 5239 return; 5240 } 5241 5242 address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address(); 5243 const Register tmp = R11; // Will be preserved. 5244 const int nbytes_save = MacroAssembler::num_volatile_regs * 8; 5245 save_volatile_gprs(R1_SP, -nbytes_save); // except R0 5246 5247 mr_if_needed(R4_ARG2, oop); 5248 save_LR_CR(tmp); // save in old frame 5249 push_frame_reg_args(nbytes_save, tmp); 5250 // load FunctionDescriptor** / entry_address * 5251 load_const_optimized(tmp, fd, R0); 5252 // load FunctionDescriptor* / entry_address 5253 ld(tmp, 0, tmp); 5254 load_const_optimized(R3_ARG1, (address)msg, R0); 5255 // Call destination for its side effect. 5256 call_c(tmp); 5257 5258 pop_frame(); 5259 restore_LR_CR(tmp); 5260 restore_volatile_gprs(R1_SP, -nbytes_save); // except R0 5261 } 5262 5263 void MacroAssembler::verify_oop_addr(RegisterOrConstant offs, Register base, const char* msg) { 5264 if (!VerifyOops) { 5265 return; 5266 } 5267 5268 address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address(); 5269 const Register tmp = R11; // Will be preserved. 5270 const int nbytes_save = MacroAssembler::num_volatile_regs * 8; 5271 save_volatile_gprs(R1_SP, -nbytes_save); // except R0 5272 5273 ld(R4_ARG2, offs, base); 5274 save_LR_CR(tmp); // save in old frame 5275 push_frame_reg_args(nbytes_save, tmp); 5276 // load FunctionDescriptor** / entry_address * 5277 load_const_optimized(tmp, fd, R0); 5278 // load FunctionDescriptor* / entry_address 5279 ld(tmp, 0, tmp); 5280 load_const_optimized(R3_ARG1, (address)msg, R0); 5281 // Call destination for its side effect. 5282 call_c(tmp); 5283 5284 pop_frame(); 5285 restore_LR_CR(tmp); 5286 restore_volatile_gprs(R1_SP, -nbytes_save); // except R0 5287 } 5288 5289 const char* stop_types[] = { 5290 "stop", 5291 "untested", 5292 "unimplemented", 5293 "shouldnotreachhere" 5294 }; 5295 5296 static void stop_on_request(int tp, const char* msg) { 5297 tty->print("PPC assembly code requires stop: (%s) %s\n", stop_types[tp%/*stop_end*/4], msg); 5298 guarantee(false, "PPC assembly code requires stop: %s", msg); 5299 } 5300 5301 // Call a C-function that prints output. 5302 void MacroAssembler::stop(int type, const char* msg, int id) { 5303 #ifndef PRODUCT 5304 block_comment(err_msg("stop: %s %s {", stop_types[type%stop_end], msg)); 5305 #else 5306 block_comment("stop {"); 5307 #endif 5308 5309 // setup arguments 5310 load_const_optimized(R3_ARG1, type); 5311 load_const_optimized(R4_ARG2, (void *)msg, /*tmp=*/R0); 5312 call_VM_leaf(CAST_FROM_FN_PTR(address, stop_on_request), R3_ARG1, R4_ARG2); 5313 illtrap(); 5314 emit_int32(id); 5315 block_comment("} stop;"); 5316 } 5317 5318 #ifndef PRODUCT 5319 // Write pattern 0x0101010101010101 in memory region [low-before, high+after]. 5320 // Val, addr are temp registers. 5321 // If low == addr, addr is killed. 5322 // High is preserved. 5323 void MacroAssembler::zap_from_to(Register low, int before, Register high, int after, Register val, Register addr) { 5324 if (!ZapMemory) return; 5325 5326 assert_different_registers(low, val); 5327 5328 BLOCK_COMMENT("zap memory region {"); 5329 load_const_optimized(val, 0x0101010101010101); 5330 int size = before + after; 5331 if (low == high && size < 5 && size > 0) { 5332 int offset = -before*BytesPerWord; 5333 for (int i = 0; i < size; ++i) { 5334 std(val, offset, low); 5335 offset += (1*BytesPerWord); 5336 } 5337 } else { 5338 addi(addr, low, -before*BytesPerWord); 5339 assert_different_registers(high, val); 5340 if (after) addi(high, high, after * BytesPerWord); 5341 Label loop; 5342 bind(loop); 5343 std(val, 0, addr); 5344 addi(addr, addr, 8); 5345 cmpd(CCR6, addr, high); 5346 ble(CCR6, loop); 5347 if (after) addi(high, high, -after * BytesPerWord); // Correct back to old value. 5348 } 5349 BLOCK_COMMENT("} zap memory region"); 5350 } 5351 5352 #endif // !PRODUCT 5353 5354 void SkipIfEqualZero::skip_to_label_if_equal_zero(MacroAssembler* masm, Register temp, 5355 const bool* flag_addr, Label& label) { 5356 int simm16_offset = masm->load_const_optimized(temp, (address)flag_addr, R0, true); 5357 assert(sizeof(bool) == 1, "PowerPC ABI"); 5358 masm->lbz(temp, simm16_offset, temp); 5359 masm->cmpwi(CCR0, temp, 0); 5360 masm->beq(CCR0, label); 5361 } 5362 5363 SkipIfEqualZero::SkipIfEqualZero(MacroAssembler* masm, Register temp, const bool* flag_addr) : _masm(masm), _label() { 5364 skip_to_label_if_equal_zero(masm, temp, flag_addr, _label); 5365 } 5366 5367 SkipIfEqualZero::~SkipIfEqualZero() { 5368 _masm->bind(_label); 5369 }