1 /* 2 * Copyright (c) 1997, 2018, Oracle and/or its affiliates. All rights reserved. 3 * Copyright (c) 2012, 2018, SAP SE. All rights reserved. 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. 9 * 10 * This code is distributed in the hope that it will be useful, but WITHOUT 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 * version 2 for more details (a copy is included in the LICENSE file that 14 * accompanied this code). 15 * 16 * You should have received a copy of the GNU General Public License version 17 * 2 along with this work; if not, write to the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 19 * 20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 21 * or visit www.oracle.com if you need additional information or have any 22 * questions. 23 * 24 */ 25 26 #include "precompiled.hpp" 27 #include "asm/macroAssembler.inline.hpp" 28 #include "compiler/disassembler.hpp" 29 #include "gc/shared/cardTable.hpp" 30 #include "gc/shared/cardTableBarrierSet.hpp" 31 #include "gc/shared/collectedHeap.inline.hpp" 32 #include "interpreter/interpreter.hpp" 33 #include "memory/resourceArea.hpp" 34 #include "nativeInst_ppc.hpp" 35 #include "prims/methodHandles.hpp" 36 #include "runtime/biasedLocking.hpp" 37 #include "runtime/icache.hpp" 38 #include "runtime/interfaceSupport.inline.hpp" 39 #include "runtime/objectMonitor.hpp" 40 #include "runtime/os.hpp" 41 #include "runtime/safepoint.hpp" 42 #include "runtime/safepointMechanism.hpp" 43 #include "runtime/sharedRuntime.hpp" 44 #include "runtime/stubRoutines.hpp" 45 #include "utilities/macros.hpp" 46 #if INCLUDE_ALL_GCS 47 #include "gc/g1/g1BarrierSet.hpp" 48 #include "gc/g1/g1CardTable.hpp" 49 #include "gc/g1/g1ThreadLocalData.hpp" 50 #include "gc/g1/heapRegion.hpp" 51 #endif // INCLUDE_ALL_GCS 52 #ifdef COMPILER2 53 #include "opto/intrinsicnode.hpp" 54 #endif 55 56 #ifdef PRODUCT 57 #define BLOCK_COMMENT(str) // nothing 58 #else 59 #define BLOCK_COMMENT(str) block_comment(str) 60 #endif 61 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 62 63 #ifdef ASSERT 64 // On RISC, there's no benefit to verifying instruction boundaries. 65 bool AbstractAssembler::pd_check_instruction_mark() { return false; } 66 #endif 67 68 void MacroAssembler::ld_largeoffset_unchecked(Register d, int si31, Register a, int emit_filler_nop) { 69 assert(Assembler::is_simm(si31, 31) && si31 >= 0, "si31 out of range"); 70 if (Assembler::is_simm(si31, 16)) { 71 ld(d, si31, a); 72 if (emit_filler_nop) nop(); 73 } else { 74 const int hi = MacroAssembler::largeoffset_si16_si16_hi(si31); 75 const int lo = MacroAssembler::largeoffset_si16_si16_lo(si31); 76 addis(d, a, hi); 77 ld(d, lo, d); 78 } 79 } 80 81 void MacroAssembler::ld_largeoffset(Register d, int si31, Register a, int emit_filler_nop) { 82 assert_different_registers(d, a); 83 ld_largeoffset_unchecked(d, si31, a, emit_filler_nop); 84 } 85 86 void MacroAssembler::load_sized_value(Register dst, RegisterOrConstant offs, Register base, 87 size_t size_in_bytes, bool is_signed) { 88 switch (size_in_bytes) { 89 case 8: ld(dst, offs, base); break; 90 case 4: is_signed ? lwa(dst, offs, base) : lwz(dst, offs, base); break; 91 case 2: is_signed ? lha(dst, offs, base) : lhz(dst, offs, base); break; 92 case 1: lbz(dst, offs, base); if (is_signed) extsb(dst, dst); break; // lba doesn't exist :( 93 default: ShouldNotReachHere(); 94 } 95 } 96 97 void MacroAssembler::store_sized_value(Register dst, RegisterOrConstant offs, Register base, 98 size_t size_in_bytes) { 99 switch (size_in_bytes) { 100 case 8: std(dst, offs, base); break; 101 case 4: stw(dst, offs, base); break; 102 case 2: sth(dst, offs, base); break; 103 case 1: stb(dst, offs, base); break; 104 default: ShouldNotReachHere(); 105 } 106 } 107 108 void MacroAssembler::align(int modulus, int max, int rem) { 109 int padding = (rem + modulus - (offset() % modulus)) % modulus; 110 if (padding > max) return; 111 for (int c = (padding >> 2); c > 0; --c) { nop(); } 112 } 113 114 // Issue instructions that calculate given TOC from global TOC. 115 void MacroAssembler::calculate_address_from_global_toc(Register dst, address addr, bool hi16, bool lo16, 116 bool add_relocation, bool emit_dummy_addr) { 117 int offset = -1; 118 if (emit_dummy_addr) { 119 offset = -128; // dummy address 120 } else if (addr != (address)(intptr_t)-1) { 121 offset = MacroAssembler::offset_to_global_toc(addr); 122 } 123 124 if (hi16) { 125 addis(dst, R29_TOC, MacroAssembler::largeoffset_si16_si16_hi(offset)); 126 } 127 if (lo16) { 128 if (add_relocation) { 129 // Relocate at the addi to avoid confusion with a load from the method's TOC. 130 relocate(internal_word_Relocation::spec(addr)); 131 } 132 addi(dst, dst, MacroAssembler::largeoffset_si16_si16_lo(offset)); 133 } 134 } 135 136 address MacroAssembler::patch_calculate_address_from_global_toc_at(address a, address bound, address addr) { 137 const int offset = MacroAssembler::offset_to_global_toc(addr); 138 139 const address inst2_addr = a; 140 const int inst2 = *(int *)inst2_addr; 141 142 // The relocation points to the second instruction, the addi, 143 // and the addi reads and writes the same register dst. 144 const int dst = inv_rt_field(inst2); 145 assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst"); 146 147 // Now, find the preceding addis which writes to dst. 148 int inst1 = 0; 149 address inst1_addr = inst2_addr - BytesPerInstWord; 150 while (inst1_addr >= bound) { 151 inst1 = *(int *) inst1_addr; 152 if (is_addis(inst1) && inv_rt_field(inst1) == dst) { 153 // Stop, found the addis which writes dst. 154 break; 155 } 156 inst1_addr -= BytesPerInstWord; 157 } 158 159 assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC"); 160 set_imm((int *)inst1_addr, MacroAssembler::largeoffset_si16_si16_hi(offset)); 161 set_imm((int *)inst2_addr, MacroAssembler::largeoffset_si16_si16_lo(offset)); 162 return inst1_addr; 163 } 164 165 address MacroAssembler::get_address_of_calculate_address_from_global_toc_at(address a, address bound) { 166 const address inst2_addr = a; 167 const int inst2 = *(int *)inst2_addr; 168 169 // The relocation points to the second instruction, the addi, 170 // and the addi reads and writes the same register dst. 171 const int dst = inv_rt_field(inst2); 172 assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst"); 173 174 // Now, find the preceding addis which writes to dst. 175 int inst1 = 0; 176 address inst1_addr = inst2_addr - BytesPerInstWord; 177 while (inst1_addr >= bound) { 178 inst1 = *(int *) inst1_addr; 179 if (is_addis(inst1) && inv_rt_field(inst1) == dst) { 180 // stop, found the addis which writes dst 181 break; 182 } 183 inst1_addr -= BytesPerInstWord; 184 } 185 186 assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC"); 187 188 int offset = (get_imm(inst1_addr, 0) << 16) + get_imm(inst2_addr, 0); 189 // -1 is a special case 190 if (offset == -1) { 191 return (address)(intptr_t)-1; 192 } else { 193 return global_toc() + offset; 194 } 195 } 196 197 #ifdef _LP64 198 // Patch compressed oops or klass constants. 199 // Assembler sequence is 200 // 1) compressed oops: 201 // lis rx = const.hi 202 // ori rx = rx | const.lo 203 // 2) compressed klass: 204 // lis rx = const.hi 205 // clrldi rx = rx & 0xFFFFffff // clearMS32b, optional 206 // ori rx = rx | const.lo 207 // Clrldi will be passed by. 208 address MacroAssembler::patch_set_narrow_oop(address a, address bound, narrowOop data) { 209 assert(UseCompressedOops, "Should only patch compressed oops"); 210 211 const address inst2_addr = a; 212 const int inst2 = *(int *)inst2_addr; 213 214 // The relocation points to the second instruction, the ori, 215 // and the ori reads and writes the same register dst. 216 const int dst = inv_rta_field(inst2); 217 assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst"); 218 // Now, find the preceding addis which writes to dst. 219 int inst1 = 0; 220 address inst1_addr = inst2_addr - BytesPerInstWord; 221 bool inst1_found = false; 222 while (inst1_addr >= bound) { 223 inst1 = *(int *)inst1_addr; 224 if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break; } 225 inst1_addr -= BytesPerInstWord; 226 } 227 assert(inst1_found, "inst is not lis"); 228 229 int xc = (data >> 16) & 0xffff; 230 int xd = (data >> 0) & 0xffff; 231 232 set_imm((int *)inst1_addr, (short)(xc)); // see enc_load_con_narrow_hi/_lo 233 set_imm((int *)inst2_addr, (xd)); // unsigned int 234 return inst1_addr; 235 } 236 237 // Get compressed oop or klass constant. 238 narrowOop MacroAssembler::get_narrow_oop(address a, address bound) { 239 assert(UseCompressedOops, "Should only patch compressed oops"); 240 241 const address inst2_addr = a; 242 const int inst2 = *(int *)inst2_addr; 243 244 // The relocation points to the second instruction, the ori, 245 // and the ori reads and writes the same register dst. 246 const int dst = inv_rta_field(inst2); 247 assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst"); 248 // Now, find the preceding lis which writes to dst. 249 int inst1 = 0; 250 address inst1_addr = inst2_addr - BytesPerInstWord; 251 bool inst1_found = false; 252 253 while (inst1_addr >= bound) { 254 inst1 = *(int *) inst1_addr; 255 if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break;} 256 inst1_addr -= BytesPerInstWord; 257 } 258 assert(inst1_found, "inst is not lis"); 259 260 uint xl = ((unsigned int) (get_imm(inst2_addr, 0) & 0xffff)); 261 uint xh = (((get_imm(inst1_addr, 0)) & 0xffff) << 16); 262 263 return (int) (xl | xh); 264 } 265 #endif // _LP64 266 267 // Returns true if successful. 268 bool MacroAssembler::load_const_from_method_toc(Register dst, AddressLiteral& a, 269 Register toc, bool fixed_size) { 270 int toc_offset = 0; 271 // Use RelocationHolder::none for the constant pool entry, otherwise 272 // we will end up with a failing NativeCall::verify(x) where x is 273 // the address of the constant pool entry. 274 // FIXME: We should insert relocation information for oops at the constant 275 // pool entries instead of inserting it at the loads; patching of a constant 276 // pool entry should be less expensive. 277 address const_address = address_constant((address)a.value(), RelocationHolder::none); 278 if (const_address == NULL) { return false; } // allocation failure 279 // Relocate at the pc of the load. 280 relocate(a.rspec()); 281 toc_offset = (int)(const_address - code()->consts()->start()); 282 ld_largeoffset_unchecked(dst, toc_offset, toc, fixed_size); 283 return true; 284 } 285 286 bool MacroAssembler::is_load_const_from_method_toc_at(address a) { 287 const address inst1_addr = a; 288 const int inst1 = *(int *)inst1_addr; 289 290 // The relocation points to the ld or the addis. 291 return (is_ld(inst1)) || 292 (is_addis(inst1) && inv_ra_field(inst1) != 0); 293 } 294 295 int MacroAssembler::get_offset_of_load_const_from_method_toc_at(address a) { 296 assert(is_load_const_from_method_toc_at(a), "must be load_const_from_method_toc"); 297 298 const address inst1_addr = a; 299 const int inst1 = *(int *)inst1_addr; 300 301 if (is_ld(inst1)) { 302 return inv_d1_field(inst1); 303 } else if (is_addis(inst1)) { 304 const int dst = inv_rt_field(inst1); 305 306 // Now, find the succeeding ld which reads and writes to dst. 307 address inst2_addr = inst1_addr + BytesPerInstWord; 308 int inst2 = 0; 309 while (true) { 310 inst2 = *(int *) inst2_addr; 311 if (is_ld(inst2) && inv_ra_field(inst2) == dst && inv_rt_field(inst2) == dst) { 312 // Stop, found the ld which reads and writes dst. 313 break; 314 } 315 inst2_addr += BytesPerInstWord; 316 } 317 return (inv_d1_field(inst1) << 16) + inv_d1_field(inst2); 318 } 319 ShouldNotReachHere(); 320 return 0; 321 } 322 323 // Get the constant from a `load_const' sequence. 324 long MacroAssembler::get_const(address a) { 325 assert(is_load_const_at(a), "not a load of a constant"); 326 const int *p = (const int*) a; 327 unsigned long x = (((unsigned long) (get_imm(a,0) & 0xffff)) << 48); 328 if (is_ori(*(p+1))) { 329 x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 32); 330 x |= (((unsigned long) (get_imm(a,3) & 0xffff)) << 16); 331 x |= (((unsigned long) (get_imm(a,4) & 0xffff))); 332 } else if (is_lis(*(p+1))) { 333 x |= (((unsigned long) (get_imm(a,2) & 0xffff)) << 32); 334 x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 16); 335 x |= (((unsigned long) (get_imm(a,3) & 0xffff))); 336 } else { 337 ShouldNotReachHere(); 338 return (long) 0; 339 } 340 return (long) x; 341 } 342 343 // Patch the 64 bit constant of a `load_const' sequence. This is a low 344 // level procedure. It neither flushes the instruction cache nor is it 345 // mt safe. 346 void MacroAssembler::patch_const(address a, long x) { 347 assert(is_load_const_at(a), "not a load of a constant"); 348 int *p = (int*) a; 349 if (is_ori(*(p+1))) { 350 set_imm(0 + p, (x >> 48) & 0xffff); 351 set_imm(1 + p, (x >> 32) & 0xffff); 352 set_imm(3 + p, (x >> 16) & 0xffff); 353 set_imm(4 + p, x & 0xffff); 354 } else if (is_lis(*(p+1))) { 355 set_imm(0 + p, (x >> 48) & 0xffff); 356 set_imm(2 + p, (x >> 32) & 0xffff); 357 set_imm(1 + p, (x >> 16) & 0xffff); 358 set_imm(3 + p, x & 0xffff); 359 } else { 360 ShouldNotReachHere(); 361 } 362 } 363 364 AddressLiteral MacroAssembler::allocate_metadata_address(Metadata* obj) { 365 assert(oop_recorder() != NULL, "this assembler needs a Recorder"); 366 int index = oop_recorder()->allocate_metadata_index(obj); 367 RelocationHolder rspec = metadata_Relocation::spec(index); 368 return AddressLiteral((address)obj, rspec); 369 } 370 371 AddressLiteral MacroAssembler::constant_metadata_address(Metadata* obj) { 372 assert(oop_recorder() != NULL, "this assembler needs a Recorder"); 373 int index = oop_recorder()->find_index(obj); 374 RelocationHolder rspec = metadata_Relocation::spec(index); 375 return AddressLiteral((address)obj, rspec); 376 } 377 378 AddressLiteral MacroAssembler::allocate_oop_address(jobject obj) { 379 assert(oop_recorder() != NULL, "this assembler needs an OopRecorder"); 380 int oop_index = oop_recorder()->allocate_oop_index(obj); 381 return AddressLiteral(address(obj), oop_Relocation::spec(oop_index)); 382 } 383 384 AddressLiteral MacroAssembler::constant_oop_address(jobject obj) { 385 assert(oop_recorder() != NULL, "this assembler needs an OopRecorder"); 386 int oop_index = oop_recorder()->find_index(obj); 387 return AddressLiteral(address(obj), oop_Relocation::spec(oop_index)); 388 } 389 390 RegisterOrConstant MacroAssembler::delayed_value_impl(intptr_t* delayed_value_addr, 391 Register tmp, int offset) { 392 intptr_t value = *delayed_value_addr; 393 if (value != 0) { 394 return RegisterOrConstant(value + offset); 395 } 396 397 // Load indirectly to solve generation ordering problem. 398 // static address, no relocation 399 int simm16_offset = load_const_optimized(tmp, delayed_value_addr, noreg, true); 400 ld(tmp, simm16_offset, tmp); // must be aligned ((xa & 3) == 0) 401 402 if (offset != 0) { 403 addi(tmp, tmp, offset); 404 } 405 406 return RegisterOrConstant(tmp); 407 } 408 409 #ifndef PRODUCT 410 void MacroAssembler::pd_print_patched_instruction(address branch) { 411 Unimplemented(); // TODO: PPC port 412 } 413 #endif // ndef PRODUCT 414 415 // Conditional far branch for destinations encodable in 24+2 bits. 416 void MacroAssembler::bc_far(int boint, int biint, Label& dest, int optimize) { 417 418 // If requested by flag optimize, relocate the bc_far as a 419 // runtime_call and prepare for optimizing it when the code gets 420 // relocated. 421 if (optimize == bc_far_optimize_on_relocate) { 422 relocate(relocInfo::runtime_call_type); 423 } 424 425 // variant 2: 426 // 427 // b!cxx SKIP 428 // bxx DEST 429 // SKIP: 430 // 431 432 const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)), 433 opposite_bcond(inv_boint_bcond(boint))); 434 435 // We emit two branches. 436 // First, a conditional branch which jumps around the far branch. 437 const address not_taken_pc = pc() + 2 * BytesPerInstWord; 438 const address bc_pc = pc(); 439 bc(opposite_boint, biint, not_taken_pc); 440 441 const int bc_instr = *(int*)bc_pc; 442 assert(not_taken_pc == (address)inv_bd_field(bc_instr, (intptr_t)bc_pc), "postcondition"); 443 assert(opposite_boint == inv_bo_field(bc_instr), "postcondition"); 444 assert(boint == add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(bc_instr))), 445 opposite_bcond(inv_boint_bcond(inv_bo_field(bc_instr)))), 446 "postcondition"); 447 assert(biint == inv_bi_field(bc_instr), "postcondition"); 448 449 // Second, an unconditional far branch which jumps to dest. 450 // Note: target(dest) remembers the current pc (see CodeSection::target) 451 // and returns the current pc if the label is not bound yet; when 452 // the label gets bound, the unconditional far branch will be patched. 453 const address target_pc = target(dest); 454 const address b_pc = pc(); 455 b(target_pc); 456 457 assert(not_taken_pc == pc(), "postcondition"); 458 assert(dest.is_bound() || target_pc == b_pc, "postcondition"); 459 } 460 461 // 1 or 2 instructions 462 void MacroAssembler::bc_far_optimized(int boint, int biint, Label& dest) { 463 if (dest.is_bound() && is_within_range_of_bcxx(target(dest), pc())) { 464 bc(boint, biint, dest); 465 } else { 466 bc_far(boint, biint, dest, MacroAssembler::bc_far_optimize_on_relocate); 467 } 468 } 469 470 bool MacroAssembler::is_bc_far_at(address instruction_addr) { 471 return is_bc_far_variant1_at(instruction_addr) || 472 is_bc_far_variant2_at(instruction_addr) || 473 is_bc_far_variant3_at(instruction_addr); 474 } 475 476 address MacroAssembler::get_dest_of_bc_far_at(address instruction_addr) { 477 if (is_bc_far_variant1_at(instruction_addr)) { 478 const address instruction_1_addr = instruction_addr; 479 const int instruction_1 = *(int*)instruction_1_addr; 480 return (address)inv_bd_field(instruction_1, (intptr_t)instruction_1_addr); 481 } else if (is_bc_far_variant2_at(instruction_addr)) { 482 const address instruction_2_addr = instruction_addr + 4; 483 return bxx_destination(instruction_2_addr); 484 } else if (is_bc_far_variant3_at(instruction_addr)) { 485 return instruction_addr + 8; 486 } 487 // variant 4 ??? 488 ShouldNotReachHere(); 489 return NULL; 490 } 491 void MacroAssembler::set_dest_of_bc_far_at(address instruction_addr, address dest) { 492 493 if (is_bc_far_variant3_at(instruction_addr)) { 494 // variant 3, far cond branch to the next instruction, already patched to nops: 495 // 496 // nop 497 // endgroup 498 // SKIP/DEST: 499 // 500 return; 501 } 502 503 // first, extract boint and biint from the current branch 504 int boint = 0; 505 int biint = 0; 506 507 ResourceMark rm; 508 const int code_size = 2 * BytesPerInstWord; 509 CodeBuffer buf(instruction_addr, code_size); 510 MacroAssembler masm(&buf); 511 if (is_bc_far_variant2_at(instruction_addr) && dest == instruction_addr + 8) { 512 // Far branch to next instruction: Optimize it by patching nops (produce variant 3). 513 masm.nop(); 514 masm.endgroup(); 515 } else { 516 if (is_bc_far_variant1_at(instruction_addr)) { 517 // variant 1, the 1st instruction contains the destination address: 518 // 519 // bcxx DEST 520 // nop 521 // 522 const int instruction_1 = *(int*)(instruction_addr); 523 boint = inv_bo_field(instruction_1); 524 biint = inv_bi_field(instruction_1); 525 } else if (is_bc_far_variant2_at(instruction_addr)) { 526 // variant 2, the 2nd instruction contains the destination address: 527 // 528 // b!cxx SKIP 529 // bxx DEST 530 // SKIP: 531 // 532 const int instruction_1 = *(int*)(instruction_addr); 533 boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(instruction_1))), 534 opposite_bcond(inv_boint_bcond(inv_bo_field(instruction_1)))); 535 biint = inv_bi_field(instruction_1); 536 } else { 537 // variant 4??? 538 ShouldNotReachHere(); 539 } 540 541 // second, set the new branch destination and optimize the code 542 if (dest != instruction_addr + 4 && // the bc_far is still unbound! 543 masm.is_within_range_of_bcxx(dest, instruction_addr)) { 544 // variant 1: 545 // 546 // bcxx DEST 547 // nop 548 // 549 masm.bc(boint, biint, dest); 550 masm.nop(); 551 } else { 552 // variant 2: 553 // 554 // b!cxx SKIP 555 // bxx DEST 556 // SKIP: 557 // 558 const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)), 559 opposite_bcond(inv_boint_bcond(boint))); 560 const address not_taken_pc = masm.pc() + 2 * BytesPerInstWord; 561 masm.bc(opposite_boint, biint, not_taken_pc); 562 masm.b(dest); 563 } 564 } 565 ICache::ppc64_flush_icache_bytes(instruction_addr, code_size); 566 } 567 568 // Emit a NOT mt-safe patchable 64 bit absolute call/jump. 569 void MacroAssembler::bxx64_patchable(address dest, relocInfo::relocType rt, bool link) { 570 // get current pc 571 uint64_t start_pc = (uint64_t) pc(); 572 573 const address pc_of_bl = (address) (start_pc + (6*BytesPerInstWord)); // bl is last 574 const address pc_of_b = (address) (start_pc + (0*BytesPerInstWord)); // b is first 575 576 // relocate here 577 if (rt != relocInfo::none) { 578 relocate(rt); 579 } 580 581 if ( ReoptimizeCallSequences && 582 (( link && is_within_range_of_b(dest, pc_of_bl)) || 583 (!link && is_within_range_of_b(dest, pc_of_b)))) { 584 // variant 2: 585 // Emit an optimized, pc-relative call/jump. 586 587 if (link) { 588 // some padding 589 nop(); 590 nop(); 591 nop(); 592 nop(); 593 nop(); 594 nop(); 595 596 // do the call 597 assert(pc() == pc_of_bl, "just checking"); 598 bl(dest, relocInfo::none); 599 } else { 600 // do the jump 601 assert(pc() == pc_of_b, "just checking"); 602 b(dest, relocInfo::none); 603 604 // some padding 605 nop(); 606 nop(); 607 nop(); 608 nop(); 609 nop(); 610 nop(); 611 } 612 613 // Assert that we can identify the emitted call/jump. 614 assert(is_bxx64_patchable_variant2_at((address)start_pc, link), 615 "can't identify emitted call"); 616 } else { 617 // variant 1: 618 mr(R0, R11); // spill R11 -> R0. 619 620 // Load the destination address into CTR, 621 // calculate destination relative to global toc. 622 calculate_address_from_global_toc(R11, dest, true, true, false); 623 624 mtctr(R11); 625 mr(R11, R0); // spill R11 <- R0. 626 nop(); 627 628 // do the call/jump 629 if (link) { 630 bctrl(); 631 } else{ 632 bctr(); 633 } 634 // Assert that we can identify the emitted call/jump. 635 assert(is_bxx64_patchable_variant1b_at((address)start_pc, link), 636 "can't identify emitted call"); 637 } 638 639 // Assert that we can identify the emitted call/jump. 640 assert(is_bxx64_patchable_at((address)start_pc, link), 641 "can't identify emitted call"); 642 assert(get_dest_of_bxx64_patchable_at((address)start_pc, link) == dest, 643 "wrong encoding of dest address"); 644 } 645 646 // Identify a bxx64_patchable instruction. 647 bool MacroAssembler::is_bxx64_patchable_at(address instruction_addr, bool link) { 648 return is_bxx64_patchable_variant1b_at(instruction_addr, link) 649 //|| is_bxx64_patchable_variant1_at(instruction_addr, link) 650 || is_bxx64_patchable_variant2_at(instruction_addr, link); 651 } 652 653 // Does the call64_patchable instruction use a pc-relative encoding of 654 // the call destination? 655 bool MacroAssembler::is_bxx64_patchable_pcrelative_at(address instruction_addr, bool link) { 656 // variant 2 is pc-relative 657 return is_bxx64_patchable_variant2_at(instruction_addr, link); 658 } 659 660 // Identify variant 1. 661 bool MacroAssembler::is_bxx64_patchable_variant1_at(address instruction_addr, bool link) { 662 unsigned int* instr = (unsigned int*) instruction_addr; 663 return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l] 664 && is_mtctr(instr[5]) // mtctr 665 && is_load_const_at(instruction_addr); 666 } 667 668 // Identify variant 1b: load destination relative to global toc. 669 bool MacroAssembler::is_bxx64_patchable_variant1b_at(address instruction_addr, bool link) { 670 unsigned int* instr = (unsigned int*) instruction_addr; 671 return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l] 672 && is_mtctr(instr[3]) // mtctr 673 && is_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord, instruction_addr); 674 } 675 676 // Identify variant 2. 677 bool MacroAssembler::is_bxx64_patchable_variant2_at(address instruction_addr, bool link) { 678 unsigned int* instr = (unsigned int*) instruction_addr; 679 if (link) { 680 return is_bl (instr[6]) // bl dest is last 681 && is_nop(instr[0]) // nop 682 && is_nop(instr[1]) // nop 683 && is_nop(instr[2]) // nop 684 && is_nop(instr[3]) // nop 685 && is_nop(instr[4]) // nop 686 && is_nop(instr[5]); // nop 687 } else { 688 return is_b (instr[0]) // b dest is first 689 && is_nop(instr[1]) // nop 690 && is_nop(instr[2]) // nop 691 && is_nop(instr[3]) // nop 692 && is_nop(instr[4]) // nop 693 && is_nop(instr[5]) // nop 694 && is_nop(instr[6]); // nop 695 } 696 } 697 698 // Set dest address of a bxx64_patchable instruction. 699 void MacroAssembler::set_dest_of_bxx64_patchable_at(address instruction_addr, address dest, bool link) { 700 ResourceMark rm; 701 int code_size = MacroAssembler::bxx64_patchable_size; 702 CodeBuffer buf(instruction_addr, code_size); 703 MacroAssembler masm(&buf); 704 masm.bxx64_patchable(dest, relocInfo::none, link); 705 ICache::ppc64_flush_icache_bytes(instruction_addr, code_size); 706 } 707 708 // Get dest address of a bxx64_patchable instruction. 709 address MacroAssembler::get_dest_of_bxx64_patchable_at(address instruction_addr, bool link) { 710 if (is_bxx64_patchable_variant1_at(instruction_addr, link)) { 711 return (address) (unsigned long) get_const(instruction_addr); 712 } else if (is_bxx64_patchable_variant2_at(instruction_addr, link)) { 713 unsigned int* instr = (unsigned int*) instruction_addr; 714 if (link) { 715 const int instr_idx = 6; // bl is last 716 int branchoffset = branch_destination(instr[instr_idx], 0); 717 return instruction_addr + branchoffset + instr_idx*BytesPerInstWord; 718 } else { 719 const int instr_idx = 0; // b is first 720 int branchoffset = branch_destination(instr[instr_idx], 0); 721 return instruction_addr + branchoffset + instr_idx*BytesPerInstWord; 722 } 723 // Load dest relative to global toc. 724 } else if (is_bxx64_patchable_variant1b_at(instruction_addr, link)) { 725 return get_address_of_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord, 726 instruction_addr); 727 } else { 728 ShouldNotReachHere(); 729 return NULL; 730 } 731 } 732 733 // Uses ordering which corresponds to ABI: 734 // _savegpr0_14: std r14,-144(r1) 735 // _savegpr0_15: std r15,-136(r1) 736 // _savegpr0_16: std r16,-128(r1) 737 void MacroAssembler::save_nonvolatile_gprs(Register dst, int offset) { 738 std(R14, offset, dst); offset += 8; 739 std(R15, offset, dst); offset += 8; 740 std(R16, offset, dst); offset += 8; 741 std(R17, offset, dst); offset += 8; 742 std(R18, offset, dst); offset += 8; 743 std(R19, offset, dst); offset += 8; 744 std(R20, offset, dst); offset += 8; 745 std(R21, offset, dst); offset += 8; 746 std(R22, offset, dst); offset += 8; 747 std(R23, offset, dst); offset += 8; 748 std(R24, offset, dst); offset += 8; 749 std(R25, offset, dst); offset += 8; 750 std(R26, offset, dst); offset += 8; 751 std(R27, offset, dst); offset += 8; 752 std(R28, offset, dst); offset += 8; 753 std(R29, offset, dst); offset += 8; 754 std(R30, offset, dst); offset += 8; 755 std(R31, offset, dst); offset += 8; 756 757 stfd(F14, offset, dst); offset += 8; 758 stfd(F15, offset, dst); offset += 8; 759 stfd(F16, offset, dst); offset += 8; 760 stfd(F17, offset, dst); offset += 8; 761 stfd(F18, offset, dst); offset += 8; 762 stfd(F19, offset, dst); offset += 8; 763 stfd(F20, offset, dst); offset += 8; 764 stfd(F21, offset, dst); offset += 8; 765 stfd(F22, offset, dst); offset += 8; 766 stfd(F23, offset, dst); offset += 8; 767 stfd(F24, offset, dst); offset += 8; 768 stfd(F25, offset, dst); offset += 8; 769 stfd(F26, offset, dst); offset += 8; 770 stfd(F27, offset, dst); offset += 8; 771 stfd(F28, offset, dst); offset += 8; 772 stfd(F29, offset, dst); offset += 8; 773 stfd(F30, offset, dst); offset += 8; 774 stfd(F31, offset, dst); 775 } 776 777 // Uses ordering which corresponds to ABI: 778 // _restgpr0_14: ld r14,-144(r1) 779 // _restgpr0_15: ld r15,-136(r1) 780 // _restgpr0_16: ld r16,-128(r1) 781 void MacroAssembler::restore_nonvolatile_gprs(Register src, int offset) { 782 ld(R14, offset, src); offset += 8; 783 ld(R15, offset, src); offset += 8; 784 ld(R16, offset, src); offset += 8; 785 ld(R17, offset, src); offset += 8; 786 ld(R18, offset, src); offset += 8; 787 ld(R19, offset, src); offset += 8; 788 ld(R20, offset, src); offset += 8; 789 ld(R21, offset, src); offset += 8; 790 ld(R22, offset, src); offset += 8; 791 ld(R23, offset, src); offset += 8; 792 ld(R24, offset, src); offset += 8; 793 ld(R25, offset, src); offset += 8; 794 ld(R26, offset, src); offset += 8; 795 ld(R27, offset, src); offset += 8; 796 ld(R28, offset, src); offset += 8; 797 ld(R29, offset, src); offset += 8; 798 ld(R30, offset, src); offset += 8; 799 ld(R31, offset, src); offset += 8; 800 801 // FP registers 802 lfd(F14, offset, src); offset += 8; 803 lfd(F15, offset, src); offset += 8; 804 lfd(F16, offset, src); offset += 8; 805 lfd(F17, offset, src); offset += 8; 806 lfd(F18, offset, src); offset += 8; 807 lfd(F19, offset, src); offset += 8; 808 lfd(F20, offset, src); offset += 8; 809 lfd(F21, offset, src); offset += 8; 810 lfd(F22, offset, src); offset += 8; 811 lfd(F23, offset, src); offset += 8; 812 lfd(F24, offset, src); offset += 8; 813 lfd(F25, offset, src); offset += 8; 814 lfd(F26, offset, src); offset += 8; 815 lfd(F27, offset, src); offset += 8; 816 lfd(F28, offset, src); offset += 8; 817 lfd(F29, offset, src); offset += 8; 818 lfd(F30, offset, src); offset += 8; 819 lfd(F31, offset, src); 820 } 821 822 // For verify_oops. 823 void MacroAssembler::save_volatile_gprs(Register dst, int offset) { 824 std(R2, offset, dst); offset += 8; 825 std(R3, offset, dst); offset += 8; 826 std(R4, offset, dst); offset += 8; 827 std(R5, offset, dst); offset += 8; 828 std(R6, offset, dst); offset += 8; 829 std(R7, offset, dst); offset += 8; 830 std(R8, offset, dst); offset += 8; 831 std(R9, offset, dst); offset += 8; 832 std(R10, offset, dst); offset += 8; 833 std(R11, offset, dst); offset += 8; 834 std(R12, offset, dst); offset += 8; 835 836 stfd(F0, offset, dst); offset += 8; 837 stfd(F1, offset, dst); offset += 8; 838 stfd(F2, offset, dst); offset += 8; 839 stfd(F3, offset, dst); offset += 8; 840 stfd(F4, offset, dst); offset += 8; 841 stfd(F5, offset, dst); offset += 8; 842 stfd(F6, offset, dst); offset += 8; 843 stfd(F7, offset, dst); offset += 8; 844 stfd(F8, offset, dst); offset += 8; 845 stfd(F9, offset, dst); offset += 8; 846 stfd(F10, offset, dst); offset += 8; 847 stfd(F11, offset, dst); offset += 8; 848 stfd(F12, offset, dst); offset += 8; 849 stfd(F13, offset, dst); 850 } 851 852 // For verify_oops. 853 void MacroAssembler::restore_volatile_gprs(Register src, int offset) { 854 ld(R2, offset, src); offset += 8; 855 ld(R3, offset, src); offset += 8; 856 ld(R4, offset, src); offset += 8; 857 ld(R5, offset, src); offset += 8; 858 ld(R6, offset, src); offset += 8; 859 ld(R7, offset, src); offset += 8; 860 ld(R8, offset, src); offset += 8; 861 ld(R9, offset, src); offset += 8; 862 ld(R10, offset, src); offset += 8; 863 ld(R11, offset, src); offset += 8; 864 ld(R12, offset, src); offset += 8; 865 866 lfd(F0, offset, src); offset += 8; 867 lfd(F1, offset, src); offset += 8; 868 lfd(F2, offset, src); offset += 8; 869 lfd(F3, offset, src); offset += 8; 870 lfd(F4, offset, src); offset += 8; 871 lfd(F5, offset, src); offset += 8; 872 lfd(F6, offset, src); offset += 8; 873 lfd(F7, offset, src); offset += 8; 874 lfd(F8, offset, src); offset += 8; 875 lfd(F9, offset, src); offset += 8; 876 lfd(F10, offset, src); offset += 8; 877 lfd(F11, offset, src); offset += 8; 878 lfd(F12, offset, src); offset += 8; 879 lfd(F13, offset, src); 880 } 881 882 void MacroAssembler::save_LR_CR(Register tmp) { 883 mfcr(tmp); 884 std(tmp, _abi(cr), R1_SP); 885 mflr(tmp); 886 std(tmp, _abi(lr), R1_SP); 887 // Tmp must contain lr on exit! (see return_addr and prolog in ppc64.ad) 888 } 889 890 void MacroAssembler::restore_LR_CR(Register tmp) { 891 assert(tmp != R1_SP, "must be distinct"); 892 ld(tmp, _abi(lr), R1_SP); 893 mtlr(tmp); 894 ld(tmp, _abi(cr), R1_SP); 895 mtcr(tmp); 896 } 897 898 address MacroAssembler::get_PC_trash_LR(Register result) { 899 Label L; 900 bl(L); 901 bind(L); 902 address lr_pc = pc(); 903 mflr(result); 904 return lr_pc; 905 } 906 907 void MacroAssembler::resize_frame(Register offset, Register tmp) { 908 #ifdef ASSERT 909 assert_different_registers(offset, tmp, R1_SP); 910 andi_(tmp, offset, frame::alignment_in_bytes-1); 911 asm_assert_eq("resize_frame: unaligned", 0x204); 912 #endif 913 914 // tmp <- *(SP) 915 ld(tmp, _abi(callers_sp), R1_SP); 916 // addr <- SP + offset; 917 // *(addr) <- tmp; 918 // SP <- addr 919 stdux(tmp, R1_SP, offset); 920 } 921 922 void MacroAssembler::resize_frame(int offset, Register tmp) { 923 assert(is_simm(offset, 16), "too big an offset"); 924 assert_different_registers(tmp, R1_SP); 925 assert((offset & (frame::alignment_in_bytes-1))==0, "resize_frame: unaligned"); 926 // tmp <- *(SP) 927 ld(tmp, _abi(callers_sp), R1_SP); 928 // addr <- SP + offset; 929 // *(addr) <- tmp; 930 // SP <- addr 931 stdu(tmp, offset, R1_SP); 932 } 933 934 void MacroAssembler::resize_frame_absolute(Register addr, Register tmp1, Register tmp2) { 935 // (addr == tmp1) || (addr == tmp2) is allowed here! 936 assert(tmp1 != tmp2, "must be distinct"); 937 938 // compute offset w.r.t. current stack pointer 939 // tmp_1 <- addr - SP (!) 940 subf(tmp1, R1_SP, addr); 941 942 // atomically update SP keeping back link. 943 resize_frame(tmp1/* offset */, tmp2/* tmp */); 944 } 945 946 void MacroAssembler::push_frame(Register bytes, Register tmp) { 947 #ifdef ASSERT 948 assert(bytes != R0, "r0 not allowed here"); 949 andi_(R0, bytes, frame::alignment_in_bytes-1); 950 asm_assert_eq("push_frame(Reg, Reg): unaligned", 0x203); 951 #endif 952 neg(tmp, bytes); 953 stdux(R1_SP, R1_SP, tmp); 954 } 955 956 // Push a frame of size `bytes'. 957 void MacroAssembler::push_frame(unsigned int bytes, Register tmp) { 958 long offset = align_addr(bytes, frame::alignment_in_bytes); 959 if (is_simm(-offset, 16)) { 960 stdu(R1_SP, -offset, R1_SP); 961 } else { 962 load_const_optimized(tmp, -offset); 963 stdux(R1_SP, R1_SP, tmp); 964 } 965 } 966 967 // Push a frame of size `bytes' plus abi_reg_args on top. 968 void MacroAssembler::push_frame_reg_args(unsigned int bytes, Register tmp) { 969 push_frame(bytes + frame::abi_reg_args_size, tmp); 970 } 971 972 // Setup up a new C frame with a spill area for non-volatile GPRs and 973 // additional space for local variables. 974 void MacroAssembler::push_frame_reg_args_nonvolatiles(unsigned int bytes, 975 Register tmp) { 976 push_frame(bytes + frame::abi_reg_args_size + frame::spill_nonvolatiles_size, tmp); 977 } 978 979 // Pop current C frame. 980 void MacroAssembler::pop_frame() { 981 ld(R1_SP, _abi(callers_sp), R1_SP); 982 } 983 984 #if defined(ABI_ELFv2) 985 address MacroAssembler::branch_to(Register r_function_entry, bool and_link) { 986 // TODO(asmundak): make sure the caller uses R12 as function descriptor 987 // most of the times. 988 if (R12 != r_function_entry) { 989 mr(R12, r_function_entry); 990 } 991 mtctr(R12); 992 // Do a call or a branch. 993 if (and_link) { 994 bctrl(); 995 } else { 996 bctr(); 997 } 998 _last_calls_return_pc = pc(); 999 1000 return _last_calls_return_pc; 1001 } 1002 1003 // Call a C function via a function descriptor and use full C 1004 // calling conventions. Updates and returns _last_calls_return_pc. 1005 address MacroAssembler::call_c(Register r_function_entry) { 1006 return branch_to(r_function_entry, /*and_link=*/true); 1007 } 1008 1009 // For tail calls: only branch, don't link, so callee returns to caller of this function. 1010 address MacroAssembler::call_c_and_return_to_caller(Register r_function_entry) { 1011 return branch_to(r_function_entry, /*and_link=*/false); 1012 } 1013 1014 address MacroAssembler::call_c(address function_entry, relocInfo::relocType rt) { 1015 load_const(R12, function_entry, R0); 1016 return branch_to(R12, /*and_link=*/true); 1017 } 1018 1019 #else 1020 // Generic version of a call to C function via a function descriptor 1021 // with variable support for C calling conventions (TOC, ENV, etc.). 1022 // Updates and returns _last_calls_return_pc. 1023 address MacroAssembler::branch_to(Register function_descriptor, bool and_link, bool save_toc_before_call, 1024 bool restore_toc_after_call, bool load_toc_of_callee, bool load_env_of_callee) { 1025 // we emit standard ptrgl glue code here 1026 assert((function_descriptor != R0), "function_descriptor cannot be R0"); 1027 1028 // retrieve necessary entries from the function descriptor 1029 ld(R0, in_bytes(FunctionDescriptor::entry_offset()), function_descriptor); 1030 mtctr(R0); 1031 1032 if (load_toc_of_callee) { 1033 ld(R2_TOC, in_bytes(FunctionDescriptor::toc_offset()), function_descriptor); 1034 } 1035 if (load_env_of_callee) { 1036 ld(R11, in_bytes(FunctionDescriptor::env_offset()), function_descriptor); 1037 } else if (load_toc_of_callee) { 1038 li(R11, 0); 1039 } 1040 1041 // do a call or a branch 1042 if (and_link) { 1043 bctrl(); 1044 } else { 1045 bctr(); 1046 } 1047 _last_calls_return_pc = pc(); 1048 1049 return _last_calls_return_pc; 1050 } 1051 1052 // Call a C function via a function descriptor and use full C calling 1053 // conventions. 1054 // We don't use the TOC in generated code, so there is no need to save 1055 // and restore its value. 1056 address MacroAssembler::call_c(Register fd) { 1057 return branch_to(fd, /*and_link=*/true, 1058 /*save toc=*/false, 1059 /*restore toc=*/false, 1060 /*load toc=*/true, 1061 /*load env=*/true); 1062 } 1063 1064 address MacroAssembler::call_c_and_return_to_caller(Register fd) { 1065 return branch_to(fd, /*and_link=*/false, 1066 /*save toc=*/false, 1067 /*restore toc=*/false, 1068 /*load toc=*/true, 1069 /*load env=*/true); 1070 } 1071 1072 address MacroAssembler::call_c(const FunctionDescriptor* fd, relocInfo::relocType rt) { 1073 if (rt != relocInfo::none) { 1074 // this call needs to be relocatable 1075 if (!ReoptimizeCallSequences 1076 || (rt != relocInfo::runtime_call_type && rt != relocInfo::none) 1077 || fd == NULL // support code-size estimation 1078 || !fd->is_friend_function() 1079 || fd->entry() == NULL) { 1080 // it's not a friend function as defined by class FunctionDescriptor, 1081 // so do a full call-c here. 1082 load_const(R11, (address)fd, R0); 1083 1084 bool has_env = (fd != NULL && fd->env() != NULL); 1085 return branch_to(R11, /*and_link=*/true, 1086 /*save toc=*/false, 1087 /*restore toc=*/false, 1088 /*load toc=*/true, 1089 /*load env=*/has_env); 1090 } else { 1091 // It's a friend function. Load the entry point and don't care about 1092 // toc and env. Use an optimizable call instruction, but ensure the 1093 // same code-size as in the case of a non-friend function. 1094 nop(); 1095 nop(); 1096 nop(); 1097 bl64_patchable(fd->entry(), rt); 1098 _last_calls_return_pc = pc(); 1099 return _last_calls_return_pc; 1100 } 1101 } else { 1102 // This call does not need to be relocatable, do more aggressive 1103 // optimizations. 1104 if (!ReoptimizeCallSequences 1105 || !fd->is_friend_function()) { 1106 // It's not a friend function as defined by class FunctionDescriptor, 1107 // so do a full call-c here. 1108 load_const(R11, (address)fd, R0); 1109 return branch_to(R11, /*and_link=*/true, 1110 /*save toc=*/false, 1111 /*restore toc=*/false, 1112 /*load toc=*/true, 1113 /*load env=*/true); 1114 } else { 1115 // it's a friend function, load the entry point and don't care about 1116 // toc and env. 1117 address dest = fd->entry(); 1118 if (is_within_range_of_b(dest, pc())) { 1119 bl(dest); 1120 } else { 1121 bl64_patchable(dest, rt); 1122 } 1123 _last_calls_return_pc = pc(); 1124 return _last_calls_return_pc; 1125 } 1126 } 1127 } 1128 1129 // Call a C function. All constants needed reside in TOC. 1130 // 1131 // Read the address to call from the TOC. 1132 // Read env from TOC, if fd specifies an env. 1133 // Read new TOC from TOC. 1134 address MacroAssembler::call_c_using_toc(const FunctionDescriptor* fd, 1135 relocInfo::relocType rt, Register toc) { 1136 if (!ReoptimizeCallSequences 1137 || (rt != relocInfo::runtime_call_type && rt != relocInfo::none) 1138 || !fd->is_friend_function()) { 1139 // It's not a friend function as defined by class FunctionDescriptor, 1140 // so do a full call-c here. 1141 assert(fd->entry() != NULL, "function must be linked"); 1142 1143 AddressLiteral fd_entry(fd->entry()); 1144 bool success = load_const_from_method_toc(R11, fd_entry, toc, /*fixed_size*/ true); 1145 mtctr(R11); 1146 if (fd->env() == NULL) { 1147 li(R11, 0); 1148 nop(); 1149 } else { 1150 AddressLiteral fd_env(fd->env()); 1151 success = success && load_const_from_method_toc(R11, fd_env, toc, /*fixed_size*/ true); 1152 } 1153 AddressLiteral fd_toc(fd->toc()); 1154 // Set R2_TOC (load from toc) 1155 success = success && load_const_from_method_toc(R2_TOC, fd_toc, toc, /*fixed_size*/ true); 1156 bctrl(); 1157 _last_calls_return_pc = pc(); 1158 if (!success) { return NULL; } 1159 } else { 1160 // It's a friend function, load the entry point and don't care about 1161 // toc and env. Use an optimizable call instruction, but ensure the 1162 // same code-size as in the case of a non-friend function. 1163 nop(); 1164 bl64_patchable(fd->entry(), rt); 1165 _last_calls_return_pc = pc(); 1166 } 1167 return _last_calls_return_pc; 1168 } 1169 #endif // ABI_ELFv2 1170 1171 void MacroAssembler::call_VM_base(Register oop_result, 1172 Register last_java_sp, 1173 address entry_point, 1174 bool check_exceptions) { 1175 BLOCK_COMMENT("call_VM {"); 1176 // Determine last_java_sp register. 1177 if (!last_java_sp->is_valid()) { 1178 last_java_sp = R1_SP; 1179 } 1180 set_top_ijava_frame_at_SP_as_last_Java_frame(last_java_sp, R11_scratch1); 1181 1182 // ARG1 must hold thread address. 1183 mr(R3_ARG1, R16_thread); 1184 #if defined(ABI_ELFv2) 1185 address return_pc = call_c(entry_point, relocInfo::none); 1186 #else 1187 address return_pc = call_c((FunctionDescriptor*)entry_point, relocInfo::none); 1188 #endif 1189 1190 reset_last_Java_frame(); 1191 1192 // Check for pending exceptions. 1193 if (check_exceptions) { 1194 // We don't check for exceptions here. 1195 ShouldNotReachHere(); 1196 } 1197 1198 // Get oop result if there is one and reset the value in the thread. 1199 if (oop_result->is_valid()) { 1200 get_vm_result(oop_result); 1201 } 1202 1203 _last_calls_return_pc = return_pc; 1204 BLOCK_COMMENT("} call_VM"); 1205 } 1206 1207 void MacroAssembler::call_VM_leaf_base(address entry_point) { 1208 BLOCK_COMMENT("call_VM_leaf {"); 1209 #if defined(ABI_ELFv2) 1210 call_c(entry_point, relocInfo::none); 1211 #else 1212 call_c(CAST_FROM_FN_PTR(FunctionDescriptor*, entry_point), relocInfo::none); 1213 #endif 1214 BLOCK_COMMENT("} call_VM_leaf"); 1215 } 1216 1217 void MacroAssembler::call_VM(Register oop_result, address entry_point, bool check_exceptions) { 1218 call_VM_base(oop_result, noreg, entry_point, check_exceptions); 1219 } 1220 1221 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, 1222 bool check_exceptions) { 1223 // R3_ARG1 is reserved for the thread. 1224 mr_if_needed(R4_ARG2, arg_1); 1225 call_VM(oop_result, entry_point, check_exceptions); 1226 } 1227 1228 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2, 1229 bool check_exceptions) { 1230 // R3_ARG1 is reserved for the thread 1231 mr_if_needed(R4_ARG2, arg_1); 1232 assert(arg_2 != R4_ARG2, "smashed argument"); 1233 mr_if_needed(R5_ARG3, arg_2); 1234 call_VM(oop_result, entry_point, check_exceptions); 1235 } 1236 1237 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2, Register arg_3, 1238 bool check_exceptions) { 1239 // R3_ARG1 is reserved for the thread 1240 mr_if_needed(R4_ARG2, arg_1); 1241 assert(arg_2 != R4_ARG2, "smashed argument"); 1242 mr_if_needed(R5_ARG3, arg_2); 1243 mr_if_needed(R6_ARG4, arg_3); 1244 call_VM(oop_result, entry_point, check_exceptions); 1245 } 1246 1247 void MacroAssembler::call_VM_leaf(address entry_point) { 1248 call_VM_leaf_base(entry_point); 1249 } 1250 1251 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1) { 1252 mr_if_needed(R3_ARG1, arg_1); 1253 call_VM_leaf(entry_point); 1254 } 1255 1256 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2) { 1257 mr_if_needed(R3_ARG1, arg_1); 1258 assert(arg_2 != R3_ARG1, "smashed argument"); 1259 mr_if_needed(R4_ARG2, arg_2); 1260 call_VM_leaf(entry_point); 1261 } 1262 1263 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2, Register arg_3) { 1264 mr_if_needed(R3_ARG1, arg_1); 1265 assert(arg_2 != R3_ARG1, "smashed argument"); 1266 mr_if_needed(R4_ARG2, arg_2); 1267 assert(arg_3 != R3_ARG1 && arg_3 != R4_ARG2, "smashed argument"); 1268 mr_if_needed(R5_ARG3, arg_3); 1269 call_VM_leaf(entry_point); 1270 } 1271 1272 // Check whether instruction is a read access to the polling page 1273 // which was emitted by load_from_polling_page(..). 1274 bool MacroAssembler::is_load_from_polling_page(int instruction, void* ucontext, 1275 address* polling_address_ptr) { 1276 if (!is_ld(instruction)) 1277 return false; // It's not a ld. Fail. 1278 1279 int rt = inv_rt_field(instruction); 1280 int ra = inv_ra_field(instruction); 1281 int ds = inv_ds_field(instruction); 1282 if (!(ds == 0 && ra != 0 && rt == 0)) { 1283 return false; // It's not a ld(r0, X, ra). Fail. 1284 } 1285 1286 if (!ucontext) { 1287 // Set polling address. 1288 if (polling_address_ptr != NULL) { 1289 *polling_address_ptr = NULL; 1290 } 1291 return true; // No ucontext given. Can't check value of ra. Assume true. 1292 } 1293 1294 #ifdef LINUX 1295 // Ucontext given. Check that register ra contains the address of 1296 // the safepoing polling page. 1297 ucontext_t* uc = (ucontext_t*) ucontext; 1298 // Set polling address. 1299 address addr = (address)uc->uc_mcontext.regs->gpr[ra] + (ssize_t)ds; 1300 if (polling_address_ptr != NULL) { 1301 *polling_address_ptr = addr; 1302 } 1303 return os::is_poll_address(addr); 1304 #else 1305 // Not on Linux, ucontext must be NULL. 1306 ShouldNotReachHere(); 1307 return false; 1308 #endif 1309 } 1310 1311 bool MacroAssembler::is_memory_serialization(int instruction, JavaThread* thread, void* ucontext) { 1312 #ifdef LINUX 1313 ucontext_t* uc = (ucontext_t*) ucontext; 1314 1315 if (is_stwx(instruction) || is_stwux(instruction)) { 1316 int ra = inv_ra_field(instruction); 1317 int rb = inv_rb_field(instruction); 1318 1319 // look up content of ra and rb in ucontext 1320 address ra_val=(address)uc->uc_mcontext.regs->gpr[ra]; 1321 long rb_val=(long)uc->uc_mcontext.regs->gpr[rb]; 1322 return os::is_memory_serialize_page(thread, ra_val+rb_val); 1323 } else if (is_stw(instruction) || is_stwu(instruction)) { 1324 int ra = inv_ra_field(instruction); 1325 int d1 = inv_d1_field(instruction); 1326 1327 // look up content of ra in ucontext 1328 address ra_val=(address)uc->uc_mcontext.regs->gpr[ra]; 1329 return os::is_memory_serialize_page(thread, ra_val+d1); 1330 } else { 1331 return false; 1332 } 1333 #else 1334 // workaround not needed on !LINUX :-) 1335 ShouldNotCallThis(); 1336 return false; 1337 #endif 1338 } 1339 1340 void MacroAssembler::bang_stack_with_offset(int offset) { 1341 // When increasing the stack, the old stack pointer will be written 1342 // to the new top of stack according to the PPC64 abi. 1343 // Therefore, stack banging is not necessary when increasing 1344 // the stack by <= os::vm_page_size() bytes. 1345 // When increasing the stack by a larger amount, this method is 1346 // called repeatedly to bang the intermediate pages. 1347 1348 // Stack grows down, caller passes positive offset. 1349 assert(offset > 0, "must bang with positive offset"); 1350 1351 long stdoffset = -offset; 1352 1353 if (is_simm(stdoffset, 16)) { 1354 // Signed 16 bit offset, a simple std is ok. 1355 if (UseLoadInstructionsForStackBangingPPC64) { 1356 ld(R0, (int)(signed short)stdoffset, R1_SP); 1357 } else { 1358 std(R0,(int)(signed short)stdoffset, R1_SP); 1359 } 1360 } else if (is_simm(stdoffset, 31)) { 1361 const int hi = MacroAssembler::largeoffset_si16_si16_hi(stdoffset); 1362 const int lo = MacroAssembler::largeoffset_si16_si16_lo(stdoffset); 1363 1364 Register tmp = R11; 1365 addis(tmp, R1_SP, hi); 1366 if (UseLoadInstructionsForStackBangingPPC64) { 1367 ld(R0, lo, tmp); 1368 } else { 1369 std(R0, lo, tmp); 1370 } 1371 } else { 1372 ShouldNotReachHere(); 1373 } 1374 } 1375 1376 // If instruction is a stack bang of the form 1377 // std R0, x(Ry), (see bang_stack_with_offset()) 1378 // stdu R1_SP, x(R1_SP), (see push_frame(), resize_frame()) 1379 // or stdux R1_SP, Rx, R1_SP (see push_frame(), resize_frame()) 1380 // return the banged address. Otherwise, return 0. 1381 address MacroAssembler::get_stack_bang_address(int instruction, void *ucontext) { 1382 #ifdef LINUX 1383 ucontext_t* uc = (ucontext_t*) ucontext; 1384 int rs = inv_rs_field(instruction); 1385 int ra = inv_ra_field(instruction); 1386 if ( (is_ld(instruction) && rs == 0 && UseLoadInstructionsForStackBangingPPC64) 1387 || (is_std(instruction) && rs == 0 && !UseLoadInstructionsForStackBangingPPC64) 1388 || (is_stdu(instruction) && rs == 1)) { 1389 int ds = inv_ds_field(instruction); 1390 // return banged address 1391 return ds+(address)uc->uc_mcontext.regs->gpr[ra]; 1392 } else if (is_stdux(instruction) && rs == 1) { 1393 int rb = inv_rb_field(instruction); 1394 address sp = (address)uc->uc_mcontext.regs->gpr[1]; 1395 long rb_val = (long)uc->uc_mcontext.regs->gpr[rb]; 1396 return ra != 1 || rb_val >= 0 ? NULL // not a stack bang 1397 : sp + rb_val; // banged address 1398 } 1399 return NULL; // not a stack bang 1400 #else 1401 // workaround not needed on !LINUX :-) 1402 ShouldNotCallThis(); 1403 return NULL; 1404 #endif 1405 } 1406 1407 void MacroAssembler::reserved_stack_check(Register return_pc) { 1408 // Test if reserved zone needs to be enabled. 1409 Label no_reserved_zone_enabling; 1410 1411 ld_ptr(R0, JavaThread::reserved_stack_activation_offset(), R16_thread); 1412 cmpld(CCR0, R1_SP, R0); 1413 blt_predict_taken(CCR0, no_reserved_zone_enabling); 1414 1415 // Enable reserved zone again, throw stack overflow exception. 1416 push_frame_reg_args(0, R0); 1417 call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone), R16_thread); 1418 pop_frame(); 1419 mtlr(return_pc); 1420 load_const_optimized(R0, StubRoutines::throw_delayed_StackOverflowError_entry()); 1421 mtctr(R0); 1422 bctr(); 1423 1424 should_not_reach_here(); 1425 1426 bind(no_reserved_zone_enabling); 1427 } 1428 1429 void MacroAssembler::getandsetd(Register dest_current_value, Register exchange_value, Register addr_base, 1430 bool cmpxchgx_hint) { 1431 Label retry; 1432 bind(retry); 1433 ldarx(dest_current_value, addr_base, cmpxchgx_hint); 1434 stdcx_(exchange_value, addr_base); 1435 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1436 bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0. 1437 } else { 1438 bne( CCR0, retry); // StXcx_ sets CCR0. 1439 } 1440 } 1441 1442 void MacroAssembler::getandaddd(Register dest_current_value, Register inc_value, Register addr_base, 1443 Register tmp, bool cmpxchgx_hint) { 1444 Label retry; 1445 bind(retry); 1446 ldarx(dest_current_value, addr_base, cmpxchgx_hint); 1447 add(tmp, dest_current_value, inc_value); 1448 stdcx_(tmp, addr_base); 1449 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1450 bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0. 1451 } else { 1452 bne( CCR0, retry); // StXcx_ sets CCR0. 1453 } 1454 } 1455 1456 // Word/sub-word atomic helper functions 1457 1458 // Temps and addr_base are killed if size < 4 and processor does not support respective instructions. 1459 // Only signed types are supported with size < 4. 1460 // Atomic add always kills tmp1. 1461 void MacroAssembler::atomic_get_and_modify_generic(Register dest_current_value, Register exchange_value, 1462 Register addr_base, Register tmp1, Register tmp2, Register tmp3, 1463 bool cmpxchgx_hint, bool is_add, int size) { 1464 // Sub-word instructions are available since Power 8. 1465 // For older processors, instruction_type != size holds, and we 1466 // emulate the sub-word instructions by constructing a 4-byte value 1467 // that leaves the other bytes unchanged. 1468 const int instruction_type = VM_Version::has_lqarx() ? size : 4; 1469 1470 Label retry; 1471 Register shift_amount = noreg, 1472 val32 = dest_current_value, 1473 modval = is_add ? tmp1 : exchange_value; 1474 1475 if (instruction_type != size) { 1476 assert_different_registers(tmp1, tmp2, tmp3, dest_current_value, exchange_value, addr_base); 1477 modval = tmp1; 1478 shift_amount = tmp2; 1479 val32 = tmp3; 1480 // Need some preperation: Compute shift amount, align address. Note: shorts must be 2 byte aligned. 1481 #ifdef VM_LITTLE_ENDIAN 1482 rldic(shift_amount, addr_base, 3, 64-5); // (dest & 3) * 8; 1483 clrrdi(addr_base, addr_base, 2); 1484 #else 1485 xori(shift_amount, addr_base, (size == 1) ? 3 : 2); 1486 clrrdi(addr_base, addr_base, 2); 1487 rldic(shift_amount, shift_amount, 3, 64-5); // byte: ((3-dest) & 3) * 8; short: ((1-dest/2) & 1) * 16; 1488 #endif 1489 } 1490 1491 // atomic emulation loop 1492 bind(retry); 1493 1494 switch (instruction_type) { 1495 case 4: lwarx(val32, addr_base, cmpxchgx_hint); break; 1496 case 2: lharx(val32, addr_base, cmpxchgx_hint); break; 1497 case 1: lbarx(val32, addr_base, cmpxchgx_hint); break; 1498 default: ShouldNotReachHere(); 1499 } 1500 1501 if (instruction_type != size) { 1502 srw(dest_current_value, val32, shift_amount); 1503 } 1504 1505 if (is_add) { add(modval, dest_current_value, exchange_value); } 1506 1507 if (instruction_type != size) { 1508 // Transform exchange value such that the replacement can be done by one xor instruction. 1509 xorr(modval, dest_current_value, is_add ? modval : exchange_value); 1510 clrldi(modval, modval, (size == 1) ? 56 : 48); 1511 slw(modval, modval, shift_amount); 1512 xorr(modval, val32, modval); 1513 } 1514 1515 switch (instruction_type) { 1516 case 4: stwcx_(modval, addr_base); break; 1517 case 2: sthcx_(modval, addr_base); break; 1518 case 1: stbcx_(modval, addr_base); break; 1519 default: ShouldNotReachHere(); 1520 } 1521 1522 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1523 bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0. 1524 } else { 1525 bne( CCR0, retry); // StXcx_ sets CCR0. 1526 } 1527 1528 // l?arx zero-extends, but Java wants byte/short values sign-extended. 1529 if (size == 1) { 1530 extsb(dest_current_value, dest_current_value); 1531 } else if (size == 2) { 1532 extsh(dest_current_value, dest_current_value); 1533 }; 1534 } 1535 1536 // Temps, addr_base and exchange_value are killed if size < 4 and processor does not support respective instructions. 1537 // Only signed types are supported with size < 4. 1538 void MacroAssembler::cmpxchg_loop_body(ConditionRegister flag, Register dest_current_value, 1539 Register compare_value, Register exchange_value, 1540 Register addr_base, Register tmp1, Register tmp2, 1541 Label &retry, Label &failed, bool cmpxchgx_hint, int size) { 1542 // Sub-word instructions are available since Power 8. 1543 // For older processors, instruction_type != size holds, and we 1544 // emulate the sub-word instructions by constructing a 4-byte value 1545 // that leaves the other bytes unchanged. 1546 const int instruction_type = VM_Version::has_lqarx() ? size : 4; 1547 1548 Register shift_amount = noreg, 1549 val32 = dest_current_value, 1550 modval = exchange_value; 1551 1552 if (instruction_type != size) { 1553 assert_different_registers(tmp1, tmp2, dest_current_value, compare_value, exchange_value, addr_base); 1554 shift_amount = tmp1; 1555 val32 = tmp2; 1556 modval = tmp2; 1557 // Need some preperation: Compute shift amount, align address. Note: shorts must be 2 byte aligned. 1558 #ifdef VM_LITTLE_ENDIAN 1559 rldic(shift_amount, addr_base, 3, 64-5); // (dest & 3) * 8; 1560 clrrdi(addr_base, addr_base, 2); 1561 #else 1562 xori(shift_amount, addr_base, (size == 1) ? 3 : 2); 1563 clrrdi(addr_base, addr_base, 2); 1564 rldic(shift_amount, shift_amount, 3, 64-5); // byte: ((3-dest) & 3) * 8; short: ((1-dest/2) & 1) * 16; 1565 #endif 1566 // Transform exchange value such that the replacement can be done by one xor instruction. 1567 xorr(exchange_value, compare_value, exchange_value); 1568 clrldi(exchange_value, exchange_value, (size == 1) ? 56 : 48); 1569 slw(exchange_value, exchange_value, shift_amount); 1570 } 1571 1572 // atomic emulation loop 1573 bind(retry); 1574 1575 switch (instruction_type) { 1576 case 4: lwarx(val32, addr_base, cmpxchgx_hint); break; 1577 case 2: lharx(val32, addr_base, cmpxchgx_hint); break; 1578 case 1: lbarx(val32, addr_base, cmpxchgx_hint); break; 1579 default: ShouldNotReachHere(); 1580 } 1581 1582 if (instruction_type != size) { 1583 srw(dest_current_value, val32, shift_amount); 1584 } 1585 if (size == 1) { 1586 extsb(dest_current_value, dest_current_value); 1587 } else if (size == 2) { 1588 extsh(dest_current_value, dest_current_value); 1589 }; 1590 1591 cmpw(flag, dest_current_value, compare_value); 1592 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1593 bne_predict_not_taken(flag, failed); 1594 } else { 1595 bne( flag, failed); 1596 } 1597 // branch to done => (flag == ne), (dest_current_value != compare_value) 1598 // fall through => (flag == eq), (dest_current_value == compare_value) 1599 1600 if (instruction_type != size) { 1601 xorr(modval, val32, exchange_value); 1602 } 1603 1604 switch (instruction_type) { 1605 case 4: stwcx_(modval, addr_base); break; 1606 case 2: sthcx_(modval, addr_base); break; 1607 case 1: stbcx_(modval, addr_base); break; 1608 default: ShouldNotReachHere(); 1609 } 1610 } 1611 1612 // CmpxchgX sets condition register to cmpX(current, compare). 1613 void MacroAssembler::cmpxchg_generic(ConditionRegister flag, Register dest_current_value, 1614 Register compare_value, Register exchange_value, 1615 Register addr_base, Register tmp1, Register tmp2, 1616 int semantics, bool cmpxchgx_hint, 1617 Register int_flag_success, bool contention_hint, bool weak, int size) { 1618 Label retry; 1619 Label failed; 1620 Label done; 1621 1622 // Save one branch if result is returned via register and 1623 // result register is different from the other ones. 1624 bool use_result_reg = (int_flag_success != noreg); 1625 bool preset_result_reg = (int_flag_success != dest_current_value && int_flag_success != compare_value && 1626 int_flag_success != exchange_value && int_flag_success != addr_base && 1627 int_flag_success != tmp1 && int_flag_success != tmp2); 1628 assert(!weak || flag == CCR0, "weak only supported with CCR0"); 1629 assert(size == 1 || size == 2 || size == 4, "unsupported"); 1630 1631 if (use_result_reg && preset_result_reg) { 1632 li(int_flag_success, 0); // preset (assume cas failed) 1633 } 1634 1635 // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM). 1636 if (contention_hint) { // Don't try to reserve if cmp fails. 1637 switch (size) { 1638 case 1: lbz(dest_current_value, 0, addr_base); extsb(dest_current_value, dest_current_value); break; 1639 case 2: lha(dest_current_value, 0, addr_base); break; 1640 case 4: lwz(dest_current_value, 0, addr_base); break; 1641 default: ShouldNotReachHere(); 1642 } 1643 cmpw(flag, dest_current_value, compare_value); 1644 bne(flag, failed); 1645 } 1646 1647 // release/fence semantics 1648 if (semantics & MemBarRel) { 1649 release(); 1650 } 1651 1652 cmpxchg_loop_body(flag, dest_current_value, compare_value, exchange_value, addr_base, tmp1, tmp2, 1653 retry, failed, cmpxchgx_hint, size); 1654 if (!weak || use_result_reg) { 1655 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1656 bne_predict_not_taken(CCR0, weak ? failed : retry); // StXcx_ sets CCR0. 1657 } else { 1658 bne( CCR0, weak ? failed : retry); // StXcx_ sets CCR0. 1659 } 1660 } 1661 // fall through => (flag == eq), (dest_current_value == compare_value), (swapped) 1662 1663 // Result in register (must do this at the end because int_flag_success can be the 1664 // same register as one above). 1665 if (use_result_reg) { 1666 li(int_flag_success, 1); 1667 } 1668 1669 if (semantics & MemBarFenceAfter) { 1670 fence(); 1671 } else if (semantics & MemBarAcq) { 1672 isync(); 1673 } 1674 1675 if (use_result_reg && !preset_result_reg) { 1676 b(done); 1677 } 1678 1679 bind(failed); 1680 if (use_result_reg && !preset_result_reg) { 1681 li(int_flag_success, 0); 1682 } 1683 1684 bind(done); 1685 // (flag == ne) => (dest_current_value != compare_value), (!swapped) 1686 // (flag == eq) => (dest_current_value == compare_value), ( swapped) 1687 } 1688 1689 // Preforms atomic compare exchange: 1690 // if (compare_value == *addr_base) 1691 // *addr_base = exchange_value 1692 // int_flag_success = 1; 1693 // else 1694 // int_flag_success = 0; 1695 // 1696 // ConditionRegister flag = cmp(compare_value, *addr_base) 1697 // Register dest_current_value = *addr_base 1698 // Register compare_value Used to compare with value in memory 1699 // Register exchange_value Written to memory if compare_value == *addr_base 1700 // Register addr_base The memory location to compareXChange 1701 // Register int_flag_success Set to 1 if exchange_value was written to *addr_base 1702 // 1703 // To avoid the costly compare exchange the value is tested beforehand. 1704 // Several special cases exist to avoid that unnecessary information is generated. 1705 // 1706 void MacroAssembler::cmpxchgd(ConditionRegister flag, 1707 Register dest_current_value, RegisterOrConstant compare_value, Register exchange_value, 1708 Register addr_base, int semantics, bool cmpxchgx_hint, 1709 Register int_flag_success, Label* failed_ext, bool contention_hint, bool weak) { 1710 Label retry; 1711 Label failed_int; 1712 Label& failed = (failed_ext != NULL) ? *failed_ext : failed_int; 1713 Label done; 1714 1715 // Save one branch if result is returned via register and result register is different from the other ones. 1716 bool use_result_reg = (int_flag_success!=noreg); 1717 bool preset_result_reg = (int_flag_success!=dest_current_value && int_flag_success!=compare_value.register_or_noreg() && 1718 int_flag_success!=exchange_value && int_flag_success!=addr_base); 1719 assert(!weak || flag == CCR0, "weak only supported with CCR0"); 1720 assert(int_flag_success == noreg || failed_ext == NULL, "cannot have both"); 1721 1722 if (use_result_reg && preset_result_reg) { 1723 li(int_flag_success, 0); // preset (assume cas failed) 1724 } 1725 1726 // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM). 1727 if (contention_hint) { // Don't try to reserve if cmp fails. 1728 ld(dest_current_value, 0, addr_base); 1729 cmpd(flag, compare_value, dest_current_value); 1730 bne(flag, failed); 1731 } 1732 1733 // release/fence semantics 1734 if (semantics & MemBarRel) { 1735 release(); 1736 } 1737 1738 // atomic emulation loop 1739 bind(retry); 1740 1741 ldarx(dest_current_value, addr_base, cmpxchgx_hint); 1742 cmpd(flag, compare_value, dest_current_value); 1743 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1744 bne_predict_not_taken(flag, failed); 1745 } else { 1746 bne( flag, failed); 1747 } 1748 1749 stdcx_(exchange_value, addr_base); 1750 if (!weak || use_result_reg || failed_ext) { 1751 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1752 bne_predict_not_taken(CCR0, weak ? failed : retry); // stXcx_ sets CCR0 1753 } else { 1754 bne( CCR0, weak ? failed : retry); // stXcx_ sets CCR0 1755 } 1756 } 1757 1758 // result in register (must do this at the end because int_flag_success can be the same register as one above) 1759 if (use_result_reg) { 1760 li(int_flag_success, 1); 1761 } 1762 1763 if (semantics & MemBarFenceAfter) { 1764 fence(); 1765 } else if (semantics & MemBarAcq) { 1766 isync(); 1767 } 1768 1769 if (use_result_reg && !preset_result_reg) { 1770 b(done); 1771 } 1772 1773 bind(failed_int); 1774 if (use_result_reg && !preset_result_reg) { 1775 li(int_flag_success, 0); 1776 } 1777 1778 bind(done); 1779 // (flag == ne) => (dest_current_value != compare_value), (!swapped) 1780 // (flag == eq) => (dest_current_value == compare_value), ( swapped) 1781 } 1782 1783 // Look up the method for a megamorphic invokeinterface call. 1784 // The target method is determined by <intf_klass, itable_index>. 1785 // The receiver klass is in recv_klass. 1786 // On success, the result will be in method_result, and execution falls through. 1787 // On failure, execution transfers to the given label. 1788 void MacroAssembler::lookup_interface_method(Register recv_klass, 1789 Register intf_klass, 1790 RegisterOrConstant itable_index, 1791 Register method_result, 1792 Register scan_temp, 1793 Register temp2, 1794 Label& L_no_such_interface, 1795 bool return_method) { 1796 assert_different_registers(recv_klass, intf_klass, method_result, scan_temp); 1797 1798 // Compute start of first itableOffsetEntry (which is at the end of the vtable). 1799 int vtable_base = in_bytes(Klass::vtable_start_offset()); 1800 int itentry_off = itableMethodEntry::method_offset_in_bytes(); 1801 int logMEsize = exact_log2(itableMethodEntry::size() * wordSize); 1802 int scan_step = itableOffsetEntry::size() * wordSize; 1803 int log_vte_size= exact_log2(vtableEntry::size_in_bytes()); 1804 1805 lwz(scan_temp, in_bytes(Klass::vtable_length_offset()), recv_klass); 1806 // %%% We should store the aligned, prescaled offset in the klassoop. 1807 // Then the next several instructions would fold away. 1808 1809 sldi(scan_temp, scan_temp, log_vte_size); 1810 addi(scan_temp, scan_temp, vtable_base); 1811 add(scan_temp, recv_klass, scan_temp); 1812 1813 // Adjust recv_klass by scaled itable_index, so we can free itable_index. 1814 if (return_method) { 1815 if (itable_index.is_register()) { 1816 Register itable_offset = itable_index.as_register(); 1817 sldi(method_result, itable_offset, logMEsize); 1818 if (itentry_off) { addi(method_result, method_result, itentry_off); } 1819 add(method_result, method_result, recv_klass); 1820 } else { 1821 long itable_offset = (long)itable_index.as_constant(); 1822 // static address, no relocation 1823 add_const_optimized(method_result, recv_klass, (itable_offset << logMEsize) + itentry_off, temp2); 1824 } 1825 } 1826 1827 // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) { 1828 // if (scan->interface() == intf) { 1829 // result = (klass + scan->offset() + itable_index); 1830 // } 1831 // } 1832 Label search, found_method; 1833 1834 for (int peel = 1; peel >= 0; peel--) { 1835 // %%%% Could load both offset and interface in one ldx, if they were 1836 // in the opposite order. This would save a load. 1837 ld(temp2, itableOffsetEntry::interface_offset_in_bytes(), scan_temp); 1838 1839 // Check that this entry is non-null. A null entry means that 1840 // the receiver class doesn't implement the interface, and wasn't the 1841 // same as when the caller was compiled. 1842 cmpd(CCR0, temp2, intf_klass); 1843 1844 if (peel) { 1845 beq(CCR0, found_method); 1846 } else { 1847 bne(CCR0, search); 1848 // (invert the test to fall through to found_method...) 1849 } 1850 1851 if (!peel) break; 1852 1853 bind(search); 1854 1855 cmpdi(CCR0, temp2, 0); 1856 beq(CCR0, L_no_such_interface); 1857 addi(scan_temp, scan_temp, scan_step); 1858 } 1859 1860 bind(found_method); 1861 1862 // Got a hit. 1863 if (return_method) { 1864 int ito_offset = itableOffsetEntry::offset_offset_in_bytes(); 1865 lwz(scan_temp, ito_offset, scan_temp); 1866 ldx(method_result, scan_temp, method_result); 1867 } 1868 } 1869 1870 // virtual method calling 1871 void MacroAssembler::lookup_virtual_method(Register recv_klass, 1872 RegisterOrConstant vtable_index, 1873 Register method_result) { 1874 1875 assert_different_registers(recv_klass, method_result, vtable_index.register_or_noreg()); 1876 1877 const int base = in_bytes(Klass::vtable_start_offset()); 1878 assert(vtableEntry::size() * wordSize == wordSize, "adjust the scaling in the code below"); 1879 1880 if (vtable_index.is_register()) { 1881 sldi(vtable_index.as_register(), vtable_index.as_register(), LogBytesPerWord); 1882 add(recv_klass, vtable_index.as_register(), recv_klass); 1883 } else { 1884 addi(recv_klass, recv_klass, vtable_index.as_constant() << LogBytesPerWord); 1885 } 1886 ld(R19_method, base + vtableEntry::method_offset_in_bytes(), recv_klass); 1887 } 1888 1889 /////////////////////////////////////////// subtype checking //////////////////////////////////////////// 1890 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass, 1891 Register super_klass, 1892 Register temp1_reg, 1893 Register temp2_reg, 1894 Label* L_success, 1895 Label* L_failure, 1896 Label* L_slow_path, 1897 RegisterOrConstant super_check_offset) { 1898 1899 const Register check_cache_offset = temp1_reg; 1900 const Register cached_super = temp2_reg; 1901 1902 assert_different_registers(sub_klass, super_klass, check_cache_offset, cached_super); 1903 1904 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 1905 int sc_offset = in_bytes(Klass::secondary_super_cache_offset()); 1906 1907 bool must_load_sco = (super_check_offset.constant_or_zero() == -1); 1908 bool need_slow_path = (must_load_sco || super_check_offset.constant_or_zero() == sco_offset); 1909 1910 Label L_fallthrough; 1911 int label_nulls = 0; 1912 if (L_success == NULL) { L_success = &L_fallthrough; label_nulls++; } 1913 if (L_failure == NULL) { L_failure = &L_fallthrough; label_nulls++; } 1914 if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; } 1915 assert(label_nulls <= 1 || 1916 (L_slow_path == &L_fallthrough && label_nulls <= 2 && !need_slow_path), 1917 "at most one NULL in the batch, usually"); 1918 1919 // If the pointers are equal, we are done (e.g., String[] elements). 1920 // This self-check enables sharing of secondary supertype arrays among 1921 // non-primary types such as array-of-interface. Otherwise, each such 1922 // type would need its own customized SSA. 1923 // We move this check to the front of the fast path because many 1924 // type checks are in fact trivially successful in this manner, 1925 // so we get a nicely predicted branch right at the start of the check. 1926 cmpd(CCR0, sub_klass, super_klass); 1927 beq(CCR0, *L_success); 1928 1929 // Check the supertype display: 1930 if (must_load_sco) { 1931 // The super check offset is always positive... 1932 lwz(check_cache_offset, sco_offset, super_klass); 1933 super_check_offset = RegisterOrConstant(check_cache_offset); 1934 // super_check_offset is register. 1935 assert_different_registers(sub_klass, super_klass, cached_super, super_check_offset.as_register()); 1936 } 1937 // The loaded value is the offset from KlassOopDesc. 1938 1939 ld(cached_super, super_check_offset, sub_klass); 1940 cmpd(CCR0, cached_super, super_klass); 1941 1942 // This check has worked decisively for primary supers. 1943 // Secondary supers are sought in the super_cache ('super_cache_addr'). 1944 // (Secondary supers are interfaces and very deeply nested subtypes.) 1945 // This works in the same check above because of a tricky aliasing 1946 // between the super_cache and the primary super display elements. 1947 // (The 'super_check_addr' can address either, as the case requires.) 1948 // Note that the cache is updated below if it does not help us find 1949 // what we need immediately. 1950 // So if it was a primary super, we can just fail immediately. 1951 // Otherwise, it's the slow path for us (no success at this point). 1952 1953 #define FINAL_JUMP(label) if (&(label) != &L_fallthrough) { b(label); } 1954 1955 if (super_check_offset.is_register()) { 1956 beq(CCR0, *L_success); 1957 cmpwi(CCR0, super_check_offset.as_register(), sc_offset); 1958 if (L_failure == &L_fallthrough) { 1959 beq(CCR0, *L_slow_path); 1960 } else { 1961 bne(CCR0, *L_failure); 1962 FINAL_JUMP(*L_slow_path); 1963 } 1964 } else { 1965 if (super_check_offset.as_constant() == sc_offset) { 1966 // Need a slow path; fast failure is impossible. 1967 if (L_slow_path == &L_fallthrough) { 1968 beq(CCR0, *L_success); 1969 } else { 1970 bne(CCR0, *L_slow_path); 1971 FINAL_JUMP(*L_success); 1972 } 1973 } else { 1974 // No slow path; it's a fast decision. 1975 if (L_failure == &L_fallthrough) { 1976 beq(CCR0, *L_success); 1977 } else { 1978 bne(CCR0, *L_failure); 1979 FINAL_JUMP(*L_success); 1980 } 1981 } 1982 } 1983 1984 bind(L_fallthrough); 1985 #undef FINAL_JUMP 1986 } 1987 1988 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass, 1989 Register super_klass, 1990 Register temp1_reg, 1991 Register temp2_reg, 1992 Label* L_success, 1993 Register result_reg) { 1994 const Register array_ptr = temp1_reg; // current value from cache array 1995 const Register temp = temp2_reg; 1996 1997 assert_different_registers(sub_klass, super_klass, array_ptr, temp); 1998 1999 int source_offset = in_bytes(Klass::secondary_supers_offset()); 2000 int target_offset = in_bytes(Klass::secondary_super_cache_offset()); 2001 2002 int length_offset = Array<Klass*>::length_offset_in_bytes(); 2003 int base_offset = Array<Klass*>::base_offset_in_bytes(); 2004 2005 Label hit, loop, failure, fallthru; 2006 2007 ld(array_ptr, source_offset, sub_klass); 2008 2009 // TODO: PPC port: assert(4 == arrayOopDesc::length_length_in_bytes(), "precondition violated."); 2010 lwz(temp, length_offset, array_ptr); 2011 cmpwi(CCR0, temp, 0); 2012 beq(CCR0, result_reg!=noreg ? failure : fallthru); // length 0 2013 2014 mtctr(temp); // load ctr 2015 2016 bind(loop); 2017 // Oops in table are NO MORE compressed. 2018 ld(temp, base_offset, array_ptr); 2019 cmpd(CCR0, temp, super_klass); 2020 beq(CCR0, hit); 2021 addi(array_ptr, array_ptr, BytesPerWord); 2022 bdnz(loop); 2023 2024 bind(failure); 2025 if (result_reg!=noreg) li(result_reg, 1); // load non-zero result (indicates a miss) 2026 b(fallthru); 2027 2028 bind(hit); 2029 std(super_klass, target_offset, sub_klass); // save result to cache 2030 if (result_reg != noreg) { li(result_reg, 0); } // load zero result (indicates a hit) 2031 if (L_success != NULL) { b(*L_success); } 2032 else if (result_reg == noreg) { blr(); } // return with CR0.eq if neither label nor result reg provided 2033 2034 bind(fallthru); 2035 } 2036 2037 // Try fast path, then go to slow one if not successful 2038 void MacroAssembler::check_klass_subtype(Register sub_klass, 2039 Register super_klass, 2040 Register temp1_reg, 2041 Register temp2_reg, 2042 Label& L_success) { 2043 Label L_failure; 2044 check_klass_subtype_fast_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success, &L_failure); 2045 check_klass_subtype_slow_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success); 2046 bind(L_failure); // Fallthru if not successful. 2047 } 2048 2049 void MacroAssembler::check_method_handle_type(Register mtype_reg, Register mh_reg, 2050 Register temp_reg, 2051 Label& wrong_method_type) { 2052 assert_different_registers(mtype_reg, mh_reg, temp_reg); 2053 // Compare method type against that of the receiver. 2054 load_heap_oop_not_null(temp_reg, delayed_value(java_lang_invoke_MethodHandle::type_offset_in_bytes, temp_reg), mh_reg); 2055 cmpd(CCR0, temp_reg, mtype_reg); 2056 bne(CCR0, wrong_method_type); 2057 } 2058 2059 RegisterOrConstant MacroAssembler::argument_offset(RegisterOrConstant arg_slot, 2060 Register temp_reg, 2061 int extra_slot_offset) { 2062 // cf. TemplateTable::prepare_invoke(), if (load_receiver). 2063 int stackElementSize = Interpreter::stackElementSize; 2064 int offset = extra_slot_offset * stackElementSize; 2065 if (arg_slot.is_constant()) { 2066 offset += arg_slot.as_constant() * stackElementSize; 2067 return offset; 2068 } else { 2069 assert(temp_reg != noreg, "must specify"); 2070 sldi(temp_reg, arg_slot.as_register(), exact_log2(stackElementSize)); 2071 if (offset != 0) 2072 addi(temp_reg, temp_reg, offset); 2073 return temp_reg; 2074 } 2075 } 2076 2077 // Supports temp2_reg = R0. 2078 void MacroAssembler::biased_locking_enter(ConditionRegister cr_reg, Register obj_reg, 2079 Register mark_reg, Register temp_reg, 2080 Register temp2_reg, Label& done, Label* slow_case) { 2081 assert(UseBiasedLocking, "why call this otherwise?"); 2082 2083 #ifdef ASSERT 2084 assert_different_registers(obj_reg, mark_reg, temp_reg, temp2_reg); 2085 #endif 2086 2087 Label cas_label; 2088 2089 // Branch to done if fast path fails and no slow_case provided. 2090 Label *slow_case_int = (slow_case != NULL) ? slow_case : &done; 2091 2092 // Biased locking 2093 // See whether the lock is currently biased toward our thread and 2094 // whether the epoch is still valid 2095 // Note that the runtime guarantees sufficient alignment of JavaThread 2096 // pointers to allow age to be placed into low bits 2097 assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits, 2098 "biased locking makes assumptions about bit layout"); 2099 2100 if (PrintBiasedLockingStatistics) { 2101 load_const(temp2_reg, (address) BiasedLocking::total_entry_count_addr(), temp_reg); 2102 lwzx(temp_reg, temp2_reg); 2103 addi(temp_reg, temp_reg, 1); 2104 stwx(temp_reg, temp2_reg); 2105 } 2106 2107 andi(temp_reg, mark_reg, markOopDesc::biased_lock_mask_in_place); 2108 cmpwi(cr_reg, temp_reg, markOopDesc::biased_lock_pattern); 2109 bne(cr_reg, cas_label); 2110 2111 load_klass(temp_reg, obj_reg); 2112 2113 load_const_optimized(temp2_reg, ~((int) markOopDesc::age_mask_in_place)); 2114 ld(temp_reg, in_bytes(Klass::prototype_header_offset()), temp_reg); 2115 orr(temp_reg, R16_thread, temp_reg); 2116 xorr(temp_reg, mark_reg, temp_reg); 2117 andr(temp_reg, temp_reg, temp2_reg); 2118 cmpdi(cr_reg, temp_reg, 0); 2119 if (PrintBiasedLockingStatistics) { 2120 Label l; 2121 bne(cr_reg, l); 2122 load_const(temp2_reg, (address) BiasedLocking::biased_lock_entry_count_addr()); 2123 lwzx(mark_reg, temp2_reg); 2124 addi(mark_reg, mark_reg, 1); 2125 stwx(mark_reg, temp2_reg); 2126 // restore mark_reg 2127 ld(mark_reg, oopDesc::mark_offset_in_bytes(), obj_reg); 2128 bind(l); 2129 } 2130 beq(cr_reg, done); 2131 2132 Label try_revoke_bias; 2133 Label try_rebias; 2134 2135 // At this point we know that the header has the bias pattern and 2136 // that we are not the bias owner in the current epoch. We need to 2137 // figure out more details about the state of the header in order to 2138 // know what operations can be legally performed on the object's 2139 // header. 2140 2141 // If the low three bits in the xor result aren't clear, that means 2142 // the prototype header is no longer biased and we have to revoke 2143 // the bias on this object. 2144 andi(temp2_reg, temp_reg, markOopDesc::biased_lock_mask_in_place); 2145 cmpwi(cr_reg, temp2_reg, 0); 2146 bne(cr_reg, try_revoke_bias); 2147 2148 // Biasing is still enabled for this data type. See whether the 2149 // epoch of the current bias is still valid, meaning that the epoch 2150 // bits of the mark word are equal to the epoch bits of the 2151 // prototype header. (Note that the prototype header's epoch bits 2152 // only change at a safepoint.) If not, attempt to rebias the object 2153 // toward the current thread. Note that we must be absolutely sure 2154 // that the current epoch is invalid in order to do this because 2155 // otherwise the manipulations it performs on the mark word are 2156 // illegal. 2157 2158 int shift_amount = 64 - markOopDesc::epoch_shift; 2159 // rotate epoch bits to right (little) end and set other bits to 0 2160 // [ big part | epoch | little part ] -> [ 0..0 | epoch ] 2161 rldicl_(temp2_reg, temp_reg, shift_amount, 64 - markOopDesc::epoch_bits); 2162 // branch if epoch bits are != 0, i.e. they differ, because the epoch has been incremented 2163 bne(CCR0, try_rebias); 2164 2165 // The epoch of the current bias is still valid but we know nothing 2166 // about the owner; it might be set or it might be clear. Try to 2167 // acquire the bias of the object using an atomic operation. If this 2168 // fails we will go in to the runtime to revoke the object's bias. 2169 // Note that we first construct the presumed unbiased header so we 2170 // don't accidentally blow away another thread's valid bias. 2171 andi(mark_reg, mark_reg, (markOopDesc::biased_lock_mask_in_place | 2172 markOopDesc::age_mask_in_place | 2173 markOopDesc::epoch_mask_in_place)); 2174 orr(temp_reg, R16_thread, mark_reg); 2175 2176 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 2177 2178 // CmpxchgX sets cr_reg to cmpX(temp2_reg, mark_reg). 2179 cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg, 2180 /*compare_value=*/mark_reg, /*exchange_value=*/temp_reg, 2181 /*where=*/obj_reg, 2182 MacroAssembler::MemBarAcq, 2183 MacroAssembler::cmpxchgx_hint_acquire_lock(), 2184 noreg, slow_case_int); // bail out if failed 2185 2186 // If the biasing toward our thread failed, this means that 2187 // another thread succeeded in biasing it toward itself and we 2188 // need to revoke that bias. The revocation will occur in the 2189 // interpreter runtime in the slow case. 2190 if (PrintBiasedLockingStatistics) { 2191 load_const(temp2_reg, (address) BiasedLocking::anonymously_biased_lock_entry_count_addr(), temp_reg); 2192 lwzx(temp_reg, temp2_reg); 2193 addi(temp_reg, temp_reg, 1); 2194 stwx(temp_reg, temp2_reg); 2195 } 2196 b(done); 2197 2198 bind(try_rebias); 2199 // At this point we know the epoch has expired, meaning that the 2200 // current "bias owner", if any, is actually invalid. Under these 2201 // circumstances _only_, we are allowed to use the current header's 2202 // value as the comparison value when doing the cas to acquire the 2203 // bias in the current epoch. In other words, we allow transfer of 2204 // the bias from one thread to another directly in this situation. 2205 load_klass(temp_reg, obj_reg); 2206 andi(temp2_reg, mark_reg, markOopDesc::age_mask_in_place); 2207 orr(temp2_reg, R16_thread, temp2_reg); 2208 ld(temp_reg, in_bytes(Klass::prototype_header_offset()), temp_reg); 2209 orr(temp_reg, temp2_reg, temp_reg); 2210 2211 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 2212 2213 cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg, 2214 /*compare_value=*/mark_reg, /*exchange_value=*/temp_reg, 2215 /*where=*/obj_reg, 2216 MacroAssembler::MemBarAcq, 2217 MacroAssembler::cmpxchgx_hint_acquire_lock(), 2218 noreg, slow_case_int); // bail out if failed 2219 2220 // If the biasing toward our thread failed, this means that 2221 // another thread succeeded in biasing it toward itself and we 2222 // need to revoke that bias. The revocation will occur in the 2223 // interpreter runtime in the slow case. 2224 if (PrintBiasedLockingStatistics) { 2225 load_const(temp2_reg, (address) BiasedLocking::rebiased_lock_entry_count_addr(), temp_reg); 2226 lwzx(temp_reg, temp2_reg); 2227 addi(temp_reg, temp_reg, 1); 2228 stwx(temp_reg, temp2_reg); 2229 } 2230 b(done); 2231 2232 bind(try_revoke_bias); 2233 // The prototype mark in the klass doesn't have the bias bit set any 2234 // more, indicating that objects of this data type are not supposed 2235 // to be biased any more. We are going to try to reset the mark of 2236 // this object to the prototype value and fall through to the 2237 // CAS-based locking scheme. Note that if our CAS fails, it means 2238 // that another thread raced us for the privilege of revoking the 2239 // bias of this particular object, so it's okay to continue in the 2240 // normal locking code. 2241 load_klass(temp_reg, obj_reg); 2242 ld(temp_reg, in_bytes(Klass::prototype_header_offset()), temp_reg); 2243 andi(temp2_reg, mark_reg, markOopDesc::age_mask_in_place); 2244 orr(temp_reg, temp_reg, temp2_reg); 2245 2246 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 2247 2248 // CmpxchgX sets cr_reg to cmpX(temp2_reg, mark_reg). 2249 cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg, 2250 /*compare_value=*/mark_reg, /*exchange_value=*/temp_reg, 2251 /*where=*/obj_reg, 2252 MacroAssembler::MemBarAcq, 2253 MacroAssembler::cmpxchgx_hint_acquire_lock()); 2254 2255 // reload markOop in mark_reg before continuing with lightweight locking 2256 ld(mark_reg, oopDesc::mark_offset_in_bytes(), obj_reg); 2257 2258 // Fall through to the normal CAS-based lock, because no matter what 2259 // the result of the above CAS, some thread must have succeeded in 2260 // removing the bias bit from the object's header. 2261 if (PrintBiasedLockingStatistics) { 2262 Label l; 2263 bne(cr_reg, l); 2264 load_const(temp2_reg, (address) BiasedLocking::revoked_lock_entry_count_addr(), temp_reg); 2265 lwzx(temp_reg, temp2_reg); 2266 addi(temp_reg, temp_reg, 1); 2267 stwx(temp_reg, temp2_reg); 2268 bind(l); 2269 } 2270 2271 bind(cas_label); 2272 } 2273 2274 void MacroAssembler::biased_locking_exit (ConditionRegister cr_reg, Register mark_addr, Register temp_reg, Label& done) { 2275 // Check for biased locking unlock case, which is a no-op 2276 // Note: we do not have to check the thread ID for two reasons. 2277 // First, the interpreter checks for IllegalMonitorStateException at 2278 // a higher level. Second, if the bias was revoked while we held the 2279 // lock, the object could not be rebiased toward another thread, so 2280 // the bias bit would be clear. 2281 2282 ld(temp_reg, 0, mark_addr); 2283 andi(temp_reg, temp_reg, markOopDesc::biased_lock_mask_in_place); 2284 2285 cmpwi(cr_reg, temp_reg, markOopDesc::biased_lock_pattern); 2286 beq(cr_reg, done); 2287 } 2288 2289 // allocation (for C1) 2290 void MacroAssembler::eden_allocate( 2291 Register obj, // result: pointer to object after successful allocation 2292 Register var_size_in_bytes, // object size in bytes if unknown at compile time; invalid otherwise 2293 int con_size_in_bytes, // object size in bytes if known at compile time 2294 Register t1, // temp register 2295 Register t2, // temp register 2296 Label& slow_case // continuation point if fast allocation fails 2297 ) { 2298 b(slow_case); 2299 } 2300 2301 void MacroAssembler::tlab_allocate( 2302 Register obj, // result: pointer to object after successful allocation 2303 Register var_size_in_bytes, // object size in bytes if unknown at compile time; invalid otherwise 2304 int con_size_in_bytes, // object size in bytes if known at compile time 2305 Register t1, // temp register 2306 Label& slow_case // continuation point if fast allocation fails 2307 ) { 2308 // make sure arguments make sense 2309 assert_different_registers(obj, var_size_in_bytes, t1); 2310 assert(0 <= con_size_in_bytes && is_simm13(con_size_in_bytes), "illegal object size"); 2311 assert((con_size_in_bytes & MinObjAlignmentInBytesMask) == 0, "object size is not multiple of alignment"); 2312 2313 const Register new_top = t1; 2314 //verify_tlab(); not implemented 2315 2316 ld(obj, in_bytes(JavaThread::tlab_top_offset()), R16_thread); 2317 ld(R0, in_bytes(JavaThread::tlab_end_offset()), R16_thread); 2318 if (var_size_in_bytes == noreg) { 2319 addi(new_top, obj, con_size_in_bytes); 2320 } else { 2321 add(new_top, obj, var_size_in_bytes); 2322 } 2323 cmpld(CCR0, new_top, R0); 2324 bc_far_optimized(Assembler::bcondCRbiIs1, bi0(CCR0, Assembler::greater), slow_case); 2325 2326 #ifdef ASSERT 2327 // make sure new free pointer is properly aligned 2328 { 2329 Label L; 2330 andi_(R0, new_top, MinObjAlignmentInBytesMask); 2331 beq(CCR0, L); 2332 stop("updated TLAB free is not properly aligned", 0x934); 2333 bind(L); 2334 } 2335 #endif // ASSERT 2336 2337 // update the tlab top pointer 2338 std(new_top, in_bytes(JavaThread::tlab_top_offset()), R16_thread); 2339 //verify_tlab(); not implemented 2340 } 2341 void MacroAssembler::incr_allocated_bytes(RegisterOrConstant size_in_bytes, Register t1, Register t2) { 2342 unimplemented("incr_allocated_bytes"); 2343 } 2344 2345 address MacroAssembler::emit_trampoline_stub(int destination_toc_offset, 2346 int insts_call_instruction_offset, Register Rtoc) { 2347 // Start the stub. 2348 address stub = start_a_stub(64); 2349 if (stub == NULL) { return NULL; } // CodeCache full: bail out 2350 2351 // Create a trampoline stub relocation which relates this trampoline stub 2352 // with the call instruction at insts_call_instruction_offset in the 2353 // instructions code-section. 2354 relocate(trampoline_stub_Relocation::spec(code()->insts()->start() + insts_call_instruction_offset)); 2355 const int stub_start_offset = offset(); 2356 2357 // For java_to_interp stubs we use R11_scratch1 as scratch register 2358 // and in call trampoline stubs we use R12_scratch2. This way we 2359 // can distinguish them (see is_NativeCallTrampolineStub_at()). 2360 Register reg_scratch = R12_scratch2; 2361 2362 // Now, create the trampoline stub's code: 2363 // - load the TOC 2364 // - load the call target from the constant pool 2365 // - call 2366 if (Rtoc == noreg) { 2367 calculate_address_from_global_toc(reg_scratch, method_toc()); 2368 Rtoc = reg_scratch; 2369 } 2370 2371 ld_largeoffset_unchecked(reg_scratch, destination_toc_offset, Rtoc, false); 2372 mtctr(reg_scratch); 2373 bctr(); 2374 2375 const address stub_start_addr = addr_at(stub_start_offset); 2376 2377 // Assert that the encoded destination_toc_offset can be identified and that it is correct. 2378 assert(destination_toc_offset == NativeCallTrampolineStub_at(stub_start_addr)->destination_toc_offset(), 2379 "encoded offset into the constant pool must match"); 2380 // Trampoline_stub_size should be good. 2381 assert((uint)(offset() - stub_start_offset) <= trampoline_stub_size, "should be good size"); 2382 assert(is_NativeCallTrampolineStub_at(stub_start_addr), "doesn't look like a trampoline"); 2383 2384 // End the stub. 2385 end_a_stub(); 2386 return stub; 2387 } 2388 2389 // TM on PPC64. 2390 void MacroAssembler::atomic_inc_ptr(Register addr, Register result, int simm16) { 2391 Label retry; 2392 bind(retry); 2393 ldarx(result, addr, /*hint*/ false); 2394 addi(result, result, simm16); 2395 stdcx_(result, addr); 2396 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 2397 bne_predict_not_taken(CCR0, retry); // stXcx_ sets CCR0 2398 } else { 2399 bne( CCR0, retry); // stXcx_ sets CCR0 2400 } 2401 } 2402 2403 void MacroAssembler::atomic_ori_int(Register addr, Register result, int uimm16) { 2404 Label retry; 2405 bind(retry); 2406 lwarx(result, addr, /*hint*/ false); 2407 ori(result, result, uimm16); 2408 stwcx_(result, addr); 2409 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 2410 bne_predict_not_taken(CCR0, retry); // stXcx_ sets CCR0 2411 } else { 2412 bne( CCR0, retry); // stXcx_ sets CCR0 2413 } 2414 } 2415 2416 #if INCLUDE_RTM_OPT 2417 2418 // Update rtm_counters based on abort status 2419 // input: abort_status 2420 // rtm_counters (RTMLockingCounters*) 2421 void MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters_Reg) { 2422 // Mapping to keep PreciseRTMLockingStatistics similar to x86. 2423 // x86 ppc (! means inverted, ? means not the same) 2424 // 0 31 Set if abort caused by XABORT instruction. 2425 // 1 ! 7 If set, the transaction may succeed on a retry. This bit is always clear if bit 0 is set. 2426 // 2 13 Set if another logical processor conflicted with a memory address that was part of the transaction that aborted. 2427 // 3 10 Set if an internal buffer overflowed. 2428 // 4 ?12 Set if a debug breakpoint was hit. 2429 // 5 ?32 Set if an abort occurred during execution of a nested transaction. 2430 const int tm_failure_bit[] = {Assembler::tm_tabort, // Note: Seems like signal handler sets this, too. 2431 Assembler::tm_failure_persistent, // inverted: transient 2432 Assembler::tm_trans_cf, 2433 Assembler::tm_footprint_of, 2434 Assembler::tm_non_trans_cf, 2435 Assembler::tm_suspended}; 2436 const bool tm_failure_inv[] = {false, true, false, false, false, false}; 2437 assert(sizeof(tm_failure_bit)/sizeof(int) == RTMLockingCounters::ABORT_STATUS_LIMIT, "adapt mapping!"); 2438 2439 const Register addr_Reg = R0; 2440 // Keep track of offset to where rtm_counters_Reg had pointed to. 2441 int counters_offs = RTMLockingCounters::abort_count_offset(); 2442 addi(addr_Reg, rtm_counters_Reg, counters_offs); 2443 const Register temp_Reg = rtm_counters_Reg; 2444 2445 //atomic_inc_ptr(addr_Reg, temp_Reg); We don't increment atomically 2446 ldx(temp_Reg, addr_Reg); 2447 addi(temp_Reg, temp_Reg, 1); 2448 stdx(temp_Reg, addr_Reg); 2449 2450 if (PrintPreciseRTMLockingStatistics) { 2451 int counters_offs_delta = RTMLockingCounters::abortX_count_offset() - counters_offs; 2452 2453 //mftexasr(abort_status); done by caller 2454 for (int i = 0; i < RTMLockingCounters::ABORT_STATUS_LIMIT; i++) { 2455 counters_offs += counters_offs_delta; 2456 li(temp_Reg, counters_offs_delta); // can't use addi with R0 2457 add(addr_Reg, addr_Reg, temp_Reg); // point to next counter 2458 counters_offs_delta = sizeof(uintx); 2459 2460 Label check_abort; 2461 rldicr_(temp_Reg, abort_status, tm_failure_bit[i], 0); 2462 if (tm_failure_inv[i]) { 2463 bne(CCR0, check_abort); 2464 } else { 2465 beq(CCR0, check_abort); 2466 } 2467 //atomic_inc_ptr(addr_Reg, temp_Reg); We don't increment atomically 2468 ldx(temp_Reg, addr_Reg); 2469 addi(temp_Reg, temp_Reg, 1); 2470 stdx(temp_Reg, addr_Reg); 2471 bind(check_abort); 2472 } 2473 } 2474 li(temp_Reg, -counters_offs); // can't use addi with R0 2475 add(rtm_counters_Reg, addr_Reg, temp_Reg); // restore 2476 } 2477 2478 // Branch if (random & (count-1) != 0), count is 2^n 2479 // tmp and CR0 are killed 2480 void MacroAssembler::branch_on_random_using_tb(Register tmp, int count, Label& brLabel) { 2481 mftb(tmp); 2482 andi_(tmp, tmp, count-1); 2483 bne(CCR0, brLabel); 2484 } 2485 2486 // Perform abort ratio calculation, set no_rtm bit if high ratio. 2487 // input: rtm_counters_Reg (RTMLockingCounters* address) - KILLED 2488 void MacroAssembler::rtm_abort_ratio_calculation(Register rtm_counters_Reg, 2489 RTMLockingCounters* rtm_counters, 2490 Metadata* method_data) { 2491 Label L_done, L_check_always_rtm1, L_check_always_rtm2; 2492 2493 if (RTMLockingCalculationDelay > 0) { 2494 // Delay calculation. 2495 ld(rtm_counters_Reg, (RegisterOrConstant)(intptr_t)RTMLockingCounters::rtm_calculation_flag_addr()); 2496 cmpdi(CCR0, rtm_counters_Reg, 0); 2497 beq(CCR0, L_done); 2498 load_const_optimized(rtm_counters_Reg, (address)rtm_counters, R0); // reload 2499 } 2500 // Abort ratio calculation only if abort_count > RTMAbortThreshold. 2501 // Aborted transactions = abort_count * 100 2502 // All transactions = total_count * RTMTotalCountIncrRate 2503 // Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio) 2504 ld(R0, RTMLockingCounters::abort_count_offset(), rtm_counters_Reg); 2505 if (is_simm(RTMAbortThreshold, 16)) { // cmpdi can handle 16bit immediate only. 2506 cmpdi(CCR0, R0, RTMAbortThreshold); 2507 blt(CCR0, L_check_always_rtm2); // reload of rtm_counters_Reg not necessary 2508 } else { 2509 load_const_optimized(rtm_counters_Reg, RTMAbortThreshold); 2510 cmpd(CCR0, R0, rtm_counters_Reg); 2511 blt(CCR0, L_check_always_rtm1); // reload of rtm_counters_Reg required 2512 } 2513 mulli(R0, R0, 100); 2514 2515 const Register tmpReg = rtm_counters_Reg; 2516 ld(tmpReg, RTMLockingCounters::total_count_offset(), rtm_counters_Reg); 2517 mulli(tmpReg, tmpReg, RTMTotalCountIncrRate); // allowable range: int16 2518 mulli(tmpReg, tmpReg, RTMAbortRatio); // allowable range: int16 2519 cmpd(CCR0, R0, tmpReg); 2520 blt(CCR0, L_check_always_rtm1); // jump to reload 2521 if (method_data != NULL) { 2522 // Set rtm_state to "no rtm" in MDO. 2523 // Not using a metadata relocation. Method and Class Loader are kept alive anyway. 2524 // (See nmethod::metadata_do and CodeBuffer::finalize_oop_references.) 2525 load_const(R0, (address)method_data + MethodData::rtm_state_offset_in_bytes(), tmpReg); 2526 atomic_ori_int(R0, tmpReg, NoRTM); 2527 } 2528 b(L_done); 2529 2530 bind(L_check_always_rtm1); 2531 load_const_optimized(rtm_counters_Reg, (address)rtm_counters, R0); // reload 2532 bind(L_check_always_rtm2); 2533 ld(tmpReg, RTMLockingCounters::total_count_offset(), rtm_counters_Reg); 2534 int64_t thresholdValue = RTMLockingThreshold / RTMTotalCountIncrRate; 2535 if (is_simm(thresholdValue, 16)) { // cmpdi can handle 16bit immediate only. 2536 cmpdi(CCR0, tmpReg, thresholdValue); 2537 } else { 2538 load_const_optimized(R0, thresholdValue); 2539 cmpd(CCR0, tmpReg, R0); 2540 } 2541 blt(CCR0, L_done); 2542 if (method_data != NULL) { 2543 // Set rtm_state to "always rtm" in MDO. 2544 // Not using a metadata relocation. See above. 2545 load_const(R0, (address)method_data + MethodData::rtm_state_offset_in_bytes(), tmpReg); 2546 atomic_ori_int(R0, tmpReg, UseRTM); 2547 } 2548 bind(L_done); 2549 } 2550 2551 // Update counters and perform abort ratio calculation. 2552 // input: abort_status_Reg 2553 void MacroAssembler::rtm_profiling(Register abort_status_Reg, Register temp_Reg, 2554 RTMLockingCounters* rtm_counters, 2555 Metadata* method_data, 2556 bool profile_rtm) { 2557 2558 assert(rtm_counters != NULL, "should not be NULL when profiling RTM"); 2559 // Update rtm counters based on state at abort. 2560 // Reads abort_status_Reg, updates flags. 2561 assert_different_registers(abort_status_Reg, temp_Reg); 2562 load_const_optimized(temp_Reg, (address)rtm_counters, R0); 2563 rtm_counters_update(abort_status_Reg, temp_Reg); 2564 if (profile_rtm) { 2565 assert(rtm_counters != NULL, "should not be NULL when profiling RTM"); 2566 rtm_abort_ratio_calculation(temp_Reg, rtm_counters, method_data); 2567 } 2568 } 2569 2570 // Retry on abort if abort's status indicates non-persistent failure. 2571 // inputs: retry_count_Reg 2572 // : abort_status_Reg 2573 // output: retry_count_Reg decremented by 1 2574 void MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg, 2575 Label& retryLabel, Label* checkRetry) { 2576 Label doneRetry; 2577 rldicr_(R0, abort_status_Reg, tm_failure_persistent, 0); 2578 bne(CCR0, doneRetry); 2579 if (checkRetry) { bind(*checkRetry); } 2580 addic_(retry_count_Reg, retry_count_Reg, -1); 2581 blt(CCR0, doneRetry); 2582 b(retryLabel); 2583 bind(doneRetry); 2584 } 2585 2586 // Spin and retry if lock is busy. 2587 // inputs: owner_addr_Reg (monitor address) 2588 // : retry_count_Reg 2589 // output: retry_count_Reg decremented by 1 2590 // CTR is killed 2591 void MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register owner_addr_Reg, Label& retryLabel) { 2592 Label SpinLoop, doneRetry, doRetry; 2593 addic_(retry_count_Reg, retry_count_Reg, -1); 2594 blt(CCR0, doneRetry); 2595 2596 if (RTMSpinLoopCount > 1) { 2597 li(R0, RTMSpinLoopCount); 2598 mtctr(R0); 2599 } 2600 2601 // low thread priority 2602 smt_prio_low(); 2603 bind(SpinLoop); 2604 2605 if (RTMSpinLoopCount > 1) { 2606 bdz(doRetry); 2607 ld(R0, 0, owner_addr_Reg); 2608 cmpdi(CCR0, R0, 0); 2609 bne(CCR0, SpinLoop); 2610 } 2611 2612 bind(doRetry); 2613 2614 // restore thread priority to default in userspace 2615 #ifdef LINUX 2616 smt_prio_medium_low(); 2617 #else 2618 smt_prio_medium(); 2619 #endif 2620 2621 b(retryLabel); 2622 2623 bind(doneRetry); 2624 } 2625 2626 // Use RTM for normal stack locks. 2627 // Input: objReg (object to lock) 2628 void MacroAssembler::rtm_stack_locking(ConditionRegister flag, 2629 Register obj, Register mark_word, Register tmp, 2630 Register retry_on_abort_count_Reg, 2631 RTMLockingCounters* stack_rtm_counters, 2632 Metadata* method_data, bool profile_rtm, 2633 Label& DONE_LABEL, Label& IsInflated) { 2634 assert(UseRTMForStackLocks, "why call this otherwise?"); 2635 assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking"); 2636 Label L_rtm_retry, L_decrement_retry, L_on_abort; 2637 2638 if (RTMRetryCount > 0) { 2639 load_const_optimized(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort 2640 bind(L_rtm_retry); 2641 } 2642 andi_(R0, mark_word, markOopDesc::monitor_value); // inflated vs stack-locked|neutral|biased 2643 bne(CCR0, IsInflated); 2644 2645 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 2646 Label L_noincrement; 2647 if (RTMTotalCountIncrRate > 1) { 2648 branch_on_random_using_tb(tmp, RTMTotalCountIncrRate, L_noincrement); 2649 } 2650 assert(stack_rtm_counters != NULL, "should not be NULL when profiling RTM"); 2651 load_const_optimized(tmp, (address)stack_rtm_counters->total_count_addr(), R0); 2652 //atomic_inc_ptr(tmp, /*temp, will be reloaded*/mark_word); We don't increment atomically 2653 ldx(mark_word, tmp); 2654 addi(mark_word, mark_word, 1); 2655 stdx(mark_word, tmp); 2656 bind(L_noincrement); 2657 } 2658 tbegin_(); 2659 beq(CCR0, L_on_abort); 2660 ld(mark_word, oopDesc::mark_offset_in_bytes(), obj); // Reload in transaction, conflicts need to be tracked. 2661 andi(R0, mark_word, markOopDesc::biased_lock_mask_in_place); // look at 3 lock bits 2662 cmpwi(flag, R0, markOopDesc::unlocked_value); // bits = 001 unlocked 2663 beq(flag, DONE_LABEL); // all done if unlocked 2664 2665 if (UseRTMXendForLockBusy) { 2666 tend_(); 2667 b(L_decrement_retry); 2668 } else { 2669 tabort_(); 2670 } 2671 bind(L_on_abort); 2672 const Register abort_status_Reg = tmp; 2673 mftexasr(abort_status_Reg); 2674 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 2675 rtm_profiling(abort_status_Reg, /*temp*/mark_word, stack_rtm_counters, method_data, profile_rtm); 2676 } 2677 ld(mark_word, oopDesc::mark_offset_in_bytes(), obj); // reload 2678 if (RTMRetryCount > 0) { 2679 // Retry on lock abort if abort status is not permanent. 2680 rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry, &L_decrement_retry); 2681 } else { 2682 bind(L_decrement_retry); 2683 } 2684 } 2685 2686 // Use RTM for inflating locks 2687 // inputs: obj (object to lock) 2688 // mark_word (current header - KILLED) 2689 // boxReg (on-stack box address (displaced header location) - KILLED) 2690 void MacroAssembler::rtm_inflated_locking(ConditionRegister flag, 2691 Register obj, Register mark_word, Register boxReg, 2692 Register retry_on_busy_count_Reg, Register retry_on_abort_count_Reg, 2693 RTMLockingCounters* rtm_counters, 2694 Metadata* method_data, bool profile_rtm, 2695 Label& DONE_LABEL) { 2696 assert(UseRTMLocking, "why call this otherwise?"); 2697 Label L_rtm_retry, L_decrement_retry, L_on_abort; 2698 // Clean monitor_value bit to get valid pointer. 2699 int owner_offset = ObjectMonitor::owner_offset_in_bytes() - markOopDesc::monitor_value; 2700 2701 // Store non-null, using boxReg instead of (intptr_t)markOopDesc::unused_mark(). 2702 std(boxReg, BasicLock::displaced_header_offset_in_bytes(), boxReg); 2703 const Register tmpReg = boxReg; 2704 const Register owner_addr_Reg = mark_word; 2705 addi(owner_addr_Reg, mark_word, owner_offset); 2706 2707 if (RTMRetryCount > 0) { 2708 load_const_optimized(retry_on_busy_count_Reg, RTMRetryCount); // Retry on lock busy. 2709 load_const_optimized(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort. 2710 bind(L_rtm_retry); 2711 } 2712 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 2713 Label L_noincrement; 2714 if (RTMTotalCountIncrRate > 1) { 2715 branch_on_random_using_tb(R0, RTMTotalCountIncrRate, L_noincrement); 2716 } 2717 assert(rtm_counters != NULL, "should not be NULL when profiling RTM"); 2718 load_const(R0, (address)rtm_counters->total_count_addr(), tmpReg); 2719 //atomic_inc_ptr(R0, tmpReg); We don't increment atomically 2720 ldx(tmpReg, R0); 2721 addi(tmpReg, tmpReg, 1); 2722 stdx(tmpReg, R0); 2723 bind(L_noincrement); 2724 } 2725 tbegin_(); 2726 beq(CCR0, L_on_abort); 2727 // We don't reload mark word. Will only be reset at safepoint. 2728 ld(R0, 0, owner_addr_Reg); // Load in transaction, conflicts need to be tracked. 2729 cmpdi(flag, R0, 0); 2730 beq(flag, DONE_LABEL); 2731 2732 if (UseRTMXendForLockBusy) { 2733 tend_(); 2734 b(L_decrement_retry); 2735 } else { 2736 tabort_(); 2737 } 2738 bind(L_on_abort); 2739 const Register abort_status_Reg = tmpReg; 2740 mftexasr(abort_status_Reg); 2741 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 2742 rtm_profiling(abort_status_Reg, /*temp*/ owner_addr_Reg, rtm_counters, method_data, profile_rtm); 2743 // Restore owner_addr_Reg 2744 ld(mark_word, oopDesc::mark_offset_in_bytes(), obj); 2745 #ifdef ASSERT 2746 andi_(R0, mark_word, markOopDesc::monitor_value); 2747 asm_assert_ne("must be inflated", 0xa754); // Deflating only allowed at safepoint. 2748 #endif 2749 addi(owner_addr_Reg, mark_word, owner_offset); 2750 } 2751 if (RTMRetryCount > 0) { 2752 // Retry on lock abort if abort status is not permanent. 2753 rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry); 2754 } 2755 2756 // Appears unlocked - try to swing _owner from null to non-null. 2757 cmpxchgd(flag, /*current val*/ R0, (intptr_t)0, /*new val*/ R16_thread, owner_addr_Reg, 2758 MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq, 2759 MacroAssembler::cmpxchgx_hint_acquire_lock(), noreg, &L_decrement_retry, true); 2760 2761 if (RTMRetryCount > 0) { 2762 // success done else retry 2763 b(DONE_LABEL); 2764 bind(L_decrement_retry); 2765 // Spin and retry if lock is busy. 2766 rtm_retry_lock_on_busy(retry_on_busy_count_Reg, owner_addr_Reg, L_rtm_retry); 2767 } else { 2768 bind(L_decrement_retry); 2769 } 2770 } 2771 2772 #endif // INCLUDE_RTM_OPT 2773 2774 // "The box" is the space on the stack where we copy the object mark. 2775 void MacroAssembler::compiler_fast_lock_object(ConditionRegister flag, Register oop, Register box, 2776 Register temp, Register displaced_header, Register current_header, 2777 bool try_bias, 2778 RTMLockingCounters* rtm_counters, 2779 RTMLockingCounters* stack_rtm_counters, 2780 Metadata* method_data, 2781 bool use_rtm, bool profile_rtm) { 2782 assert_different_registers(oop, box, temp, displaced_header, current_header); 2783 assert(flag != CCR0, "bad condition register"); 2784 Label cont; 2785 Label object_has_monitor; 2786 Label cas_failed; 2787 2788 // Load markOop from object into displaced_header. 2789 ld(displaced_header, oopDesc::mark_offset_in_bytes(), oop); 2790 2791 2792 // Always do locking in runtime. 2793 if (EmitSync & 0x01) { 2794 cmpdi(flag, oop, 0); // Oop can't be 0 here => always false. 2795 return; 2796 } 2797 2798 if (try_bias) { 2799 biased_locking_enter(flag, oop, displaced_header, temp, current_header, cont); 2800 } 2801 2802 #if INCLUDE_RTM_OPT 2803 if (UseRTMForStackLocks && use_rtm) { 2804 rtm_stack_locking(flag, oop, displaced_header, temp, /*temp*/ current_header, 2805 stack_rtm_counters, method_data, profile_rtm, 2806 cont, object_has_monitor); 2807 } 2808 #endif // INCLUDE_RTM_OPT 2809 2810 // Handle existing monitor. 2811 if ((EmitSync & 0x02) == 0) { 2812 // The object has an existing monitor iff (mark & monitor_value) != 0. 2813 andi_(temp, displaced_header, markOopDesc::monitor_value); 2814 bne(CCR0, object_has_monitor); 2815 } 2816 2817 // Set displaced_header to be (markOop of object | UNLOCK_VALUE). 2818 ori(displaced_header, displaced_header, markOopDesc::unlocked_value); 2819 2820 // Load Compare Value application register. 2821 2822 // Initialize the box. (Must happen before we update the object mark!) 2823 std(displaced_header, BasicLock::displaced_header_offset_in_bytes(), box); 2824 2825 // Must fence, otherwise, preceding store(s) may float below cmpxchg. 2826 // Compare object markOop with mark and if equal exchange scratch1 with object markOop. 2827 cmpxchgd(/*flag=*/flag, 2828 /*current_value=*/current_header, 2829 /*compare_value=*/displaced_header, 2830 /*exchange_value=*/box, 2831 /*where=*/oop, 2832 MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq, 2833 MacroAssembler::cmpxchgx_hint_acquire_lock(), 2834 noreg, 2835 &cas_failed, 2836 /*check without membar and ldarx first*/true); 2837 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 2838 2839 // If the compare-and-exchange succeeded, then we found an unlocked 2840 // object and we have now locked it. 2841 b(cont); 2842 2843 bind(cas_failed); 2844 // We did not see an unlocked object so try the fast recursive case. 2845 2846 // Check if the owner is self by comparing the value in the markOop of object 2847 // (current_header) with the stack pointer. 2848 sub(current_header, current_header, R1_SP); 2849 load_const_optimized(temp, ~(os::vm_page_size()-1) | markOopDesc::lock_mask_in_place); 2850 2851 and_(R0/*==0?*/, current_header, temp); 2852 // If condition is true we are cont and hence we can store 0 as the 2853 // displaced header in the box, which indicates that it is a recursive lock. 2854 mcrf(flag,CCR0); 2855 std(R0/*==0, perhaps*/, BasicLock::displaced_header_offset_in_bytes(), box); 2856 2857 // Handle existing monitor. 2858 if ((EmitSync & 0x02) == 0) { 2859 b(cont); 2860 2861 bind(object_has_monitor); 2862 // The object's monitor m is unlocked iff m->owner == NULL, 2863 // otherwise m->owner may contain a thread or a stack address. 2864 2865 #if INCLUDE_RTM_OPT 2866 // Use the same RTM locking code in 32- and 64-bit VM. 2867 if (use_rtm) { 2868 rtm_inflated_locking(flag, oop, displaced_header, box, temp, /*temp*/ current_header, 2869 rtm_counters, method_data, profile_rtm, cont); 2870 } else { 2871 #endif // INCLUDE_RTM_OPT 2872 2873 // Try to CAS m->owner from NULL to current thread. 2874 addi(temp, displaced_header, ObjectMonitor::owner_offset_in_bytes()-markOopDesc::monitor_value); 2875 cmpxchgd(/*flag=*/flag, 2876 /*current_value=*/current_header, 2877 /*compare_value=*/(intptr_t)0, 2878 /*exchange_value=*/R16_thread, 2879 /*where=*/temp, 2880 MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq, 2881 MacroAssembler::cmpxchgx_hint_acquire_lock()); 2882 2883 // Store a non-null value into the box. 2884 std(box, BasicLock::displaced_header_offset_in_bytes(), box); 2885 2886 # ifdef ASSERT 2887 bne(flag, cont); 2888 // We have acquired the monitor, check some invariants. 2889 addi(/*monitor=*/temp, temp, -ObjectMonitor::owner_offset_in_bytes()); 2890 // Invariant 1: _recursions should be 0. 2891 //assert(ObjectMonitor::recursions_size_in_bytes() == 8, "unexpected size"); 2892 asm_assert_mem8_is_zero(ObjectMonitor::recursions_offset_in_bytes(), temp, 2893 "monitor->_recursions should be 0", -1); 2894 # endif 2895 2896 #if INCLUDE_RTM_OPT 2897 } // use_rtm() 2898 #endif 2899 } 2900 2901 bind(cont); 2902 // flag == EQ indicates success 2903 // flag == NE indicates failure 2904 } 2905 2906 void MacroAssembler::compiler_fast_unlock_object(ConditionRegister flag, Register oop, Register box, 2907 Register temp, Register displaced_header, Register current_header, 2908 bool try_bias, bool use_rtm) { 2909 assert_different_registers(oop, box, temp, displaced_header, current_header); 2910 assert(flag != CCR0, "bad condition register"); 2911 Label cont; 2912 Label object_has_monitor; 2913 2914 // Always do locking in runtime. 2915 if (EmitSync & 0x01) { 2916 cmpdi(flag, oop, 0); // Oop can't be 0 here => always false. 2917 return; 2918 } 2919 2920 if (try_bias) { 2921 biased_locking_exit(flag, oop, current_header, cont); 2922 } 2923 2924 #if INCLUDE_RTM_OPT 2925 if (UseRTMForStackLocks && use_rtm) { 2926 assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking"); 2927 Label L_regular_unlock; 2928 ld(current_header, oopDesc::mark_offset_in_bytes(), oop); // fetch markword 2929 andi(R0, current_header, markOopDesc::biased_lock_mask_in_place); // look at 3 lock bits 2930 cmpwi(flag, R0, markOopDesc::unlocked_value); // bits = 001 unlocked 2931 bne(flag, L_regular_unlock); // else RegularLock 2932 tend_(); // otherwise end... 2933 b(cont); // ... and we're done 2934 bind(L_regular_unlock); 2935 } 2936 #endif 2937 2938 // Find the lock address and load the displaced header from the stack. 2939 ld(displaced_header, BasicLock::displaced_header_offset_in_bytes(), box); 2940 2941 // If the displaced header is 0, we have a recursive unlock. 2942 cmpdi(flag, displaced_header, 0); 2943 beq(flag, cont); 2944 2945 // Handle existing monitor. 2946 if ((EmitSync & 0x02) == 0) { 2947 // The object has an existing monitor iff (mark & monitor_value) != 0. 2948 RTM_OPT_ONLY( if (!(UseRTMForStackLocks && use_rtm)) ) // skip load if already done 2949 ld(current_header, oopDesc::mark_offset_in_bytes(), oop); 2950 andi_(R0, current_header, markOopDesc::monitor_value); 2951 bne(CCR0, object_has_monitor); 2952 } 2953 2954 // Check if it is still a light weight lock, this is is true if we see 2955 // the stack address of the basicLock in the markOop of the object. 2956 // Cmpxchg sets flag to cmpd(current_header, box). 2957 cmpxchgd(/*flag=*/flag, 2958 /*current_value=*/current_header, 2959 /*compare_value=*/box, 2960 /*exchange_value=*/displaced_header, 2961 /*where=*/oop, 2962 MacroAssembler::MemBarRel, 2963 MacroAssembler::cmpxchgx_hint_release_lock(), 2964 noreg, 2965 &cont); 2966 2967 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 2968 2969 // Handle existing monitor. 2970 if ((EmitSync & 0x02) == 0) { 2971 b(cont); 2972 2973 bind(object_has_monitor); 2974 addi(current_header, current_header, -markOopDesc::monitor_value); // monitor 2975 ld(temp, ObjectMonitor::owner_offset_in_bytes(), current_header); 2976 2977 // It's inflated. 2978 #if INCLUDE_RTM_OPT 2979 if (use_rtm) { 2980 Label L_regular_inflated_unlock; 2981 // Clean monitor_value bit to get valid pointer 2982 cmpdi(flag, temp, 0); 2983 bne(flag, L_regular_inflated_unlock); 2984 tend_(); 2985 b(cont); 2986 bind(L_regular_inflated_unlock); 2987 } 2988 #endif 2989 2990 ld(displaced_header, ObjectMonitor::recursions_offset_in_bytes(), current_header); 2991 xorr(temp, R16_thread, temp); // Will be 0 if we are the owner. 2992 orr(temp, temp, displaced_header); // Will be 0 if there are 0 recursions. 2993 cmpdi(flag, temp, 0); 2994 bne(flag, cont); 2995 2996 ld(temp, ObjectMonitor::EntryList_offset_in_bytes(), current_header); 2997 ld(displaced_header, ObjectMonitor::cxq_offset_in_bytes(), current_header); 2998 orr(temp, temp, displaced_header); // Will be 0 if both are 0. 2999 cmpdi(flag, temp, 0); 3000 bne(flag, cont); 3001 release(); 3002 std(temp, ObjectMonitor::owner_offset_in_bytes(), current_header); 3003 } 3004 3005 bind(cont); 3006 // flag == EQ indicates success 3007 // flag == NE indicates failure 3008 } 3009 3010 // Write serialization page so VM thread can do a pseudo remote membar. 3011 // We use the current thread pointer to calculate a thread specific 3012 // offset to write to within the page. This minimizes bus traffic 3013 // due to cache line collision. 3014 void MacroAssembler::serialize_memory(Register thread, Register tmp1, Register tmp2) { 3015 srdi(tmp2, thread, os::get_serialize_page_shift_count()); 3016 3017 int mask = os::vm_page_size() - sizeof(int); 3018 if (Assembler::is_simm(mask, 16)) { 3019 andi(tmp2, tmp2, mask); 3020 } else { 3021 lis(tmp1, (int)((signed short) (mask >> 16))); 3022 ori(tmp1, tmp1, mask & 0x0000ffff); 3023 andr(tmp2, tmp2, tmp1); 3024 } 3025 3026 load_const(tmp1, (long) os::get_memory_serialize_page()); 3027 release(); 3028 stwx(R0, tmp1, tmp2); 3029 } 3030 3031 void MacroAssembler::safepoint_poll(Label& slow_path, Register temp_reg) { 3032 if (SafepointMechanism::uses_thread_local_poll()) { 3033 ld(temp_reg, in_bytes(Thread::polling_page_offset()), R16_thread); 3034 // Armed page has poll_bit set. 3035 andi_(temp_reg, temp_reg, SafepointMechanism::poll_bit()); 3036 } else { 3037 lwz(temp_reg, (RegisterOrConstant)(intptr_t)SafepointSynchronize::address_of_state()); 3038 cmpwi(CCR0, temp_reg, SafepointSynchronize::_not_synchronized); 3039 } 3040 bne(CCR0, slow_path); 3041 } 3042 3043 3044 // GC barrier helper macros 3045 3046 // Write the card table byte if needed. 3047 void MacroAssembler::card_write_barrier_post(Register Rstore_addr, Register Rnew_val, Register Rtmp) { 3048 CardTableBarrierSet* bs = 3049 barrier_set_cast<CardTableBarrierSet>(Universe::heap()->barrier_set()); 3050 assert(bs->kind() == BarrierSet::CardTableBarrierSet, "wrong barrier"); 3051 CardTable* ct = bs->card_table(); 3052 #ifdef ASSERT 3053 cmpdi(CCR0, Rnew_val, 0); 3054 asm_assert_ne("null oop not allowed", 0x321); 3055 #endif 3056 card_table_write(ct->byte_map_base(), Rtmp, Rstore_addr); 3057 } 3058 3059 // Write the card table byte. 3060 void MacroAssembler::card_table_write(jbyte* byte_map_base, Register Rtmp, Register Robj) { 3061 assert_different_registers(Robj, Rtmp, R0); 3062 load_const_optimized(Rtmp, (address)byte_map_base, R0); 3063 srdi(Robj, Robj, CardTable::card_shift); 3064 li(R0, 0); // dirty 3065 if (UseConcMarkSweepGC) membar(Assembler::StoreStore); 3066 stbx(R0, Rtmp, Robj); 3067 } 3068 3069 // Kills R31 if value is a volatile register. 3070 void MacroAssembler::resolve_jobject(Register value, Register tmp1, Register tmp2, bool needs_frame) { 3071 Label done; 3072 cmpdi(CCR0, value, 0); 3073 beq(CCR0, done); // Use NULL as-is. 3074 3075 clrrdi(tmp1, value, JNIHandles::weak_tag_size); 3076 #if INCLUDE_ALL_GCS 3077 if (UseG1GC) { andi_(tmp2, value, JNIHandles::weak_tag_mask); } 3078 #endif 3079 ld(value, 0, tmp1); // Resolve (untagged) jobject. 3080 3081 #if INCLUDE_ALL_GCS 3082 if (UseG1GC) { 3083 Label not_weak; 3084 beq(CCR0, not_weak); // Test for jweak tag. 3085 verify_oop(value); 3086 g1_write_barrier_pre(noreg, // obj 3087 noreg, // offset 3088 value, // pre_val 3089 tmp1, tmp2, needs_frame); 3090 bind(not_weak); 3091 } 3092 #endif // INCLUDE_ALL_GCS 3093 verify_oop(value); 3094 bind(done); 3095 } 3096 3097 #if INCLUDE_ALL_GCS 3098 // General G1 pre-barrier generator. 3099 // Goal: record the previous value if it is not null. 3100 void MacroAssembler::g1_write_barrier_pre(Register Robj, RegisterOrConstant offset, Register Rpre_val, 3101 Register Rtmp1, Register Rtmp2, bool needs_frame) { 3102 Label runtime, filtered; 3103 3104 // Is marking active? 3105 if (in_bytes(SATBMarkQueue::byte_width_of_active()) == 4) { 3106 lwz(Rtmp1, in_bytes(G1ThreadLocalData::satb_mark_queue_active_offset()), R16_thread); 3107 } else { 3108 guarantee(in_bytes(SATBMarkQueue::byte_width_of_active()) == 1, "Assumption"); 3109 lbz(Rtmp1, in_bytes(G1ThreadLocalData::satb_mark_queue_active_offset()), R16_thread); 3110 } 3111 cmpdi(CCR0, Rtmp1, 0); 3112 beq(CCR0, filtered); 3113 3114 // Do we need to load the previous value? 3115 if (Robj != noreg) { 3116 // Load the previous value... 3117 if (UseCompressedOops) { 3118 lwz(Rpre_val, offset, Robj); 3119 } else { 3120 ld(Rpre_val, offset, Robj); 3121 } 3122 // Previous value has been loaded into Rpre_val. 3123 } 3124 assert(Rpre_val != noreg, "must have a real register"); 3125 3126 // Is the previous value null? 3127 cmpdi(CCR0, Rpre_val, 0); 3128 beq(CCR0, filtered); 3129 3130 if (Robj != noreg && UseCompressedOops) { 3131 decode_heap_oop_not_null(Rpre_val); 3132 } 3133 3134 // OK, it's not filtered, so we'll need to call enqueue. In the normal 3135 // case, pre_val will be a scratch G-reg, but there are some cases in 3136 // which it's an O-reg. In the first case, do a normal call. In the 3137 // latter, do a save here and call the frameless version. 3138 3139 // Can we store original value in the thread's buffer? 3140 // Is index == 0? 3141 // (The index field is typed as size_t.) 3142 const Register Rbuffer = Rtmp1, Rindex = Rtmp2; 3143 3144 ld(Rindex, in_bytes(G1ThreadLocalData::satb_mark_queue_index_offset()), R16_thread); 3145 cmpdi(CCR0, Rindex, 0); 3146 beq(CCR0, runtime); // If index == 0, goto runtime. 3147 ld(Rbuffer, in_bytes(G1ThreadLocalData::satb_mark_queue_buffer_offset()), R16_thread); 3148 3149 addi(Rindex, Rindex, -wordSize); // Decrement index. 3150 std(Rindex, in_bytes(G1ThreadLocalData::satb_mark_queue_index_offset()), R16_thread); 3151 3152 // Record the previous value. 3153 stdx(Rpre_val, Rbuffer, Rindex); 3154 b(filtered); 3155 3156 bind(runtime); 3157 3158 // May need to preserve LR. Also needed if current frame is not compatible with C calling convention. 3159 if (needs_frame) { 3160 save_LR_CR(Rtmp1); 3161 push_frame_reg_args(0, Rtmp2); 3162 } 3163 3164 if (Rpre_val->is_volatile() && Robj == noreg) mr(R31, Rpre_val); // Save pre_val across C call if it was preloaded. 3165 call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_pre), Rpre_val, R16_thread); 3166 if (Rpre_val->is_volatile() && Robj == noreg) mr(Rpre_val, R31); // restore 3167 3168 if (needs_frame) { 3169 pop_frame(); 3170 restore_LR_CR(Rtmp1); 3171 } 3172 3173 bind(filtered); 3174 } 3175 3176 // General G1 post-barrier generator 3177 // Store cross-region card. 3178 void MacroAssembler::g1_write_barrier_post(Register Rstore_addr, Register Rnew_val, Register Rtmp1, Register Rtmp2, Register Rtmp3, Label *filtered_ext) { 3179 Label runtime, filtered_int; 3180 Label& filtered = (filtered_ext != NULL) ? *filtered_ext : filtered_int; 3181 assert_different_registers(Rstore_addr, Rnew_val, Rtmp1, Rtmp2); 3182 3183 G1BarrierSet* bs = 3184 barrier_set_cast<G1BarrierSet>(Universe::heap()->barrier_set()); 3185 CardTable* ct = bs->card_table(); 3186 3187 // Does store cross heap regions? 3188 if (G1RSBarrierRegionFilter) { 3189 xorr(Rtmp1, Rstore_addr, Rnew_val); 3190 srdi_(Rtmp1, Rtmp1, HeapRegion::LogOfHRGrainBytes); 3191 beq(CCR0, filtered); 3192 } 3193 3194 // Crosses regions, storing NULL? 3195 #ifdef ASSERT 3196 cmpdi(CCR0, Rnew_val, 0); 3197 asm_assert_ne("null oop not allowed (G1)", 0x322); // Checked by caller on PPC64, so following branch is obsolete: 3198 //beq(CCR0, filtered); 3199 #endif 3200 3201 // Storing region crossing non-NULL, is card already dirty? 3202 assert(sizeof(*ct->byte_map_base()) == sizeof(jbyte), "adjust this code"); 3203 const Register Rcard_addr = Rtmp1; 3204 Register Rbase = Rtmp2; 3205 load_const_optimized(Rbase, (address)ct->byte_map_base(), /*temp*/ Rtmp3); 3206 3207 srdi(Rcard_addr, Rstore_addr, CardTable::card_shift); 3208 3209 // Get the address of the card. 3210 lbzx(/*card value*/ Rtmp3, Rbase, Rcard_addr); 3211 cmpwi(CCR0, Rtmp3, (int)G1CardTable::g1_young_card_val()); 3212 beq(CCR0, filtered); 3213 3214 membar(Assembler::StoreLoad); 3215 lbzx(/*card value*/ Rtmp3, Rbase, Rcard_addr); // Reload after membar. 3216 cmpwi(CCR0, Rtmp3 /* card value */, CardTable::dirty_card_val()); 3217 beq(CCR0, filtered); 3218 3219 // Storing a region crossing, non-NULL oop, card is clean. 3220 // Dirty card and log. 3221 li(Rtmp3, CardTable::dirty_card_val()); 3222 //release(); // G1: oops are allowed to get visible after dirty marking. 3223 stbx(Rtmp3, Rbase, Rcard_addr); 3224 3225 add(Rcard_addr, Rbase, Rcard_addr); // This is the address which needs to get enqueued. 3226 Rbase = noreg; // end of lifetime 3227 3228 const Register Rqueue_index = Rtmp2, 3229 Rqueue_buf = Rtmp3; 3230 ld(Rqueue_index, in_bytes(G1ThreadLocalData::dirty_card_queue_index_offset()), R16_thread); 3231 cmpdi(CCR0, Rqueue_index, 0); 3232 beq(CCR0, runtime); // index == 0 then jump to runtime 3233 ld(Rqueue_buf, in_bytes(G1ThreadLocalData::dirty_card_queue_buffer_offset()), R16_thread); 3234 3235 addi(Rqueue_index, Rqueue_index, -wordSize); // decrement index 3236 std(Rqueue_index, in_bytes(G1ThreadLocalData::dirty_card_queue_index_offset()), R16_thread); 3237 3238 stdx(Rcard_addr, Rqueue_buf, Rqueue_index); // store card 3239 b(filtered); 3240 3241 bind(runtime); 3242 3243 // Save the live input values. 3244 call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_post), Rcard_addr, R16_thread); 3245 3246 bind(filtered_int); 3247 } 3248 #endif // INCLUDE_ALL_GCS 3249 3250 // Values for last_Java_pc, and last_Java_sp must comply to the rules 3251 // in frame_ppc.hpp. 3252 void MacroAssembler::set_last_Java_frame(Register last_Java_sp, Register last_Java_pc) { 3253 // Always set last_Java_pc and flags first because once last_Java_sp 3254 // is visible has_last_Java_frame is true and users will look at the 3255 // rest of the fields. (Note: flags should always be zero before we 3256 // get here so doesn't need to be set.) 3257 3258 // Verify that last_Java_pc was zeroed on return to Java 3259 asm_assert_mem8_is_zero(in_bytes(JavaThread::last_Java_pc_offset()), R16_thread, 3260 "last_Java_pc not zeroed before leaving Java", 0x200); 3261 3262 // When returning from calling out from Java mode the frame anchor's 3263 // last_Java_pc will always be set to NULL. It is set here so that 3264 // if we are doing a call to native (not VM) that we capture the 3265 // known pc and don't have to rely on the native call having a 3266 // standard frame linkage where we can find the pc. 3267 if (last_Java_pc != noreg) 3268 std(last_Java_pc, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread); 3269 3270 // Set last_Java_sp last. 3271 std(last_Java_sp, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread); 3272 } 3273 3274 void MacroAssembler::reset_last_Java_frame(void) { 3275 asm_assert_mem8_isnot_zero(in_bytes(JavaThread::last_Java_sp_offset()), 3276 R16_thread, "SP was not set, still zero", 0x202); 3277 3278 BLOCK_COMMENT("reset_last_Java_frame {"); 3279 li(R0, 0); 3280 3281 // _last_Java_sp = 0 3282 std(R0, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread); 3283 3284 // _last_Java_pc = 0 3285 std(R0, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread); 3286 BLOCK_COMMENT("} reset_last_Java_frame"); 3287 } 3288 3289 void MacroAssembler::set_top_ijava_frame_at_SP_as_last_Java_frame(Register sp, Register tmp1) { 3290 assert_different_registers(sp, tmp1); 3291 3292 // sp points to a TOP_IJAVA_FRAME, retrieve frame's PC via 3293 // TOP_IJAVA_FRAME_ABI. 3294 // FIXME: assert that we really have a TOP_IJAVA_FRAME here! 3295 address entry = pc(); 3296 load_const_optimized(tmp1, entry); 3297 3298 set_last_Java_frame(/*sp=*/sp, /*pc=*/tmp1); 3299 } 3300 3301 void MacroAssembler::get_vm_result(Register oop_result) { 3302 // Read: 3303 // R16_thread 3304 // R16_thread->in_bytes(JavaThread::vm_result_offset()) 3305 // 3306 // Updated: 3307 // oop_result 3308 // R16_thread->in_bytes(JavaThread::vm_result_offset()) 3309 3310 verify_thread(); 3311 3312 ld(oop_result, in_bytes(JavaThread::vm_result_offset()), R16_thread); 3313 li(R0, 0); 3314 std(R0, in_bytes(JavaThread::vm_result_offset()), R16_thread); 3315 3316 verify_oop(oop_result); 3317 } 3318 3319 void MacroAssembler::get_vm_result_2(Register metadata_result) { 3320 // Read: 3321 // R16_thread 3322 // R16_thread->in_bytes(JavaThread::vm_result_2_offset()) 3323 // 3324 // Updated: 3325 // metadata_result 3326 // R16_thread->in_bytes(JavaThread::vm_result_2_offset()) 3327 3328 ld(metadata_result, in_bytes(JavaThread::vm_result_2_offset()), R16_thread); 3329 li(R0, 0); 3330 std(R0, in_bytes(JavaThread::vm_result_2_offset()), R16_thread); 3331 } 3332 3333 Register MacroAssembler::encode_klass_not_null(Register dst, Register src) { 3334 Register current = (src != noreg) ? src : dst; // Klass is in dst if no src provided. 3335 if (Universe::narrow_klass_base() != 0) { 3336 // Use dst as temp if it is free. 3337 sub_const_optimized(dst, current, Universe::narrow_klass_base(), R0); 3338 current = dst; 3339 } 3340 if (Universe::narrow_klass_shift() != 0) { 3341 srdi(dst, current, Universe::narrow_klass_shift()); 3342 current = dst; 3343 } 3344 return current; 3345 } 3346 3347 void MacroAssembler::store_klass(Register dst_oop, Register klass, Register ck) { 3348 if (UseCompressedClassPointers) { 3349 Register compressedKlass = encode_klass_not_null(ck, klass); 3350 stw(compressedKlass, oopDesc::klass_offset_in_bytes(), dst_oop); 3351 } else { 3352 std(klass, oopDesc::klass_offset_in_bytes(), dst_oop); 3353 } 3354 } 3355 3356 void MacroAssembler::store_klass_gap(Register dst_oop, Register val) { 3357 if (UseCompressedClassPointers) { 3358 if (val == noreg) { 3359 val = R0; 3360 li(val, 0); 3361 } 3362 stw(val, oopDesc::klass_gap_offset_in_bytes(), dst_oop); // klass gap if compressed 3363 } 3364 } 3365 3366 int MacroAssembler::instr_size_for_decode_klass_not_null() { 3367 if (!UseCompressedClassPointers) return 0; 3368 int num_instrs = 1; // shift or move 3369 if (Universe::narrow_klass_base() != 0) num_instrs = 7; // shift + load const + add 3370 return num_instrs * BytesPerInstWord; 3371 } 3372 3373 void MacroAssembler::decode_klass_not_null(Register dst, Register src) { 3374 assert(dst != R0, "Dst reg may not be R0, as R0 is used here."); 3375 if (src == noreg) src = dst; 3376 Register shifted_src = src; 3377 if (Universe::narrow_klass_shift() != 0 || 3378 Universe::narrow_klass_base() == 0 && src != dst) { // Move required. 3379 shifted_src = dst; 3380 sldi(shifted_src, src, Universe::narrow_klass_shift()); 3381 } 3382 if (Universe::narrow_klass_base() != 0) { 3383 add_const_optimized(dst, shifted_src, Universe::narrow_klass_base(), R0); 3384 } 3385 } 3386 3387 void MacroAssembler::load_klass(Register dst, Register src) { 3388 if (UseCompressedClassPointers) { 3389 lwz(dst, oopDesc::klass_offset_in_bytes(), src); 3390 // Attention: no null check here! 3391 decode_klass_not_null(dst, dst); 3392 } else { 3393 ld(dst, oopDesc::klass_offset_in_bytes(), src); 3394 } 3395 } 3396 3397 // ((OopHandle)result).resolve(); 3398 void MacroAssembler::resolve_oop_handle(Register result) { 3399 // OopHandle::resolve is an indirection. 3400 ld(result, 0, result); 3401 } 3402 3403 void MacroAssembler::load_mirror_from_const_method(Register mirror, Register const_method) { 3404 ld(mirror, in_bytes(ConstMethod::constants_offset()), const_method); 3405 ld(mirror, ConstantPool::pool_holder_offset_in_bytes(), mirror); 3406 ld(mirror, in_bytes(Klass::java_mirror_offset()), mirror); 3407 resolve_oop_handle(mirror); 3408 } 3409 3410 // Clear Array 3411 // For very short arrays. tmp == R0 is allowed. 3412 void MacroAssembler::clear_memory_unrolled(Register base_ptr, int cnt_dwords, Register tmp, int offset) { 3413 if (cnt_dwords > 0) { li(tmp, 0); } 3414 for (int i = 0; i < cnt_dwords; ++i) { std(tmp, offset + i * 8, base_ptr); } 3415 } 3416 3417 // Version for constant short array length. Kills base_ptr. tmp == R0 is allowed. 3418 void MacroAssembler::clear_memory_constlen(Register base_ptr, int cnt_dwords, Register tmp) { 3419 if (cnt_dwords < 8) { 3420 clear_memory_unrolled(base_ptr, cnt_dwords, tmp); 3421 return; 3422 } 3423 3424 Label loop; 3425 const long loopcnt = cnt_dwords >> 1, 3426 remainder = cnt_dwords & 1; 3427 3428 li(tmp, loopcnt); 3429 mtctr(tmp); 3430 li(tmp, 0); 3431 bind(loop); 3432 std(tmp, 0, base_ptr); 3433 std(tmp, 8, base_ptr); 3434 addi(base_ptr, base_ptr, 16); 3435 bdnz(loop); 3436 if (remainder) { std(tmp, 0, base_ptr); } 3437 } 3438 3439 // Kills both input registers. tmp == R0 is allowed. 3440 void MacroAssembler::clear_memory_doubleword(Register base_ptr, Register cnt_dwords, Register tmp, long const_cnt) { 3441 // Procedure for large arrays (uses data cache block zero instruction). 3442 Label startloop, fast, fastloop, small_rest, restloop, done; 3443 const int cl_size = VM_Version::L1_data_cache_line_size(), 3444 cl_dwords = cl_size >> 3, 3445 cl_dw_addr_bits = exact_log2(cl_dwords), 3446 dcbz_min = 1, // Min count of dcbz executions, needs to be >0. 3447 min_cnt = ((dcbz_min + 1) << cl_dw_addr_bits) - 1; 3448 3449 if (const_cnt >= 0) { 3450 // Constant case. 3451 if (const_cnt < min_cnt) { 3452 clear_memory_constlen(base_ptr, const_cnt, tmp); 3453 return; 3454 } 3455 load_const_optimized(cnt_dwords, const_cnt, tmp); 3456 } else { 3457 // cnt_dwords already loaded in register. Need to check size. 3458 cmpdi(CCR1, cnt_dwords, min_cnt); // Big enough? (ensure >= dcbz_min lines included). 3459 blt(CCR1, small_rest); 3460 } 3461 rldicl_(tmp, base_ptr, 64-3, 64-cl_dw_addr_bits); // Extract dword offset within first cache line. 3462 beq(CCR0, fast); // Already 128byte aligned. 3463 3464 subfic(tmp, tmp, cl_dwords); 3465 mtctr(tmp); // Set ctr to hit 128byte boundary (0<ctr<cl_dwords). 3466 subf(cnt_dwords, tmp, cnt_dwords); // rest. 3467 li(tmp, 0); 3468 3469 bind(startloop); // Clear at the beginning to reach 128byte boundary. 3470 std(tmp, 0, base_ptr); // Clear 8byte aligned block. 3471 addi(base_ptr, base_ptr, 8); 3472 bdnz(startloop); 3473 3474 bind(fast); // Clear 128byte blocks. 3475 srdi(tmp, cnt_dwords, cl_dw_addr_bits); // Loop count for 128byte loop (>0). 3476 andi(cnt_dwords, cnt_dwords, cl_dwords-1); // Rest in dwords. 3477 mtctr(tmp); // Load counter. 3478 3479 bind(fastloop); 3480 dcbz(base_ptr); // Clear 128byte aligned block. 3481 addi(base_ptr, base_ptr, cl_size); 3482 bdnz(fastloop); 3483 3484 bind(small_rest); 3485 cmpdi(CCR0, cnt_dwords, 0); // size 0? 3486 beq(CCR0, done); // rest == 0 3487 li(tmp, 0); 3488 mtctr(cnt_dwords); // Load counter. 3489 3490 bind(restloop); // Clear rest. 3491 std(tmp, 0, base_ptr); // Clear 8byte aligned block. 3492 addi(base_ptr, base_ptr, 8); 3493 bdnz(restloop); 3494 3495 bind(done); 3496 } 3497 3498 /////////////////////////////////////////// String intrinsics //////////////////////////////////////////// 3499 3500 #ifdef COMPILER2 3501 // Intrinsics for CompactStrings 3502 3503 // Compress char[] to byte[] by compressing 16 bytes at once. 3504 void MacroAssembler::string_compress_16(Register src, Register dst, Register cnt, 3505 Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5, 3506 Label& Lfailure) { 3507 3508 const Register tmp0 = R0; 3509 assert_different_registers(src, dst, cnt, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5); 3510 Label Lloop, Lslow; 3511 3512 // Check if cnt >= 8 (= 16 bytes) 3513 lis(tmp1, 0xFF); // tmp1 = 0x00FF00FF00FF00FF 3514 srwi_(tmp2, cnt, 3); 3515 beq(CCR0, Lslow); 3516 ori(tmp1, tmp1, 0xFF); 3517 rldimi(tmp1, tmp1, 32, 0); 3518 mtctr(tmp2); 3519 3520 // 2x unrolled loop 3521 bind(Lloop); 3522 ld(tmp2, 0, src); // _0_1_2_3 (Big Endian) 3523 ld(tmp4, 8, src); // _4_5_6_7 3524 3525 orr(tmp0, tmp2, tmp4); 3526 rldicl(tmp3, tmp2, 6*8, 64-24); // _____1_2 3527 rldimi(tmp2, tmp2, 2*8, 2*8); // _0_2_3_3 3528 rldicl(tmp5, tmp4, 6*8, 64-24); // _____5_6 3529 rldimi(tmp4, tmp4, 2*8, 2*8); // _4_6_7_7 3530 3531 andc_(tmp0, tmp0, tmp1); 3532 bne(CCR0, Lfailure); // Not latin1. 3533 addi(src, src, 16); 3534 3535 rlwimi(tmp3, tmp2, 0*8, 24, 31);// _____1_3 3536 srdi(tmp2, tmp2, 3*8); // ____0_2_ 3537 rlwimi(tmp5, tmp4, 0*8, 24, 31);// _____5_7 3538 srdi(tmp4, tmp4, 3*8); // ____4_6_ 3539 3540 orr(tmp2, tmp2, tmp3); // ____0123 3541 orr(tmp4, tmp4, tmp5); // ____4567 3542 3543 stw(tmp2, 0, dst); 3544 stw(tmp4, 4, dst); 3545 addi(dst, dst, 8); 3546 bdnz(Lloop); 3547 3548 bind(Lslow); // Fallback to slow version 3549 } 3550 3551 // Compress char[] to byte[]. cnt must be positive int. 3552 void MacroAssembler::string_compress(Register src, Register dst, Register cnt, Register tmp, Label& Lfailure) { 3553 Label Lloop; 3554 mtctr(cnt); 3555 3556 bind(Lloop); 3557 lhz(tmp, 0, src); 3558 cmplwi(CCR0, tmp, 0xff); 3559 bgt(CCR0, Lfailure); // Not latin1. 3560 addi(src, src, 2); 3561 stb(tmp, 0, dst); 3562 addi(dst, dst, 1); 3563 bdnz(Lloop); 3564 } 3565 3566 // Inflate byte[] to char[] by inflating 16 bytes at once. 3567 void MacroAssembler::string_inflate_16(Register src, Register dst, Register cnt, 3568 Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5) { 3569 const Register tmp0 = R0; 3570 assert_different_registers(src, dst, cnt, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5); 3571 Label Lloop, Lslow; 3572 3573 // Check if cnt >= 8 3574 srwi_(tmp2, cnt, 3); 3575 beq(CCR0, Lslow); 3576 lis(tmp1, 0xFF); // tmp1 = 0x00FF00FF 3577 ori(tmp1, tmp1, 0xFF); 3578 mtctr(tmp2); 3579 3580 // 2x unrolled loop 3581 bind(Lloop); 3582 lwz(tmp2, 0, src); // ____0123 (Big Endian) 3583 lwz(tmp4, 4, src); // ____4567 3584 addi(src, src, 8); 3585 3586 rldicl(tmp3, tmp2, 7*8, 64-8); // _______2 3587 rlwimi(tmp2, tmp2, 3*8, 16, 23);// ____0113 3588 rldicl(tmp5, tmp4, 7*8, 64-8); // _______6 3589 rlwimi(tmp4, tmp4, 3*8, 16, 23);// ____4557 3590 3591 andc(tmp0, tmp2, tmp1); // ____0_1_ 3592 rlwimi(tmp2, tmp3, 2*8, 0, 23); // _____2_3 3593 andc(tmp3, tmp4, tmp1); // ____4_5_ 3594 rlwimi(tmp4, tmp5, 2*8, 0, 23); // _____6_7 3595 3596 rldimi(tmp2, tmp0, 3*8, 0*8); // _0_1_2_3 3597 rldimi(tmp4, tmp3, 3*8, 0*8); // _4_5_6_7 3598 3599 std(tmp2, 0, dst); 3600 std(tmp4, 8, dst); 3601 addi(dst, dst, 16); 3602 bdnz(Lloop); 3603 3604 bind(Lslow); // Fallback to slow version 3605 } 3606 3607 // Inflate byte[] to char[]. cnt must be positive int. 3608 void MacroAssembler::string_inflate(Register src, Register dst, Register cnt, Register tmp) { 3609 Label Lloop; 3610 mtctr(cnt); 3611 3612 bind(Lloop); 3613 lbz(tmp, 0, src); 3614 addi(src, src, 1); 3615 sth(tmp, 0, dst); 3616 addi(dst, dst, 2); 3617 bdnz(Lloop); 3618 } 3619 3620 void MacroAssembler::string_compare(Register str1, Register str2, 3621 Register cnt1, Register cnt2, 3622 Register tmp1, Register result, int ae) { 3623 const Register tmp0 = R0, 3624 diff = tmp1; 3625 3626 assert_different_registers(str1, str2, cnt1, cnt2, tmp0, tmp1, result); 3627 Label Ldone, Lslow, Lloop, Lreturn_diff; 3628 3629 // Note: Making use of the fact that compareTo(a, b) == -compareTo(b, a) 3630 // we interchange str1 and str2 in the UL case and negate the result. 3631 // Like this, str1 is always latin1 encoded, except for the UU case. 3632 // In addition, we need 0 (or sign which is 0) extend. 3633 3634 if (ae == StrIntrinsicNode::UU) { 3635 srwi(cnt1, cnt1, 1); 3636 } else { 3637 clrldi(cnt1, cnt1, 32); 3638 } 3639 3640 if (ae != StrIntrinsicNode::LL) { 3641 srwi(cnt2, cnt2, 1); 3642 } else { 3643 clrldi(cnt2, cnt2, 32); 3644 } 3645 3646 // See if the lengths are different, and calculate min in cnt1. 3647 // Save diff in case we need it for a tie-breaker. 3648 subf_(diff, cnt2, cnt1); // diff = cnt1 - cnt2 3649 // if (diff > 0) { cnt1 = cnt2; } 3650 if (VM_Version::has_isel()) { 3651 isel(cnt1, CCR0, Assembler::greater, /*invert*/ false, cnt2); 3652 } else { 3653 Label Lskip; 3654 blt(CCR0, Lskip); 3655 mr(cnt1, cnt2); 3656 bind(Lskip); 3657 } 3658 3659 // Rename registers 3660 Register chr1 = result; 3661 Register chr2 = tmp0; 3662 3663 // Compare multiple characters in fast loop (only implemented for same encoding). 3664 int stride1 = 8, stride2 = 8; 3665 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3666 int log2_chars_per_iter = (ae == StrIntrinsicNode::LL) ? 3 : 2; 3667 Label Lfastloop, Lskipfast; 3668 3669 srwi_(tmp0, cnt1, log2_chars_per_iter); 3670 beq(CCR0, Lskipfast); 3671 rldicl(cnt2, cnt1, 0, 64 - log2_chars_per_iter); // Remaining characters. 3672 li(cnt1, 1 << log2_chars_per_iter); // Initialize for failure case: Rescan characters from current iteration. 3673 mtctr(tmp0); 3674 3675 bind(Lfastloop); 3676 ld(chr1, 0, str1); 3677 ld(chr2, 0, str2); 3678 cmpd(CCR0, chr1, chr2); 3679 bne(CCR0, Lslow); 3680 addi(str1, str1, stride1); 3681 addi(str2, str2, stride2); 3682 bdnz(Lfastloop); 3683 mr(cnt1, cnt2); // Remaining characters. 3684 bind(Lskipfast); 3685 } 3686 3687 // Loop which searches the first difference character by character. 3688 cmpwi(CCR0, cnt1, 0); 3689 beq(CCR0, Lreturn_diff); 3690 bind(Lslow); 3691 mtctr(cnt1); 3692 3693 switch (ae) { 3694 case StrIntrinsicNode::LL: stride1 = 1; stride2 = 1; break; 3695 case StrIntrinsicNode::UL: // fallthru (see comment above) 3696 case StrIntrinsicNode::LU: stride1 = 1; stride2 = 2; break; 3697 case StrIntrinsicNode::UU: stride1 = 2; stride2 = 2; break; 3698 default: ShouldNotReachHere(); break; 3699 } 3700 3701 bind(Lloop); 3702 if (stride1 == 1) { lbz(chr1, 0, str1); } else { lhz(chr1, 0, str1); } 3703 if (stride2 == 1) { lbz(chr2, 0, str2); } else { lhz(chr2, 0, str2); } 3704 subf_(result, chr2, chr1); // result = chr1 - chr2 3705 bne(CCR0, Ldone); 3706 addi(str1, str1, stride1); 3707 addi(str2, str2, stride2); 3708 bdnz(Lloop); 3709 3710 // If strings are equal up to min length, return the length difference. 3711 bind(Lreturn_diff); 3712 mr(result, diff); 3713 3714 // Otherwise, return the difference between the first mismatched chars. 3715 bind(Ldone); 3716 if (ae == StrIntrinsicNode::UL) { 3717 neg(result, result); // Negate result (see note above). 3718 } 3719 } 3720 3721 void MacroAssembler::array_equals(bool is_array_equ, Register ary1, Register ary2, 3722 Register limit, Register tmp1, Register result, bool is_byte) { 3723 const Register tmp0 = R0; 3724 assert_different_registers(ary1, ary2, limit, tmp0, tmp1, result); 3725 Label Ldone, Lskiploop, Lloop, Lfastloop, Lskipfast; 3726 bool limit_needs_shift = false; 3727 3728 if (is_array_equ) { 3729 const int length_offset = arrayOopDesc::length_offset_in_bytes(); 3730 const int base_offset = arrayOopDesc::base_offset_in_bytes(is_byte ? T_BYTE : T_CHAR); 3731 3732 // Return true if the same array. 3733 cmpd(CCR0, ary1, ary2); 3734 beq(CCR0, Lskiploop); 3735 3736 // Return false if one of them is NULL. 3737 cmpdi(CCR0, ary1, 0); 3738 cmpdi(CCR1, ary2, 0); 3739 li(result, 0); 3740 cror(CCR0, Assembler::equal, CCR1, Assembler::equal); 3741 beq(CCR0, Ldone); 3742 3743 // Load the lengths of arrays. 3744 lwz(limit, length_offset, ary1); 3745 lwz(tmp0, length_offset, ary2); 3746 3747 // Return false if the two arrays are not equal length. 3748 cmpw(CCR0, limit, tmp0); 3749 bne(CCR0, Ldone); 3750 3751 // Load array addresses. 3752 addi(ary1, ary1, base_offset); 3753 addi(ary2, ary2, base_offset); 3754 } else { 3755 limit_needs_shift = !is_byte; 3756 li(result, 0); // Assume not equal. 3757 } 3758 3759 // Rename registers 3760 Register chr1 = tmp0; 3761 Register chr2 = tmp1; 3762 3763 // Compare 8 bytes per iteration in fast loop. 3764 const int log2_chars_per_iter = is_byte ? 3 : 2; 3765 3766 srwi_(tmp0, limit, log2_chars_per_iter + (limit_needs_shift ? 1 : 0)); 3767 beq(CCR0, Lskipfast); 3768 mtctr(tmp0); 3769 3770 bind(Lfastloop); 3771 ld(chr1, 0, ary1); 3772 ld(chr2, 0, ary2); 3773 addi(ary1, ary1, 8); 3774 addi(ary2, ary2, 8); 3775 cmpd(CCR0, chr1, chr2); 3776 bne(CCR0, Ldone); 3777 bdnz(Lfastloop); 3778 3779 bind(Lskipfast); 3780 rldicl_(limit, limit, limit_needs_shift ? 64 - 1 : 0, 64 - log2_chars_per_iter); // Remaining characters. 3781 beq(CCR0, Lskiploop); 3782 mtctr(limit); 3783 3784 // Character by character. 3785 bind(Lloop); 3786 if (is_byte) { 3787 lbz(chr1, 0, ary1); 3788 lbz(chr2, 0, ary2); 3789 addi(ary1, ary1, 1); 3790 addi(ary2, ary2, 1); 3791 } else { 3792 lhz(chr1, 0, ary1); 3793 lhz(chr2, 0, ary2); 3794 addi(ary1, ary1, 2); 3795 addi(ary2, ary2, 2); 3796 } 3797 cmpw(CCR0, chr1, chr2); 3798 bne(CCR0, Ldone); 3799 bdnz(Lloop); 3800 3801 bind(Lskiploop); 3802 li(result, 1); // All characters are equal. 3803 bind(Ldone); 3804 } 3805 3806 void MacroAssembler::string_indexof(Register result, Register haystack, Register haycnt, 3807 Register needle, ciTypeArray* needle_values, Register needlecnt, int needlecntval, 3808 Register tmp1, Register tmp2, Register tmp3, Register tmp4, int ae) { 3809 3810 // Ensure 0<needlecnt<=haycnt in ideal graph as prerequisite! 3811 Label L_TooShort, L_Found, L_NotFound, L_End; 3812 Register last_addr = haycnt, // Kill haycnt at the beginning. 3813 addr = tmp1, 3814 n_start = tmp2, 3815 ch1 = tmp3, 3816 ch2 = R0; 3817 3818 assert(ae != StrIntrinsicNode::LU, "Invalid encoding"); 3819 const int h_csize = (ae == StrIntrinsicNode::LL) ? 1 : 2; 3820 const int n_csize = (ae == StrIntrinsicNode::UU) ? 2 : 1; 3821 3822 // ************************************************************************************************** 3823 // Prepare for main loop: optimized for needle count >=2, bail out otherwise. 3824 // ************************************************************************************************** 3825 3826 // Compute last haystack addr to use if no match gets found. 3827 clrldi(haycnt, haycnt, 32); // Ensure positive int is valid as 64 bit value. 3828 addi(addr, haystack, -h_csize); // Accesses use pre-increment. 3829 if (needlecntval == 0) { // variable needlecnt 3830 cmpwi(CCR6, needlecnt, 2); 3831 clrldi(needlecnt, needlecnt, 32); // Ensure positive int is valid as 64 bit value. 3832 blt(CCR6, L_TooShort); // Variable needlecnt: handle short needle separately. 3833 } 3834 3835 if (n_csize == 2) { lwz(n_start, 0, needle); } else { lhz(n_start, 0, needle); } // Load first 2 characters of needle. 3836 3837 if (needlecntval == 0) { // variable needlecnt 3838 subf(ch1, needlecnt, haycnt); // Last character index to compare is haycnt-needlecnt. 3839 addi(needlecnt, needlecnt, -2); // Rest of needle. 3840 } else { // constant needlecnt 3841 guarantee(needlecntval != 1, "IndexOf with single-character needle must be handled separately"); 3842 assert((needlecntval & 0x7fff) == needlecntval, "wrong immediate"); 3843 addi(ch1, haycnt, -needlecntval); // Last character index to compare is haycnt-needlecnt. 3844 if (needlecntval > 3) { li(needlecnt, needlecntval - 2); } // Rest of needle. 3845 } 3846 3847 if (h_csize == 2) { slwi(ch1, ch1, 1); } // Scale to number of bytes. 3848 3849 if (ae ==StrIntrinsicNode::UL) { 3850 srwi(tmp4, n_start, 1*8); // ___0 3851 rlwimi(n_start, tmp4, 2*8, 0, 23); // _0_1 3852 } 3853 3854 add(last_addr, haystack, ch1); // Point to last address to compare (haystack+2*(haycnt-needlecnt)). 3855 3856 // Main Loop (now we have at least 2 characters). 3857 Label L_OuterLoop, L_InnerLoop, L_FinalCheck, L_Comp1, L_Comp2; 3858 bind(L_OuterLoop); // Search for 1st 2 characters. 3859 Register addr_diff = tmp4; 3860 subf(addr_diff, addr, last_addr); // Difference between already checked address and last address to check. 3861 addi(addr, addr, h_csize); // This is the new address we want to use for comparing. 3862 srdi_(ch2, addr_diff, h_csize); 3863 beq(CCR0, L_FinalCheck); // 2 characters left? 3864 mtctr(ch2); // num of characters / 2 3865 bind(L_InnerLoop); // Main work horse (2x unrolled search loop) 3866 if (h_csize == 2) { // Load 2 characters of haystack (ignore alignment). 3867 lwz(ch1, 0, addr); 3868 lwz(ch2, 2, addr); 3869 } else { 3870 lhz(ch1, 0, addr); 3871 lhz(ch2, 1, addr); 3872 } 3873 cmpw(CCR0, ch1, n_start); // Compare 2 characters (1 would be sufficient but try to reduce branches to CompLoop). 3874 cmpw(CCR1, ch2, n_start); 3875 beq(CCR0, L_Comp1); // Did we find the needle start? 3876 beq(CCR1, L_Comp2); 3877 addi(addr, addr, 2 * h_csize); 3878 bdnz(L_InnerLoop); 3879 bind(L_FinalCheck); 3880 andi_(addr_diff, addr_diff, h_csize); // Remaining characters not covered by InnerLoop: (num of characters) & 1. 3881 beq(CCR0, L_NotFound); 3882 if (h_csize == 2) { lwz(ch1, 0, addr); } else { lhz(ch1, 0, addr); } // One position left at which we have to compare. 3883 cmpw(CCR1, ch1, n_start); 3884 beq(CCR1, L_Comp1); 3885 bind(L_NotFound); 3886 li(result, -1); // not found 3887 b(L_End); 3888 3889 // ************************************************************************************************** 3890 // Special Case: unfortunately, the variable needle case can be called with needlecnt<2 3891 // ************************************************************************************************** 3892 if (needlecntval == 0) { // We have to handle these cases separately. 3893 Label L_OneCharLoop; 3894 bind(L_TooShort); 3895 mtctr(haycnt); 3896 if (n_csize == 2) { lhz(n_start, 0, needle); } else { lbz(n_start, 0, needle); } // First character of needle 3897 bind(L_OneCharLoop); 3898 if (h_csize == 2) { lhzu(ch1, 2, addr); } else { lbzu(ch1, 1, addr); } 3899 cmpw(CCR1, ch1, n_start); 3900 beq(CCR1, L_Found); // Did we find the one character needle? 3901 bdnz(L_OneCharLoop); 3902 li(result, -1); // Not found. 3903 b(L_End); 3904 } 3905 3906 // ************************************************************************************************** 3907 // Regular Case Part II: compare rest of needle (first 2 characters have been compared already) 3908 // ************************************************************************************************** 3909 3910 // Compare the rest 3911 bind(L_Comp2); 3912 addi(addr, addr, h_csize); // First comparison has failed, 2nd one hit. 3913 bind(L_Comp1); // Addr points to possible needle start. 3914 if (needlecntval != 2) { // Const needlecnt==2? 3915 if (needlecntval != 3) { 3916 if (needlecntval == 0) { beq(CCR6, L_Found); } // Variable needlecnt==2? 3917 Register n_ind = tmp4, 3918 h_ind = n_ind; 3919 li(n_ind, 2 * n_csize); // First 2 characters are already compared, use index 2. 3920 mtctr(needlecnt); // Decremented by 2, still > 0. 3921 Label L_CompLoop; 3922 bind(L_CompLoop); 3923 if (ae ==StrIntrinsicNode::UL) { 3924 h_ind = ch1; 3925 sldi(h_ind, n_ind, 1); 3926 } 3927 if (n_csize == 2) { lhzx(ch2, needle, n_ind); } else { lbzx(ch2, needle, n_ind); } 3928 if (h_csize == 2) { lhzx(ch1, addr, h_ind); } else { lbzx(ch1, addr, h_ind); } 3929 cmpw(CCR1, ch1, ch2); 3930 bne(CCR1, L_OuterLoop); 3931 addi(n_ind, n_ind, n_csize); 3932 bdnz(L_CompLoop); 3933 } else { // No loop required if there's only one needle character left. 3934 if (n_csize == 2) { lhz(ch2, 2 * 2, needle); } else { lbz(ch2, 2 * 1, needle); } 3935 if (h_csize == 2) { lhz(ch1, 2 * 2, addr); } else { lbz(ch1, 2 * 1, addr); } 3936 cmpw(CCR1, ch1, ch2); 3937 bne(CCR1, L_OuterLoop); 3938 } 3939 } 3940 // Return index ... 3941 bind(L_Found); 3942 subf(result, haystack, addr); // relative to haystack, ... 3943 if (h_csize == 2) { srdi(result, result, 1); } // in characters. 3944 bind(L_End); 3945 } // string_indexof 3946 3947 void MacroAssembler::string_indexof_char(Register result, Register haystack, Register haycnt, 3948 Register needle, jchar needleChar, Register tmp1, Register tmp2, bool is_byte) { 3949 assert_different_registers(haystack, haycnt, needle, tmp1, tmp2); 3950 3951 Label L_InnerLoop, L_FinalCheck, L_Found1, L_Found2, L_NotFound, L_End; 3952 Register addr = tmp1, 3953 ch1 = tmp2, 3954 ch2 = R0; 3955 3956 const int h_csize = is_byte ? 1 : 2; 3957 3958 //4: 3959 srwi_(tmp2, haycnt, 1); // Shift right by exact_log2(UNROLL_FACTOR). 3960 mr(addr, haystack); 3961 beq(CCR0, L_FinalCheck); 3962 mtctr(tmp2); // Move to count register. 3963 //8: 3964 bind(L_InnerLoop); // Main work horse (2x unrolled search loop). 3965 if (!is_byte) { 3966 lhz(ch1, 0, addr); 3967 lhz(ch2, 2, addr); 3968 } else { 3969 lbz(ch1, 0, addr); 3970 lbz(ch2, 1, addr); 3971 } 3972 (needle != R0) ? cmpw(CCR0, ch1, needle) : cmplwi(CCR0, ch1, (unsigned int)needleChar); 3973 (needle != R0) ? cmpw(CCR1, ch2, needle) : cmplwi(CCR1, ch2, (unsigned int)needleChar); 3974 beq(CCR0, L_Found1); // Did we find the needle? 3975 beq(CCR1, L_Found2); 3976 addi(addr, addr, 2 * h_csize); 3977 bdnz(L_InnerLoop); 3978 //16: 3979 bind(L_FinalCheck); 3980 andi_(R0, haycnt, 1); 3981 beq(CCR0, L_NotFound); 3982 if (!is_byte) { lhz(ch1, 0, addr); } else { lbz(ch1, 0, addr); } // One position left at which we have to compare. 3983 (needle != R0) ? cmpw(CCR1, ch1, needle) : cmplwi(CCR1, ch1, (unsigned int)needleChar); 3984 beq(CCR1, L_Found1); 3985 //21: 3986 bind(L_NotFound); 3987 li(result, -1); // Not found. 3988 b(L_End); 3989 3990 bind(L_Found2); 3991 addi(addr, addr, h_csize); 3992 //24: 3993 bind(L_Found1); // Return index ... 3994 subf(result, haystack, addr); // relative to haystack, ... 3995 if (!is_byte) { srdi(result, result, 1); } // in characters. 3996 bind(L_End); 3997 } // string_indexof_char 3998 3999 4000 void MacroAssembler::has_negatives(Register src, Register cnt, Register result, 4001 Register tmp1, Register tmp2) { 4002 const Register tmp0 = R0; 4003 assert_different_registers(src, result, cnt, tmp0, tmp1, tmp2); 4004 Label Lfastloop, Lslow, Lloop, Lnoneg, Ldone; 4005 4006 // Check if cnt >= 8 (= 16 bytes) 4007 lis(tmp1, (int)(short)0x8080); // tmp1 = 0x8080808080808080 4008 srwi_(tmp2, cnt, 4); 4009 li(result, 1); // Assume there's a negative byte. 4010 beq(CCR0, Lslow); 4011 ori(tmp1, tmp1, 0x8080); 4012 rldimi(tmp1, tmp1, 32, 0); 4013 mtctr(tmp2); 4014 4015 // 2x unrolled loop 4016 bind(Lfastloop); 4017 ld(tmp2, 0, src); 4018 ld(tmp0, 8, src); 4019 4020 orr(tmp0, tmp2, tmp0); 4021 4022 and_(tmp0, tmp0, tmp1); 4023 bne(CCR0, Ldone); // Found negative byte. 4024 addi(src, src, 16); 4025 4026 bdnz(Lfastloop); 4027 4028 bind(Lslow); // Fallback to slow version 4029 rldicl_(tmp0, cnt, 0, 64-4); 4030 beq(CCR0, Lnoneg); 4031 mtctr(tmp0); 4032 bind(Lloop); 4033 lbz(tmp0, 0, src); 4034 addi(src, src, 1); 4035 andi_(tmp0, tmp0, 0x80); 4036 bne(CCR0, Ldone); // Found negative byte. 4037 bdnz(Lloop); 4038 bind(Lnoneg); 4039 li(result, 0); 4040 4041 bind(Ldone); 4042 } 4043 4044 #endif // Compiler2 4045 4046 // Helpers for Intrinsic Emitters 4047 // 4048 // Revert the byte order of a 32bit value in a register 4049 // src: 0x44556677 4050 // dst: 0x77665544 4051 // Three steps to obtain the result: 4052 // 1) Rotate src (as doubleword) left 5 bytes. That puts the leftmost byte of the src word 4053 // into the rightmost byte position. Afterwards, everything left of the rightmost byte is cleared. 4054 // This value initializes dst. 4055 // 2) Rotate src (as word) left 3 bytes. That puts the rightmost byte of the src word into the leftmost 4056 // byte position. Furthermore, byte 5 is rotated into byte 6 position where it is supposed to go. 4057 // This value is mask inserted into dst with a [0..23] mask of 1s. 4058 // 3) Rotate src (as word) left 1 byte. That puts byte 6 into byte 5 position. 4059 // This value is mask inserted into dst with a [8..15] mask of 1s. 4060 void MacroAssembler::load_reverse_32(Register dst, Register src) { 4061 assert_different_registers(dst, src); 4062 4063 rldicl(dst, src, (4+1)*8, 56); // Rotate byte 4 into position 7 (rightmost), clear all to the left. 4064 rlwimi(dst, src, 3*8, 0, 23); // Insert byte 5 into position 6, 7 into 4, leave pos 7 alone. 4065 rlwimi(dst, src, 1*8, 8, 15); // Insert byte 6 into position 5, leave the rest alone. 4066 } 4067 4068 // Calculate the column addresses of the crc32 lookup table into distinct registers. 4069 // This loop-invariant calculation is moved out of the loop body, reducing the loop 4070 // body size from 20 to 16 instructions. 4071 // Returns the offset that was used to calculate the address of column tc3. 4072 // Due to register shortage, setting tc3 may overwrite table. With the return offset 4073 // at hand, the original table address can be easily reconstructed. 4074 int MacroAssembler::crc32_table_columns(Register table, Register tc0, Register tc1, Register tc2, Register tc3) { 4075 4076 #ifdef VM_LITTLE_ENDIAN 4077 // This is what we implement (the DOLIT4 part): 4078 // ========================================================================= */ 4079 // #define DOLIT4 c ^= *buf4++; \ 4080 // c = crc_table[3][c & 0xff] ^ crc_table[2][(c >> 8) & 0xff] ^ \ 4081 // crc_table[1][(c >> 16) & 0xff] ^ crc_table[0][c >> 24] 4082 // #define DOLIT32 DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4 4083 // ========================================================================= */ 4084 const int ix0 = 3*(4*CRC32_COLUMN_SIZE); 4085 const int ix1 = 2*(4*CRC32_COLUMN_SIZE); 4086 const int ix2 = 1*(4*CRC32_COLUMN_SIZE); 4087 const int ix3 = 0*(4*CRC32_COLUMN_SIZE); 4088 #else 4089 // This is what we implement (the DOBIG4 part): 4090 // ========================================================================= 4091 // #define DOBIG4 c ^= *++buf4; \ 4092 // c = crc_table[4][c & 0xff] ^ crc_table[5][(c >> 8) & 0xff] ^ \ 4093 // crc_table[6][(c >> 16) & 0xff] ^ crc_table[7][c >> 24] 4094 // #define DOBIG32 DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4 4095 // ========================================================================= 4096 const int ix0 = 4*(4*CRC32_COLUMN_SIZE); 4097 const int ix1 = 5*(4*CRC32_COLUMN_SIZE); 4098 const int ix2 = 6*(4*CRC32_COLUMN_SIZE); 4099 const int ix3 = 7*(4*CRC32_COLUMN_SIZE); 4100 #endif 4101 assert_different_registers(table, tc0, tc1, tc2); 4102 assert(table == tc3, "must be!"); 4103 4104 addi(tc0, table, ix0); 4105 addi(tc1, table, ix1); 4106 addi(tc2, table, ix2); 4107 if (ix3 != 0) addi(tc3, table, ix3); 4108 4109 return ix3; 4110 } 4111 4112 /** 4113 * uint32_t crc; 4114 * timesXtoThe32[crc & 0xFF] ^ (crc >> 8); 4115 */ 4116 void MacroAssembler::fold_byte_crc32(Register crc, Register val, Register table, Register tmp) { 4117 assert_different_registers(crc, table, tmp); 4118 assert_different_registers(val, table); 4119 4120 if (crc == val) { // Must rotate first to use the unmodified value. 4121 rlwinm(tmp, val, 2, 24-2, 31-2); // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest. 4122 // As we use a word (4-byte) instruction, we have to adapt the mask bit positions. 4123 srwi(crc, crc, 8); // Unsigned shift, clear leftmost 8 bits. 4124 } else { 4125 srwi(crc, crc, 8); // Unsigned shift, clear leftmost 8 bits. 4126 rlwinm(tmp, val, 2, 24-2, 31-2); // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest. 4127 } 4128 lwzx(tmp, table, tmp); 4129 xorr(crc, crc, tmp); 4130 } 4131 4132 /** 4133 * uint32_t crc; 4134 * timesXtoThe32[crc & 0xFF] ^ (crc >> 8); 4135 */ 4136 void MacroAssembler::fold_8bit_crc32(Register crc, Register table, Register tmp) { 4137 fold_byte_crc32(crc, crc, table, tmp); 4138 } 4139 4140 /** 4141 * Emits code to update CRC-32 with a byte value according to constants in table. 4142 * 4143 * @param [in,out]crc Register containing the crc. 4144 * @param [in]val Register containing the byte to fold into the CRC. 4145 * @param [in]table Register containing the table of crc constants. 4146 * 4147 * uint32_t crc; 4148 * val = crc_table[(val ^ crc) & 0xFF]; 4149 * crc = val ^ (crc >> 8); 4150 */ 4151 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) { 4152 BLOCK_COMMENT("update_byte_crc32:"); 4153 xorr(val, val, crc); 4154 fold_byte_crc32(crc, val, table, val); 4155 } 4156 4157 /** 4158 * @param crc register containing existing CRC (32-bit) 4159 * @param buf register pointing to input byte buffer (byte*) 4160 * @param len register containing number of bytes 4161 * @param table register pointing to CRC table 4162 */ 4163 void MacroAssembler::update_byteLoop_crc32(Register crc, Register buf, Register len, Register table, 4164 Register data, bool loopAlignment) { 4165 assert_different_registers(crc, buf, len, table, data); 4166 4167 Label L_mainLoop, L_done; 4168 const int mainLoop_stepping = 1; 4169 const int mainLoop_alignment = loopAlignment ? 32 : 4; // (InputForNewCode > 4 ? InputForNewCode : 32) : 4; 4170 4171 // Process all bytes in a single-byte loop. 4172 clrldi_(len, len, 32); // Enforce 32 bit. Anything to do? 4173 beq(CCR0, L_done); 4174 4175 mtctr(len); 4176 align(mainLoop_alignment); 4177 BIND(L_mainLoop); 4178 lbz(data, 0, buf); // Byte from buffer, zero-extended. 4179 addi(buf, buf, mainLoop_stepping); // Advance buffer position. 4180 update_byte_crc32(crc, data, table); 4181 bdnz(L_mainLoop); // Iterate. 4182 4183 bind(L_done); 4184 } 4185 4186 /** 4187 * Emits code to update CRC-32 with a 4-byte value according to constants in table 4188 * Implementation according to jdk/src/share/native/java/util/zip/zlib-1.2.8/crc32.c 4189 */ 4190 // A not on the lookup table address(es): 4191 // The lookup table consists of two sets of four columns each. 4192 // The columns {0..3} are used for little-endian machines. 4193 // The columns {4..7} are used for big-endian machines. 4194 // To save the effort of adding the column offset to the table address each time 4195 // a table element is looked up, it is possible to pass the pre-calculated 4196 // column addresses. 4197 // Uses R9..R12 as work register. Must be saved/restored by caller, if necessary. 4198 void MacroAssembler::update_1word_crc32(Register crc, Register buf, Register table, int bufDisp, int bufInc, 4199 Register t0, Register t1, Register t2, Register t3, 4200 Register tc0, Register tc1, Register tc2, Register tc3) { 4201 assert_different_registers(crc, t3); 4202 4203 // XOR crc with next four bytes of buffer. 4204 lwz(t3, bufDisp, buf); 4205 if (bufInc != 0) { 4206 addi(buf, buf, bufInc); 4207 } 4208 xorr(t3, t3, crc); 4209 4210 // Chop crc into 4 single-byte pieces, shifted left 2 bits, to form the table indices. 4211 rlwinm(t0, t3, 2, 24-2, 31-2); // ((t1 >> 0) & 0xff) << 2 4212 rlwinm(t1, t3, 32+(2- 8), 24-2, 31-2); // ((t1 >> 8) & 0xff) << 2 4213 rlwinm(t2, t3, 32+(2-16), 24-2, 31-2); // ((t1 >> 16) & 0xff) << 2 4214 rlwinm(t3, t3, 32+(2-24), 24-2, 31-2); // ((t1 >> 24) & 0xff) << 2 4215 4216 // Use the pre-calculated column addresses. 4217 // Load pre-calculated table values. 4218 lwzx(t0, tc0, t0); 4219 lwzx(t1, tc1, t1); 4220 lwzx(t2, tc2, t2); 4221 lwzx(t3, tc3, t3); 4222 4223 // Calculate new crc from table values. 4224 xorr(t0, t0, t1); 4225 xorr(t2, t2, t3); 4226 xorr(crc, t0, t2); // Now crc contains the final checksum value. 4227 } 4228 4229 /** 4230 * @param crc register containing existing CRC (32-bit) 4231 * @param buf register pointing to input byte buffer (byte*) 4232 * @param len register containing number of bytes 4233 * @param table register pointing to CRC table 4234 * 4235 * Uses R9..R12 as work register. Must be saved/restored by caller! 4236 */ 4237 void MacroAssembler::kernel_crc32_2word(Register crc, Register buf, Register len, Register table, 4238 Register t0, Register t1, Register t2, Register t3, 4239 Register tc0, Register tc1, Register tc2, Register tc3, 4240 bool invertCRC) { 4241 assert_different_registers(crc, buf, len, table); 4242 4243 Label L_mainLoop, L_tail; 4244 Register tmp = t0; 4245 Register data = t0; 4246 Register tmp2 = t1; 4247 const int mainLoop_stepping = 8; 4248 const int tailLoop_stepping = 1; 4249 const int log_stepping = exact_log2(mainLoop_stepping); 4250 const int mainLoop_alignment = 32; // InputForNewCode > 4 ? InputForNewCode : 32; 4251 const int complexThreshold = 2*mainLoop_stepping; 4252 4253 // Don't test for len <= 0 here. This pathological case should not occur anyway. 4254 // Optimizing for it by adding a test and a branch seems to be a waste of CPU cycles 4255 // for all well-behaved cases. The situation itself is detected and handled correctly 4256 // within update_byteLoop_crc32. 4257 assert(tailLoop_stepping == 1, "check tailLoop_stepping!"); 4258 4259 BLOCK_COMMENT("kernel_crc32_2word {"); 4260 4261 if (invertCRC) { 4262 nand(crc, crc, crc); // 1s complement of crc 4263 } 4264 4265 // Check for short (<mainLoop_stepping) buffer. 4266 cmpdi(CCR0, len, complexThreshold); 4267 blt(CCR0, L_tail); 4268 4269 // Pre-mainLoop alignment did show a slight (1%) positive effect on performance. 4270 // We leave the code in for reference. Maybe we need alignment when we exploit vector instructions. 4271 { 4272 // Align buf addr to mainLoop_stepping boundary. 4273 neg(tmp2, buf); // Calculate # preLoop iterations for alignment. 4274 rldicl(tmp2, tmp2, 0, 64-log_stepping); // Rotate tmp2 0 bits, insert into tmp2, anding with mask with 1s from 62..63. 4275 4276 if (complexThreshold > mainLoop_stepping) { 4277 sub(len, len, tmp2); // Remaining bytes for main loop (>=mainLoop_stepping is guaranteed). 4278 } else { 4279 sub(tmp, len, tmp2); // Remaining bytes for main loop. 4280 cmpdi(CCR0, tmp, mainLoop_stepping); 4281 blt(CCR0, L_tail); // For less than one mainloop_stepping left, do only tail processing 4282 mr(len, tmp); // remaining bytes for main loop (>=mainLoop_stepping is guaranteed). 4283 } 4284 update_byteLoop_crc32(crc, buf, tmp2, table, data, false); 4285 } 4286 4287 srdi(tmp2, len, log_stepping); // #iterations for mainLoop 4288 andi(len, len, mainLoop_stepping-1); // remaining bytes for tailLoop 4289 mtctr(tmp2); 4290 4291 #ifdef VM_LITTLE_ENDIAN 4292 Register crc_rv = crc; 4293 #else 4294 Register crc_rv = tmp; // Load_reverse needs separate registers to work on. 4295 // Occupies tmp, but frees up crc. 4296 load_reverse_32(crc_rv, crc); // Revert byte order because we are dealing with big-endian data. 4297 tmp = crc; 4298 #endif 4299 4300 int reconstructTableOffset = crc32_table_columns(table, tc0, tc1, tc2, tc3); 4301 4302 align(mainLoop_alignment); // Octoword-aligned loop address. Shows 2% improvement. 4303 BIND(L_mainLoop); 4304 update_1word_crc32(crc_rv, buf, table, 0, 0, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3); 4305 update_1word_crc32(crc_rv, buf, table, 4, mainLoop_stepping, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3); 4306 bdnz(L_mainLoop); 4307 4308 #ifndef VM_LITTLE_ENDIAN 4309 load_reverse_32(crc, crc_rv); // Revert byte order because we are dealing with big-endian data. 4310 tmp = crc_rv; // Tmp uses it's original register again. 4311 #endif 4312 4313 // Restore original table address for tailLoop. 4314 if (reconstructTableOffset != 0) { 4315 addi(table, table, -reconstructTableOffset); 4316 } 4317 4318 // Process last few (<complexThreshold) bytes of buffer. 4319 BIND(L_tail); 4320 update_byteLoop_crc32(crc, buf, len, table, data, false); 4321 4322 if (invertCRC) { 4323 nand(crc, crc, crc); // 1s complement of crc 4324 } 4325 BLOCK_COMMENT("} kernel_crc32_2word"); 4326 } 4327 4328 /** 4329 * @param crc register containing existing CRC (32-bit) 4330 * @param buf register pointing to input byte buffer (byte*) 4331 * @param len register containing number of bytes 4332 * @param table register pointing to CRC table 4333 * 4334 * uses R9..R12 as work register. Must be saved/restored by caller! 4335 */ 4336 void MacroAssembler::kernel_crc32_1word(Register crc, Register buf, Register len, Register table, 4337 Register t0, Register t1, Register t2, Register t3, 4338 Register tc0, Register tc1, Register tc2, Register tc3, 4339 bool invertCRC) { 4340 assert_different_registers(crc, buf, len, table); 4341 4342 Label L_mainLoop, L_tail; 4343 Register tmp = t0; 4344 Register data = t0; 4345 Register tmp2 = t1; 4346 const int mainLoop_stepping = 4; 4347 const int tailLoop_stepping = 1; 4348 const int log_stepping = exact_log2(mainLoop_stepping); 4349 const int mainLoop_alignment = 32; // InputForNewCode > 4 ? InputForNewCode : 32; 4350 const int complexThreshold = 2*mainLoop_stepping; 4351 4352 // Don't test for len <= 0 here. This pathological case should not occur anyway. 4353 // Optimizing for it by adding a test and a branch seems to be a waste of CPU cycles 4354 // for all well-behaved cases. The situation itself is detected and handled correctly 4355 // within update_byteLoop_crc32. 4356 assert(tailLoop_stepping == 1, "check tailLoop_stepping!"); 4357 4358 BLOCK_COMMENT("kernel_crc32_1word {"); 4359 4360 if (invertCRC) { 4361 nand(crc, crc, crc); // 1s complement of crc 4362 } 4363 4364 // Check for short (<mainLoop_stepping) buffer. 4365 cmpdi(CCR0, len, complexThreshold); 4366 blt(CCR0, L_tail); 4367 4368 // Pre-mainLoop alignment did show a slight (1%) positive effect on performance. 4369 // We leave the code in for reference. Maybe we need alignment when we exploit vector instructions. 4370 { 4371 // Align buf addr to mainLoop_stepping boundary. 4372 neg(tmp2, buf); // Calculate # preLoop iterations for alignment. 4373 rldicl(tmp2, tmp2, 0, 64-log_stepping); // Rotate tmp2 0 bits, insert into tmp2, anding with mask with 1s from 62..63. 4374 4375 if (complexThreshold > mainLoop_stepping) { 4376 sub(len, len, tmp2); // Remaining bytes for main loop (>=mainLoop_stepping is guaranteed). 4377 } else { 4378 sub(tmp, len, tmp2); // Remaining bytes for main loop. 4379 cmpdi(CCR0, tmp, mainLoop_stepping); 4380 blt(CCR0, L_tail); // For less than one mainloop_stepping left, do only tail processing 4381 mr(len, tmp); // remaining bytes for main loop (>=mainLoop_stepping is guaranteed). 4382 } 4383 update_byteLoop_crc32(crc, buf, tmp2, table, data, false); 4384 } 4385 4386 srdi(tmp2, len, log_stepping); // #iterations for mainLoop 4387 andi(len, len, mainLoop_stepping-1); // remaining bytes for tailLoop 4388 mtctr(tmp2); 4389 4390 #ifdef VM_LITTLE_ENDIAN 4391 Register crc_rv = crc; 4392 #else 4393 Register crc_rv = tmp; // Load_reverse needs separate registers to work on. 4394 // Occupies tmp, but frees up crc. 4395 load_reverse_32(crc_rv, crc); // Revert byte order because we are dealing with big-endian data. 4396 tmp = crc; 4397 #endif 4398 4399 int reconstructTableOffset = crc32_table_columns(table, tc0, tc1, tc2, tc3); 4400 4401 align(mainLoop_alignment); // Octoword-aligned loop address. Shows 2% improvement. 4402 BIND(L_mainLoop); 4403 update_1word_crc32(crc_rv, buf, table, 0, mainLoop_stepping, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3); 4404 bdnz(L_mainLoop); 4405 4406 #ifndef VM_LITTLE_ENDIAN 4407 load_reverse_32(crc, crc_rv); // Revert byte order because we are dealing with big-endian data. 4408 tmp = crc_rv; // Tmp uses it's original register again. 4409 #endif 4410 4411 // Restore original table address for tailLoop. 4412 if (reconstructTableOffset != 0) { 4413 addi(table, table, -reconstructTableOffset); 4414 } 4415 4416 // Process last few (<complexThreshold) bytes of buffer. 4417 BIND(L_tail); 4418 update_byteLoop_crc32(crc, buf, len, table, data, false); 4419 4420 if (invertCRC) { 4421 nand(crc, crc, crc); // 1s complement of crc 4422 } 4423 BLOCK_COMMENT("} kernel_crc32_1word"); 4424 } 4425 4426 /** 4427 * @param crc register containing existing CRC (32-bit) 4428 * @param buf register pointing to input byte buffer (byte*) 4429 * @param len register containing number of bytes 4430 * @param table register pointing to CRC table 4431 * 4432 * Uses R7_ARG5, R8_ARG6 as work registers. 4433 */ 4434 void MacroAssembler::kernel_crc32_1byte(Register crc, Register buf, Register len, Register table, 4435 Register t0, Register t1, Register t2, Register t3, 4436 bool invertCRC) { 4437 assert_different_registers(crc, buf, len, table); 4438 4439 Register data = t0; // Holds the current byte to be folded into crc. 4440 4441 BLOCK_COMMENT("kernel_crc32_1byte {"); 4442 4443 if (invertCRC) { 4444 nand(crc, crc, crc); // 1s complement of crc 4445 } 4446 4447 // Process all bytes in a single-byte loop. 4448 update_byteLoop_crc32(crc, buf, len, table, data, true); 4449 4450 if (invertCRC) { 4451 nand(crc, crc, crc); // 1s complement of crc 4452 } 4453 BLOCK_COMMENT("} kernel_crc32_1byte"); 4454 } 4455 4456 /** 4457 * @param crc register containing existing CRC (32-bit) 4458 * @param buf register pointing to input byte buffer (byte*) 4459 * @param len register containing number of bytes 4460 * @param table register pointing to CRC table 4461 * @param constants register pointing to CRC table for 128-bit aligned memory 4462 * @param barretConstants register pointing to table for barrett reduction 4463 * @param t0-t4 temp registers 4464 */ 4465 void MacroAssembler::kernel_crc32_1word_vpmsum(Register crc, Register buf, Register len, Register table, 4466 Register constants, Register barretConstants, 4467 Register t0, Register t1, Register t2, Register t3, Register t4, 4468 bool invertCRC) { 4469 assert_different_registers(crc, buf, len, table); 4470 4471 Label L_alignedHead, L_tail; 4472 4473 BLOCK_COMMENT("kernel_crc32_1word_vpmsum {"); 4474 4475 // 1. ~c 4476 if (invertCRC) { 4477 nand(crc, crc, crc); // 1s complement of crc 4478 } 4479 4480 // 2. use kernel_crc32_1word for short len 4481 clrldi(len, len, 32); 4482 cmpdi(CCR0, len, 512); 4483 blt(CCR0, L_tail); 4484 4485 // 3. calculate from 0 to first aligned address 4486 const int alignment = 16; 4487 Register prealign = t0; 4488 4489 andi_(prealign, buf, alignment - 1); 4490 beq(CCR0, L_alignedHead); 4491 subfic(prealign, prealign, alignment); 4492 4493 subf(len, prealign, len); 4494 update_byteLoop_crc32(crc, buf, prealign, table, t2, false); 4495 4496 // 4. calculate from first aligned address as far as possible 4497 BIND(L_alignedHead); 4498 kernel_crc32_1word_aligned(crc, buf, len, constants, barretConstants, t0, t1, t2, t3, t4); 4499 4500 // 5. remaining bytes 4501 BIND(L_tail); 4502 Register tc0 = t4; 4503 Register tc1 = constants; 4504 Register tc2 = barretConstants; 4505 kernel_crc32_1word(crc, buf, len, table, t0, t1, t2, t3, tc0, tc1, tc2, table, false); 4506 4507 // 6. ~c 4508 if (invertCRC) { 4509 nand(crc, crc, crc); // 1s complement of crc 4510 } 4511 4512 BLOCK_COMMENT("} kernel_crc32_1word_vpmsum"); 4513 } 4514 4515 /** 4516 * @param crc register containing existing CRC (32-bit) 4517 * @param buf register pointing to input byte buffer (byte*) 4518 * @param len register containing number of bytes (will get updated to remaining bytes) 4519 * @param constants register pointing to CRC table for 128-bit aligned memory 4520 * @param barretConstants register pointing to table for barrett reduction 4521 * @param t0-t4 temp registers 4522 * Precondition: len should be >= 512. Otherwise, nothing will be done. 4523 */ 4524 void MacroAssembler::kernel_crc32_1word_aligned(Register crc, Register buf, Register len, 4525 Register constants, Register barretConstants, 4526 Register t0, Register t1, Register t2, Register t3, Register t4) { 4527 4528 // Save non-volatile vector registers (frameless). 4529 Register offset = t1; 4530 int offsetInt = 0; 4531 offsetInt -= 16; li(offset, offsetInt); stvx(VR20, offset, R1_SP); 4532 offsetInt -= 16; li(offset, offsetInt); stvx(VR21, offset, R1_SP); 4533 offsetInt -= 16; li(offset, offsetInt); stvx(VR22, offset, R1_SP); 4534 offsetInt -= 16; li(offset, offsetInt); stvx(VR23, offset, R1_SP); 4535 offsetInt -= 16; li(offset, offsetInt); stvx(VR24, offset, R1_SP); 4536 offsetInt -= 16; li(offset, offsetInt); stvx(VR25, offset, R1_SP); 4537 #ifndef VM_LITTLE_ENDIAN 4538 offsetInt -= 16; li(offset, offsetInt); stvx(VR26, offset, R1_SP); 4539 #endif 4540 offsetInt -= 8; std(R14, offsetInt, R1_SP); 4541 offsetInt -= 8; std(R15, offsetInt, R1_SP); 4542 offsetInt -= 8; std(R16, offsetInt, R1_SP); 4543 offsetInt -= 8; std(R17, offsetInt, R1_SP); 4544 4545 // Implementation uses an inner loop which uses between 256 and 16 * unroll_factor 4546 // bytes per iteration. The basic scheme is: 4547 // lvx: load vector (Big Endian needs reversal) 4548 // vpmsumw: carry-less 32 bit multiplications with constant representing a large CRC shift 4549 // vxor: xor partial results together to get unroll_factor2 vectors 4550 4551 // Outer loop performs the CRC shifts needed to combine the unroll_factor2 vectors. 4552 4553 // Using 16 * unroll_factor / unroll_factor_2 bytes for constants. 4554 const int unroll_factor = 2048; 4555 const int unroll_factor2 = 8; 4556 4557 // Support registers. 4558 Register offs[] = { noreg, t0, t1, t2, t3, t4, crc /* will live in VCRC */, R14 }; 4559 Register num_bytes = R15, 4560 loop_count = R16, 4561 cur_const = R17; 4562 // Constant array for outer loop: unroll_factor2 - 1 registers, 4563 // Constant array for inner loop: unroll_factor / unroll_factor2 registers. 4564 VectorRegister consts0[] = { VR16, VR17, VR18, VR19, VR20, VR21, VR22 }, 4565 consts1[] = { VR23, VR24 }; 4566 // Data register arrays: 2 arrays with unroll_factor2 registers. 4567 VectorRegister data0[] = { VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7 }, 4568 data1[] = { VR8, VR9, VR10, VR11, VR12, VR13, VR14, VR15 }; 4569 4570 VectorRegister VCRC = data0[0]; 4571 VectorRegister Vc = VR25; 4572 VectorRegister swap_bytes = VR26; // Only for Big Endian. 4573 4574 // We have at least 1 iteration (ensured by caller). 4575 Label L_outer_loop, L_inner_loop, L_last; 4576 4577 // If supported set DSCR pre-fetch to deepest. 4578 if (VM_Version::has_mfdscr()) { 4579 load_const_optimized(t0, VM_Version::_dscr_val | 7); 4580 mtdscr(t0); 4581 } 4582 4583 mtvrwz(VCRC, crc); // crc lives lives in VCRC, now 4584 4585 for (int i = 1; i < unroll_factor2; ++i) { 4586 li(offs[i], 16 * i); 4587 } 4588 4589 // Load consts for outer loop 4590 lvx(consts0[0], constants); 4591 for (int i = 1; i < unroll_factor2 - 1; ++i) { 4592 lvx(consts0[i], offs[i], constants); 4593 } 4594 addi(constants, constants, (unroll_factor2 - 1) * 16); 4595 4596 load_const_optimized(num_bytes, 16 * unroll_factor); 4597 load_const_optimized(loop_count, unroll_factor / (2 * unroll_factor2) - 1); // One double-iteration peeled off. 4598 4599 // Reuse data registers outside of the loop. 4600 VectorRegister Vtmp = data1[0]; 4601 VectorRegister Vtmp2 = data1[1]; 4602 VectorRegister zeroes = data1[2]; 4603 4604 vspltisb(Vtmp, 0); 4605 vsldoi(VCRC, Vtmp, VCRC, 8); // 96 bit zeroes, 32 bit CRC. 4606 4607 // Load vector for vpermxor (to xor both 64 bit parts together) 4608 lvsl(Vtmp, buf); // 000102030405060708090a0b0c0d0e0f 4609 vspltisb(Vc, 4); 4610 vsl(Vc, Vtmp, Vc); // 00102030405060708090a0b0c0d0e0f0 4611 xxspltd(Vc->to_vsr(), Vc->to_vsr(), 0); 4612 vor(Vc, Vtmp, Vc); // 001122334455667708192a3b4c5d6e7f 4613 4614 #ifdef VM_LITTLE_ENDIAN 4615 #define BE_swap_bytes(x) 4616 #else 4617 vspltisb(Vtmp2, 0xf); 4618 vxor(swap_bytes, Vtmp, Vtmp2); 4619 #define BE_swap_bytes(x) vperm(x, x, x, swap_bytes) 4620 #endif 4621 4622 cmpd(CCR0, len, num_bytes); 4623 blt(CCR0, L_last); 4624 4625 // ********** Main loop start ********** 4626 align(32); 4627 bind(L_outer_loop); 4628 4629 // Begin of unrolled first iteration (no xor). 4630 lvx(data1[0], buf); 4631 mr(cur_const, constants); 4632 for (int i = 1; i < unroll_factor2 / 2; ++i) { 4633 lvx(data1[i], offs[i], buf); 4634 } 4635 vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result. 4636 lvx(consts1[0], cur_const); 4637 mtctr(loop_count); 4638 for (int i = 0; i < unroll_factor2 / 2; ++i) { 4639 BE_swap_bytes(data1[i]); 4640 if (i == 0) { vxor(data1[0], data1[0], VCRC); } // xor in previous CRC. 4641 lvx(data1[i + unroll_factor2 / 2], offs[i + unroll_factor2 / 2], buf); 4642 vpmsumw(data0[i], data1[i], consts1[0]); 4643 } 4644 addi(buf, buf, 16 * unroll_factor2); 4645 subf(len, num_bytes, len); 4646 lvx(consts1[1], offs[1], cur_const); 4647 addi(cur_const, cur_const, 32); 4648 // Begin of unrolled second iteration (head). 4649 for (int i = 0; i < unroll_factor2 / 2; ++i) { 4650 BE_swap_bytes(data1[i + unroll_factor2 / 2]); 4651 if (i == 0) { lvx(data1[0], buf); } else { lvx(data1[i], offs[i], buf); } 4652 vpmsumw(data0[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2], consts1[0]); 4653 } 4654 for (int i = 0; i < unroll_factor2 / 2; ++i) { 4655 BE_swap_bytes(data1[i]); 4656 lvx(data1[i + unroll_factor2 / 2], offs[i + unroll_factor2 / 2], buf); 4657 vpmsumw(data1[i], data1[i], consts1[1]); 4658 } 4659 addi(buf, buf, 16 * unroll_factor2); 4660 4661 // Generate most performance relevant code. Loads + half of the vpmsumw have been generated. 4662 // Double-iteration allows using the 2 constant registers alternatingly. 4663 align(32); 4664 bind(L_inner_loop); 4665 for (int j = 1; j < 3; ++j) { // j < unroll_factor / unroll_factor2 - 1 for complete unrolling. 4666 if (j & 1) { 4667 lvx(consts1[0], cur_const); 4668 } else { 4669 lvx(consts1[1], offs[1], cur_const); 4670 addi(cur_const, cur_const, 32); 4671 } 4672 for (int i = 0; i < unroll_factor2; ++i) { 4673 int idx = i + unroll_factor2 / 2, inc = 0; // For modulo-scheduled input. 4674 if (idx >= unroll_factor2) { idx -= unroll_factor2; inc = 1; } 4675 BE_swap_bytes(data1[idx]); 4676 vxor(data0[i], data0[i], data1[i]); 4677 if (i == 0) lvx(data1[0], buf); else lvx(data1[i], offs[i], buf); 4678 vpmsumw(data1[idx], data1[idx], consts1[(j + inc) & 1]); 4679 } 4680 addi(buf, buf, 16 * unroll_factor2); 4681 } 4682 bdnz(L_inner_loop); 4683 4684 // Tail of last iteration (no loads). 4685 for (int i = 0; i < unroll_factor2 / 2; ++i) { 4686 BE_swap_bytes(data1[i + unroll_factor2 / 2]); 4687 vxor(data0[i], data0[i], data1[i]); 4688 vpmsumw(data1[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2], consts1[1]); 4689 } 4690 for (int i = 0; i < unroll_factor2 / 2; ++i) { 4691 vpmsumw(data0[i], data0[i], consts0[unroll_factor2 - 2 - i]); // First half of fixup shifts. 4692 vxor(data0[i + unroll_factor2 / 2], data0[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2]); 4693 } 4694 4695 // Last data register is ok, other ones need fixup shift. 4696 for (int i = unroll_factor2 / 2; i < unroll_factor2 - 1; ++i) { 4697 vpmsumw(data0[i], data0[i], consts0[unroll_factor2 - 2 - i]); 4698 } 4699 4700 // Combine to 128 bit result vector VCRC = data0[0]. 4701 for (int i = 1; i < unroll_factor2; i<<=1) { 4702 for (int j = 0; j <= unroll_factor2 - 2*i; j+=2*i) { 4703 vxor(data0[j], data0[j], data0[j+i]); 4704 } 4705 } 4706 cmpd(CCR0, len, num_bytes); 4707 bge(CCR0, L_outer_loop); 4708 4709 // Last chance with lower num_bytes. 4710 bind(L_last); 4711 srdi(loop_count, len, exact_log2(16 * 2 * unroll_factor2)); // Use double-iterations. 4712 add_const_optimized(constants, constants, 16 * (unroll_factor / unroll_factor2)); // Point behind last one. 4713 sldi(R0, loop_count, exact_log2(16 * 2)); // Bytes of constants to be used. 4714 clrrdi(num_bytes, len, exact_log2(16 * 2 * unroll_factor2)); 4715 subf(constants, R0, constants); // Point to constant to be used first. 4716 4717 addic_(loop_count, loop_count, -1); // One double-iteration peeled off. 4718 bgt(CCR0, L_outer_loop); 4719 // ********** Main loop end ********** 4720 #undef BE_swap_bytes 4721 4722 // Restore DSCR pre-fetch value. 4723 if (VM_Version::has_mfdscr()) { 4724 load_const_optimized(t0, VM_Version::_dscr_val); 4725 mtdscr(t0); 4726 } 4727 4728 vspltisb(zeroes, 0); 4729 4730 // Combine to 64 bit result. 4731 vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result. 4732 4733 // Reduce to 32 bit CRC: Remainder by multiply-high. 4734 lvx(Vtmp, barretConstants); 4735 vsldoi(Vtmp2, zeroes, VCRC, 12); // Extract high 32 bit. 4736 vpmsumd(Vtmp2, Vtmp2, Vtmp); // Multiply by inverse long poly. 4737 vsldoi(Vtmp2, zeroes, Vtmp2, 12); // Extract high 32 bit. 4738 vsldoi(Vtmp, zeroes, Vtmp, 8); 4739 vpmsumd(Vtmp2, Vtmp2, Vtmp); // Multiply quotient by long poly. 4740 vxor(VCRC, VCRC, Vtmp2); // Remainder fits into 32 bit. 4741 4742 // Move result. len is already updated. 4743 vsldoi(VCRC, VCRC, zeroes, 8); 4744 mfvrd(crc, VCRC); 4745 4746 // Restore non-volatile Vector registers (frameless). 4747 offsetInt = 0; 4748 offsetInt -= 16; li(offset, offsetInt); lvx(VR20, offset, R1_SP); 4749 offsetInt -= 16; li(offset, offsetInt); lvx(VR21, offset, R1_SP); 4750 offsetInt -= 16; li(offset, offsetInt); lvx(VR22, offset, R1_SP); 4751 offsetInt -= 16; li(offset, offsetInt); lvx(VR23, offset, R1_SP); 4752 offsetInt -= 16; li(offset, offsetInt); lvx(VR24, offset, R1_SP); 4753 offsetInt -= 16; li(offset, offsetInt); lvx(VR25, offset, R1_SP); 4754 #ifndef VM_LITTLE_ENDIAN 4755 offsetInt -= 16; li(offset, offsetInt); lvx(VR26, offset, R1_SP); 4756 #endif 4757 offsetInt -= 8; ld(R14, offsetInt, R1_SP); 4758 offsetInt -= 8; ld(R15, offsetInt, R1_SP); 4759 offsetInt -= 8; ld(R16, offsetInt, R1_SP); 4760 offsetInt -= 8; ld(R17, offsetInt, R1_SP); 4761 } 4762 4763 void MacroAssembler::kernel_crc32_singleByte(Register crc, Register buf, Register len, Register table, Register tmp, bool invertCRC) { 4764 assert_different_registers(crc, buf, /* len, not used!! */ table, tmp); 4765 4766 BLOCK_COMMENT("kernel_crc32_singleByte:"); 4767 if (invertCRC) { 4768 nand(crc, crc, crc); // 1s complement of crc 4769 } 4770 4771 lbz(tmp, 0, buf); // Byte from buffer, zero-extended. 4772 update_byte_crc32(crc, tmp, table); 4773 4774 if (invertCRC) { 4775 nand(crc, crc, crc); // 1s complement of crc 4776 } 4777 } 4778 4779 void MacroAssembler::kernel_crc32_singleByteReg(Register crc, Register val, Register table, bool invertCRC) { 4780 assert_different_registers(crc, val, table); 4781 4782 BLOCK_COMMENT("kernel_crc32_singleByteReg:"); 4783 if (invertCRC) { 4784 nand(crc, crc, crc); // 1s complement of crc 4785 } 4786 4787 update_byte_crc32(crc, val, table); 4788 4789 if (invertCRC) { 4790 nand(crc, crc, crc); // 1s complement of crc 4791 } 4792 } 4793 4794 // dest_lo += src1 + src2 4795 // dest_hi += carry1 + carry2 4796 void MacroAssembler::add2_with_carry(Register dest_hi, 4797 Register dest_lo, 4798 Register src1, Register src2) { 4799 li(R0, 0); 4800 addc(dest_lo, dest_lo, src1); 4801 adde(dest_hi, dest_hi, R0); 4802 addc(dest_lo, dest_lo, src2); 4803 adde(dest_hi, dest_hi, R0); 4804 } 4805 4806 // Multiply 64 bit by 64 bit first loop. 4807 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, 4808 Register x_xstart, 4809 Register y, Register y_idx, 4810 Register z, 4811 Register carry, 4812 Register product_high, Register product, 4813 Register idx, Register kdx, 4814 Register tmp) { 4815 // jlong carry, x[], y[], z[]; 4816 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx--, kdx--) { 4817 // huge_128 product = y[idx] * x[xstart] + carry; 4818 // z[kdx] = (jlong)product; 4819 // carry = (jlong)(product >>> 64); 4820 // } 4821 // z[xstart] = carry; 4822 4823 Label L_first_loop, L_first_loop_exit; 4824 Label L_one_x, L_one_y, L_multiply; 4825 4826 addic_(xstart, xstart, -1); 4827 blt(CCR0, L_one_x); // Special case: length of x is 1. 4828 4829 // Load next two integers of x. 4830 sldi(tmp, xstart, LogBytesPerInt); 4831 ldx(x_xstart, x, tmp); 4832 #ifdef VM_LITTLE_ENDIAN 4833 rldicl(x_xstart, x_xstart, 32, 0); 4834 #endif 4835 4836 align(32, 16); 4837 bind(L_first_loop); 4838 4839 cmpdi(CCR0, idx, 1); 4840 blt(CCR0, L_first_loop_exit); 4841 addi(idx, idx, -2); 4842 beq(CCR0, L_one_y); 4843 4844 // Load next two integers of y. 4845 sldi(tmp, idx, LogBytesPerInt); 4846 ldx(y_idx, y, tmp); 4847 #ifdef VM_LITTLE_ENDIAN 4848 rldicl(y_idx, y_idx, 32, 0); 4849 #endif 4850 4851 4852 bind(L_multiply); 4853 multiply64(product_high, product, x_xstart, y_idx); 4854 4855 li(tmp, 0); 4856 addc(product, product, carry); // Add carry to result. 4857 adde(product_high, product_high, tmp); // Add carry of the last addition. 4858 addi(kdx, kdx, -2); 4859 4860 // Store result. 4861 #ifdef VM_LITTLE_ENDIAN 4862 rldicl(product, product, 32, 0); 4863 #endif 4864 sldi(tmp, kdx, LogBytesPerInt); 4865 stdx(product, z, tmp); 4866 mr_if_needed(carry, product_high); 4867 b(L_first_loop); 4868 4869 4870 bind(L_one_y); // Load one 32 bit portion of y as (0,value). 4871 4872 lwz(y_idx, 0, y); 4873 b(L_multiply); 4874 4875 4876 bind(L_one_x); // Load one 32 bit portion of x as (0,value). 4877 4878 lwz(x_xstart, 0, x); 4879 b(L_first_loop); 4880 4881 bind(L_first_loop_exit); 4882 } 4883 4884 // Multiply 64 bit by 64 bit and add 128 bit. 4885 void MacroAssembler::multiply_add_128_x_128(Register x_xstart, Register y, 4886 Register z, Register yz_idx, 4887 Register idx, Register carry, 4888 Register product_high, Register product, 4889 Register tmp, int offset) { 4890 4891 // huge_128 product = (y[idx] * x_xstart) + z[kdx] + carry; 4892 // z[kdx] = (jlong)product; 4893 4894 sldi(tmp, idx, LogBytesPerInt); 4895 if (offset) { 4896 addi(tmp, tmp, offset); 4897 } 4898 ldx(yz_idx, y, tmp); 4899 #ifdef VM_LITTLE_ENDIAN 4900 rldicl(yz_idx, yz_idx, 32, 0); 4901 #endif 4902 4903 multiply64(product_high, product, x_xstart, yz_idx); 4904 ldx(yz_idx, z, tmp); 4905 #ifdef VM_LITTLE_ENDIAN 4906 rldicl(yz_idx, yz_idx, 32, 0); 4907 #endif 4908 4909 add2_with_carry(product_high, product, carry, yz_idx); 4910 4911 sldi(tmp, idx, LogBytesPerInt); 4912 if (offset) { 4913 addi(tmp, tmp, offset); 4914 } 4915 #ifdef VM_LITTLE_ENDIAN 4916 rldicl(product, product, 32, 0); 4917 #endif 4918 stdx(product, z, tmp); 4919 } 4920 4921 // Multiply 128 bit by 128 bit. Unrolled inner loop. 4922 void MacroAssembler::multiply_128_x_128_loop(Register x_xstart, 4923 Register y, Register z, 4924 Register yz_idx, Register idx, Register carry, 4925 Register product_high, Register product, 4926 Register carry2, Register tmp) { 4927 4928 // jlong carry, x[], y[], z[]; 4929 // int kdx = ystart+1; 4930 // for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop 4931 // huge_128 product = (y[idx+1] * x_xstart) + z[kdx+idx+1] + carry; 4932 // z[kdx+idx+1] = (jlong)product; 4933 // jlong carry2 = (jlong)(product >>> 64); 4934 // product = (y[idx] * x_xstart) + z[kdx+idx] + carry2; 4935 // z[kdx+idx] = (jlong)product; 4936 // carry = (jlong)(product >>> 64); 4937 // } 4938 // idx += 2; 4939 // if (idx > 0) { 4940 // product = (y[idx] * x_xstart) + z[kdx+idx] + carry; 4941 // z[kdx+idx] = (jlong)product; 4942 // carry = (jlong)(product >>> 64); 4943 // } 4944 4945 Label L_third_loop, L_third_loop_exit, L_post_third_loop_done; 4946 const Register jdx = R0; 4947 4948 // Scale the index. 4949 srdi_(jdx, idx, 2); 4950 beq(CCR0, L_third_loop_exit); 4951 mtctr(jdx); 4952 4953 align(32, 16); 4954 bind(L_third_loop); 4955 4956 addi(idx, idx, -4); 4957 4958 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 8); 4959 mr_if_needed(carry2, product_high); 4960 4961 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry2, product_high, product, tmp, 0); 4962 mr_if_needed(carry, product_high); 4963 bdnz(L_third_loop); 4964 4965 bind(L_third_loop_exit); // Handle any left-over operand parts. 4966 4967 andi_(idx, idx, 0x3); 4968 beq(CCR0, L_post_third_loop_done); 4969 4970 Label L_check_1; 4971 4972 addic_(idx, idx, -2); 4973 blt(CCR0, L_check_1); 4974 4975 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 0); 4976 mr_if_needed(carry, product_high); 4977 4978 bind(L_check_1); 4979 4980 addi(idx, idx, 0x2); 4981 andi_(idx, idx, 0x1); 4982 addic_(idx, idx, -1); 4983 blt(CCR0, L_post_third_loop_done); 4984 4985 sldi(tmp, idx, LogBytesPerInt); 4986 lwzx(yz_idx, y, tmp); 4987 multiply64(product_high, product, x_xstart, yz_idx); 4988 lwzx(yz_idx, z, tmp); 4989 4990 add2_with_carry(product_high, product, yz_idx, carry); 4991 4992 sldi(tmp, idx, LogBytesPerInt); 4993 stwx(product, z, tmp); 4994 srdi(product, product, 32); 4995 4996 sldi(product_high, product_high, 32); 4997 orr(product, product, product_high); 4998 mr_if_needed(carry, product); 4999 5000 bind(L_post_third_loop_done); 5001 } // multiply_128_x_128_loop 5002 5003 void MacroAssembler::muladd(Register out, Register in, 5004 Register offset, Register len, Register k, 5005 Register tmp1, Register tmp2, Register carry) { 5006 5007 // Labels 5008 Label LOOP, SKIP; 5009 5010 // Make sure length is positive. 5011 cmpdi (CCR0, len, 0); 5012 5013 // Prepare variables 5014 subi (offset, offset, 4); 5015 li (carry, 0); 5016 ble (CCR0, SKIP); 5017 5018 mtctr (len); 5019 subi (len, len, 1 ); 5020 sldi (len, len, 2 ); 5021 5022 // Main loop 5023 bind(LOOP); 5024 lwzx (tmp1, len, in ); 5025 lwzx (tmp2, offset, out ); 5026 mulld (tmp1, tmp1, k ); 5027 add (tmp2, carry, tmp2 ); 5028 add (tmp2, tmp1, tmp2 ); 5029 stwx (tmp2, offset, out ); 5030 srdi (carry, tmp2, 32 ); 5031 subi (offset, offset, 4 ); 5032 subi (len, len, 4 ); 5033 bdnz (LOOP); 5034 bind(SKIP); 5035 } 5036 5037 void MacroAssembler::multiply_to_len(Register x, Register xlen, 5038 Register y, Register ylen, 5039 Register z, Register zlen, 5040 Register tmp1, Register tmp2, 5041 Register tmp3, Register tmp4, 5042 Register tmp5, Register tmp6, 5043 Register tmp7, Register tmp8, 5044 Register tmp9, Register tmp10, 5045 Register tmp11, Register tmp12, 5046 Register tmp13) { 5047 5048 ShortBranchVerifier sbv(this); 5049 5050 assert_different_registers(x, xlen, y, ylen, z, zlen, 5051 tmp1, tmp2, tmp3, tmp4, tmp5, tmp6); 5052 assert_different_registers(x, xlen, y, ylen, z, zlen, 5053 tmp1, tmp2, tmp3, tmp4, tmp5, tmp7); 5054 assert_different_registers(x, xlen, y, ylen, z, zlen, 5055 tmp1, tmp2, tmp3, tmp4, tmp5, tmp8); 5056 5057 const Register idx = tmp1; 5058 const Register kdx = tmp2; 5059 const Register xstart = tmp3; 5060 5061 const Register y_idx = tmp4; 5062 const Register carry = tmp5; 5063 const Register product = tmp6; 5064 const Register product_high = tmp7; 5065 const Register x_xstart = tmp8; 5066 const Register tmp = tmp9; 5067 5068 // First Loop. 5069 // 5070 // final static long LONG_MASK = 0xffffffffL; 5071 // int xstart = xlen - 1; 5072 // int ystart = ylen - 1; 5073 // long carry = 0; 5074 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) { 5075 // long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry; 5076 // z[kdx] = (int)product; 5077 // carry = product >>> 32; 5078 // } 5079 // z[xstart] = (int)carry; 5080 5081 mr_if_needed(idx, ylen); // idx = ylen 5082 mr_if_needed(kdx, zlen); // kdx = xlen + ylen 5083 li(carry, 0); // carry = 0 5084 5085 Label L_done; 5086 5087 addic_(xstart, xlen, -1); 5088 blt(CCR0, L_done); 5089 5090 multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, 5091 carry, product_high, product, idx, kdx, tmp); 5092 5093 Label L_second_loop; 5094 5095 cmpdi(CCR0, kdx, 0); 5096 beq(CCR0, L_second_loop); 5097 5098 Label L_carry; 5099 5100 addic_(kdx, kdx, -1); 5101 beq(CCR0, L_carry); 5102 5103 // Store lower 32 bits of carry. 5104 sldi(tmp, kdx, LogBytesPerInt); 5105 stwx(carry, z, tmp); 5106 srdi(carry, carry, 32); 5107 addi(kdx, kdx, -1); 5108 5109 5110 bind(L_carry); 5111 5112 // Store upper 32 bits of carry. 5113 sldi(tmp, kdx, LogBytesPerInt); 5114 stwx(carry, z, tmp); 5115 5116 // Second and third (nested) loops. 5117 // 5118 // for (int i = xstart-1; i >= 0; i--) { // Second loop 5119 // carry = 0; 5120 // for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop 5121 // long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) + 5122 // (z[k] & LONG_MASK) + carry; 5123 // z[k] = (int)product; 5124 // carry = product >>> 32; 5125 // } 5126 // z[i] = (int)carry; 5127 // } 5128 // 5129 // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = rdx 5130 5131 bind(L_second_loop); 5132 5133 li(carry, 0); // carry = 0; 5134 5135 addic_(xstart, xstart, -1); // i = xstart-1; 5136 blt(CCR0, L_done); 5137 5138 Register zsave = tmp10; 5139 5140 mr(zsave, z); 5141 5142 5143 Label L_last_x; 5144 5145 sldi(tmp, xstart, LogBytesPerInt); 5146 add(z, z, tmp); // z = z + k - j 5147 addi(z, z, 4); 5148 addic_(xstart, xstart, -1); // i = xstart-1; 5149 blt(CCR0, L_last_x); 5150 5151 sldi(tmp, xstart, LogBytesPerInt); 5152 ldx(x_xstart, x, tmp); 5153 #ifdef VM_LITTLE_ENDIAN 5154 rldicl(x_xstart, x_xstart, 32, 0); 5155 #endif 5156 5157 5158 Label L_third_loop_prologue; 5159 5160 bind(L_third_loop_prologue); 5161 5162 Register xsave = tmp11; 5163 Register xlensave = tmp12; 5164 Register ylensave = tmp13; 5165 5166 mr(xsave, x); 5167 mr(xlensave, xstart); 5168 mr(ylensave, ylen); 5169 5170 5171 multiply_128_x_128_loop(x_xstart, y, z, y_idx, ylen, 5172 carry, product_high, product, x, tmp); 5173 5174 mr(z, zsave); 5175 mr(x, xsave); 5176 mr(xlen, xlensave); // This is the decrement of the loop counter! 5177 mr(ylen, ylensave); 5178 5179 addi(tmp3, xlen, 1); 5180 sldi(tmp, tmp3, LogBytesPerInt); 5181 stwx(carry, z, tmp); 5182 addic_(tmp3, tmp3, -1); 5183 blt(CCR0, L_done); 5184 5185 srdi(carry, carry, 32); 5186 sldi(tmp, tmp3, LogBytesPerInt); 5187 stwx(carry, z, tmp); 5188 b(L_second_loop); 5189 5190 // Next infrequent code is moved outside loops. 5191 bind(L_last_x); 5192 5193 lwz(x_xstart, 0, x); 5194 b(L_third_loop_prologue); 5195 5196 bind(L_done); 5197 } // multiply_to_len 5198 5199 void MacroAssembler::asm_assert(bool check_equal, const char *msg, int id) { 5200 #ifdef ASSERT 5201 Label ok; 5202 if (check_equal) { 5203 beq(CCR0, ok); 5204 } else { 5205 bne(CCR0, ok); 5206 } 5207 stop(msg, id); 5208 bind(ok); 5209 #endif 5210 } 5211 5212 void MacroAssembler::asm_assert_mems_zero(bool check_equal, int size, int mem_offset, 5213 Register mem_base, const char* msg, int id) { 5214 #ifdef ASSERT 5215 switch (size) { 5216 case 4: 5217 lwz(R0, mem_offset, mem_base); 5218 cmpwi(CCR0, R0, 0); 5219 break; 5220 case 8: 5221 ld(R0, mem_offset, mem_base); 5222 cmpdi(CCR0, R0, 0); 5223 break; 5224 default: 5225 ShouldNotReachHere(); 5226 } 5227 asm_assert(check_equal, msg, id); 5228 #endif // ASSERT 5229 } 5230 5231 void MacroAssembler::verify_thread() { 5232 if (VerifyThread) { 5233 unimplemented("'VerifyThread' currently not implemented on PPC"); 5234 } 5235 } 5236 5237 // READ: oop. KILL: R0. Volatile floats perhaps. 5238 void MacroAssembler::verify_oop(Register oop, const char* msg) { 5239 if (!VerifyOops) { 5240 return; 5241 } 5242 5243 address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address(); 5244 const Register tmp = R11; // Will be preserved. 5245 const int nbytes_save = MacroAssembler::num_volatile_regs * 8; 5246 save_volatile_gprs(R1_SP, -nbytes_save); // except R0 5247 5248 mr_if_needed(R4_ARG2, oop); 5249 save_LR_CR(tmp); // save in old frame 5250 push_frame_reg_args(nbytes_save, tmp); 5251 // load FunctionDescriptor** / entry_address * 5252 load_const_optimized(tmp, fd, R0); 5253 // load FunctionDescriptor* / entry_address 5254 ld(tmp, 0, tmp); 5255 load_const_optimized(R3_ARG1, (address)msg, R0); 5256 // Call destination for its side effect. 5257 call_c(tmp); 5258 5259 pop_frame(); 5260 restore_LR_CR(tmp); 5261 restore_volatile_gprs(R1_SP, -nbytes_save); // except R0 5262 } 5263 5264 void MacroAssembler::verify_oop_addr(RegisterOrConstant offs, Register base, const char* msg) { 5265 if (!VerifyOops) { 5266 return; 5267 } 5268 5269 address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address(); 5270 const Register tmp = R11; // Will be preserved. 5271 const int nbytes_save = MacroAssembler::num_volatile_regs * 8; 5272 save_volatile_gprs(R1_SP, -nbytes_save); // except R0 5273 5274 ld(R4_ARG2, offs, base); 5275 save_LR_CR(tmp); // save in old frame 5276 push_frame_reg_args(nbytes_save, tmp); 5277 // load FunctionDescriptor** / entry_address * 5278 load_const_optimized(tmp, fd, R0); 5279 // load FunctionDescriptor* / entry_address 5280 ld(tmp, 0, tmp); 5281 load_const_optimized(R3_ARG1, (address)msg, R0); 5282 // Call destination for its side effect. 5283 call_c(tmp); 5284 5285 pop_frame(); 5286 restore_LR_CR(tmp); 5287 restore_volatile_gprs(R1_SP, -nbytes_save); // except R0 5288 } 5289 5290 const char* stop_types[] = { 5291 "stop", 5292 "untested", 5293 "unimplemented", 5294 "shouldnotreachhere" 5295 }; 5296 5297 static void stop_on_request(int tp, const char* msg) { 5298 tty->print("PPC assembly code requires stop: (%s) %s\n", stop_types[tp%/*stop_end*/4], msg); 5299 guarantee(false, "PPC assembly code requires stop: %s", msg); 5300 } 5301 5302 // Call a C-function that prints output. 5303 void MacroAssembler::stop(int type, const char* msg, int id) { 5304 #ifndef PRODUCT 5305 block_comment(err_msg("stop: %s %s {", stop_types[type%stop_end], msg)); 5306 #else 5307 block_comment("stop {"); 5308 #endif 5309 5310 // setup arguments 5311 load_const_optimized(R3_ARG1, type); 5312 load_const_optimized(R4_ARG2, (void *)msg, /*tmp=*/R0); 5313 call_VM_leaf(CAST_FROM_FN_PTR(address, stop_on_request), R3_ARG1, R4_ARG2); 5314 illtrap(); 5315 emit_int32(id); 5316 block_comment("} stop;"); 5317 } 5318 5319 #ifndef PRODUCT 5320 // Write pattern 0x0101010101010101 in memory region [low-before, high+after]. 5321 // Val, addr are temp registers. 5322 // If low == addr, addr is killed. 5323 // High is preserved. 5324 void MacroAssembler::zap_from_to(Register low, int before, Register high, int after, Register val, Register addr) { 5325 if (!ZapMemory) return; 5326 5327 assert_different_registers(low, val); 5328 5329 BLOCK_COMMENT("zap memory region {"); 5330 load_const_optimized(val, 0x0101010101010101); 5331 int size = before + after; 5332 if (low == high && size < 5 && size > 0) { 5333 int offset = -before*BytesPerWord; 5334 for (int i = 0; i < size; ++i) { 5335 std(val, offset, low); 5336 offset += (1*BytesPerWord); 5337 } 5338 } else { 5339 addi(addr, low, -before*BytesPerWord); 5340 assert_different_registers(high, val); 5341 if (after) addi(high, high, after * BytesPerWord); 5342 Label loop; 5343 bind(loop); 5344 std(val, 0, addr); 5345 addi(addr, addr, 8); 5346 cmpd(CCR6, addr, high); 5347 ble(CCR6, loop); 5348 if (after) addi(high, high, -after * BytesPerWord); // Correct back to old value. 5349 } 5350 BLOCK_COMMENT("} zap memory region"); 5351 } 5352 5353 #endif // !PRODUCT 5354 5355 void SkipIfEqualZero::skip_to_label_if_equal_zero(MacroAssembler* masm, Register temp, 5356 const bool* flag_addr, Label& label) { 5357 int simm16_offset = masm->load_const_optimized(temp, (address)flag_addr, R0, true); 5358 assert(sizeof(bool) == 1, "PowerPC ABI"); 5359 masm->lbz(temp, simm16_offset, temp); 5360 masm->cmpwi(CCR0, temp, 0); 5361 masm->beq(CCR0, label); 5362 } 5363 5364 SkipIfEqualZero::SkipIfEqualZero(MacroAssembler* masm, Register temp, const bool* flag_addr) : _masm(masm), _label() { 5365 skip_to_label_if_equal_zero(masm, temp, flag_addr, _label); 5366 } 5367 5368 SkipIfEqualZero::~SkipIfEqualZero() { 5369 _masm->bind(_label); 5370 }