1 /* 2 * Copyright (c) 1997, 2016, Oracle and/or its affiliates. All rights reserved. 3 * Copyright (c) 2012, 2016 SAP SE. All rights reserved. 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. 9 * 10 * This code is distributed in the hope that it will be useful, but WITHOUT 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 * version 2 for more details (a copy is included in the LICENSE file that 14 * accompanied this code). 15 * 16 * You should have received a copy of the GNU General Public License version 17 * 2 along with this work; if not, write to the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 19 * 20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 21 * or visit www.oracle.com if you need additional information or have any 22 * questions. 23 * 24 */ 25 26 #include "precompiled.hpp" 27 #include "asm/macroAssembler.inline.hpp" 28 #include "compiler/disassembler.hpp" 29 #include "gc/shared/cardTableModRefBS.hpp" 30 #include "gc/shared/collectedHeap.inline.hpp" 31 #include "interpreter/interpreter.hpp" 32 #include "memory/resourceArea.hpp" 33 #include "nativeInst_ppc.hpp" 34 #include "prims/methodHandles.hpp" 35 #include "runtime/biasedLocking.hpp" 36 #include "runtime/icache.hpp" 37 #include "runtime/interfaceSupport.hpp" 38 #include "runtime/objectMonitor.hpp" 39 #include "runtime/os.hpp" 40 #include "runtime/sharedRuntime.hpp" 41 #include "runtime/stubRoutines.hpp" 42 #include "utilities/macros.hpp" 43 #if INCLUDE_ALL_GCS 44 #include "gc/g1/g1CollectedHeap.inline.hpp" 45 #include "gc/g1/g1SATBCardTableModRefBS.hpp" 46 #include "gc/g1/heapRegion.hpp" 47 #endif // INCLUDE_ALL_GCS 48 49 #ifdef PRODUCT 50 #define BLOCK_COMMENT(str) // nothing 51 #else 52 #define BLOCK_COMMENT(str) block_comment(str) 53 #endif 54 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 55 56 #ifdef ASSERT 57 // On RISC, there's no benefit to verifying instruction boundaries. 58 bool AbstractAssembler::pd_check_instruction_mark() { return false; } 59 #endif 60 61 void MacroAssembler::ld_largeoffset_unchecked(Register d, int si31, Register a, int emit_filler_nop) { 62 assert(Assembler::is_simm(si31, 31) && si31 >= 0, "si31 out of range"); 63 if (Assembler::is_simm(si31, 16)) { 64 ld(d, si31, a); 65 if (emit_filler_nop) nop(); 66 } else { 67 const int hi = MacroAssembler::largeoffset_si16_si16_hi(si31); 68 const int lo = MacroAssembler::largeoffset_si16_si16_lo(si31); 69 addis(d, a, hi); 70 ld(d, lo, d); 71 } 72 } 73 74 void MacroAssembler::ld_largeoffset(Register d, int si31, Register a, int emit_filler_nop) { 75 assert_different_registers(d, a); 76 ld_largeoffset_unchecked(d, si31, a, emit_filler_nop); 77 } 78 79 void MacroAssembler::load_sized_value(Register dst, RegisterOrConstant offs, Register base, 80 size_t size_in_bytes, bool is_signed) { 81 switch (size_in_bytes) { 82 case 8: ld(dst, offs, base); break; 83 case 4: is_signed ? lwa(dst, offs, base) : lwz(dst, offs, base); break; 84 case 2: is_signed ? lha(dst, offs, base) : lhz(dst, offs, base); break; 85 case 1: lbz(dst, offs, base); if (is_signed) extsb(dst, dst); break; // lba doesn't exist :( 86 default: ShouldNotReachHere(); 87 } 88 } 89 90 void MacroAssembler::store_sized_value(Register dst, RegisterOrConstant offs, Register base, 91 size_t size_in_bytes) { 92 switch (size_in_bytes) { 93 case 8: std(dst, offs, base); break; 94 case 4: stw(dst, offs, base); break; 95 case 2: sth(dst, offs, base); break; 96 case 1: stb(dst, offs, base); break; 97 default: ShouldNotReachHere(); 98 } 99 } 100 101 void MacroAssembler::align(int modulus, int max, int rem) { 102 int padding = (rem + modulus - (offset() % modulus)) % modulus; 103 if (padding > max) return; 104 for (int c = (padding >> 2); c > 0; --c) { nop(); } 105 } 106 107 // Issue instructions that calculate given TOC from global TOC. 108 void MacroAssembler::calculate_address_from_global_toc(Register dst, address addr, bool hi16, bool lo16, 109 bool add_relocation, bool emit_dummy_addr) { 110 int offset = -1; 111 if (emit_dummy_addr) { 112 offset = -128; // dummy address 113 } else if (addr != (address)(intptr_t)-1) { 114 offset = MacroAssembler::offset_to_global_toc(addr); 115 } 116 117 if (hi16) { 118 addis(dst, R29_TOC, MacroAssembler::largeoffset_si16_si16_hi(offset)); 119 } 120 if (lo16) { 121 if (add_relocation) { 122 // Relocate at the addi to avoid confusion with a load from the method's TOC. 123 relocate(internal_word_Relocation::spec(addr)); 124 } 125 addi(dst, dst, MacroAssembler::largeoffset_si16_si16_lo(offset)); 126 } 127 } 128 129 int MacroAssembler::patch_calculate_address_from_global_toc_at(address a, address bound, address addr) { 130 const int offset = MacroAssembler::offset_to_global_toc(addr); 131 132 const address inst2_addr = a; 133 const int inst2 = *(int *)inst2_addr; 134 135 // The relocation points to the second instruction, the addi, 136 // and the addi reads and writes the same register dst. 137 const int dst = inv_rt_field(inst2); 138 assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst"); 139 140 // Now, find the preceding addis which writes to dst. 141 int inst1 = 0; 142 address inst1_addr = inst2_addr - BytesPerInstWord; 143 while (inst1_addr >= bound) { 144 inst1 = *(int *) inst1_addr; 145 if (is_addis(inst1) && inv_rt_field(inst1) == dst) { 146 // Stop, found the addis which writes dst. 147 break; 148 } 149 inst1_addr -= BytesPerInstWord; 150 } 151 152 assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC"); 153 set_imm((int *)inst1_addr, MacroAssembler::largeoffset_si16_si16_hi(offset)); 154 set_imm((int *)inst2_addr, MacroAssembler::largeoffset_si16_si16_lo(offset)); 155 return (int)((intptr_t)addr - (intptr_t)inst1_addr); 156 } 157 158 address MacroAssembler::get_address_of_calculate_address_from_global_toc_at(address a, address bound) { 159 const address inst2_addr = a; 160 const int inst2 = *(int *)inst2_addr; 161 162 // The relocation points to the second instruction, the addi, 163 // and the addi reads and writes the same register dst. 164 const int dst = inv_rt_field(inst2); 165 assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst"); 166 167 // Now, find the preceding addis which writes to dst. 168 int inst1 = 0; 169 address inst1_addr = inst2_addr - BytesPerInstWord; 170 while (inst1_addr >= bound) { 171 inst1 = *(int *) inst1_addr; 172 if (is_addis(inst1) && inv_rt_field(inst1) == dst) { 173 // stop, found the addis which writes dst 174 break; 175 } 176 inst1_addr -= BytesPerInstWord; 177 } 178 179 assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC"); 180 181 int offset = (get_imm(inst1_addr, 0) << 16) + get_imm(inst2_addr, 0); 182 // -1 is a special case 183 if (offset == -1) { 184 return (address)(intptr_t)-1; 185 } else { 186 return global_toc() + offset; 187 } 188 } 189 190 #ifdef _LP64 191 // Patch compressed oops or klass constants. 192 // Assembler sequence is 193 // 1) compressed oops: 194 // lis rx = const.hi 195 // ori rx = rx | const.lo 196 // 2) compressed klass: 197 // lis rx = const.hi 198 // clrldi rx = rx & 0xFFFFffff // clearMS32b, optional 199 // ori rx = rx | const.lo 200 // Clrldi will be passed by. 201 int MacroAssembler::patch_set_narrow_oop(address a, address bound, narrowOop data) { 202 assert(UseCompressedOops, "Should only patch compressed oops"); 203 204 const address inst2_addr = a; 205 const int inst2 = *(int *)inst2_addr; 206 207 // The relocation points to the second instruction, the ori, 208 // and the ori reads and writes the same register dst. 209 const int dst = inv_rta_field(inst2); 210 assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst"); 211 // Now, find the preceding addis which writes to dst. 212 int inst1 = 0; 213 address inst1_addr = inst2_addr - BytesPerInstWord; 214 bool inst1_found = false; 215 while (inst1_addr >= bound) { 216 inst1 = *(int *)inst1_addr; 217 if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break; } 218 inst1_addr -= BytesPerInstWord; 219 } 220 assert(inst1_found, "inst is not lis"); 221 222 int xc = (data >> 16) & 0xffff; 223 int xd = (data >> 0) & 0xffff; 224 225 set_imm((int *)inst1_addr, (short)(xc)); // see enc_load_con_narrow_hi/_lo 226 set_imm((int *)inst2_addr, (xd)); // unsigned int 227 return (int)((intptr_t)inst2_addr - (intptr_t)inst1_addr); 228 } 229 230 // Get compressed oop or klass constant. 231 narrowOop MacroAssembler::get_narrow_oop(address a, address bound) { 232 assert(UseCompressedOops, "Should only patch compressed oops"); 233 234 const address inst2_addr = a; 235 const int inst2 = *(int *)inst2_addr; 236 237 // The relocation points to the second instruction, the ori, 238 // and the ori reads and writes the same register dst. 239 const int dst = inv_rta_field(inst2); 240 assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst"); 241 // Now, find the preceding lis which writes to dst. 242 int inst1 = 0; 243 address inst1_addr = inst2_addr - BytesPerInstWord; 244 bool inst1_found = false; 245 246 while (inst1_addr >= bound) { 247 inst1 = *(int *) inst1_addr; 248 if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break;} 249 inst1_addr -= BytesPerInstWord; 250 } 251 assert(inst1_found, "inst is not lis"); 252 253 uint xl = ((unsigned int) (get_imm(inst2_addr, 0) & 0xffff)); 254 uint xh = (((get_imm(inst1_addr, 0)) & 0xffff) << 16); 255 256 return (int) (xl | xh); 257 } 258 #endif // _LP64 259 260 // Returns true if successful. 261 bool MacroAssembler::load_const_from_method_toc(Register dst, AddressLiteral& a, 262 Register toc, bool fixed_size) { 263 int toc_offset = 0; 264 // Use RelocationHolder::none for the constant pool entry, otherwise 265 // we will end up with a failing NativeCall::verify(x) where x is 266 // the address of the constant pool entry. 267 // FIXME: We should insert relocation information for oops at the constant 268 // pool entries instead of inserting it at the loads; patching of a constant 269 // pool entry should be less expensive. 270 address const_address = address_constant((address)a.value(), RelocationHolder::none); 271 if (const_address == NULL) { return false; } // allocation failure 272 // Relocate at the pc of the load. 273 relocate(a.rspec()); 274 toc_offset = (int)(const_address - code()->consts()->start()); 275 ld_largeoffset_unchecked(dst, toc_offset, toc, fixed_size); 276 return true; 277 } 278 279 bool MacroAssembler::is_load_const_from_method_toc_at(address a) { 280 const address inst1_addr = a; 281 const int inst1 = *(int *)inst1_addr; 282 283 // The relocation points to the ld or the addis. 284 return (is_ld(inst1)) || 285 (is_addis(inst1) && inv_ra_field(inst1) != 0); 286 } 287 288 int MacroAssembler::get_offset_of_load_const_from_method_toc_at(address a) { 289 assert(is_load_const_from_method_toc_at(a), "must be load_const_from_method_toc"); 290 291 const address inst1_addr = a; 292 const int inst1 = *(int *)inst1_addr; 293 294 if (is_ld(inst1)) { 295 return inv_d1_field(inst1); 296 } else if (is_addis(inst1)) { 297 const int dst = inv_rt_field(inst1); 298 299 // Now, find the succeeding ld which reads and writes to dst. 300 address inst2_addr = inst1_addr + BytesPerInstWord; 301 int inst2 = 0; 302 while (true) { 303 inst2 = *(int *) inst2_addr; 304 if (is_ld(inst2) && inv_ra_field(inst2) == dst && inv_rt_field(inst2) == dst) { 305 // Stop, found the ld which reads and writes dst. 306 break; 307 } 308 inst2_addr += BytesPerInstWord; 309 } 310 return (inv_d1_field(inst1) << 16) + inv_d1_field(inst2); 311 } 312 ShouldNotReachHere(); 313 return 0; 314 } 315 316 // Get the constant from a `load_const' sequence. 317 long MacroAssembler::get_const(address a) { 318 assert(is_load_const_at(a), "not a load of a constant"); 319 const int *p = (const int*) a; 320 unsigned long x = (((unsigned long) (get_imm(a,0) & 0xffff)) << 48); 321 if (is_ori(*(p+1))) { 322 x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 32); 323 x |= (((unsigned long) (get_imm(a,3) & 0xffff)) << 16); 324 x |= (((unsigned long) (get_imm(a,4) & 0xffff))); 325 } else if (is_lis(*(p+1))) { 326 x |= (((unsigned long) (get_imm(a,2) & 0xffff)) << 32); 327 x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 16); 328 x |= (((unsigned long) (get_imm(a,3) & 0xffff))); 329 } else { 330 ShouldNotReachHere(); 331 return (long) 0; 332 } 333 return (long) x; 334 } 335 336 // Patch the 64 bit constant of a `load_const' sequence. This is a low 337 // level procedure. It neither flushes the instruction cache nor is it 338 // mt safe. 339 void MacroAssembler::patch_const(address a, long x) { 340 assert(is_load_const_at(a), "not a load of a constant"); 341 int *p = (int*) a; 342 if (is_ori(*(p+1))) { 343 set_imm(0 + p, (x >> 48) & 0xffff); 344 set_imm(1 + p, (x >> 32) & 0xffff); 345 set_imm(3 + p, (x >> 16) & 0xffff); 346 set_imm(4 + p, x & 0xffff); 347 } else if (is_lis(*(p+1))) { 348 set_imm(0 + p, (x >> 48) & 0xffff); 349 set_imm(2 + p, (x >> 32) & 0xffff); 350 set_imm(1 + p, (x >> 16) & 0xffff); 351 set_imm(3 + p, x & 0xffff); 352 } else { 353 ShouldNotReachHere(); 354 } 355 } 356 357 AddressLiteral MacroAssembler::allocate_metadata_address(Metadata* obj) { 358 assert(oop_recorder() != NULL, "this assembler needs a Recorder"); 359 int index = oop_recorder()->allocate_metadata_index(obj); 360 RelocationHolder rspec = metadata_Relocation::spec(index); 361 return AddressLiteral((address)obj, rspec); 362 } 363 364 AddressLiteral MacroAssembler::constant_metadata_address(Metadata* obj) { 365 assert(oop_recorder() != NULL, "this assembler needs a Recorder"); 366 int index = oop_recorder()->find_index(obj); 367 RelocationHolder rspec = metadata_Relocation::spec(index); 368 return AddressLiteral((address)obj, rspec); 369 } 370 371 AddressLiteral MacroAssembler::allocate_oop_address(jobject obj) { 372 assert(oop_recorder() != NULL, "this assembler needs an OopRecorder"); 373 int oop_index = oop_recorder()->allocate_oop_index(obj); 374 return AddressLiteral(address(obj), oop_Relocation::spec(oop_index)); 375 } 376 377 AddressLiteral MacroAssembler::constant_oop_address(jobject obj) { 378 assert(oop_recorder() != NULL, "this assembler needs an OopRecorder"); 379 int oop_index = oop_recorder()->find_index(obj); 380 return AddressLiteral(address(obj), oop_Relocation::spec(oop_index)); 381 } 382 383 RegisterOrConstant MacroAssembler::delayed_value_impl(intptr_t* delayed_value_addr, 384 Register tmp, int offset) { 385 intptr_t value = *delayed_value_addr; 386 if (value != 0) { 387 return RegisterOrConstant(value + offset); 388 } 389 390 // Load indirectly to solve generation ordering problem. 391 // static address, no relocation 392 int simm16_offset = load_const_optimized(tmp, delayed_value_addr, noreg, true); 393 ld(tmp, simm16_offset, tmp); // must be aligned ((xa & 3) == 0) 394 395 if (offset != 0) { 396 addi(tmp, tmp, offset); 397 } 398 399 return RegisterOrConstant(tmp); 400 } 401 402 #ifndef PRODUCT 403 void MacroAssembler::pd_print_patched_instruction(address branch) { 404 Unimplemented(); // TODO: PPC port 405 } 406 #endif // ndef PRODUCT 407 408 // Conditional far branch for destinations encodable in 24+2 bits. 409 void MacroAssembler::bc_far(int boint, int biint, Label& dest, int optimize) { 410 411 // If requested by flag optimize, relocate the bc_far as a 412 // runtime_call and prepare for optimizing it when the code gets 413 // relocated. 414 if (optimize == bc_far_optimize_on_relocate) { 415 relocate(relocInfo::runtime_call_type); 416 } 417 418 // variant 2: 419 // 420 // b!cxx SKIP 421 // bxx DEST 422 // SKIP: 423 // 424 425 const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)), 426 opposite_bcond(inv_boint_bcond(boint))); 427 428 // We emit two branches. 429 // First, a conditional branch which jumps around the far branch. 430 const address not_taken_pc = pc() + 2 * BytesPerInstWord; 431 const address bc_pc = pc(); 432 bc(opposite_boint, biint, not_taken_pc); 433 434 const int bc_instr = *(int*)bc_pc; 435 assert(not_taken_pc == (address)inv_bd_field(bc_instr, (intptr_t)bc_pc), "postcondition"); 436 assert(opposite_boint == inv_bo_field(bc_instr), "postcondition"); 437 assert(boint == add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(bc_instr))), 438 opposite_bcond(inv_boint_bcond(inv_bo_field(bc_instr)))), 439 "postcondition"); 440 assert(biint == inv_bi_field(bc_instr), "postcondition"); 441 442 // Second, an unconditional far branch which jumps to dest. 443 // Note: target(dest) remembers the current pc (see CodeSection::target) 444 // and returns the current pc if the label is not bound yet; when 445 // the label gets bound, the unconditional far branch will be patched. 446 const address target_pc = target(dest); 447 const address b_pc = pc(); 448 b(target_pc); 449 450 assert(not_taken_pc == pc(), "postcondition"); 451 assert(dest.is_bound() || target_pc == b_pc, "postcondition"); 452 } 453 454 // 1 or 2 instructions 455 void MacroAssembler::bc_far_optimized(int boint, int biint, Label& dest) { 456 if (dest.is_bound() && is_within_range_of_bcxx(target(dest), pc())) { 457 bc(boint, biint, dest); 458 } else { 459 bc_far(boint, biint, dest, MacroAssembler::bc_far_optimize_on_relocate); 460 } 461 } 462 463 bool MacroAssembler::is_bc_far_at(address instruction_addr) { 464 return is_bc_far_variant1_at(instruction_addr) || 465 is_bc_far_variant2_at(instruction_addr) || 466 is_bc_far_variant3_at(instruction_addr); 467 } 468 469 address MacroAssembler::get_dest_of_bc_far_at(address instruction_addr) { 470 if (is_bc_far_variant1_at(instruction_addr)) { 471 const address instruction_1_addr = instruction_addr; 472 const int instruction_1 = *(int*)instruction_1_addr; 473 return (address)inv_bd_field(instruction_1, (intptr_t)instruction_1_addr); 474 } else if (is_bc_far_variant2_at(instruction_addr)) { 475 const address instruction_2_addr = instruction_addr + 4; 476 return bxx_destination(instruction_2_addr); 477 } else if (is_bc_far_variant3_at(instruction_addr)) { 478 return instruction_addr + 8; 479 } 480 // variant 4 ??? 481 ShouldNotReachHere(); 482 return NULL; 483 } 484 void MacroAssembler::set_dest_of_bc_far_at(address instruction_addr, address dest) { 485 486 if (is_bc_far_variant3_at(instruction_addr)) { 487 // variant 3, far cond branch to the next instruction, already patched to nops: 488 // 489 // nop 490 // endgroup 491 // SKIP/DEST: 492 // 493 return; 494 } 495 496 // first, extract boint and biint from the current branch 497 int boint = 0; 498 int biint = 0; 499 500 ResourceMark rm; 501 const int code_size = 2 * BytesPerInstWord; 502 CodeBuffer buf(instruction_addr, code_size); 503 MacroAssembler masm(&buf); 504 if (is_bc_far_variant2_at(instruction_addr) && dest == instruction_addr + 8) { 505 // Far branch to next instruction: Optimize it by patching nops (produce variant 3). 506 masm.nop(); 507 masm.endgroup(); 508 } else { 509 if (is_bc_far_variant1_at(instruction_addr)) { 510 // variant 1, the 1st instruction contains the destination address: 511 // 512 // bcxx DEST 513 // nop 514 // 515 const int instruction_1 = *(int*)(instruction_addr); 516 boint = inv_bo_field(instruction_1); 517 biint = inv_bi_field(instruction_1); 518 } else if (is_bc_far_variant2_at(instruction_addr)) { 519 // variant 2, the 2nd instruction contains the destination address: 520 // 521 // b!cxx SKIP 522 // bxx DEST 523 // SKIP: 524 // 525 const int instruction_1 = *(int*)(instruction_addr); 526 boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(instruction_1))), 527 opposite_bcond(inv_boint_bcond(inv_bo_field(instruction_1)))); 528 biint = inv_bi_field(instruction_1); 529 } else { 530 // variant 4??? 531 ShouldNotReachHere(); 532 } 533 534 // second, set the new branch destination and optimize the code 535 if (dest != instruction_addr + 4 && // the bc_far is still unbound! 536 masm.is_within_range_of_bcxx(dest, instruction_addr)) { 537 // variant 1: 538 // 539 // bcxx DEST 540 // nop 541 // 542 masm.bc(boint, biint, dest); 543 masm.nop(); 544 } else { 545 // variant 2: 546 // 547 // b!cxx SKIP 548 // bxx DEST 549 // SKIP: 550 // 551 const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)), 552 opposite_bcond(inv_boint_bcond(boint))); 553 const address not_taken_pc = masm.pc() + 2 * BytesPerInstWord; 554 masm.bc(opposite_boint, biint, not_taken_pc); 555 masm.b(dest); 556 } 557 } 558 ICache::ppc64_flush_icache_bytes(instruction_addr, code_size); 559 } 560 561 // Emit a NOT mt-safe patchable 64 bit absolute call/jump. 562 void MacroAssembler::bxx64_patchable(address dest, relocInfo::relocType rt, bool link) { 563 // get current pc 564 uint64_t start_pc = (uint64_t) pc(); 565 566 const address pc_of_bl = (address) (start_pc + (6*BytesPerInstWord)); // bl is last 567 const address pc_of_b = (address) (start_pc + (0*BytesPerInstWord)); // b is first 568 569 // relocate here 570 if (rt != relocInfo::none) { 571 relocate(rt); 572 } 573 574 if ( ReoptimizeCallSequences && 575 (( link && is_within_range_of_b(dest, pc_of_bl)) || 576 (!link && is_within_range_of_b(dest, pc_of_b)))) { 577 // variant 2: 578 // Emit an optimized, pc-relative call/jump. 579 580 if (link) { 581 // some padding 582 nop(); 583 nop(); 584 nop(); 585 nop(); 586 nop(); 587 nop(); 588 589 // do the call 590 assert(pc() == pc_of_bl, "just checking"); 591 bl(dest, relocInfo::none); 592 } else { 593 // do the jump 594 assert(pc() == pc_of_b, "just checking"); 595 b(dest, relocInfo::none); 596 597 // some padding 598 nop(); 599 nop(); 600 nop(); 601 nop(); 602 nop(); 603 nop(); 604 } 605 606 // Assert that we can identify the emitted call/jump. 607 assert(is_bxx64_patchable_variant2_at((address)start_pc, link), 608 "can't identify emitted call"); 609 } else { 610 // variant 1: 611 mr(R0, R11); // spill R11 -> R0. 612 613 // Load the destination address into CTR, 614 // calculate destination relative to global toc. 615 calculate_address_from_global_toc(R11, dest, true, true, false); 616 617 mtctr(R11); 618 mr(R11, R0); // spill R11 <- R0. 619 nop(); 620 621 // do the call/jump 622 if (link) { 623 bctrl(); 624 } else{ 625 bctr(); 626 } 627 // Assert that we can identify the emitted call/jump. 628 assert(is_bxx64_patchable_variant1b_at((address)start_pc, link), 629 "can't identify emitted call"); 630 } 631 632 // Assert that we can identify the emitted call/jump. 633 assert(is_bxx64_patchable_at((address)start_pc, link), 634 "can't identify emitted call"); 635 assert(get_dest_of_bxx64_patchable_at((address)start_pc, link) == dest, 636 "wrong encoding of dest address"); 637 } 638 639 // Identify a bxx64_patchable instruction. 640 bool MacroAssembler::is_bxx64_patchable_at(address instruction_addr, bool link) { 641 return is_bxx64_patchable_variant1b_at(instruction_addr, link) 642 //|| is_bxx64_patchable_variant1_at(instruction_addr, link) 643 || is_bxx64_patchable_variant2_at(instruction_addr, link); 644 } 645 646 // Does the call64_patchable instruction use a pc-relative encoding of 647 // the call destination? 648 bool MacroAssembler::is_bxx64_patchable_pcrelative_at(address instruction_addr, bool link) { 649 // variant 2 is pc-relative 650 return is_bxx64_patchable_variant2_at(instruction_addr, link); 651 } 652 653 // Identify variant 1. 654 bool MacroAssembler::is_bxx64_patchable_variant1_at(address instruction_addr, bool link) { 655 unsigned int* instr = (unsigned int*) instruction_addr; 656 return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l] 657 && is_mtctr(instr[5]) // mtctr 658 && is_load_const_at(instruction_addr); 659 } 660 661 // Identify variant 1b: load destination relative to global toc. 662 bool MacroAssembler::is_bxx64_patchable_variant1b_at(address instruction_addr, bool link) { 663 unsigned int* instr = (unsigned int*) instruction_addr; 664 return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l] 665 && is_mtctr(instr[3]) // mtctr 666 && is_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord, instruction_addr); 667 } 668 669 // Identify variant 2. 670 bool MacroAssembler::is_bxx64_patchable_variant2_at(address instruction_addr, bool link) { 671 unsigned int* instr = (unsigned int*) instruction_addr; 672 if (link) { 673 return is_bl (instr[6]) // bl dest is last 674 && is_nop(instr[0]) // nop 675 && is_nop(instr[1]) // nop 676 && is_nop(instr[2]) // nop 677 && is_nop(instr[3]) // nop 678 && is_nop(instr[4]) // nop 679 && is_nop(instr[5]); // nop 680 } else { 681 return is_b (instr[0]) // b dest is first 682 && is_nop(instr[1]) // nop 683 && is_nop(instr[2]) // nop 684 && is_nop(instr[3]) // nop 685 && is_nop(instr[4]) // nop 686 && is_nop(instr[5]) // nop 687 && is_nop(instr[6]); // nop 688 } 689 } 690 691 // Set dest address of a bxx64_patchable instruction. 692 void MacroAssembler::set_dest_of_bxx64_patchable_at(address instruction_addr, address dest, bool link) { 693 ResourceMark rm; 694 int code_size = MacroAssembler::bxx64_patchable_size; 695 CodeBuffer buf(instruction_addr, code_size); 696 MacroAssembler masm(&buf); 697 masm.bxx64_patchable(dest, relocInfo::none, link); 698 ICache::ppc64_flush_icache_bytes(instruction_addr, code_size); 699 } 700 701 // Get dest address of a bxx64_patchable instruction. 702 address MacroAssembler::get_dest_of_bxx64_patchable_at(address instruction_addr, bool link) { 703 if (is_bxx64_patchable_variant1_at(instruction_addr, link)) { 704 return (address) (unsigned long) get_const(instruction_addr); 705 } else if (is_bxx64_patchable_variant2_at(instruction_addr, link)) { 706 unsigned int* instr = (unsigned int*) instruction_addr; 707 if (link) { 708 const int instr_idx = 6; // bl is last 709 int branchoffset = branch_destination(instr[instr_idx], 0); 710 return instruction_addr + branchoffset + instr_idx*BytesPerInstWord; 711 } else { 712 const int instr_idx = 0; // b is first 713 int branchoffset = branch_destination(instr[instr_idx], 0); 714 return instruction_addr + branchoffset + instr_idx*BytesPerInstWord; 715 } 716 // Load dest relative to global toc. 717 } else if (is_bxx64_patchable_variant1b_at(instruction_addr, link)) { 718 return get_address_of_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord, 719 instruction_addr); 720 } else { 721 ShouldNotReachHere(); 722 return NULL; 723 } 724 } 725 726 // Uses ordering which corresponds to ABI: 727 // _savegpr0_14: std r14,-144(r1) 728 // _savegpr0_15: std r15,-136(r1) 729 // _savegpr0_16: std r16,-128(r1) 730 void MacroAssembler::save_nonvolatile_gprs(Register dst, int offset) { 731 std(R14, offset, dst); offset += 8; 732 std(R15, offset, dst); offset += 8; 733 std(R16, offset, dst); offset += 8; 734 std(R17, offset, dst); offset += 8; 735 std(R18, offset, dst); offset += 8; 736 std(R19, offset, dst); offset += 8; 737 std(R20, offset, dst); offset += 8; 738 std(R21, offset, dst); offset += 8; 739 std(R22, offset, dst); offset += 8; 740 std(R23, offset, dst); offset += 8; 741 std(R24, offset, dst); offset += 8; 742 std(R25, offset, dst); offset += 8; 743 std(R26, offset, dst); offset += 8; 744 std(R27, offset, dst); offset += 8; 745 std(R28, offset, dst); offset += 8; 746 std(R29, offset, dst); offset += 8; 747 std(R30, offset, dst); offset += 8; 748 std(R31, offset, dst); offset += 8; 749 750 stfd(F14, offset, dst); offset += 8; 751 stfd(F15, offset, dst); offset += 8; 752 stfd(F16, offset, dst); offset += 8; 753 stfd(F17, offset, dst); offset += 8; 754 stfd(F18, offset, dst); offset += 8; 755 stfd(F19, offset, dst); offset += 8; 756 stfd(F20, offset, dst); offset += 8; 757 stfd(F21, offset, dst); offset += 8; 758 stfd(F22, offset, dst); offset += 8; 759 stfd(F23, offset, dst); offset += 8; 760 stfd(F24, offset, dst); offset += 8; 761 stfd(F25, offset, dst); offset += 8; 762 stfd(F26, offset, dst); offset += 8; 763 stfd(F27, offset, dst); offset += 8; 764 stfd(F28, offset, dst); offset += 8; 765 stfd(F29, offset, dst); offset += 8; 766 stfd(F30, offset, dst); offset += 8; 767 stfd(F31, offset, dst); 768 } 769 770 // Uses ordering which corresponds to ABI: 771 // _restgpr0_14: ld r14,-144(r1) 772 // _restgpr0_15: ld r15,-136(r1) 773 // _restgpr0_16: ld r16,-128(r1) 774 void MacroAssembler::restore_nonvolatile_gprs(Register src, int offset) { 775 ld(R14, offset, src); offset += 8; 776 ld(R15, offset, src); offset += 8; 777 ld(R16, offset, src); offset += 8; 778 ld(R17, offset, src); offset += 8; 779 ld(R18, offset, src); offset += 8; 780 ld(R19, offset, src); offset += 8; 781 ld(R20, offset, src); offset += 8; 782 ld(R21, offset, src); offset += 8; 783 ld(R22, offset, src); offset += 8; 784 ld(R23, offset, src); offset += 8; 785 ld(R24, offset, src); offset += 8; 786 ld(R25, offset, src); offset += 8; 787 ld(R26, offset, src); offset += 8; 788 ld(R27, offset, src); offset += 8; 789 ld(R28, offset, src); offset += 8; 790 ld(R29, offset, src); offset += 8; 791 ld(R30, offset, src); offset += 8; 792 ld(R31, offset, src); offset += 8; 793 794 // FP registers 795 lfd(F14, offset, src); offset += 8; 796 lfd(F15, offset, src); offset += 8; 797 lfd(F16, offset, src); offset += 8; 798 lfd(F17, offset, src); offset += 8; 799 lfd(F18, offset, src); offset += 8; 800 lfd(F19, offset, src); offset += 8; 801 lfd(F20, offset, src); offset += 8; 802 lfd(F21, offset, src); offset += 8; 803 lfd(F22, offset, src); offset += 8; 804 lfd(F23, offset, src); offset += 8; 805 lfd(F24, offset, src); offset += 8; 806 lfd(F25, offset, src); offset += 8; 807 lfd(F26, offset, src); offset += 8; 808 lfd(F27, offset, src); offset += 8; 809 lfd(F28, offset, src); offset += 8; 810 lfd(F29, offset, src); offset += 8; 811 lfd(F30, offset, src); offset += 8; 812 lfd(F31, offset, src); 813 } 814 815 // For verify_oops. 816 void MacroAssembler::save_volatile_gprs(Register dst, int offset) { 817 std(R2, offset, dst); offset += 8; 818 std(R3, offset, dst); offset += 8; 819 std(R4, offset, dst); offset += 8; 820 std(R5, offset, dst); offset += 8; 821 std(R6, offset, dst); offset += 8; 822 std(R7, offset, dst); offset += 8; 823 std(R8, offset, dst); offset += 8; 824 std(R9, offset, dst); offset += 8; 825 std(R10, offset, dst); offset += 8; 826 std(R11, offset, dst); offset += 8; 827 std(R12, offset, dst); offset += 8; 828 829 stfd(F0, offset, dst); offset += 8; 830 stfd(F1, offset, dst); offset += 8; 831 stfd(F2, offset, dst); offset += 8; 832 stfd(F3, offset, dst); offset += 8; 833 stfd(F4, offset, dst); offset += 8; 834 stfd(F5, offset, dst); offset += 8; 835 stfd(F6, offset, dst); offset += 8; 836 stfd(F7, offset, dst); offset += 8; 837 stfd(F8, offset, dst); offset += 8; 838 stfd(F9, offset, dst); offset += 8; 839 stfd(F10, offset, dst); offset += 8; 840 stfd(F11, offset, dst); offset += 8; 841 stfd(F12, offset, dst); offset += 8; 842 stfd(F13, offset, dst); 843 } 844 845 // For verify_oops. 846 void MacroAssembler::restore_volatile_gprs(Register src, int offset) { 847 ld(R2, offset, src); offset += 8; 848 ld(R3, offset, src); offset += 8; 849 ld(R4, offset, src); offset += 8; 850 ld(R5, offset, src); offset += 8; 851 ld(R6, offset, src); offset += 8; 852 ld(R7, offset, src); offset += 8; 853 ld(R8, offset, src); offset += 8; 854 ld(R9, offset, src); offset += 8; 855 ld(R10, offset, src); offset += 8; 856 ld(R11, offset, src); offset += 8; 857 ld(R12, offset, src); offset += 8; 858 859 lfd(F0, offset, src); offset += 8; 860 lfd(F1, offset, src); offset += 8; 861 lfd(F2, offset, src); offset += 8; 862 lfd(F3, offset, src); offset += 8; 863 lfd(F4, offset, src); offset += 8; 864 lfd(F5, offset, src); offset += 8; 865 lfd(F6, offset, src); offset += 8; 866 lfd(F7, offset, src); offset += 8; 867 lfd(F8, offset, src); offset += 8; 868 lfd(F9, offset, src); offset += 8; 869 lfd(F10, offset, src); offset += 8; 870 lfd(F11, offset, src); offset += 8; 871 lfd(F12, offset, src); offset += 8; 872 lfd(F13, offset, src); 873 } 874 875 void MacroAssembler::save_LR_CR(Register tmp) { 876 mfcr(tmp); 877 std(tmp, _abi(cr), R1_SP); 878 mflr(tmp); 879 std(tmp, _abi(lr), R1_SP); 880 // Tmp must contain lr on exit! (see return_addr and prolog in ppc64.ad) 881 } 882 883 void MacroAssembler::restore_LR_CR(Register tmp) { 884 assert(tmp != R1_SP, "must be distinct"); 885 ld(tmp, _abi(lr), R1_SP); 886 mtlr(tmp); 887 ld(tmp, _abi(cr), R1_SP); 888 mtcr(tmp); 889 } 890 891 address MacroAssembler::get_PC_trash_LR(Register result) { 892 Label L; 893 bl(L); 894 bind(L); 895 address lr_pc = pc(); 896 mflr(result); 897 return lr_pc; 898 } 899 900 void MacroAssembler::resize_frame(Register offset, Register tmp) { 901 #ifdef ASSERT 902 assert_different_registers(offset, tmp, R1_SP); 903 andi_(tmp, offset, frame::alignment_in_bytes-1); 904 asm_assert_eq("resize_frame: unaligned", 0x204); 905 #endif 906 907 // tmp <- *(SP) 908 ld(tmp, _abi(callers_sp), R1_SP); 909 // addr <- SP + offset; 910 // *(addr) <- tmp; 911 // SP <- addr 912 stdux(tmp, R1_SP, offset); 913 } 914 915 void MacroAssembler::resize_frame(int offset, Register tmp) { 916 assert(is_simm(offset, 16), "too big an offset"); 917 assert_different_registers(tmp, R1_SP); 918 assert((offset & (frame::alignment_in_bytes-1))==0, "resize_frame: unaligned"); 919 // tmp <- *(SP) 920 ld(tmp, _abi(callers_sp), R1_SP); 921 // addr <- SP + offset; 922 // *(addr) <- tmp; 923 // SP <- addr 924 stdu(tmp, offset, R1_SP); 925 } 926 927 void MacroAssembler::resize_frame_absolute(Register addr, Register tmp1, Register tmp2) { 928 // (addr == tmp1) || (addr == tmp2) is allowed here! 929 assert(tmp1 != tmp2, "must be distinct"); 930 931 // compute offset w.r.t. current stack pointer 932 // tmp_1 <- addr - SP (!) 933 subf(tmp1, R1_SP, addr); 934 935 // atomically update SP keeping back link. 936 resize_frame(tmp1/* offset */, tmp2/* tmp */); 937 } 938 939 void MacroAssembler::push_frame(Register bytes, Register tmp) { 940 #ifdef ASSERT 941 assert(bytes != R0, "r0 not allowed here"); 942 andi_(R0, bytes, frame::alignment_in_bytes-1); 943 asm_assert_eq("push_frame(Reg, Reg): unaligned", 0x203); 944 #endif 945 neg(tmp, bytes); 946 stdux(R1_SP, R1_SP, tmp); 947 } 948 949 // Push a frame of size `bytes'. 950 void MacroAssembler::push_frame(unsigned int bytes, Register tmp) { 951 long offset = align_addr(bytes, frame::alignment_in_bytes); 952 if (is_simm(-offset, 16)) { 953 stdu(R1_SP, -offset, R1_SP); 954 } else { 955 load_const_optimized(tmp, -offset); 956 stdux(R1_SP, R1_SP, tmp); 957 } 958 } 959 960 // Push a frame of size `bytes' plus abi_reg_args on top. 961 void MacroAssembler::push_frame_reg_args(unsigned int bytes, Register tmp) { 962 push_frame(bytes + frame::abi_reg_args_size, tmp); 963 } 964 965 // Setup up a new C frame with a spill area for non-volatile GPRs and 966 // additional space for local variables. 967 void MacroAssembler::push_frame_reg_args_nonvolatiles(unsigned int bytes, 968 Register tmp) { 969 push_frame(bytes + frame::abi_reg_args_size + frame::spill_nonvolatiles_size, tmp); 970 } 971 972 // Pop current C frame. 973 void MacroAssembler::pop_frame() { 974 ld(R1_SP, _abi(callers_sp), R1_SP); 975 } 976 977 #if defined(ABI_ELFv2) 978 address MacroAssembler::branch_to(Register r_function_entry, bool and_link) { 979 // TODO(asmundak): make sure the caller uses R12 as function descriptor 980 // most of the times. 981 if (R12 != r_function_entry) { 982 mr(R12, r_function_entry); 983 } 984 mtctr(R12); 985 // Do a call or a branch. 986 if (and_link) { 987 bctrl(); 988 } else { 989 bctr(); 990 } 991 _last_calls_return_pc = pc(); 992 993 return _last_calls_return_pc; 994 } 995 996 // Call a C function via a function descriptor and use full C 997 // calling conventions. Updates and returns _last_calls_return_pc. 998 address MacroAssembler::call_c(Register r_function_entry) { 999 return branch_to(r_function_entry, /*and_link=*/true); 1000 } 1001 1002 // For tail calls: only branch, don't link, so callee returns to caller of this function. 1003 address MacroAssembler::call_c_and_return_to_caller(Register r_function_entry) { 1004 return branch_to(r_function_entry, /*and_link=*/false); 1005 } 1006 1007 address MacroAssembler::call_c(address function_entry, relocInfo::relocType rt) { 1008 load_const(R12, function_entry, R0); 1009 return branch_to(R12, /*and_link=*/true); 1010 } 1011 1012 #else 1013 // Generic version of a call to C function via a function descriptor 1014 // with variable support for C calling conventions (TOC, ENV, etc.). 1015 // Updates and returns _last_calls_return_pc. 1016 address MacroAssembler::branch_to(Register function_descriptor, bool and_link, bool save_toc_before_call, 1017 bool restore_toc_after_call, bool load_toc_of_callee, bool load_env_of_callee) { 1018 // we emit standard ptrgl glue code here 1019 assert((function_descriptor != R0), "function_descriptor cannot be R0"); 1020 1021 // retrieve necessary entries from the function descriptor 1022 ld(R0, in_bytes(FunctionDescriptor::entry_offset()), function_descriptor); 1023 mtctr(R0); 1024 1025 if (load_toc_of_callee) { 1026 ld(R2_TOC, in_bytes(FunctionDescriptor::toc_offset()), function_descriptor); 1027 } 1028 if (load_env_of_callee) { 1029 ld(R11, in_bytes(FunctionDescriptor::env_offset()), function_descriptor); 1030 } else if (load_toc_of_callee) { 1031 li(R11, 0); 1032 } 1033 1034 // do a call or a branch 1035 if (and_link) { 1036 bctrl(); 1037 } else { 1038 bctr(); 1039 } 1040 _last_calls_return_pc = pc(); 1041 1042 return _last_calls_return_pc; 1043 } 1044 1045 // Call a C function via a function descriptor and use full C calling 1046 // conventions. 1047 // We don't use the TOC in generated code, so there is no need to save 1048 // and restore its value. 1049 address MacroAssembler::call_c(Register fd) { 1050 return branch_to(fd, /*and_link=*/true, 1051 /*save toc=*/false, 1052 /*restore toc=*/false, 1053 /*load toc=*/true, 1054 /*load env=*/true); 1055 } 1056 1057 address MacroAssembler::call_c_and_return_to_caller(Register fd) { 1058 return branch_to(fd, /*and_link=*/false, 1059 /*save toc=*/false, 1060 /*restore toc=*/false, 1061 /*load toc=*/true, 1062 /*load env=*/true); 1063 } 1064 1065 address MacroAssembler::call_c(const FunctionDescriptor* fd, relocInfo::relocType rt) { 1066 if (rt != relocInfo::none) { 1067 // this call needs to be relocatable 1068 if (!ReoptimizeCallSequences 1069 || (rt != relocInfo::runtime_call_type && rt != relocInfo::none) 1070 || fd == NULL // support code-size estimation 1071 || !fd->is_friend_function() 1072 || fd->entry() == NULL) { 1073 // it's not a friend function as defined by class FunctionDescriptor, 1074 // so do a full call-c here. 1075 load_const(R11, (address)fd, R0); 1076 1077 bool has_env = (fd != NULL && fd->env() != NULL); 1078 return branch_to(R11, /*and_link=*/true, 1079 /*save toc=*/false, 1080 /*restore toc=*/false, 1081 /*load toc=*/true, 1082 /*load env=*/has_env); 1083 } else { 1084 // It's a friend function. Load the entry point and don't care about 1085 // toc and env. Use an optimizable call instruction, but ensure the 1086 // same code-size as in the case of a non-friend function. 1087 nop(); 1088 nop(); 1089 nop(); 1090 bl64_patchable(fd->entry(), rt); 1091 _last_calls_return_pc = pc(); 1092 return _last_calls_return_pc; 1093 } 1094 } else { 1095 // This call does not need to be relocatable, do more aggressive 1096 // optimizations. 1097 if (!ReoptimizeCallSequences 1098 || !fd->is_friend_function()) { 1099 // It's not a friend function as defined by class FunctionDescriptor, 1100 // so do a full call-c here. 1101 load_const(R11, (address)fd, R0); 1102 return branch_to(R11, /*and_link=*/true, 1103 /*save toc=*/false, 1104 /*restore toc=*/false, 1105 /*load toc=*/true, 1106 /*load env=*/true); 1107 } else { 1108 // it's a friend function, load the entry point and don't care about 1109 // toc and env. 1110 address dest = fd->entry(); 1111 if (is_within_range_of_b(dest, pc())) { 1112 bl(dest); 1113 } else { 1114 bl64_patchable(dest, rt); 1115 } 1116 _last_calls_return_pc = pc(); 1117 return _last_calls_return_pc; 1118 } 1119 } 1120 } 1121 1122 // Call a C function. All constants needed reside in TOC. 1123 // 1124 // Read the address to call from the TOC. 1125 // Read env from TOC, if fd specifies an env. 1126 // Read new TOC from TOC. 1127 address MacroAssembler::call_c_using_toc(const FunctionDescriptor* fd, 1128 relocInfo::relocType rt, Register toc) { 1129 if (!ReoptimizeCallSequences 1130 || (rt != relocInfo::runtime_call_type && rt != relocInfo::none) 1131 || !fd->is_friend_function()) { 1132 // It's not a friend function as defined by class FunctionDescriptor, 1133 // so do a full call-c here. 1134 assert(fd->entry() != NULL, "function must be linked"); 1135 1136 AddressLiteral fd_entry(fd->entry()); 1137 bool success = load_const_from_method_toc(R11, fd_entry, toc, /*fixed_size*/ true); 1138 mtctr(R11); 1139 if (fd->env() == NULL) { 1140 li(R11, 0); 1141 nop(); 1142 } else { 1143 AddressLiteral fd_env(fd->env()); 1144 success = success && load_const_from_method_toc(R11, fd_env, toc, /*fixed_size*/ true); 1145 } 1146 AddressLiteral fd_toc(fd->toc()); 1147 // Set R2_TOC (load from toc) 1148 success = success && load_const_from_method_toc(R2_TOC, fd_toc, toc, /*fixed_size*/ true); 1149 bctrl(); 1150 _last_calls_return_pc = pc(); 1151 if (!success) { return NULL; } 1152 } else { 1153 // It's a friend function, load the entry point and don't care about 1154 // toc and env. Use an optimizable call instruction, but ensure the 1155 // same code-size as in the case of a non-friend function. 1156 nop(); 1157 bl64_patchable(fd->entry(), rt); 1158 _last_calls_return_pc = pc(); 1159 } 1160 return _last_calls_return_pc; 1161 } 1162 #endif // ABI_ELFv2 1163 1164 void MacroAssembler::call_VM_base(Register oop_result, 1165 Register last_java_sp, 1166 address entry_point, 1167 bool check_exceptions) { 1168 BLOCK_COMMENT("call_VM {"); 1169 // Determine last_java_sp register. 1170 if (!last_java_sp->is_valid()) { 1171 last_java_sp = R1_SP; 1172 } 1173 set_top_ijava_frame_at_SP_as_last_Java_frame(last_java_sp, R11_scratch1); 1174 1175 // ARG1 must hold thread address. 1176 mr(R3_ARG1, R16_thread); 1177 #if defined(ABI_ELFv2) 1178 address return_pc = call_c(entry_point, relocInfo::none); 1179 #else 1180 address return_pc = call_c((FunctionDescriptor*)entry_point, relocInfo::none); 1181 #endif 1182 1183 reset_last_Java_frame(); 1184 1185 // Check for pending exceptions. 1186 if (check_exceptions) { 1187 // We don't check for exceptions here. 1188 ShouldNotReachHere(); 1189 } 1190 1191 // Get oop result if there is one and reset the value in the thread. 1192 if (oop_result->is_valid()) { 1193 get_vm_result(oop_result); 1194 } 1195 1196 _last_calls_return_pc = return_pc; 1197 BLOCK_COMMENT("} call_VM"); 1198 } 1199 1200 void MacroAssembler::call_VM_leaf_base(address entry_point) { 1201 BLOCK_COMMENT("call_VM_leaf {"); 1202 #if defined(ABI_ELFv2) 1203 call_c(entry_point, relocInfo::none); 1204 #else 1205 call_c(CAST_FROM_FN_PTR(FunctionDescriptor*, entry_point), relocInfo::none); 1206 #endif 1207 BLOCK_COMMENT("} call_VM_leaf"); 1208 } 1209 1210 void MacroAssembler::call_VM(Register oop_result, address entry_point, bool check_exceptions) { 1211 call_VM_base(oop_result, noreg, entry_point, check_exceptions); 1212 } 1213 1214 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, 1215 bool check_exceptions) { 1216 // R3_ARG1 is reserved for the thread. 1217 mr_if_needed(R4_ARG2, arg_1); 1218 call_VM(oop_result, entry_point, check_exceptions); 1219 } 1220 1221 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2, 1222 bool check_exceptions) { 1223 // R3_ARG1 is reserved for the thread 1224 mr_if_needed(R4_ARG2, arg_1); 1225 assert(arg_2 != R4_ARG2, "smashed argument"); 1226 mr_if_needed(R5_ARG3, arg_2); 1227 call_VM(oop_result, entry_point, check_exceptions); 1228 } 1229 1230 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2, Register arg_3, 1231 bool check_exceptions) { 1232 // R3_ARG1 is reserved for the thread 1233 mr_if_needed(R4_ARG2, arg_1); 1234 assert(arg_2 != R4_ARG2, "smashed argument"); 1235 mr_if_needed(R5_ARG3, arg_2); 1236 mr_if_needed(R6_ARG4, arg_3); 1237 call_VM(oop_result, entry_point, check_exceptions); 1238 } 1239 1240 void MacroAssembler::call_VM_leaf(address entry_point) { 1241 call_VM_leaf_base(entry_point); 1242 } 1243 1244 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1) { 1245 mr_if_needed(R3_ARG1, arg_1); 1246 call_VM_leaf(entry_point); 1247 } 1248 1249 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2) { 1250 mr_if_needed(R3_ARG1, arg_1); 1251 assert(arg_2 != R3_ARG1, "smashed argument"); 1252 mr_if_needed(R4_ARG2, arg_2); 1253 call_VM_leaf(entry_point); 1254 } 1255 1256 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2, Register arg_3) { 1257 mr_if_needed(R3_ARG1, arg_1); 1258 assert(arg_2 != R3_ARG1, "smashed argument"); 1259 mr_if_needed(R4_ARG2, arg_2); 1260 assert(arg_3 != R3_ARG1 && arg_3 != R4_ARG2, "smashed argument"); 1261 mr_if_needed(R5_ARG3, arg_3); 1262 call_VM_leaf(entry_point); 1263 } 1264 1265 // Check whether instruction is a read access to the polling page 1266 // which was emitted by load_from_polling_page(..). 1267 bool MacroAssembler::is_load_from_polling_page(int instruction, void* ucontext, 1268 address* polling_address_ptr) { 1269 if (!is_ld(instruction)) 1270 return false; // It's not a ld. Fail. 1271 1272 int rt = inv_rt_field(instruction); 1273 int ra = inv_ra_field(instruction); 1274 int ds = inv_ds_field(instruction); 1275 if (!(ds == 0 && ra != 0 && rt == 0)) { 1276 return false; // It's not a ld(r0, X, ra). Fail. 1277 } 1278 1279 if (!ucontext) { 1280 // Set polling address. 1281 if (polling_address_ptr != NULL) { 1282 *polling_address_ptr = NULL; 1283 } 1284 return true; // No ucontext given. Can't check value of ra. Assume true. 1285 } 1286 1287 #ifdef LINUX 1288 // Ucontext given. Check that register ra contains the address of 1289 // the safepoing polling page. 1290 ucontext_t* uc = (ucontext_t*) ucontext; 1291 // Set polling address. 1292 address addr = (address)uc->uc_mcontext.regs->gpr[ra] + (ssize_t)ds; 1293 if (polling_address_ptr != NULL) { 1294 *polling_address_ptr = addr; 1295 } 1296 return os::is_poll_address(addr); 1297 #else 1298 // Not on Linux, ucontext must be NULL. 1299 ShouldNotReachHere(); 1300 return false; 1301 #endif 1302 } 1303 1304 bool MacroAssembler::is_memory_serialization(int instruction, JavaThread* thread, void* ucontext) { 1305 #ifdef LINUX 1306 ucontext_t* uc = (ucontext_t*) ucontext; 1307 1308 if (is_stwx(instruction) || is_stwux(instruction)) { 1309 int ra = inv_ra_field(instruction); 1310 int rb = inv_rb_field(instruction); 1311 1312 // look up content of ra and rb in ucontext 1313 address ra_val=(address)uc->uc_mcontext.regs->gpr[ra]; 1314 long rb_val=(long)uc->uc_mcontext.regs->gpr[rb]; 1315 return os::is_memory_serialize_page(thread, ra_val+rb_val); 1316 } else if (is_stw(instruction) || is_stwu(instruction)) { 1317 int ra = inv_ra_field(instruction); 1318 int d1 = inv_d1_field(instruction); 1319 1320 // look up content of ra in ucontext 1321 address ra_val=(address)uc->uc_mcontext.regs->gpr[ra]; 1322 return os::is_memory_serialize_page(thread, ra_val+d1); 1323 } else { 1324 return false; 1325 } 1326 #else 1327 // workaround not needed on !LINUX :-) 1328 ShouldNotCallThis(); 1329 return false; 1330 #endif 1331 } 1332 1333 void MacroAssembler::bang_stack_with_offset(int offset) { 1334 // When increasing the stack, the old stack pointer will be written 1335 // to the new top of stack according to the PPC64 abi. 1336 // Therefore, stack banging is not necessary when increasing 1337 // the stack by <= os::vm_page_size() bytes. 1338 // When increasing the stack by a larger amount, this method is 1339 // called repeatedly to bang the intermediate pages. 1340 1341 // Stack grows down, caller passes positive offset. 1342 assert(offset > 0, "must bang with positive offset"); 1343 1344 long stdoffset = -offset; 1345 1346 if (is_simm(stdoffset, 16)) { 1347 // Signed 16 bit offset, a simple std is ok. 1348 if (UseLoadInstructionsForStackBangingPPC64) { 1349 ld(R0, (int)(signed short)stdoffset, R1_SP); 1350 } else { 1351 std(R0,(int)(signed short)stdoffset, R1_SP); 1352 } 1353 } else if (is_simm(stdoffset, 31)) { 1354 const int hi = MacroAssembler::largeoffset_si16_si16_hi(stdoffset); 1355 const int lo = MacroAssembler::largeoffset_si16_si16_lo(stdoffset); 1356 1357 Register tmp = R11; 1358 addis(tmp, R1_SP, hi); 1359 if (UseLoadInstructionsForStackBangingPPC64) { 1360 ld(R0, lo, tmp); 1361 } else { 1362 std(R0, lo, tmp); 1363 } 1364 } else { 1365 ShouldNotReachHere(); 1366 } 1367 } 1368 1369 // If instruction is a stack bang of the form 1370 // std R0, x(Ry), (see bang_stack_with_offset()) 1371 // stdu R1_SP, x(R1_SP), (see push_frame(), resize_frame()) 1372 // or stdux R1_SP, Rx, R1_SP (see push_frame(), resize_frame()) 1373 // return the banged address. Otherwise, return 0. 1374 address MacroAssembler::get_stack_bang_address(int instruction, void *ucontext) { 1375 #ifdef LINUX 1376 ucontext_t* uc = (ucontext_t*) ucontext; 1377 int rs = inv_rs_field(instruction); 1378 int ra = inv_ra_field(instruction); 1379 if ( (is_ld(instruction) && rs == 0 && UseLoadInstructionsForStackBangingPPC64) 1380 || (is_std(instruction) && rs == 0 && !UseLoadInstructionsForStackBangingPPC64) 1381 || (is_stdu(instruction) && rs == 1)) { 1382 int ds = inv_ds_field(instruction); 1383 // return banged address 1384 return ds+(address)uc->uc_mcontext.regs->gpr[ra]; 1385 } else if (is_stdux(instruction) && rs == 1) { 1386 int rb = inv_rb_field(instruction); 1387 address sp = (address)uc->uc_mcontext.regs->gpr[1]; 1388 long rb_val = (long)uc->uc_mcontext.regs->gpr[rb]; 1389 return ra != 1 || rb_val >= 0 ? NULL // not a stack bang 1390 : sp + rb_val; // banged address 1391 } 1392 return NULL; // not a stack bang 1393 #else 1394 // workaround not needed on !LINUX :-) 1395 ShouldNotCallThis(); 1396 return NULL; 1397 #endif 1398 } 1399 1400 // CmpxchgX sets condition register to cmpX(current, compare). 1401 void MacroAssembler::cmpxchgw(ConditionRegister flag, Register dest_current_value, 1402 Register compare_value, Register exchange_value, 1403 Register addr_base, int semantics, bool cmpxchgx_hint, 1404 Register int_flag_success, bool contention_hint) { 1405 Label retry; 1406 Label failed; 1407 Label done; 1408 1409 // Save one branch if result is returned via register and 1410 // result register is different from the other ones. 1411 bool use_result_reg = (int_flag_success != noreg); 1412 bool preset_result_reg = (int_flag_success != dest_current_value && int_flag_success != compare_value && 1413 int_flag_success != exchange_value && int_flag_success != addr_base); 1414 1415 if (use_result_reg && preset_result_reg) { 1416 li(int_flag_success, 0); // preset (assume cas failed) 1417 } 1418 1419 // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM). 1420 if (contention_hint) { // Don't try to reserve if cmp fails. 1421 lwz(dest_current_value, 0, addr_base); 1422 cmpw(flag, dest_current_value, compare_value); 1423 bne(flag, failed); 1424 } 1425 1426 // release/fence semantics 1427 if (semantics & MemBarRel) { 1428 release(); 1429 } 1430 1431 // atomic emulation loop 1432 bind(retry); 1433 1434 lwarx(dest_current_value, addr_base, cmpxchgx_hint); 1435 cmpw(flag, dest_current_value, compare_value); 1436 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1437 bne_predict_not_taken(flag, failed); 1438 } else { 1439 bne( flag, failed); 1440 } 1441 // branch to done => (flag == ne), (dest_current_value != compare_value) 1442 // fall through => (flag == eq), (dest_current_value == compare_value) 1443 1444 stwcx_(exchange_value, addr_base); 1445 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1446 bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0. 1447 } else { 1448 bne( CCR0, retry); // StXcx_ sets CCR0. 1449 } 1450 // fall through => (flag == eq), (dest_current_value == compare_value), (swapped) 1451 1452 // Result in register (must do this at the end because int_flag_success can be the 1453 // same register as one above). 1454 if (use_result_reg) { 1455 li(int_flag_success, 1); 1456 } 1457 1458 if (semantics & MemBarFenceAfter) { 1459 fence(); 1460 } else if (semantics & MemBarAcq) { 1461 isync(); 1462 } 1463 1464 if (use_result_reg && !preset_result_reg) { 1465 b(done); 1466 } 1467 1468 bind(failed); 1469 if (use_result_reg && !preset_result_reg) { 1470 li(int_flag_success, 0); 1471 } 1472 1473 bind(done); 1474 // (flag == ne) => (dest_current_value != compare_value), (!swapped) 1475 // (flag == eq) => (dest_current_value == compare_value), ( swapped) 1476 } 1477 1478 // Preforms atomic compare exchange: 1479 // if (compare_value == *addr_base) 1480 // *addr_base = exchange_value 1481 // int_flag_success = 1; 1482 // else 1483 // int_flag_success = 0; 1484 // 1485 // ConditionRegister flag = cmp(compare_value, *addr_base) 1486 // Register dest_current_value = *addr_base 1487 // Register compare_value Used to compare with value in memory 1488 // Register exchange_value Written to memory if compare_value == *addr_base 1489 // Register addr_base The memory location to compareXChange 1490 // Register int_flag_success Set to 1 if exchange_value was written to *addr_base 1491 // 1492 // To avoid the costly compare exchange the value is tested beforehand. 1493 // Several special cases exist to avoid that unnecessary information is generated. 1494 // 1495 void MacroAssembler::cmpxchgd(ConditionRegister flag, 1496 Register dest_current_value, RegisterOrConstant compare_value, Register exchange_value, 1497 Register addr_base, int semantics, bool cmpxchgx_hint, 1498 Register int_flag_success, Label* failed_ext, bool contention_hint) { 1499 Label retry; 1500 Label failed_int; 1501 Label& failed = (failed_ext != NULL) ? *failed_ext : failed_int; 1502 Label done; 1503 1504 // Save one branch if result is returned via register and result register is different from the other ones. 1505 bool use_result_reg = (int_flag_success!=noreg); 1506 bool preset_result_reg = (int_flag_success!=dest_current_value && int_flag_success!=compare_value.register_or_noreg() && 1507 int_flag_success!=exchange_value && int_flag_success!=addr_base); 1508 assert(int_flag_success == noreg || failed_ext == NULL, "cannot have both"); 1509 1510 if (use_result_reg && preset_result_reg) { 1511 li(int_flag_success, 0); // preset (assume cas failed) 1512 } 1513 1514 // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM). 1515 if (contention_hint) { // Don't try to reserve if cmp fails. 1516 ld(dest_current_value, 0, addr_base); 1517 cmpd(flag, compare_value, dest_current_value); 1518 bne(flag, failed); 1519 } 1520 1521 // release/fence semantics 1522 if (semantics & MemBarRel) { 1523 release(); 1524 } 1525 1526 // atomic emulation loop 1527 bind(retry); 1528 1529 ldarx(dest_current_value, addr_base, cmpxchgx_hint); 1530 cmpd(flag, compare_value, dest_current_value); 1531 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1532 bne_predict_not_taken(flag, failed); 1533 } else { 1534 bne( flag, failed); 1535 } 1536 1537 stdcx_(exchange_value, addr_base); 1538 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1539 bne_predict_not_taken(CCR0, retry); // stXcx_ sets CCR0 1540 } else { 1541 bne( CCR0, retry); // stXcx_ sets CCR0 1542 } 1543 1544 // result in register (must do this at the end because int_flag_success can be the same register as one above) 1545 if (use_result_reg) { 1546 li(int_flag_success, 1); 1547 } 1548 1549 if (semantics & MemBarFenceAfter) { 1550 fence(); 1551 } else if (semantics & MemBarAcq) { 1552 isync(); 1553 } 1554 1555 if (use_result_reg && !preset_result_reg) { 1556 b(done); 1557 } 1558 1559 bind(failed_int); 1560 if (use_result_reg && !preset_result_reg) { 1561 li(int_flag_success, 0); 1562 } 1563 1564 bind(done); 1565 // (flag == ne) => (dest_current_value != compare_value), (!swapped) 1566 // (flag == eq) => (dest_current_value == compare_value), ( swapped) 1567 } 1568 1569 // Look up the method for a megamorphic invokeinterface call. 1570 // The target method is determined by <intf_klass, itable_index>. 1571 // The receiver klass is in recv_klass. 1572 // On success, the result will be in method_result, and execution falls through. 1573 // On failure, execution transfers to the given label. 1574 void MacroAssembler::lookup_interface_method(Register recv_klass, 1575 Register intf_klass, 1576 RegisterOrConstant itable_index, 1577 Register method_result, 1578 Register scan_temp, 1579 Register sethi_temp, 1580 Label& L_no_such_interface) { 1581 assert_different_registers(recv_klass, intf_klass, method_result, scan_temp); 1582 assert(itable_index.is_constant() || itable_index.as_register() == method_result, 1583 "caller must use same register for non-constant itable index as for method"); 1584 1585 // Compute start of first itableOffsetEntry (which is at the end of the vtable). 1586 int vtable_base = InstanceKlass::vtable_start_offset() * wordSize; 1587 int itentry_off = itableMethodEntry::method_offset_in_bytes(); 1588 int logMEsize = exact_log2(itableMethodEntry::size() * wordSize); 1589 int scan_step = itableOffsetEntry::size() * wordSize; 1590 int log_vte_size= exact_log2(vtableEntry::size() * wordSize); 1591 1592 lwz(scan_temp, InstanceKlass::vtable_length_offset() * wordSize, recv_klass); 1593 // %%% We should store the aligned, prescaled offset in the klassoop. 1594 // Then the next several instructions would fold away. 1595 1596 sldi(scan_temp, scan_temp, log_vte_size); 1597 addi(scan_temp, scan_temp, vtable_base); 1598 add(scan_temp, recv_klass, scan_temp); 1599 1600 // Adjust recv_klass by scaled itable_index, so we can free itable_index. 1601 if (itable_index.is_register()) { 1602 Register itable_offset = itable_index.as_register(); 1603 sldi(itable_offset, itable_offset, logMEsize); 1604 if (itentry_off) addi(itable_offset, itable_offset, itentry_off); 1605 add(recv_klass, itable_offset, recv_klass); 1606 } else { 1607 long itable_offset = (long)itable_index.as_constant(); 1608 load_const_optimized(sethi_temp, (itable_offset<<logMEsize)+itentry_off); // static address, no relocation 1609 add(recv_klass, sethi_temp, recv_klass); 1610 } 1611 1612 // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) { 1613 // if (scan->interface() == intf) { 1614 // result = (klass + scan->offset() + itable_index); 1615 // } 1616 // } 1617 Label search, found_method; 1618 1619 for (int peel = 1; peel >= 0; peel--) { 1620 // %%%% Could load both offset and interface in one ldx, if they were 1621 // in the opposite order. This would save a load. 1622 ld(method_result, itableOffsetEntry::interface_offset_in_bytes(), scan_temp); 1623 1624 // Check that this entry is non-null. A null entry means that 1625 // the receiver class doesn't implement the interface, and wasn't the 1626 // same as when the caller was compiled. 1627 cmpd(CCR0, method_result, intf_klass); 1628 1629 if (peel) { 1630 beq(CCR0, found_method); 1631 } else { 1632 bne(CCR0, search); 1633 // (invert the test to fall through to found_method...) 1634 } 1635 1636 if (!peel) break; 1637 1638 bind(search); 1639 1640 cmpdi(CCR0, method_result, 0); 1641 beq(CCR0, L_no_such_interface); 1642 addi(scan_temp, scan_temp, scan_step); 1643 } 1644 1645 bind(found_method); 1646 1647 // Got a hit. 1648 int ito_offset = itableOffsetEntry::offset_offset_in_bytes(); 1649 lwz(scan_temp, ito_offset, scan_temp); 1650 ldx(method_result, scan_temp, recv_klass); 1651 } 1652 1653 // virtual method calling 1654 void MacroAssembler::lookup_virtual_method(Register recv_klass, 1655 RegisterOrConstant vtable_index, 1656 Register method_result) { 1657 1658 assert_different_registers(recv_klass, method_result, vtable_index.register_or_noreg()); 1659 1660 const int base = InstanceKlass::vtable_start_offset() * wordSize; 1661 assert(vtableEntry::size() * wordSize == wordSize, "adjust the scaling in the code below"); 1662 1663 if (vtable_index.is_register()) { 1664 sldi(vtable_index.as_register(), vtable_index.as_register(), LogBytesPerWord); 1665 add(recv_klass, vtable_index.as_register(), recv_klass); 1666 } else { 1667 addi(recv_klass, recv_klass, vtable_index.as_constant() << LogBytesPerWord); 1668 } 1669 ld(R19_method, base + vtableEntry::method_offset_in_bytes(), recv_klass); 1670 } 1671 1672 /////////////////////////////////////////// subtype checking //////////////////////////////////////////// 1673 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass, 1674 Register super_klass, 1675 Register temp1_reg, 1676 Register temp2_reg, 1677 Label* L_success, 1678 Label* L_failure, 1679 Label* L_slow_path, 1680 RegisterOrConstant super_check_offset) { 1681 1682 const Register check_cache_offset = temp1_reg; 1683 const Register cached_super = temp2_reg; 1684 1685 assert_different_registers(sub_klass, super_klass, check_cache_offset, cached_super); 1686 1687 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 1688 int sc_offset = in_bytes(Klass::secondary_super_cache_offset()); 1689 1690 bool must_load_sco = (super_check_offset.constant_or_zero() == -1); 1691 bool need_slow_path = (must_load_sco || super_check_offset.constant_or_zero() == sco_offset); 1692 1693 Label L_fallthrough; 1694 int label_nulls = 0; 1695 if (L_success == NULL) { L_success = &L_fallthrough; label_nulls++; } 1696 if (L_failure == NULL) { L_failure = &L_fallthrough; label_nulls++; } 1697 if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; } 1698 assert(label_nulls <= 1 || 1699 (L_slow_path == &L_fallthrough && label_nulls <= 2 && !need_slow_path), 1700 "at most one NULL in the batch, usually"); 1701 1702 // If the pointers are equal, we are done (e.g., String[] elements). 1703 // This self-check enables sharing of secondary supertype arrays among 1704 // non-primary types such as array-of-interface. Otherwise, each such 1705 // type would need its own customized SSA. 1706 // We move this check to the front of the fast path because many 1707 // type checks are in fact trivially successful in this manner, 1708 // so we get a nicely predicted branch right at the start of the check. 1709 cmpd(CCR0, sub_klass, super_klass); 1710 beq(CCR0, *L_success); 1711 1712 // Check the supertype display: 1713 if (must_load_sco) { 1714 // The super check offset is always positive... 1715 lwz(check_cache_offset, sco_offset, super_klass); 1716 super_check_offset = RegisterOrConstant(check_cache_offset); 1717 // super_check_offset is register. 1718 assert_different_registers(sub_klass, super_klass, cached_super, super_check_offset.as_register()); 1719 } 1720 // The loaded value is the offset from KlassOopDesc. 1721 1722 ld(cached_super, super_check_offset, sub_klass); 1723 cmpd(CCR0, cached_super, super_klass); 1724 1725 // This check has worked decisively for primary supers. 1726 // Secondary supers are sought in the super_cache ('super_cache_addr'). 1727 // (Secondary supers are interfaces and very deeply nested subtypes.) 1728 // This works in the same check above because of a tricky aliasing 1729 // between the super_cache and the primary super display elements. 1730 // (The 'super_check_addr' can address either, as the case requires.) 1731 // Note that the cache is updated below if it does not help us find 1732 // what we need immediately. 1733 // So if it was a primary super, we can just fail immediately. 1734 // Otherwise, it's the slow path for us (no success at this point). 1735 1736 #define FINAL_JUMP(label) if (&(label) != &L_fallthrough) { b(label); } 1737 1738 if (super_check_offset.is_register()) { 1739 beq(CCR0, *L_success); 1740 cmpwi(CCR0, super_check_offset.as_register(), sc_offset); 1741 if (L_failure == &L_fallthrough) { 1742 beq(CCR0, *L_slow_path); 1743 } else { 1744 bne(CCR0, *L_failure); 1745 FINAL_JUMP(*L_slow_path); 1746 } 1747 } else { 1748 if (super_check_offset.as_constant() == sc_offset) { 1749 // Need a slow path; fast failure is impossible. 1750 if (L_slow_path == &L_fallthrough) { 1751 beq(CCR0, *L_success); 1752 } else { 1753 bne(CCR0, *L_slow_path); 1754 FINAL_JUMP(*L_success); 1755 } 1756 } else { 1757 // No slow path; it's a fast decision. 1758 if (L_failure == &L_fallthrough) { 1759 beq(CCR0, *L_success); 1760 } else { 1761 bne(CCR0, *L_failure); 1762 FINAL_JUMP(*L_success); 1763 } 1764 } 1765 } 1766 1767 bind(L_fallthrough); 1768 #undef FINAL_JUMP 1769 } 1770 1771 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass, 1772 Register super_klass, 1773 Register temp1_reg, 1774 Register temp2_reg, 1775 Label* L_success, 1776 Register result_reg) { 1777 const Register array_ptr = temp1_reg; // current value from cache array 1778 const Register temp = temp2_reg; 1779 1780 assert_different_registers(sub_klass, super_klass, array_ptr, temp); 1781 1782 int source_offset = in_bytes(Klass::secondary_supers_offset()); 1783 int target_offset = in_bytes(Klass::secondary_super_cache_offset()); 1784 1785 int length_offset = Array<Klass*>::length_offset_in_bytes(); 1786 int base_offset = Array<Klass*>::base_offset_in_bytes(); 1787 1788 Label hit, loop, failure, fallthru; 1789 1790 ld(array_ptr, source_offset, sub_klass); 1791 1792 // TODO: PPC port: assert(4 == arrayOopDesc::length_length_in_bytes(), "precondition violated."); 1793 lwz(temp, length_offset, array_ptr); 1794 cmpwi(CCR0, temp, 0); 1795 beq(CCR0, result_reg!=noreg ? failure : fallthru); // length 0 1796 1797 mtctr(temp); // load ctr 1798 1799 bind(loop); 1800 // Oops in table are NO MORE compressed. 1801 ld(temp, base_offset, array_ptr); 1802 cmpd(CCR0, temp, super_klass); 1803 beq(CCR0, hit); 1804 addi(array_ptr, array_ptr, BytesPerWord); 1805 bdnz(loop); 1806 1807 bind(failure); 1808 if (result_reg!=noreg) li(result_reg, 1); // load non-zero result (indicates a miss) 1809 b(fallthru); 1810 1811 bind(hit); 1812 std(super_klass, target_offset, sub_klass); // save result to cache 1813 if (result_reg != noreg) { li(result_reg, 0); } // load zero result (indicates a hit) 1814 if (L_success != NULL) { b(*L_success); } 1815 else if (result_reg == noreg) { blr(); } // return with CR0.eq if neither label nor result reg provided 1816 1817 bind(fallthru); 1818 } 1819 1820 // Try fast path, then go to slow one if not successful 1821 void MacroAssembler::check_klass_subtype(Register sub_klass, 1822 Register super_klass, 1823 Register temp1_reg, 1824 Register temp2_reg, 1825 Label& L_success) { 1826 Label L_failure; 1827 check_klass_subtype_fast_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success, &L_failure); 1828 check_klass_subtype_slow_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success); 1829 bind(L_failure); // Fallthru if not successful. 1830 } 1831 1832 void MacroAssembler::check_method_handle_type(Register mtype_reg, Register mh_reg, 1833 Register temp_reg, 1834 Label& wrong_method_type) { 1835 assert_different_registers(mtype_reg, mh_reg, temp_reg); 1836 // Compare method type against that of the receiver. 1837 load_heap_oop_not_null(temp_reg, delayed_value(java_lang_invoke_MethodHandle::type_offset_in_bytes, temp_reg), mh_reg); 1838 cmpd(CCR0, temp_reg, mtype_reg); 1839 bne(CCR0, wrong_method_type); 1840 } 1841 1842 RegisterOrConstant MacroAssembler::argument_offset(RegisterOrConstant arg_slot, 1843 Register temp_reg, 1844 int extra_slot_offset) { 1845 // cf. TemplateTable::prepare_invoke(), if (load_receiver). 1846 int stackElementSize = Interpreter::stackElementSize; 1847 int offset = extra_slot_offset * stackElementSize; 1848 if (arg_slot.is_constant()) { 1849 offset += arg_slot.as_constant() * stackElementSize; 1850 return offset; 1851 } else { 1852 assert(temp_reg != noreg, "must specify"); 1853 sldi(temp_reg, arg_slot.as_register(), exact_log2(stackElementSize)); 1854 if (offset != 0) 1855 addi(temp_reg, temp_reg, offset); 1856 return temp_reg; 1857 } 1858 } 1859 1860 // Supports temp2_reg = R0. 1861 void MacroAssembler::biased_locking_enter(ConditionRegister cr_reg, Register obj_reg, 1862 Register mark_reg, Register temp_reg, 1863 Register temp2_reg, Label& done, Label* slow_case) { 1864 assert(UseBiasedLocking, "why call this otherwise?"); 1865 1866 #ifdef ASSERT 1867 assert_different_registers(obj_reg, mark_reg, temp_reg, temp2_reg); 1868 #endif 1869 1870 Label cas_label; 1871 1872 // Branch to done if fast path fails and no slow_case provided. 1873 Label *slow_case_int = (slow_case != NULL) ? slow_case : &done; 1874 1875 // Biased locking 1876 // See whether the lock is currently biased toward our thread and 1877 // whether the epoch is still valid 1878 // Note that the runtime guarantees sufficient alignment of JavaThread 1879 // pointers to allow age to be placed into low bits 1880 assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits, 1881 "biased locking makes assumptions about bit layout"); 1882 1883 if (PrintBiasedLockingStatistics) { 1884 load_const(temp2_reg, (address) BiasedLocking::total_entry_count_addr(), temp_reg); 1885 lwzx(temp_reg, temp2_reg); 1886 addi(temp_reg, temp_reg, 1); 1887 stwx(temp_reg, temp2_reg); 1888 } 1889 1890 andi(temp_reg, mark_reg, markOopDesc::biased_lock_mask_in_place); 1891 cmpwi(cr_reg, temp_reg, markOopDesc::biased_lock_pattern); 1892 bne(cr_reg, cas_label); 1893 1894 load_klass(temp_reg, obj_reg); 1895 1896 load_const_optimized(temp2_reg, ~((int) markOopDesc::age_mask_in_place)); 1897 ld(temp_reg, in_bytes(Klass::prototype_header_offset()), temp_reg); 1898 orr(temp_reg, R16_thread, temp_reg); 1899 xorr(temp_reg, mark_reg, temp_reg); 1900 andr(temp_reg, temp_reg, temp2_reg); 1901 cmpdi(cr_reg, temp_reg, 0); 1902 if (PrintBiasedLockingStatistics) { 1903 Label l; 1904 bne(cr_reg, l); 1905 load_const(temp2_reg, (address) BiasedLocking::biased_lock_entry_count_addr()); 1906 lwzx(mark_reg, temp2_reg); 1907 addi(mark_reg, mark_reg, 1); 1908 stwx(mark_reg, temp2_reg); 1909 // restore mark_reg 1910 ld(mark_reg, oopDesc::mark_offset_in_bytes(), obj_reg); 1911 bind(l); 1912 } 1913 beq(cr_reg, done); 1914 1915 Label try_revoke_bias; 1916 Label try_rebias; 1917 1918 // At this point we know that the header has the bias pattern and 1919 // that we are not the bias owner in the current epoch. We need to 1920 // figure out more details about the state of the header in order to 1921 // know what operations can be legally performed on the object's 1922 // header. 1923 1924 // If the low three bits in the xor result aren't clear, that means 1925 // the prototype header is no longer biased and we have to revoke 1926 // the bias on this object. 1927 andi(temp2_reg, temp_reg, markOopDesc::biased_lock_mask_in_place); 1928 cmpwi(cr_reg, temp2_reg, 0); 1929 bne(cr_reg, try_revoke_bias); 1930 1931 // Biasing is still enabled for this data type. See whether the 1932 // epoch of the current bias is still valid, meaning that the epoch 1933 // bits of the mark word are equal to the epoch bits of the 1934 // prototype header. (Note that the prototype header's epoch bits 1935 // only change at a safepoint.) If not, attempt to rebias the object 1936 // toward the current thread. Note that we must be absolutely sure 1937 // that the current epoch is invalid in order to do this because 1938 // otherwise the manipulations it performs on the mark word are 1939 // illegal. 1940 1941 int shift_amount = 64 - markOopDesc::epoch_shift; 1942 // rotate epoch bits to right (little) end and set other bits to 0 1943 // [ big part | epoch | little part ] -> [ 0..0 | epoch ] 1944 rldicl_(temp2_reg, temp_reg, shift_amount, 64 - markOopDesc::epoch_bits); 1945 // branch if epoch bits are != 0, i.e. they differ, because the epoch has been incremented 1946 bne(CCR0, try_rebias); 1947 1948 // The epoch of the current bias is still valid but we know nothing 1949 // about the owner; it might be set or it might be clear. Try to 1950 // acquire the bias of the object using an atomic operation. If this 1951 // fails we will go in to the runtime to revoke the object's bias. 1952 // Note that we first construct the presumed unbiased header so we 1953 // don't accidentally blow away another thread's valid bias. 1954 andi(mark_reg, mark_reg, (markOopDesc::biased_lock_mask_in_place | 1955 markOopDesc::age_mask_in_place | 1956 markOopDesc::epoch_mask_in_place)); 1957 orr(temp_reg, R16_thread, mark_reg); 1958 1959 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 1960 1961 // CmpxchgX sets cr_reg to cmpX(temp2_reg, mark_reg). 1962 cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg, 1963 /*compare_value=*/mark_reg, /*exchange_value=*/temp_reg, 1964 /*where=*/obj_reg, 1965 MacroAssembler::MemBarAcq, 1966 MacroAssembler::cmpxchgx_hint_acquire_lock(), 1967 noreg, slow_case_int); // bail out if failed 1968 1969 // If the biasing toward our thread failed, this means that 1970 // another thread succeeded in biasing it toward itself and we 1971 // need to revoke that bias. The revocation will occur in the 1972 // interpreter runtime in the slow case. 1973 if (PrintBiasedLockingStatistics) { 1974 load_const(temp2_reg, (address) BiasedLocking::anonymously_biased_lock_entry_count_addr(), temp_reg); 1975 lwzx(temp_reg, temp2_reg); 1976 addi(temp_reg, temp_reg, 1); 1977 stwx(temp_reg, temp2_reg); 1978 } 1979 b(done); 1980 1981 bind(try_rebias); 1982 // At this point we know the epoch has expired, meaning that the 1983 // current "bias owner", if any, is actually invalid. Under these 1984 // circumstances _only_, we are allowed to use the current header's 1985 // value as the comparison value when doing the cas to acquire the 1986 // bias in the current epoch. In other words, we allow transfer of 1987 // the bias from one thread to another directly in this situation. 1988 load_klass(temp_reg, obj_reg); 1989 andi(temp2_reg, mark_reg, markOopDesc::age_mask_in_place); 1990 orr(temp2_reg, R16_thread, temp2_reg); 1991 ld(temp_reg, in_bytes(Klass::prototype_header_offset()), temp_reg); 1992 orr(temp_reg, temp2_reg, temp_reg); 1993 1994 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 1995 1996 cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg, 1997 /*compare_value=*/mark_reg, /*exchange_value=*/temp_reg, 1998 /*where=*/obj_reg, 1999 MacroAssembler::MemBarAcq, 2000 MacroAssembler::cmpxchgx_hint_acquire_lock(), 2001 noreg, slow_case_int); // bail out if failed 2002 2003 // If the biasing toward our thread failed, this means that 2004 // another thread succeeded in biasing it toward itself and we 2005 // need to revoke that bias. The revocation will occur in the 2006 // interpreter runtime in the slow case. 2007 if (PrintBiasedLockingStatistics) { 2008 load_const(temp2_reg, (address) BiasedLocking::rebiased_lock_entry_count_addr(), temp_reg); 2009 lwzx(temp_reg, temp2_reg); 2010 addi(temp_reg, temp_reg, 1); 2011 stwx(temp_reg, temp2_reg); 2012 } 2013 b(done); 2014 2015 bind(try_revoke_bias); 2016 // The prototype mark in the klass doesn't have the bias bit set any 2017 // more, indicating that objects of this data type are not supposed 2018 // to be biased any more. We are going to try to reset the mark of 2019 // this object to the prototype value and fall through to the 2020 // CAS-based locking scheme. Note that if our CAS fails, it means 2021 // that another thread raced us for the privilege of revoking the 2022 // bias of this particular object, so it's okay to continue in the 2023 // normal locking code. 2024 load_klass(temp_reg, obj_reg); 2025 ld(temp_reg, in_bytes(Klass::prototype_header_offset()), temp_reg); 2026 andi(temp2_reg, mark_reg, markOopDesc::age_mask_in_place); 2027 orr(temp_reg, temp_reg, temp2_reg); 2028 2029 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 2030 2031 // CmpxchgX sets cr_reg to cmpX(temp2_reg, mark_reg). 2032 cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg, 2033 /*compare_value=*/mark_reg, /*exchange_value=*/temp_reg, 2034 /*where=*/obj_reg, 2035 MacroAssembler::MemBarAcq, 2036 MacroAssembler::cmpxchgx_hint_acquire_lock()); 2037 2038 // reload markOop in mark_reg before continuing with lightweight locking 2039 ld(mark_reg, oopDesc::mark_offset_in_bytes(), obj_reg); 2040 2041 // Fall through to the normal CAS-based lock, because no matter what 2042 // the result of the above CAS, some thread must have succeeded in 2043 // removing the bias bit from the object's header. 2044 if (PrintBiasedLockingStatistics) { 2045 Label l; 2046 bne(cr_reg, l); 2047 load_const(temp2_reg, (address) BiasedLocking::revoked_lock_entry_count_addr(), temp_reg); 2048 lwzx(temp_reg, temp2_reg); 2049 addi(temp_reg, temp_reg, 1); 2050 stwx(temp_reg, temp2_reg); 2051 bind(l); 2052 } 2053 2054 bind(cas_label); 2055 } 2056 2057 void MacroAssembler::biased_locking_exit (ConditionRegister cr_reg, Register mark_addr, Register temp_reg, Label& done) { 2058 // Check for biased locking unlock case, which is a no-op 2059 // Note: we do not have to check the thread ID for two reasons. 2060 // First, the interpreter checks for IllegalMonitorStateException at 2061 // a higher level. Second, if the bias was revoked while we held the 2062 // lock, the object could not be rebiased toward another thread, so 2063 // the bias bit would be clear. 2064 2065 ld(temp_reg, 0, mark_addr); 2066 andi(temp_reg, temp_reg, markOopDesc::biased_lock_mask_in_place); 2067 2068 cmpwi(cr_reg, temp_reg, markOopDesc::biased_lock_pattern); 2069 beq(cr_reg, done); 2070 } 2071 2072 // allocation (for C1) 2073 void MacroAssembler::eden_allocate( 2074 Register obj, // result: pointer to object after successful allocation 2075 Register var_size_in_bytes, // object size in bytes if unknown at compile time; invalid otherwise 2076 int con_size_in_bytes, // object size in bytes if known at compile time 2077 Register t1, // temp register 2078 Register t2, // temp register 2079 Label& slow_case // continuation point if fast allocation fails 2080 ) { 2081 b(slow_case); 2082 } 2083 2084 void MacroAssembler::tlab_allocate( 2085 Register obj, // result: pointer to object after successful allocation 2086 Register var_size_in_bytes, // object size in bytes if unknown at compile time; invalid otherwise 2087 int con_size_in_bytes, // object size in bytes if known at compile time 2088 Register t1, // temp register 2089 Label& slow_case // continuation point if fast allocation fails 2090 ) { 2091 // make sure arguments make sense 2092 assert_different_registers(obj, var_size_in_bytes, t1); 2093 assert(0 <= con_size_in_bytes && is_simm13(con_size_in_bytes), "illegal object size"); 2094 assert((con_size_in_bytes & MinObjAlignmentInBytesMask) == 0, "object size is not multiple of alignment"); 2095 2096 const Register new_top = t1; 2097 //verify_tlab(); not implemented 2098 2099 ld(obj, in_bytes(JavaThread::tlab_top_offset()), R16_thread); 2100 ld(R0, in_bytes(JavaThread::tlab_end_offset()), R16_thread); 2101 if (var_size_in_bytes == noreg) { 2102 addi(new_top, obj, con_size_in_bytes); 2103 } else { 2104 add(new_top, obj, var_size_in_bytes); 2105 } 2106 cmpld(CCR0, new_top, R0); 2107 bc_far_optimized(Assembler::bcondCRbiIs1, bi0(CCR0, Assembler::greater), slow_case); 2108 2109 #ifdef ASSERT 2110 // make sure new free pointer is properly aligned 2111 { 2112 Label L; 2113 andi_(R0, new_top, MinObjAlignmentInBytesMask); 2114 beq(CCR0, L); 2115 stop("updated TLAB free is not properly aligned", 0x934); 2116 bind(L); 2117 } 2118 #endif // ASSERT 2119 2120 // update the tlab top pointer 2121 std(new_top, in_bytes(JavaThread::tlab_top_offset()), R16_thread); 2122 //verify_tlab(); not implemented 2123 } 2124 void MacroAssembler::tlab_refill(Label& retry_tlab, Label& try_eden, Label& slow_case) { 2125 unimplemented("tlab_refill"); 2126 } 2127 void MacroAssembler::incr_allocated_bytes(RegisterOrConstant size_in_bytes, Register t1, Register t2) { 2128 unimplemented("incr_allocated_bytes"); 2129 } 2130 2131 address MacroAssembler::emit_trampoline_stub(int destination_toc_offset, 2132 int insts_call_instruction_offset, Register Rtoc) { 2133 // Start the stub. 2134 address stub = start_a_stub(64); 2135 if (stub == NULL) { return NULL; } // CodeCache full: bail out 2136 2137 // Create a trampoline stub relocation which relates this trampoline stub 2138 // with the call instruction at insts_call_instruction_offset in the 2139 // instructions code-section. 2140 relocate(trampoline_stub_Relocation::spec(code()->insts()->start() + insts_call_instruction_offset)); 2141 const int stub_start_offset = offset(); 2142 2143 // For java_to_interp stubs we use R11_scratch1 as scratch register 2144 // and in call trampoline stubs we use R12_scratch2. This way we 2145 // can distinguish them (see is_NativeCallTrampolineStub_at()). 2146 Register reg_scratch = R12_scratch2; 2147 2148 // Now, create the trampoline stub's code: 2149 // - load the TOC 2150 // - load the call target from the constant pool 2151 // - call 2152 if (Rtoc == noreg) { 2153 calculate_address_from_global_toc(reg_scratch, method_toc()); 2154 Rtoc = reg_scratch; 2155 } 2156 2157 ld_largeoffset_unchecked(reg_scratch, destination_toc_offset, Rtoc, false); 2158 mtctr(reg_scratch); 2159 bctr(); 2160 2161 const address stub_start_addr = addr_at(stub_start_offset); 2162 2163 // Assert that the encoded destination_toc_offset can be identified and that it is correct. 2164 assert(destination_toc_offset == NativeCallTrampolineStub_at(stub_start_addr)->destination_toc_offset(), 2165 "encoded offset into the constant pool must match"); 2166 // Trampoline_stub_size should be good. 2167 assert((uint)(offset() - stub_start_offset) <= trampoline_stub_size, "should be good size"); 2168 assert(is_NativeCallTrampolineStub_at(stub_start_addr), "doesn't look like a trampoline"); 2169 2170 // End the stub. 2171 end_a_stub(); 2172 return stub; 2173 } 2174 2175 // TM on PPC64. 2176 void MacroAssembler::atomic_inc_ptr(Register addr, Register result, int simm16) { 2177 Label retry; 2178 bind(retry); 2179 ldarx(result, addr, /*hint*/ false); 2180 addi(result, result, simm16); 2181 stdcx_(result, addr); 2182 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 2183 bne_predict_not_taken(CCR0, retry); // stXcx_ sets CCR0 2184 } else { 2185 bne( CCR0, retry); // stXcx_ sets CCR0 2186 } 2187 } 2188 2189 void MacroAssembler::atomic_ori_int(Register addr, Register result, int uimm16) { 2190 Label retry; 2191 bind(retry); 2192 lwarx(result, addr, /*hint*/ false); 2193 ori(result, result, uimm16); 2194 stwcx_(result, addr); 2195 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 2196 bne_predict_not_taken(CCR0, retry); // stXcx_ sets CCR0 2197 } else { 2198 bne( CCR0, retry); // stXcx_ sets CCR0 2199 } 2200 } 2201 2202 #if INCLUDE_RTM_OPT 2203 2204 // Update rtm_counters based on abort status 2205 // input: abort_status 2206 // rtm_counters (RTMLockingCounters*) 2207 void MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters_Reg) { 2208 // Mapping to keep PreciseRTMLockingStatistics similar to x86. 2209 // x86 ppc (! means inverted, ? means not the same) 2210 // 0 31 Set if abort caused by XABORT instruction. 2211 // 1 ! 7 If set, the transaction may succeed on a retry. This bit is always clear if bit 0 is set. 2212 // 2 13 Set if another logical processor conflicted with a memory address that was part of the transaction that aborted. 2213 // 3 10 Set if an internal buffer overflowed. 2214 // 4 ?12 Set if a debug breakpoint was hit. 2215 // 5 ?32 Set if an abort occurred during execution of a nested transaction. 2216 const int tm_failure_bit[] = {Assembler::tm_tabort, // Note: Seems like signal handler sets this, too. 2217 Assembler::tm_failure_persistent, // inverted: transient 2218 Assembler::tm_trans_cf, 2219 Assembler::tm_footprint_of, 2220 Assembler::tm_non_trans_cf, 2221 Assembler::tm_suspended}; 2222 const bool tm_failure_inv[] = {false, true, false, false, false, false}; 2223 assert(sizeof(tm_failure_bit)/sizeof(int) == RTMLockingCounters::ABORT_STATUS_LIMIT, "adapt mapping!"); 2224 2225 const Register addr_Reg = R0; 2226 // Keep track of offset to where rtm_counters_Reg had pointed to. 2227 int counters_offs = RTMLockingCounters::abort_count_offset(); 2228 addi(addr_Reg, rtm_counters_Reg, counters_offs); 2229 const Register temp_Reg = rtm_counters_Reg; 2230 2231 //atomic_inc_ptr(addr_Reg, temp_Reg); We don't increment atomically 2232 ldx(temp_Reg, addr_Reg); 2233 addi(temp_Reg, temp_Reg, 1); 2234 stdx(temp_Reg, addr_Reg); 2235 2236 if (PrintPreciseRTMLockingStatistics) { 2237 int counters_offs_delta = RTMLockingCounters::abortX_count_offset() - counters_offs; 2238 2239 //mftexasr(abort_status); done by caller 2240 for (int i = 0; i < RTMLockingCounters::ABORT_STATUS_LIMIT; i++) { 2241 counters_offs += counters_offs_delta; 2242 li(temp_Reg, counters_offs_delta); // can't use addi with R0 2243 add(addr_Reg, addr_Reg, temp_Reg); // point to next counter 2244 counters_offs_delta = sizeof(uintx); 2245 2246 Label check_abort; 2247 rldicr_(temp_Reg, abort_status, tm_failure_bit[i], 0); 2248 if (tm_failure_inv[i]) { 2249 bne(CCR0, check_abort); 2250 } else { 2251 beq(CCR0, check_abort); 2252 } 2253 //atomic_inc_ptr(addr_Reg, temp_Reg); We don't increment atomically 2254 ldx(temp_Reg, addr_Reg); 2255 addi(temp_Reg, temp_Reg, 1); 2256 stdx(temp_Reg, addr_Reg); 2257 bind(check_abort); 2258 } 2259 } 2260 li(temp_Reg, -counters_offs); // can't use addi with R0 2261 add(rtm_counters_Reg, addr_Reg, temp_Reg); // restore 2262 } 2263 2264 // Branch if (random & (count-1) != 0), count is 2^n 2265 // tmp and CR0 are killed 2266 void MacroAssembler::branch_on_random_using_tb(Register tmp, int count, Label& brLabel) { 2267 mftb(tmp); 2268 andi_(tmp, tmp, count-1); 2269 bne(CCR0, brLabel); 2270 } 2271 2272 // Perform abort ratio calculation, set no_rtm bit if high ratio. 2273 // input: rtm_counters_Reg (RTMLockingCounters* address) - KILLED 2274 void MacroAssembler::rtm_abort_ratio_calculation(Register rtm_counters_Reg, 2275 RTMLockingCounters* rtm_counters, 2276 Metadata* method_data) { 2277 Label L_done, L_check_always_rtm1, L_check_always_rtm2; 2278 2279 if (RTMLockingCalculationDelay > 0) { 2280 // Delay calculation. 2281 ld(rtm_counters_Reg, (RegisterOrConstant)(intptr_t)RTMLockingCounters::rtm_calculation_flag_addr()); 2282 cmpdi(CCR0, rtm_counters_Reg, 0); 2283 beq(CCR0, L_done); 2284 load_const_optimized(rtm_counters_Reg, (address)rtm_counters, R0); // reload 2285 } 2286 // Abort ratio calculation only if abort_count > RTMAbortThreshold. 2287 // Aborted transactions = abort_count * 100 2288 // All transactions = total_count * RTMTotalCountIncrRate 2289 // Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio) 2290 ld(R0, RTMLockingCounters::abort_count_offset(), rtm_counters_Reg); 2291 cmpdi(CCR0, R0, RTMAbortThreshold); 2292 blt(CCR0, L_check_always_rtm2); 2293 mulli(R0, R0, 100); 2294 2295 const Register tmpReg = rtm_counters_Reg; 2296 ld(tmpReg, RTMLockingCounters::total_count_offset(), rtm_counters_Reg); 2297 mulli(tmpReg, tmpReg, RTMTotalCountIncrRate); 2298 mulli(tmpReg, tmpReg, RTMAbortRatio); 2299 cmpd(CCR0, R0, tmpReg); 2300 blt(CCR0, L_check_always_rtm1); // jump to reload 2301 if (method_data != NULL) { 2302 // Set rtm_state to "no rtm" in MDO. 2303 // Not using a metadata relocation. Method and Class Loader are kept alive anyway. 2304 // (See nmethod::metadata_do and CodeBuffer::finalize_oop_references.) 2305 load_const(R0, (address)method_data + MethodData::rtm_state_offset_in_bytes(), tmpReg); 2306 atomic_ori_int(R0, tmpReg, NoRTM); 2307 } 2308 b(L_done); 2309 2310 bind(L_check_always_rtm1); 2311 load_const_optimized(rtm_counters_Reg, (address)rtm_counters, R0); // reload 2312 bind(L_check_always_rtm2); 2313 ld(tmpReg, RTMLockingCounters::total_count_offset(), rtm_counters_Reg); 2314 cmpdi(CCR0, tmpReg, RTMLockingThreshold / RTMTotalCountIncrRate); 2315 blt(CCR0, L_done); 2316 if (method_data != NULL) { 2317 // Set rtm_state to "always rtm" in MDO. 2318 // Not using a metadata relocation. See above. 2319 load_const(R0, (address)method_data + MethodData::rtm_state_offset_in_bytes(), tmpReg); 2320 atomic_ori_int(R0, tmpReg, UseRTM); 2321 } 2322 bind(L_done); 2323 } 2324 2325 // Update counters and perform abort ratio calculation. 2326 // input: abort_status_Reg 2327 void MacroAssembler::rtm_profiling(Register abort_status_Reg, Register temp_Reg, 2328 RTMLockingCounters* rtm_counters, 2329 Metadata* method_data, 2330 bool profile_rtm) { 2331 2332 assert(rtm_counters != NULL, "should not be NULL when profiling RTM"); 2333 // Update rtm counters based on state at abort. 2334 // Reads abort_status_Reg, updates flags. 2335 assert_different_registers(abort_status_Reg, temp_Reg); 2336 load_const_optimized(temp_Reg, (address)rtm_counters, R0); 2337 rtm_counters_update(abort_status_Reg, temp_Reg); 2338 if (profile_rtm) { 2339 assert(rtm_counters != NULL, "should not be NULL when profiling RTM"); 2340 rtm_abort_ratio_calculation(temp_Reg, rtm_counters, method_data); 2341 } 2342 } 2343 2344 // Retry on abort if abort's status indicates non-persistent failure. 2345 // inputs: retry_count_Reg 2346 // : abort_status_Reg 2347 // output: retry_count_Reg decremented by 1 2348 void MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg, 2349 Label& retryLabel, Label* checkRetry) { 2350 Label doneRetry; 2351 rldicr_(R0, abort_status_Reg, tm_failure_persistent, 0); 2352 bne(CCR0, doneRetry); 2353 if (checkRetry) { bind(*checkRetry); } 2354 addic_(retry_count_Reg, retry_count_Reg, -1); 2355 blt(CCR0, doneRetry); 2356 smt_yield(); // Can't use wait(). No permission (SIGILL). 2357 b(retryLabel); 2358 bind(doneRetry); 2359 } 2360 2361 // Spin and retry if lock is busy. 2362 // inputs: box_Reg (monitor address) 2363 // : retry_count_Reg 2364 // output: retry_count_Reg decremented by 1 2365 // CTR is killed 2366 void MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register owner_addr_Reg, Label& retryLabel) { 2367 Label SpinLoop, doneRetry; 2368 addic_(retry_count_Reg, retry_count_Reg, -1); 2369 blt(CCR0, doneRetry); 2370 li(R0, RTMSpinLoopCount); 2371 mtctr(R0); 2372 2373 bind(SpinLoop); 2374 smt_yield(); // Can't use waitrsv(). No permission (SIGILL). 2375 bdz(retryLabel); 2376 ld(R0, 0, owner_addr_Reg); 2377 cmpdi(CCR0, R0, 0); 2378 bne(CCR0, SpinLoop); 2379 b(retryLabel); 2380 2381 bind(doneRetry); 2382 } 2383 2384 // Use RTM for normal stack locks. 2385 // Input: objReg (object to lock) 2386 void MacroAssembler::rtm_stack_locking(ConditionRegister flag, 2387 Register obj, Register mark_word, Register tmp, 2388 Register retry_on_abort_count_Reg, 2389 RTMLockingCounters* stack_rtm_counters, 2390 Metadata* method_data, bool profile_rtm, 2391 Label& DONE_LABEL, Label& IsInflated) { 2392 assert(UseRTMForStackLocks, "why call this otherwise?"); 2393 assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking"); 2394 Label L_rtm_retry, L_decrement_retry, L_on_abort; 2395 2396 if (RTMRetryCount > 0) { 2397 load_const_optimized(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort 2398 bind(L_rtm_retry); 2399 } 2400 andi_(R0, mark_word, markOopDesc::monitor_value); // inflated vs stack-locked|neutral|biased 2401 bne(CCR0, IsInflated); 2402 2403 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 2404 Label L_noincrement; 2405 if (RTMTotalCountIncrRate > 1) { 2406 branch_on_random_using_tb(tmp, (int)RTMTotalCountIncrRate, L_noincrement); 2407 } 2408 assert(stack_rtm_counters != NULL, "should not be NULL when profiling RTM"); 2409 load_const_optimized(tmp, (address)stack_rtm_counters->total_count_addr(), R0); 2410 //atomic_inc_ptr(tmp, /*temp, will be reloaded*/mark_word); We don't increment atomically 2411 ldx(mark_word, tmp); 2412 addi(mark_word, mark_word, 1); 2413 stdx(mark_word, tmp); 2414 bind(L_noincrement); 2415 } 2416 tbegin_(); 2417 beq(CCR0, L_on_abort); 2418 ld(mark_word, oopDesc::mark_offset_in_bytes(), obj); // Reload in transaction, conflicts need to be tracked. 2419 andi(R0, mark_word, markOopDesc::biased_lock_mask_in_place); // look at 3 lock bits 2420 cmpwi(flag, R0, markOopDesc::unlocked_value); // bits = 001 unlocked 2421 beq(flag, DONE_LABEL); // all done if unlocked 2422 2423 if (UseRTMXendForLockBusy) { 2424 tend_(); 2425 b(L_decrement_retry); 2426 } else { 2427 tabort_(); 2428 } 2429 bind(L_on_abort); 2430 const Register abort_status_Reg = tmp; 2431 mftexasr(abort_status_Reg); 2432 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 2433 rtm_profiling(abort_status_Reg, /*temp*/mark_word, stack_rtm_counters, method_data, profile_rtm); 2434 } 2435 ld(mark_word, oopDesc::mark_offset_in_bytes(), obj); // reload 2436 if (RTMRetryCount > 0) { 2437 // Retry on lock abort if abort status is not permanent. 2438 rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry, &L_decrement_retry); 2439 } else { 2440 bind(L_decrement_retry); 2441 } 2442 } 2443 2444 // Use RTM for inflating locks 2445 // inputs: obj (object to lock) 2446 // mark_word (current header - KILLED) 2447 // boxReg (on-stack box address (displaced header location) - KILLED) 2448 void MacroAssembler::rtm_inflated_locking(ConditionRegister flag, 2449 Register obj, Register mark_word, Register boxReg, 2450 Register retry_on_busy_count_Reg, Register retry_on_abort_count_Reg, 2451 RTMLockingCounters* rtm_counters, 2452 Metadata* method_data, bool profile_rtm, 2453 Label& DONE_LABEL) { 2454 assert(UseRTMLocking, "why call this otherwise?"); 2455 Label L_rtm_retry, L_decrement_retry, L_on_abort; 2456 // Clean monitor_value bit to get valid pointer. 2457 int owner_offset = ObjectMonitor::owner_offset_in_bytes() - markOopDesc::monitor_value; 2458 2459 // Store non-null, using boxReg instead of (intptr_t)markOopDesc::unused_mark(). 2460 std(boxReg, BasicLock::displaced_header_offset_in_bytes(), boxReg); 2461 const Register tmpReg = boxReg; 2462 const Register owner_addr_Reg = mark_word; 2463 addi(owner_addr_Reg, mark_word, owner_offset); 2464 2465 if (RTMRetryCount > 0) { 2466 load_const_optimized(retry_on_busy_count_Reg, RTMRetryCount); // Retry on lock busy. 2467 load_const_optimized(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort. 2468 bind(L_rtm_retry); 2469 } 2470 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 2471 Label L_noincrement; 2472 if (RTMTotalCountIncrRate > 1) { 2473 branch_on_random_using_tb(R0, (int)RTMTotalCountIncrRate, L_noincrement); 2474 } 2475 assert(rtm_counters != NULL, "should not be NULL when profiling RTM"); 2476 load_const(R0, (address)rtm_counters->total_count_addr(), tmpReg); 2477 //atomic_inc_ptr(R0, tmpReg); We don't increment atomically 2478 ldx(tmpReg, R0); 2479 addi(tmpReg, tmpReg, 1); 2480 stdx(tmpReg, R0); 2481 bind(L_noincrement); 2482 } 2483 tbegin_(); 2484 beq(CCR0, L_on_abort); 2485 // We don't reload mark word. Will only be reset at safepoint. 2486 ld(R0, 0, owner_addr_Reg); // Load in transaction, conflicts need to be tracked. 2487 cmpdi(flag, R0, 0); 2488 beq(flag, DONE_LABEL); 2489 2490 if (UseRTMXendForLockBusy) { 2491 tend_(); 2492 b(L_decrement_retry); 2493 } else { 2494 tabort_(); 2495 } 2496 bind(L_on_abort); 2497 const Register abort_status_Reg = tmpReg; 2498 mftexasr(abort_status_Reg); 2499 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 2500 rtm_profiling(abort_status_Reg, /*temp*/ owner_addr_Reg, rtm_counters, method_data, profile_rtm); 2501 // Restore owner_addr_Reg 2502 ld(mark_word, oopDesc::mark_offset_in_bytes(), obj); 2503 #ifdef ASSERT 2504 andi_(R0, mark_word, markOopDesc::monitor_value); 2505 asm_assert_ne("must be inflated", 0xa754); // Deflating only allowed at safepoint. 2506 #endif 2507 addi(owner_addr_Reg, mark_word, owner_offset); 2508 } 2509 if (RTMRetryCount > 0) { 2510 // Retry on lock abort if abort status is not permanent. 2511 rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry); 2512 } 2513 2514 // Appears unlocked - try to swing _owner from null to non-null. 2515 cmpxchgd(flag, /*current val*/ R0, (intptr_t)0, /*new val*/ R16_thread, owner_addr_Reg, 2516 MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq, 2517 MacroAssembler::cmpxchgx_hint_acquire_lock(), noreg, &L_decrement_retry, true); 2518 2519 if (RTMRetryCount > 0) { 2520 // success done else retry 2521 b(DONE_LABEL); 2522 bind(L_decrement_retry); 2523 // Spin and retry if lock is busy. 2524 rtm_retry_lock_on_busy(retry_on_busy_count_Reg, owner_addr_Reg, L_rtm_retry); 2525 } else { 2526 bind(L_decrement_retry); 2527 } 2528 } 2529 2530 #endif // INCLUDE_RTM_OPT 2531 2532 // "The box" is the space on the stack where we copy the object mark. 2533 void MacroAssembler::compiler_fast_lock_object(ConditionRegister flag, Register oop, Register box, 2534 Register temp, Register displaced_header, Register current_header, 2535 bool try_bias, 2536 RTMLockingCounters* rtm_counters, 2537 RTMLockingCounters* stack_rtm_counters, 2538 Metadata* method_data, 2539 bool use_rtm, bool profile_rtm) { 2540 assert_different_registers(oop, box, temp, displaced_header, current_header); 2541 assert(flag != CCR0, "bad condition register"); 2542 Label cont; 2543 Label object_has_monitor; 2544 Label cas_failed; 2545 2546 // Load markOop from object into displaced_header. 2547 ld(displaced_header, oopDesc::mark_offset_in_bytes(), oop); 2548 2549 2550 // Always do locking in runtime. 2551 if (EmitSync & 0x01) { 2552 cmpdi(flag, oop, 0); // Oop can't be 0 here => always false. 2553 return; 2554 } 2555 2556 if (try_bias) { 2557 biased_locking_enter(flag, oop, displaced_header, temp, current_header, cont); 2558 } 2559 2560 #if INCLUDE_RTM_OPT 2561 if (UseRTMForStackLocks && use_rtm) { 2562 rtm_stack_locking(flag, oop, displaced_header, temp, /*temp*/ current_header, 2563 stack_rtm_counters, method_data, profile_rtm, 2564 cont, object_has_monitor); 2565 } 2566 #endif // INCLUDE_RTM_OPT 2567 2568 // Handle existing monitor. 2569 if ((EmitSync & 0x02) == 0) { 2570 // The object has an existing monitor iff (mark & monitor_value) != 0. 2571 andi_(temp, displaced_header, markOopDesc::monitor_value); 2572 bne(CCR0, object_has_monitor); 2573 } 2574 2575 // Set displaced_header to be (markOop of object | UNLOCK_VALUE). 2576 ori(displaced_header, displaced_header, markOopDesc::unlocked_value); 2577 2578 // Load Compare Value application register. 2579 2580 // Initialize the box. (Must happen before we update the object mark!) 2581 std(displaced_header, BasicLock::displaced_header_offset_in_bytes(), box); 2582 2583 // Must fence, otherwise, preceding store(s) may float below cmpxchg. 2584 // Compare object markOop with mark and if equal exchange scratch1 with object markOop. 2585 cmpxchgd(/*flag=*/flag, 2586 /*current_value=*/current_header, 2587 /*compare_value=*/displaced_header, 2588 /*exchange_value=*/box, 2589 /*where=*/oop, 2590 MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq, 2591 MacroAssembler::cmpxchgx_hint_acquire_lock(), 2592 noreg, 2593 &cas_failed, 2594 /*check without membar and ldarx first*/true); 2595 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 2596 2597 // If the compare-and-exchange succeeded, then we found an unlocked 2598 // object and we have now locked it. 2599 b(cont); 2600 2601 bind(cas_failed); 2602 // We did not see an unlocked object so try the fast recursive case. 2603 2604 // Check if the owner is self by comparing the value in the markOop of object 2605 // (current_header) with the stack pointer. 2606 sub(current_header, current_header, R1_SP); 2607 load_const_optimized(temp, ~(os::vm_page_size()-1) | markOopDesc::lock_mask_in_place); 2608 2609 and_(R0/*==0?*/, current_header, temp); 2610 // If condition is true we are cont and hence we can store 0 as the 2611 // displaced header in the box, which indicates that it is a recursive lock. 2612 mcrf(flag,CCR0); 2613 std(R0/*==0, perhaps*/, BasicLock::displaced_header_offset_in_bytes(), box); 2614 2615 // Handle existing monitor. 2616 if ((EmitSync & 0x02) == 0) { 2617 b(cont); 2618 2619 bind(object_has_monitor); 2620 // The object's monitor m is unlocked iff m->owner == NULL, 2621 // otherwise m->owner may contain a thread or a stack address. 2622 2623 #if INCLUDE_RTM_OPT 2624 // Use the same RTM locking code in 32- and 64-bit VM. 2625 if (use_rtm) { 2626 rtm_inflated_locking(flag, oop, displaced_header, box, temp, /*temp*/ current_header, 2627 rtm_counters, method_data, profile_rtm, cont); 2628 } else { 2629 #endif // INCLUDE_RTM_OPT 2630 2631 // Try to CAS m->owner from NULL to current thread. 2632 addi(temp, displaced_header, ObjectMonitor::owner_offset_in_bytes()-markOopDesc::monitor_value); 2633 cmpxchgd(/*flag=*/flag, 2634 /*current_value=*/current_header, 2635 /*compare_value=*/(intptr_t)0, 2636 /*exchange_value=*/R16_thread, 2637 /*where=*/temp, 2638 MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq, 2639 MacroAssembler::cmpxchgx_hint_acquire_lock()); 2640 2641 // Store a non-null value into the box. 2642 std(box, BasicLock::displaced_header_offset_in_bytes(), box); 2643 2644 # ifdef ASSERT 2645 bne(flag, cont); 2646 // We have acquired the monitor, check some invariants. 2647 addi(/*monitor=*/temp, temp, -ObjectMonitor::owner_offset_in_bytes()); 2648 // Invariant 1: _recursions should be 0. 2649 //assert(ObjectMonitor::recursions_size_in_bytes() == 8, "unexpected size"); 2650 asm_assert_mem8_is_zero(ObjectMonitor::recursions_offset_in_bytes(), temp, 2651 "monitor->_recursions should be 0", -1); 2652 // Invariant 2: OwnerIsThread shouldn't be 0. 2653 //assert(ObjectMonitor::OwnerIsThread_size_in_bytes() == 4, "unexpected size"); 2654 //asm_assert_mem4_isnot_zero(ObjectMonitor::OwnerIsThread_offset_in_bytes(), temp, 2655 // "monitor->OwnerIsThread shouldn't be 0", -1); 2656 # endif 2657 2658 #if INCLUDE_RTM_OPT 2659 } // use_rtm() 2660 #endif 2661 } 2662 2663 bind(cont); 2664 // flag == EQ indicates success 2665 // flag == NE indicates failure 2666 } 2667 2668 void MacroAssembler::compiler_fast_unlock_object(ConditionRegister flag, Register oop, Register box, 2669 Register temp, Register displaced_header, Register current_header, 2670 bool try_bias, bool use_rtm) { 2671 assert_different_registers(oop, box, temp, displaced_header, current_header); 2672 assert(flag != CCR0, "bad condition register"); 2673 Label cont; 2674 Label object_has_monitor; 2675 2676 // Always do locking in runtime. 2677 if (EmitSync & 0x01) { 2678 cmpdi(flag, oop, 0); // Oop can't be 0 here => always false. 2679 return; 2680 } 2681 2682 if (try_bias) { 2683 biased_locking_exit(flag, oop, current_header, cont); 2684 } 2685 2686 #if INCLUDE_RTM_OPT 2687 if (UseRTMForStackLocks && use_rtm) { 2688 assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking"); 2689 Label L_regular_unlock; 2690 ld(current_header, oopDesc::mark_offset_in_bytes(), oop); // fetch markword 2691 andi(R0, current_header, markOopDesc::biased_lock_mask_in_place); // look at 3 lock bits 2692 cmpwi(flag, R0, markOopDesc::unlocked_value); // bits = 001 unlocked 2693 bne(flag, L_regular_unlock); // else RegularLock 2694 tend_(); // otherwise end... 2695 b(cont); // ... and we're done 2696 bind(L_regular_unlock); 2697 } 2698 #endif 2699 2700 // Find the lock address and load the displaced header from the stack. 2701 ld(displaced_header, BasicLock::displaced_header_offset_in_bytes(), box); 2702 2703 // If the displaced header is 0, we have a recursive unlock. 2704 cmpdi(flag, displaced_header, 0); 2705 beq(flag, cont); 2706 2707 // Handle existing monitor. 2708 if ((EmitSync & 0x02) == 0) { 2709 // The object has an existing monitor iff (mark & monitor_value) != 0. 2710 RTM_OPT_ONLY( if (!(UseRTMForStackLocks && use_rtm)) ) // skip load if already done 2711 ld(current_header, oopDesc::mark_offset_in_bytes(), oop); 2712 andi_(R0, current_header, markOopDesc::monitor_value); 2713 bne(CCR0, object_has_monitor); 2714 } 2715 2716 // Check if it is still a light weight lock, this is is true if we see 2717 // the stack address of the basicLock in the markOop of the object. 2718 // Cmpxchg sets flag to cmpd(current_header, box). 2719 cmpxchgd(/*flag=*/flag, 2720 /*current_value=*/current_header, 2721 /*compare_value=*/box, 2722 /*exchange_value=*/displaced_header, 2723 /*where=*/oop, 2724 MacroAssembler::MemBarRel, 2725 MacroAssembler::cmpxchgx_hint_release_lock(), 2726 noreg, 2727 &cont); 2728 2729 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 2730 2731 // Handle existing monitor. 2732 if ((EmitSync & 0x02) == 0) { 2733 b(cont); 2734 2735 bind(object_has_monitor); 2736 addi(current_header, current_header, -markOopDesc::monitor_value); // monitor 2737 ld(temp, ObjectMonitor::owner_offset_in_bytes(), current_header); 2738 2739 // It's inflated. 2740 #if INCLUDE_RTM_OPT 2741 if (use_rtm) { 2742 Label L_regular_inflated_unlock; 2743 // Clean monitor_value bit to get valid pointer 2744 cmpdi(flag, temp, 0); 2745 bne(flag, L_regular_inflated_unlock); 2746 tend_(); 2747 b(cont); 2748 bind(L_regular_inflated_unlock); 2749 } 2750 #endif 2751 2752 ld(displaced_header, ObjectMonitor::recursions_offset_in_bytes(), current_header); 2753 xorr(temp, R16_thread, temp); // Will be 0 if we are the owner. 2754 orr(temp, temp, displaced_header); // Will be 0 if there are 0 recursions. 2755 cmpdi(flag, temp, 0); 2756 bne(flag, cont); 2757 2758 ld(temp, ObjectMonitor::EntryList_offset_in_bytes(), current_header); 2759 ld(displaced_header, ObjectMonitor::cxq_offset_in_bytes(), current_header); 2760 orr(temp, temp, displaced_header); // Will be 0 if both are 0. 2761 cmpdi(flag, temp, 0); 2762 bne(flag, cont); 2763 release(); 2764 std(temp, ObjectMonitor::owner_offset_in_bytes(), current_header); 2765 } 2766 2767 bind(cont); 2768 // flag == EQ indicates success 2769 // flag == NE indicates failure 2770 } 2771 2772 // Write serialization page so VM thread can do a pseudo remote membar. 2773 // We use the current thread pointer to calculate a thread specific 2774 // offset to write to within the page. This minimizes bus traffic 2775 // due to cache line collision. 2776 void MacroAssembler::serialize_memory(Register thread, Register tmp1, Register tmp2) { 2777 srdi(tmp2, thread, os::get_serialize_page_shift_count()); 2778 2779 int mask = os::vm_page_size() - sizeof(int); 2780 if (Assembler::is_simm(mask, 16)) { 2781 andi(tmp2, tmp2, mask); 2782 } else { 2783 lis(tmp1, (int)((signed short) (mask >> 16))); 2784 ori(tmp1, tmp1, mask & 0x0000ffff); 2785 andr(tmp2, tmp2, tmp1); 2786 } 2787 2788 load_const(tmp1, (long) os::get_memory_serialize_page()); 2789 release(); 2790 stwx(R0, tmp1, tmp2); 2791 } 2792 2793 2794 // GC barrier helper macros 2795 2796 // Write the card table byte if needed. 2797 void MacroAssembler::card_write_barrier_post(Register Rstore_addr, Register Rnew_val, Register Rtmp) { 2798 CardTableModRefBS* bs = 2799 barrier_set_cast<CardTableModRefBS>(Universe::heap()->barrier_set()); 2800 assert(bs->kind() == BarrierSet::CardTableForRS || 2801 bs->kind() == BarrierSet::CardTableExtension, "wrong barrier"); 2802 #ifdef ASSERT 2803 cmpdi(CCR0, Rnew_val, 0); 2804 asm_assert_ne("null oop not allowed", 0x321); 2805 #endif 2806 card_table_write(bs->byte_map_base, Rtmp, Rstore_addr); 2807 } 2808 2809 // Write the card table byte. 2810 void MacroAssembler::card_table_write(jbyte* byte_map_base, Register Rtmp, Register Robj) { 2811 assert_different_registers(Robj, Rtmp, R0); 2812 load_const_optimized(Rtmp, (address)byte_map_base, R0); 2813 srdi(Robj, Robj, CardTableModRefBS::card_shift); 2814 li(R0, 0); // dirty 2815 if (UseConcMarkSweepGC) membar(Assembler::StoreStore); 2816 stbx(R0, Rtmp, Robj); 2817 } 2818 2819 #if INCLUDE_ALL_GCS 2820 // General G1 pre-barrier generator. 2821 // Goal: record the previous value if it is not null. 2822 void MacroAssembler::g1_write_barrier_pre(Register Robj, RegisterOrConstant offset, Register Rpre_val, 2823 Register Rtmp1, Register Rtmp2, bool needs_frame) { 2824 Label runtime, filtered; 2825 2826 // Is marking active? 2827 if (in_bytes(SATBMarkQueue::byte_width_of_active()) == 4) { 2828 lwz(Rtmp1, in_bytes(JavaThread::satb_mark_queue_offset() + SATBMarkQueue::byte_offset_of_active()), R16_thread); 2829 } else { 2830 guarantee(in_bytes(SATBMarkQueue::byte_width_of_active()) == 1, "Assumption"); 2831 lbz(Rtmp1, in_bytes(JavaThread::satb_mark_queue_offset() + SATBMarkQueue::byte_offset_of_active()), R16_thread); 2832 } 2833 cmpdi(CCR0, Rtmp1, 0); 2834 beq(CCR0, filtered); 2835 2836 // Do we need to load the previous value? 2837 if (Robj != noreg) { 2838 // Load the previous value... 2839 if (UseCompressedOops) { 2840 lwz(Rpre_val, offset, Robj); 2841 } else { 2842 ld(Rpre_val, offset, Robj); 2843 } 2844 // Previous value has been loaded into Rpre_val. 2845 } 2846 assert(Rpre_val != noreg, "must have a real register"); 2847 2848 // Is the previous value null? 2849 cmpdi(CCR0, Rpre_val, 0); 2850 beq(CCR0, filtered); 2851 2852 if (Robj != noreg && UseCompressedOops) { 2853 decode_heap_oop_not_null(Rpre_val); 2854 } 2855 2856 // OK, it's not filtered, so we'll need to call enqueue. In the normal 2857 // case, pre_val will be a scratch G-reg, but there are some cases in 2858 // which it's an O-reg. In the first case, do a normal call. In the 2859 // latter, do a save here and call the frameless version. 2860 2861 // Can we store original value in the thread's buffer? 2862 // Is index == 0? 2863 // (The index field is typed as size_t.) 2864 const Register Rbuffer = Rtmp1, Rindex = Rtmp2; 2865 2866 ld(Rindex, in_bytes(JavaThread::satb_mark_queue_offset() + SATBMarkQueue::byte_offset_of_index()), R16_thread); 2867 cmpdi(CCR0, Rindex, 0); 2868 beq(CCR0, runtime); // If index == 0, goto runtime. 2869 ld(Rbuffer, in_bytes(JavaThread::satb_mark_queue_offset() + SATBMarkQueue::byte_offset_of_buf()), R16_thread); 2870 2871 addi(Rindex, Rindex, -wordSize); // Decrement index. 2872 std(Rindex, in_bytes(JavaThread::satb_mark_queue_offset() + SATBMarkQueue::byte_offset_of_index()), R16_thread); 2873 2874 // Record the previous value. 2875 stdx(Rpre_val, Rbuffer, Rindex); 2876 b(filtered); 2877 2878 bind(runtime); 2879 2880 // VM call need frame to access(write) O register. 2881 if (needs_frame) { 2882 save_LR_CR(Rtmp1); 2883 push_frame_reg_args(0, Rtmp2); 2884 } 2885 2886 if (Rpre_val->is_volatile() && Robj == noreg) mr(R31, Rpre_val); // Save pre_val across C call if it was preloaded. 2887 call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_pre), Rpre_val, R16_thread); 2888 if (Rpre_val->is_volatile() && Robj == noreg) mr(Rpre_val, R31); // restore 2889 2890 if (needs_frame) { 2891 pop_frame(); 2892 restore_LR_CR(Rtmp1); 2893 } 2894 2895 bind(filtered); 2896 } 2897 2898 // General G1 post-barrier generator 2899 // Store cross-region card. 2900 void MacroAssembler::g1_write_barrier_post(Register Rstore_addr, Register Rnew_val, Register Rtmp1, Register Rtmp2, Register Rtmp3, Label *filtered_ext) { 2901 Label runtime, filtered_int; 2902 Label& filtered = (filtered_ext != NULL) ? *filtered_ext : filtered_int; 2903 assert_different_registers(Rstore_addr, Rnew_val, Rtmp1, Rtmp2); 2904 2905 G1SATBCardTableLoggingModRefBS* bs = 2906 barrier_set_cast<G1SATBCardTableLoggingModRefBS>(Universe::heap()->barrier_set()); 2907 2908 // Does store cross heap regions? 2909 if (G1RSBarrierRegionFilter) { 2910 xorr(Rtmp1, Rstore_addr, Rnew_val); 2911 srdi_(Rtmp1, Rtmp1, HeapRegion::LogOfHRGrainBytes); 2912 beq(CCR0, filtered); 2913 } 2914 2915 // Crosses regions, storing NULL? 2916 #ifdef ASSERT 2917 cmpdi(CCR0, Rnew_val, 0); 2918 asm_assert_ne("null oop not allowed (G1)", 0x322); // Checked by caller on PPC64, so following branch is obsolete: 2919 //beq(CCR0, filtered); 2920 #endif 2921 2922 // Storing region crossing non-NULL, is card already dirty? 2923 assert(sizeof(*bs->byte_map_base) == sizeof(jbyte), "adjust this code"); 2924 const Register Rcard_addr = Rtmp1; 2925 Register Rbase = Rtmp2; 2926 load_const_optimized(Rbase, (address)bs->byte_map_base, /*temp*/ Rtmp3); 2927 2928 srdi(Rcard_addr, Rstore_addr, CardTableModRefBS::card_shift); 2929 2930 // Get the address of the card. 2931 lbzx(/*card value*/ Rtmp3, Rbase, Rcard_addr); 2932 cmpwi(CCR0, Rtmp3, (int)G1SATBCardTableModRefBS::g1_young_card_val()); 2933 beq(CCR0, filtered); 2934 2935 membar(Assembler::StoreLoad); 2936 lbzx(/*card value*/ Rtmp3, Rbase, Rcard_addr); // Reload after membar. 2937 cmpwi(CCR0, Rtmp3 /* card value */, CardTableModRefBS::dirty_card_val()); 2938 beq(CCR0, filtered); 2939 2940 // Storing a region crossing, non-NULL oop, card is clean. 2941 // Dirty card and log. 2942 li(Rtmp3, CardTableModRefBS::dirty_card_val()); 2943 //release(); // G1: oops are allowed to get visible after dirty marking. 2944 stbx(Rtmp3, Rbase, Rcard_addr); 2945 2946 add(Rcard_addr, Rbase, Rcard_addr); // This is the address which needs to get enqueued. 2947 Rbase = noreg; // end of lifetime 2948 2949 const Register Rqueue_index = Rtmp2, 2950 Rqueue_buf = Rtmp3; 2951 ld(Rqueue_index, in_bytes(JavaThread::dirty_card_queue_offset() + DirtyCardQueue::byte_offset_of_index()), R16_thread); 2952 cmpdi(CCR0, Rqueue_index, 0); 2953 beq(CCR0, runtime); // index == 0 then jump to runtime 2954 ld(Rqueue_buf, in_bytes(JavaThread::dirty_card_queue_offset() + DirtyCardQueue::byte_offset_of_buf()), R16_thread); 2955 2956 addi(Rqueue_index, Rqueue_index, -wordSize); // decrement index 2957 std(Rqueue_index, in_bytes(JavaThread::dirty_card_queue_offset() + DirtyCardQueue::byte_offset_of_index()), R16_thread); 2958 2959 stdx(Rcard_addr, Rqueue_buf, Rqueue_index); // store card 2960 b(filtered); 2961 2962 bind(runtime); 2963 2964 // Save the live input values. 2965 call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_post), Rcard_addr, R16_thread); 2966 2967 bind(filtered_int); 2968 } 2969 #endif // INCLUDE_ALL_GCS 2970 2971 // Values for last_Java_pc, and last_Java_sp must comply to the rules 2972 // in frame_ppc.hpp. 2973 void MacroAssembler::set_last_Java_frame(Register last_Java_sp, Register last_Java_pc) { 2974 // Always set last_Java_pc and flags first because once last_Java_sp 2975 // is visible has_last_Java_frame is true and users will look at the 2976 // rest of the fields. (Note: flags should always be zero before we 2977 // get here so doesn't need to be set.) 2978 2979 // Verify that last_Java_pc was zeroed on return to Java 2980 asm_assert_mem8_is_zero(in_bytes(JavaThread::last_Java_pc_offset()), R16_thread, 2981 "last_Java_pc not zeroed before leaving Java", 0x200); 2982 2983 // When returning from calling out from Java mode the frame anchor's 2984 // last_Java_pc will always be set to NULL. It is set here so that 2985 // if we are doing a call to native (not VM) that we capture the 2986 // known pc and don't have to rely on the native call having a 2987 // standard frame linkage where we can find the pc. 2988 if (last_Java_pc != noreg) 2989 std(last_Java_pc, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread); 2990 2991 // Set last_Java_sp last. 2992 std(last_Java_sp, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread); 2993 } 2994 2995 void MacroAssembler::reset_last_Java_frame(void) { 2996 asm_assert_mem8_isnot_zero(in_bytes(JavaThread::last_Java_sp_offset()), 2997 R16_thread, "SP was not set, still zero", 0x202); 2998 2999 BLOCK_COMMENT("reset_last_Java_frame {"); 3000 li(R0, 0); 3001 3002 // _last_Java_sp = 0 3003 std(R0, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread); 3004 3005 // _last_Java_pc = 0 3006 std(R0, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread); 3007 BLOCK_COMMENT("} reset_last_Java_frame"); 3008 } 3009 3010 void MacroAssembler::set_top_ijava_frame_at_SP_as_last_Java_frame(Register sp, Register tmp1) { 3011 assert_different_registers(sp, tmp1); 3012 3013 // sp points to a TOP_IJAVA_FRAME, retrieve frame's PC via 3014 // TOP_IJAVA_FRAME_ABI. 3015 // FIXME: assert that we really have a TOP_IJAVA_FRAME here! 3016 address entry = pc(); 3017 load_const_optimized(tmp1, entry); 3018 3019 set_last_Java_frame(/*sp=*/sp, /*pc=*/tmp1); 3020 } 3021 3022 void MacroAssembler::get_vm_result(Register oop_result) { 3023 // Read: 3024 // R16_thread 3025 // R16_thread->in_bytes(JavaThread::vm_result_offset()) 3026 // 3027 // Updated: 3028 // oop_result 3029 // R16_thread->in_bytes(JavaThread::vm_result_offset()) 3030 3031 verify_thread(); 3032 3033 ld(oop_result, in_bytes(JavaThread::vm_result_offset()), R16_thread); 3034 li(R0, 0); 3035 std(R0, in_bytes(JavaThread::vm_result_offset()), R16_thread); 3036 3037 verify_oop(oop_result); 3038 } 3039 3040 void MacroAssembler::get_vm_result_2(Register metadata_result) { 3041 // Read: 3042 // R16_thread 3043 // R16_thread->in_bytes(JavaThread::vm_result_2_offset()) 3044 // 3045 // Updated: 3046 // metadata_result 3047 // R16_thread->in_bytes(JavaThread::vm_result_2_offset()) 3048 3049 ld(metadata_result, in_bytes(JavaThread::vm_result_2_offset()), R16_thread); 3050 li(R0, 0); 3051 std(R0, in_bytes(JavaThread::vm_result_2_offset()), R16_thread); 3052 } 3053 3054 Register MacroAssembler::encode_klass_not_null(Register dst, Register src) { 3055 Register current = (src != noreg) ? src : dst; // Klass is in dst if no src provided. 3056 if (Universe::narrow_klass_base() != 0) { 3057 // Use dst as temp if it is free. 3058 sub_const_optimized(dst, current, Universe::narrow_klass_base(), R0); 3059 current = dst; 3060 } 3061 if (Universe::narrow_klass_shift() != 0) { 3062 srdi(dst, current, Universe::narrow_klass_shift()); 3063 current = dst; 3064 } 3065 return current; 3066 } 3067 3068 void MacroAssembler::store_klass(Register dst_oop, Register klass, Register ck) { 3069 if (UseCompressedClassPointers) { 3070 Register compressedKlass = encode_klass_not_null(ck, klass); 3071 stw(compressedKlass, oopDesc::klass_offset_in_bytes(), dst_oop); 3072 } else { 3073 std(klass, oopDesc::klass_offset_in_bytes(), dst_oop); 3074 } 3075 } 3076 3077 void MacroAssembler::store_klass_gap(Register dst_oop, Register val) { 3078 if (UseCompressedClassPointers) { 3079 if (val == noreg) { 3080 val = R0; 3081 li(val, 0); 3082 } 3083 stw(val, oopDesc::klass_gap_offset_in_bytes(), dst_oop); // klass gap if compressed 3084 } 3085 } 3086 3087 int MacroAssembler::instr_size_for_decode_klass_not_null() { 3088 if (!UseCompressedClassPointers) return 0; 3089 int num_instrs = 1; // shift or move 3090 if (Universe::narrow_klass_base() != 0) num_instrs = 7; // shift + load const + add 3091 return num_instrs * BytesPerInstWord; 3092 } 3093 3094 void MacroAssembler::decode_klass_not_null(Register dst, Register src) { 3095 assert(dst != R0, "Dst reg may not be R0, as R0 is used here."); 3096 if (src == noreg) src = dst; 3097 Register shifted_src = src; 3098 if (Universe::narrow_klass_shift() != 0 || 3099 Universe::narrow_klass_base() == 0 && src != dst) { // Move required. 3100 shifted_src = dst; 3101 sldi(shifted_src, src, Universe::narrow_klass_shift()); 3102 } 3103 if (Universe::narrow_klass_base() != 0) { 3104 add_const_optimized(dst, shifted_src, Universe::narrow_klass_base(), R0); 3105 } 3106 } 3107 3108 void MacroAssembler::load_klass(Register dst, Register src) { 3109 if (UseCompressedClassPointers) { 3110 lwz(dst, oopDesc::klass_offset_in_bytes(), src); 3111 // Attention: no null check here! 3112 decode_klass_not_null(dst, dst); 3113 } else { 3114 ld(dst, oopDesc::klass_offset_in_bytes(), src); 3115 } 3116 } 3117 3118 // Clear Array 3119 // Kills both input registers. tmp == R0 is allowed. 3120 void MacroAssembler::clear_memory_doubleword(Register base_ptr, Register cnt_dwords, Register tmp) { 3121 // Procedure for large arrays (uses data cache block zero instruction). 3122 Label startloop, fast, fastloop, small_rest, restloop, done; 3123 const int cl_size = VM_Version::L1_data_cache_line_size(), 3124 cl_dwords = cl_size>>3, 3125 cl_dw_addr_bits = exact_log2(cl_dwords), 3126 dcbz_min = 1; // Min count of dcbz executions, needs to be >0. 3127 3128 //2: 3129 cmpdi(CCR1, cnt_dwords, ((dcbz_min+1)<<cl_dw_addr_bits)-1); // Big enough? (ensure >=dcbz_min lines included). 3130 blt(CCR1, small_rest); // Too small. 3131 rldicl_(tmp, base_ptr, 64-3, 64-cl_dw_addr_bits); // Extract dword offset within first cache line. 3132 beq(CCR0, fast); // Already 128byte aligned. 3133 3134 subfic(tmp, tmp, cl_dwords); 3135 mtctr(tmp); // Set ctr to hit 128byte boundary (0<ctr<cl_dwords). 3136 subf(cnt_dwords, tmp, cnt_dwords); // rest. 3137 li(tmp, 0); 3138 //10: 3139 bind(startloop); // Clear at the beginning to reach 128byte boundary. 3140 std(tmp, 0, base_ptr); // Clear 8byte aligned block. 3141 addi(base_ptr, base_ptr, 8); 3142 bdnz(startloop); 3143 //13: 3144 bind(fast); // Clear 128byte blocks. 3145 srdi(tmp, cnt_dwords, cl_dw_addr_bits); // Loop count for 128byte loop (>0). 3146 andi(cnt_dwords, cnt_dwords, cl_dwords-1); // Rest in dwords. 3147 mtctr(tmp); // Load counter. 3148 //16: 3149 bind(fastloop); 3150 dcbz(base_ptr); // Clear 128byte aligned block. 3151 addi(base_ptr, base_ptr, cl_size); 3152 bdnz(fastloop); 3153 if (InsertEndGroupPPC64) { endgroup(); } else { nop(); } 3154 //20: 3155 bind(small_rest); 3156 cmpdi(CCR0, cnt_dwords, 0); // size 0? 3157 beq(CCR0, done); // rest == 0 3158 li(tmp, 0); 3159 mtctr(cnt_dwords); // Load counter. 3160 //24: 3161 bind(restloop); // Clear rest. 3162 std(tmp, 0, base_ptr); // Clear 8byte aligned block. 3163 addi(base_ptr, base_ptr, 8); 3164 bdnz(restloop); 3165 //27: 3166 bind(done); 3167 } 3168 3169 /////////////////////////////////////////// String intrinsics //////////////////////////////////////////// 3170 3171 // Search for a single jchar in an jchar[]. 3172 // 3173 // Assumes that result differs from all other registers. 3174 // 3175 // 'haystack' is the addresses of a jchar-array. 3176 // 'needle' is either the character to search for or R0. 3177 // 'needleChar' is the character to search for if 'needle' == R0.. 3178 // 'haycnt' is the length of the haystack. We assume 'haycnt' >=1. 3179 // 3180 // Preserves haystack, haycnt, needle and kills all other registers. 3181 // 3182 // If needle == R0, we search for the constant needleChar. 3183 void MacroAssembler::string_indexof_1(Register result, Register haystack, Register haycnt, 3184 Register needle, jchar needleChar, 3185 Register tmp1, Register tmp2) { 3186 3187 assert_different_registers(result, haystack, haycnt, needle, tmp1, tmp2); 3188 3189 Label L_InnerLoop, L_FinalCheck, L_Found1, L_Found2, L_Found3, L_NotFound, L_End; 3190 Register addr = tmp1, 3191 ch1 = tmp2, 3192 ch2 = R0; 3193 3194 //3: 3195 dcbtct(haystack, 0x00); // Indicate R/O access to haystack. 3196 3197 srwi_(tmp2, haycnt, 1); // Shift right by exact_log2(UNROLL_FACTOR). 3198 mr(addr, haystack); 3199 beq(CCR0, L_FinalCheck); 3200 mtctr(tmp2); // Move to count register. 3201 //8: 3202 bind(L_InnerLoop); // Main work horse (2x unrolled search loop). 3203 lhz(ch1, 0, addr); // Load characters from haystack. 3204 lhz(ch2, 2, addr); 3205 (needle != R0) ? cmpw(CCR0, ch1, needle) : cmplwi(CCR0, ch1, needleChar); 3206 (needle != R0) ? cmpw(CCR1, ch2, needle) : cmplwi(CCR1, ch2, needleChar); 3207 beq(CCR0, L_Found1); // Did we find the needle? 3208 beq(CCR1, L_Found2); 3209 addi(addr, addr, 4); 3210 bdnz(L_InnerLoop); 3211 //16: 3212 bind(L_FinalCheck); 3213 andi_(R0, haycnt, 1); 3214 beq(CCR0, L_NotFound); 3215 lhz(ch1, 0, addr); // One position left at which we have to compare. 3216 (needle != R0) ? cmpw(CCR1, ch1, needle) : cmplwi(CCR1, ch1, needleChar); 3217 beq(CCR1, L_Found3); 3218 //21: 3219 bind(L_NotFound); 3220 li(result, -1); // Not found. 3221 b(L_End); 3222 3223 bind(L_Found2); 3224 addi(addr, addr, 2); 3225 //24: 3226 bind(L_Found1); 3227 bind(L_Found3); // Return index ... 3228 subf(addr, haystack, addr); // relative to haystack, 3229 srdi(result, addr, 1); // in characters. 3230 bind(L_End); 3231 } 3232 3233 3234 // Implementation of IndexOf for jchar arrays. 3235 // 3236 // The length of haystack and needle are not constant, i.e. passed in a register. 3237 // 3238 // Preserves registers haystack, needle. 3239 // Kills registers haycnt, needlecnt. 3240 // Assumes that result differs from all other registers. 3241 // Haystack, needle are the addresses of jchar-arrays. 3242 // Haycnt, needlecnt are the lengths of them, respectively. 3243 // 3244 // Needlecntval must be zero or 15-bit unsigned immediate and > 1. 3245 void MacroAssembler::string_indexof(Register result, Register haystack, Register haycnt, 3246 Register needle, ciTypeArray* needle_values, Register needlecnt, int needlecntval, 3247 Register tmp1, Register tmp2, Register tmp3, Register tmp4) { 3248 3249 // Ensure 0<needlecnt<=haycnt in ideal graph as prerequisite! 3250 Label L_TooShort, L_Found, L_NotFound, L_End; 3251 Register last_addr = haycnt, // Kill haycnt at the beginning. 3252 addr = tmp1, 3253 n_start = tmp2, 3254 ch1 = tmp3, 3255 ch2 = R0; 3256 3257 // ************************************************************************************************** 3258 // Prepare for main loop: optimized for needle count >=2, bail out otherwise. 3259 // ************************************************************************************************** 3260 3261 //1 (variable) or 3 (const): 3262 dcbtct(needle, 0x00); // Indicate R/O access to str1. 3263 dcbtct(haystack, 0x00); // Indicate R/O access to str2. 3264 3265 // Compute last haystack addr to use if no match gets found. 3266 if (needlecntval == 0) { // variable needlecnt 3267 //3: 3268 subf(ch1, needlecnt, haycnt); // Last character index to compare is haycnt-needlecnt. 3269 addi(addr, haystack, -2); // Accesses use pre-increment. 3270 cmpwi(CCR6, needlecnt, 2); 3271 blt(CCR6, L_TooShort); // Variable needlecnt: handle short needle separately. 3272 slwi(ch1, ch1, 1); // Scale to number of bytes. 3273 lwz(n_start, 0, needle); // Load first 2 characters of needle. 3274 add(last_addr, haystack, ch1); // Point to last address to compare (haystack+2*(haycnt-needlecnt)). 3275 addi(needlecnt, needlecnt, -2); // Rest of needle. 3276 } else { // constant needlecnt 3277 guarantee(needlecntval != 1, "IndexOf with single-character needle must be handled separately"); 3278 assert((needlecntval & 0x7fff) == needlecntval, "wrong immediate"); 3279 //5: 3280 addi(ch1, haycnt, -needlecntval); // Last character index to compare is haycnt-needlecnt. 3281 lwz(n_start, 0, needle); // Load first 2 characters of needle. 3282 addi(addr, haystack, -2); // Accesses use pre-increment. 3283 slwi(ch1, ch1, 1); // Scale to number of bytes. 3284 add(last_addr, haystack, ch1); // Point to last address to compare (haystack+2*(haycnt-needlecnt)). 3285 li(needlecnt, needlecntval-2); // Rest of needle. 3286 } 3287 3288 // Main Loop (now we have at least 3 characters). 3289 //11: 3290 Label L_OuterLoop, L_InnerLoop, L_FinalCheck, L_Comp1, L_Comp2, L_Comp3; 3291 bind(L_OuterLoop); // Search for 1st 2 characters. 3292 Register addr_diff = tmp4; 3293 subf(addr_diff, addr, last_addr); // Difference between already checked address and last address to check. 3294 addi(addr, addr, 2); // This is the new address we want to use for comparing. 3295 srdi_(ch2, addr_diff, 2); 3296 beq(CCR0, L_FinalCheck); // 2 characters left? 3297 mtctr(ch2); // addr_diff/4 3298 //16: 3299 bind(L_InnerLoop); // Main work horse (2x unrolled search loop) 3300 lwz(ch1, 0, addr); // Load 2 characters of haystack (ignore alignment). 3301 lwz(ch2, 2, addr); 3302 cmpw(CCR0, ch1, n_start); // Compare 2 characters (1 would be sufficient but try to reduce branches to CompLoop). 3303 cmpw(CCR1, ch2, n_start); 3304 beq(CCR0, L_Comp1); // Did we find the needle start? 3305 beq(CCR1, L_Comp2); 3306 addi(addr, addr, 4); 3307 bdnz(L_InnerLoop); 3308 //24: 3309 bind(L_FinalCheck); 3310 rldicl_(addr_diff, addr_diff, 64-1, 63); // Remaining characters not covered by InnerLoop: (addr_diff>>1)&1. 3311 beq(CCR0, L_NotFound); 3312 lwz(ch1, 0, addr); // One position left at which we have to compare. 3313 cmpw(CCR1, ch1, n_start); 3314 beq(CCR1, L_Comp3); 3315 //29: 3316 bind(L_NotFound); 3317 li(result, -1); // not found 3318 b(L_End); 3319 3320 3321 // ************************************************************************************************** 3322 // Special Case: unfortunately, the variable needle case can be called with needlecnt<2 3323 // ************************************************************************************************** 3324 //31: 3325 if ((needlecntval>>1) !=1 ) { // Const needlecnt is 2 or 3? Reduce code size. 3326 int nopcnt = 5; 3327 if (needlecntval !=0 ) ++nopcnt; // Balance alignment (other case: see below). 3328 if (needlecntval == 0) { // We have to handle these cases separately. 3329 Label L_OneCharLoop; 3330 bind(L_TooShort); 3331 mtctr(haycnt); 3332 lhz(n_start, 0, needle); // First character of needle 3333 bind(L_OneCharLoop); 3334 lhzu(ch1, 2, addr); 3335 cmpw(CCR1, ch1, n_start); 3336 beq(CCR1, L_Found); // Did we find the one character needle? 3337 bdnz(L_OneCharLoop); 3338 li(result, -1); // Not found. 3339 b(L_End); 3340 } // 8 instructions, so no impact on alignment. 3341 for (int x = 0; x < nopcnt; ++x) nop(); 3342 } 3343 3344 // ************************************************************************************************** 3345 // Regular Case Part II: compare rest of needle (first 2 characters have been compared already) 3346 // ************************************************************************************************** 3347 3348 // Compare the rest 3349 //36 if needlecntval==0, else 37: 3350 bind(L_Comp2); 3351 addi(addr, addr, 2); // First comparison has failed, 2nd one hit. 3352 bind(L_Comp1); // Addr points to possible needle start. 3353 bind(L_Comp3); // Could have created a copy and use a different return address but saving code size here. 3354 if (needlecntval != 2) { // Const needlecnt==2? 3355 if (needlecntval != 3) { 3356 if (needlecntval == 0) beq(CCR6, L_Found); // Variable needlecnt==2? 3357 Register ind_reg = tmp4; 3358 li(ind_reg, 2*2); // First 2 characters are already compared, use index 2. 3359 mtctr(needlecnt); // Decremented by 2, still > 0. 3360 //40: 3361 Label L_CompLoop; 3362 bind(L_CompLoop); 3363 lhzx(ch2, needle, ind_reg); 3364 lhzx(ch1, addr, ind_reg); 3365 cmpw(CCR1, ch1, ch2); 3366 bne(CCR1, L_OuterLoop); 3367 addi(ind_reg, ind_reg, 2); 3368 bdnz(L_CompLoop); 3369 } else { // No loop required if there's only one needle character left. 3370 lhz(ch2, 2*2, needle); 3371 lhz(ch1, 2*2, addr); 3372 cmpw(CCR1, ch1, ch2); 3373 bne(CCR1, L_OuterLoop); 3374 } 3375 } 3376 // Return index ... 3377 //46: 3378 bind(L_Found); 3379 subf(addr, haystack, addr); // relative to haystack, ... 3380 srdi(result, addr, 1); // in characters. 3381 //48: 3382 bind(L_End); 3383 } 3384 3385 // Implementation of Compare for jchar arrays. 3386 // 3387 // Kills the registers str1, str2, cnt1, cnt2. 3388 // Kills cr0, ctr. 3389 // Assumes that result differes from the input registers. 3390 void MacroAssembler::string_compare(Register str1_reg, Register str2_reg, Register cnt1_reg, Register cnt2_reg, 3391 Register result_reg, Register tmp_reg) { 3392 assert_different_registers(result_reg, str1_reg, str2_reg, cnt1_reg, cnt2_reg, tmp_reg); 3393 3394 Label Ldone, Lslow_case, Lslow_loop, Lfast_loop; 3395 Register cnt_diff = R0, 3396 limit_reg = cnt1_reg, 3397 chr1_reg = result_reg, 3398 chr2_reg = cnt2_reg, 3399 addr_diff = str2_reg; 3400 3401 // 'cnt_reg' contains the number of characters in the string's character array for the 3402 // pre-CompactStrings strings implementation and the number of bytes in the string's 3403 // byte array for the CompactStrings strings implementation. 3404 const int HAS_COMPACT_STRING = java_lang_String::has_coder_field() ? 1 : 0; // '1' = byte array, '0' = char array 3405 3406 // Offset 0 should be 32 byte aligned. 3407 //-6: 3408 srawi(cnt1_reg, cnt1_reg, HAS_COMPACT_STRING); 3409 srawi(cnt2_reg, cnt2_reg, HAS_COMPACT_STRING); 3410 //-4: 3411 dcbtct(str1_reg, 0x00); // Indicate R/O access to str1. 3412 dcbtct(str2_reg, 0x00); // Indicate R/O access to str2. 3413 //-2: 3414 // Compute min(cnt1, cnt2) and check if 0 (bail out if we don't need to compare characters). 3415 subf(result_reg, cnt2_reg, cnt1_reg); // difference between cnt1/2 3416 subf_(addr_diff, str1_reg, str2_reg); // alias? 3417 beq(CCR0, Ldone); // return cnt difference if both ones are identical 3418 srawi(limit_reg, result_reg, 31); // generate signmask (cnt1/2 must be non-negative so cnt_diff can't overflow) 3419 mr(cnt_diff, result_reg); 3420 andr(limit_reg, result_reg, limit_reg); // difference or zero (negative): cnt1<cnt2 ? cnt1-cnt2 : 0 3421 add_(limit_reg, cnt2_reg, limit_reg); // min(cnt1, cnt2)==0? 3422 beq(CCR0, Ldone); // return cnt difference if one has 0 length 3423 3424 lhz(chr1_reg, 0, str1_reg); // optional: early out if first characters mismatch 3425 lhzx(chr2_reg, str1_reg, addr_diff); // optional: early out if first characters mismatch 3426 addi(tmp_reg, limit_reg, -1); // min(cnt1, cnt2)-1 3427 subf_(result_reg, chr2_reg, chr1_reg); // optional: early out if first characters mismatch 3428 bne(CCR0, Ldone); // optional: early out if first characters mismatch 3429 3430 // Set loop counter by scaling down tmp_reg 3431 srawi_(chr2_reg, tmp_reg, exact_log2(4)); // (min(cnt1, cnt2)-1)/4 3432 ble(CCR0, Lslow_case); // need >4 characters for fast loop 3433 andi(limit_reg, tmp_reg, 4-1); // remaining characters 3434 3435 // Adapt str1_reg str2_reg for the first loop iteration 3436 mtctr(chr2_reg); // (min(cnt1, cnt2)-1)/4 3437 addi(limit_reg, limit_reg, 4+1); // compare last 5-8 characters in slow_case if mismatch found in fast_loop 3438 //16: 3439 // Compare the rest of the characters 3440 bind(Lfast_loop); 3441 ld(chr1_reg, 0, str1_reg); 3442 ldx(chr2_reg, str1_reg, addr_diff); 3443 cmpd(CCR0, chr2_reg, chr1_reg); 3444 bne(CCR0, Lslow_case); // return chr1_reg 3445 addi(str1_reg, str1_reg, 4*2); 3446 bdnz(Lfast_loop); 3447 addi(limit_reg, limit_reg, -4); // no mismatch found in fast_loop, only 1-4 characters missing 3448 //23: 3449 bind(Lslow_case); 3450 mtctr(limit_reg); 3451 //24: 3452 bind(Lslow_loop); 3453 lhz(chr1_reg, 0, str1_reg); 3454 lhzx(chr2_reg, str1_reg, addr_diff); 3455 subf_(result_reg, chr2_reg, chr1_reg); 3456 bne(CCR0, Ldone); // return chr1_reg 3457 addi(str1_reg, str1_reg, 1*2); 3458 bdnz(Lslow_loop); 3459 //30: 3460 // If strings are equal up to min length, return the length difference. 3461 mr(result_reg, cnt_diff); 3462 nop(); // alignment 3463 //32: 3464 // Otherwise, return the difference between the first mismatched chars. 3465 bind(Ldone); 3466 } 3467 3468 3469 // Compare char[] arrays. 3470 // 3471 // str1_reg USE only 3472 // str2_reg USE only 3473 // cnt_reg USE_DEF, due to tmp reg shortage 3474 // result_reg DEF only, might compromise USE only registers 3475 void MacroAssembler::char_arrays_equals(Register str1_reg, Register str2_reg, Register cnt_reg, Register result_reg, 3476 Register tmp1_reg, Register tmp2_reg, Register tmp3_reg, Register tmp4_reg, 3477 Register tmp5_reg) { 3478 3479 // Str1 may be the same register as str2 which can occur e.g. after scalar replacement. 3480 assert_different_registers(result_reg, str1_reg, cnt_reg, tmp1_reg, tmp2_reg, tmp3_reg, tmp4_reg, tmp5_reg); 3481 assert_different_registers(result_reg, str2_reg, cnt_reg, tmp1_reg, tmp2_reg, tmp3_reg, tmp4_reg, tmp5_reg); 3482 3483 // Offset 0 should be 32 byte aligned. 3484 Label Linit_cbc, Lcbc, Lloop, Ldone_true, Ldone_false; 3485 Register index_reg = tmp5_reg; 3486 Register cbc_iter = tmp4_reg; 3487 3488 // 'cnt_reg' contains the number of characters in the string's character array for the 3489 // pre-CompactStrings strings implementation and the number of bytes in the string's 3490 // byte array for the CompactStrings strings implementation. 3491 const int HAS_COMPACT_STRING = java_lang_String::has_coder_field() ? 1 : 0; // '1' = byte array, '0' = char array 3492 3493 //-1: 3494 dcbtct(str1_reg, 0x00); // Indicate R/O access to str1. 3495 dcbtct(str2_reg, 0x00); // Indicate R/O access to str2. 3496 //1: 3497 // cbc_iter: remaining characters after the '4 java characters per iteration' loop. 3498 rlwinm(cbc_iter, cnt_reg, 32 - HAS_COMPACT_STRING, 30, 31); // (cnt_reg % (HAS_COMPACT_STRING ? 8 : 4)) >> HAS_COMPACT_STRING 3499 li(index_reg, 0); // init 3500 li(result_reg, 0); // assume false 3501 // tmp2_reg: units of 4 java characters (i.e. 8 bytes) per iteration (main loop). 3502 srwi_(tmp2_reg, cnt_reg, exact_log2(4 << HAS_COMPACT_STRING)); // cnt_reg / (HAS_COMPACT_STRING ? 8 : 4) 3503 3504 cmpwi(CCR1, cbc_iter, 0); // CCR1 = (cbc_iter==0) 3505 beq(CCR0, Linit_cbc); // too short 3506 mtctr(tmp2_reg); 3507 //8: 3508 bind(Lloop); 3509 ldx(tmp1_reg, str1_reg, index_reg); 3510 ldx(tmp2_reg, str2_reg, index_reg); 3511 cmpd(CCR0, tmp1_reg, tmp2_reg); 3512 bne(CCR0, Ldone_false); // Unequal char pair found -> done. 3513 addi(index_reg, index_reg, 4*sizeof(jchar)); 3514 bdnz(Lloop); 3515 //14: 3516 bind(Linit_cbc); 3517 beq(CCR1, Ldone_true); 3518 mtctr(cbc_iter); 3519 //16: 3520 bind(Lcbc); 3521 lhzx(tmp1_reg, str1_reg, index_reg); 3522 lhzx(tmp2_reg, str2_reg, index_reg); 3523 cmpw(CCR0, tmp1_reg, tmp2_reg); 3524 bne(CCR0, Ldone_false); // Unequal char pair found -> done. 3525 addi(index_reg, index_reg, 1*sizeof(jchar)); 3526 bdnz(Lcbc); 3527 nop(); 3528 bind(Ldone_true); 3529 li(result_reg, 1); 3530 //24: 3531 bind(Ldone_false); 3532 } 3533 3534 3535 void MacroAssembler::char_arrays_equalsImm(Register str1_reg, Register str2_reg, int cntval, Register result_reg, 3536 Register tmp1_reg, Register tmp2_reg) { 3537 // Str1 may be the same register as str2 which can occur e.g. after scalar replacement. 3538 assert_different_registers(result_reg, str1_reg, tmp1_reg, tmp2_reg); 3539 assert_different_registers(result_reg, str2_reg, tmp1_reg, tmp2_reg); 3540 assert(sizeof(jchar) == 2, "must be"); 3541 assert(cntval >= 0 && ((cntval & 0x7fff) == cntval), "wrong immediate"); 3542 3543 // 'cntval' contains the number of characters in the string's character array for the 3544 // pre-CompactStrings strings implementation and the number of bytes in the string's 3545 // byte array for the CompactStrings strings implementation. 3546 cntval >>= (java_lang_String::has_coder_field() ? 1 : 0); // '1' = byte array strings, '0' = char array strings 3547 3548 Label Ldone_false; 3549 3550 if (cntval < 16) { // short case 3551 if (cntval != 0) li(result_reg, 0); // assume false 3552 3553 const int num_bytes = cntval*sizeof(jchar); 3554 int index = 0; 3555 for (int next_index; (next_index = index + 8) <= num_bytes; index = next_index) { 3556 ld(tmp1_reg, index, str1_reg); 3557 ld(tmp2_reg, index, str2_reg); 3558 cmpd(CCR0, tmp1_reg, tmp2_reg); 3559 bne(CCR0, Ldone_false); 3560 } 3561 if (cntval & 2) { 3562 lwz(tmp1_reg, index, str1_reg); 3563 lwz(tmp2_reg, index, str2_reg); 3564 cmpw(CCR0, tmp1_reg, tmp2_reg); 3565 bne(CCR0, Ldone_false); 3566 index += 4; 3567 } 3568 if (cntval & 1) { 3569 lhz(tmp1_reg, index, str1_reg); 3570 lhz(tmp2_reg, index, str2_reg); 3571 cmpw(CCR0, tmp1_reg, tmp2_reg); 3572 bne(CCR0, Ldone_false); 3573 } 3574 // fallthrough: true 3575 } else { 3576 Label Lloop; 3577 Register index_reg = tmp1_reg; 3578 const int loopcnt = cntval/4; 3579 assert(loopcnt > 0, "must be"); 3580 // Offset 0 should be 32 byte aligned. 3581 //2: 3582 dcbtct(str1_reg, 0x00); // Indicate R/O access to str1. 3583 dcbtct(str2_reg, 0x00); // Indicate R/O access to str2. 3584 li(tmp2_reg, loopcnt); 3585 li(index_reg, 0); // init 3586 li(result_reg, 0); // assume false 3587 mtctr(tmp2_reg); 3588 //8: 3589 bind(Lloop); 3590 ldx(R0, str1_reg, index_reg); 3591 ldx(tmp2_reg, str2_reg, index_reg); 3592 cmpd(CCR0, R0, tmp2_reg); 3593 bne(CCR0, Ldone_false); // Unequal char pair found -> done. 3594 addi(index_reg, index_reg, 4*sizeof(jchar)); 3595 bdnz(Lloop); 3596 //14: 3597 if (cntval & 2) { 3598 lwzx(R0, str1_reg, index_reg); 3599 lwzx(tmp2_reg, str2_reg, index_reg); 3600 cmpw(CCR0, R0, tmp2_reg); 3601 bne(CCR0, Ldone_false); 3602 if (cntval & 1) addi(index_reg, index_reg, 2*sizeof(jchar)); 3603 } 3604 if (cntval & 1) { 3605 lhzx(R0, str1_reg, index_reg); 3606 lhzx(tmp2_reg, str2_reg, index_reg); 3607 cmpw(CCR0, R0, tmp2_reg); 3608 bne(CCR0, Ldone_false); 3609 } 3610 // fallthru: true 3611 } 3612 li(result_reg, 1); 3613 bind(Ldone_false); 3614 } 3615 3616 // Helpers for Intrinsic Emitters 3617 // 3618 // Revert the byte order of a 32bit value in a register 3619 // src: 0x44556677 3620 // dst: 0x77665544 3621 // Three steps to obtain the result: 3622 // 1) Rotate src (as doubleword) left 5 bytes. That puts the leftmost byte of the src word 3623 // into the rightmost byte position. Afterwards, everything left of the rightmost byte is cleared. 3624 // This value initializes dst. 3625 // 2) Rotate src (as word) left 3 bytes. That puts the rightmost byte of the src word into the leftmost 3626 // byte position. Furthermore, byte 5 is rotated into byte 6 position where it is supposed to go. 3627 // This value is mask inserted into dst with a [0..23] mask of 1s. 3628 // 3) Rotate src (as word) left 1 byte. That puts byte 6 into byte 5 position. 3629 // This value is mask inserted into dst with a [8..15] mask of 1s. 3630 void MacroAssembler::load_reverse_32(Register dst, Register src) { 3631 assert_different_registers(dst, src); 3632 3633 rldicl(dst, src, (4+1)*8, 56); // Rotate byte 4 into position 7 (rightmost), clear all to the left. 3634 rlwimi(dst, src, 3*8, 0, 23); // Insert byte 5 into position 6, 7 into 4, leave pos 7 alone. 3635 rlwimi(dst, src, 1*8, 8, 15); // Insert byte 6 into position 5, leave the rest alone. 3636 } 3637 3638 // Calculate the column addresses of the crc32 lookup table into distinct registers. 3639 // This loop-invariant calculation is moved out of the loop body, reducing the loop 3640 // body size from 20 to 16 instructions. 3641 // Returns the offset that was used to calculate the address of column tc3. 3642 // Due to register shortage, setting tc3 may overwrite table. With the return offset 3643 // at hand, the original table address can be easily reconstructed. 3644 int MacroAssembler::crc32_table_columns(Register table, Register tc0, Register tc1, Register tc2, Register tc3) { 3645 3646 #ifdef VM_LITTLE_ENDIAN 3647 // This is what we implement (the DOLIT4 part): 3648 // ========================================================================= */ 3649 // #define DOLIT4 c ^= *buf4++; \ 3650 // c = crc_table[3][c & 0xff] ^ crc_table[2][(c >> 8) & 0xff] ^ \ 3651 // crc_table[1][(c >> 16) & 0xff] ^ crc_table[0][c >> 24] 3652 // #define DOLIT32 DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4 3653 // ========================================================================= */ 3654 const int ix0 = 3*(4*CRC32_COLUMN_SIZE); 3655 const int ix1 = 2*(4*CRC32_COLUMN_SIZE); 3656 const int ix2 = 1*(4*CRC32_COLUMN_SIZE); 3657 const int ix3 = 0*(4*CRC32_COLUMN_SIZE); 3658 #else 3659 // This is what we implement (the DOBIG4 part): 3660 // ========================================================================= 3661 // #define DOBIG4 c ^= *++buf4; \ 3662 // c = crc_table[4][c & 0xff] ^ crc_table[5][(c >> 8) & 0xff] ^ \ 3663 // crc_table[6][(c >> 16) & 0xff] ^ crc_table[7][c >> 24] 3664 // #define DOBIG32 DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4 3665 // ========================================================================= 3666 const int ix0 = 4*(4*CRC32_COLUMN_SIZE); 3667 const int ix1 = 5*(4*CRC32_COLUMN_SIZE); 3668 const int ix2 = 6*(4*CRC32_COLUMN_SIZE); 3669 const int ix3 = 7*(4*CRC32_COLUMN_SIZE); 3670 #endif 3671 assert_different_registers(table, tc0, tc1, tc2); 3672 assert(table == tc3, "must be!"); 3673 3674 addi(tc0, table, ix0); 3675 addi(tc1, table, ix1); 3676 addi(tc2, table, ix2); 3677 if (ix3 != 0) addi(tc3, table, ix3); 3678 3679 return ix3; 3680 } 3681 3682 /** 3683 * uint32_t crc; 3684 * timesXtoThe32[crc & 0xFF] ^ (crc >> 8); 3685 */ 3686 void MacroAssembler::fold_byte_crc32(Register crc, Register val, Register table, Register tmp) { 3687 assert_different_registers(crc, table, tmp); 3688 assert_different_registers(val, table); 3689 3690 if (crc == val) { // Must rotate first to use the unmodified value. 3691 rlwinm(tmp, val, 2, 24-2, 31-2); // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest. 3692 // As we use a word (4-byte) instruction, we have to adapt the mask bit positions. 3693 srwi(crc, crc, 8); // Unsigned shift, clear leftmost 8 bits. 3694 } else { 3695 srwi(crc, crc, 8); // Unsigned shift, clear leftmost 8 bits. 3696 rlwinm(tmp, val, 2, 24-2, 31-2); // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest. 3697 } 3698 lwzx(tmp, table, tmp); 3699 xorr(crc, crc, tmp); 3700 } 3701 3702 /** 3703 * uint32_t crc; 3704 * timesXtoThe32[crc & 0xFF] ^ (crc >> 8); 3705 */ 3706 void MacroAssembler::fold_8bit_crc32(Register crc, Register table, Register tmp) { 3707 fold_byte_crc32(crc, crc, table, tmp); 3708 } 3709 3710 /** 3711 * Emits code to update CRC-32 with a byte value according to constants in table. 3712 * 3713 * @param [in,out]crc Register containing the crc. 3714 * @param [in]val Register containing the byte to fold into the CRC. 3715 * @param [in]table Register containing the table of crc constants. 3716 * 3717 * uint32_t crc; 3718 * val = crc_table[(val ^ crc) & 0xFF]; 3719 * crc = val ^ (crc >> 8); 3720 */ 3721 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) { 3722 BLOCK_COMMENT("update_byte_crc32:"); 3723 xorr(val, val, crc); 3724 fold_byte_crc32(crc, val, table, val); 3725 } 3726 3727 /** 3728 * @param crc register containing existing CRC (32-bit) 3729 * @param buf register pointing to input byte buffer (byte*) 3730 * @param len register containing number of bytes 3731 * @param table register pointing to CRC table 3732 */ 3733 void MacroAssembler::update_byteLoop_crc32(Register crc, Register buf, Register len, Register table, 3734 Register data, bool loopAlignment, bool invertCRC) { 3735 assert_different_registers(crc, buf, len, table, data); 3736 3737 Label L_mainLoop, L_done; 3738 const int mainLoop_stepping = 1; 3739 const int mainLoop_alignment = loopAlignment ? 32 : 4; // (InputForNewCode > 4 ? InputForNewCode : 32) : 4; 3740 3741 // Process all bytes in a single-byte loop. 3742 clrldi_(len, len, 32); // Enforce 32 bit. Anything to do? 3743 beq(CCR0, L_done); 3744 3745 if (invertCRC) { 3746 nand(crc, crc, crc); // ~c 3747 } 3748 3749 mtctr(len); 3750 align(mainLoop_alignment); 3751 BIND(L_mainLoop); 3752 lbz(data, 0, buf); // Byte from buffer, zero-extended. 3753 addi(buf, buf, mainLoop_stepping); // Advance buffer position. 3754 update_byte_crc32(crc, data, table); 3755 bdnz(L_mainLoop); // Iterate. 3756 3757 if (invertCRC) { 3758 nand(crc, crc, crc); // ~c 3759 } 3760 3761 bind(L_done); 3762 } 3763 3764 /** 3765 * Emits code to update CRC-32 with a 4-byte value according to constants in table 3766 * Implementation according to jdk/src/share/native/java/util/zip/zlib-1.2.8/crc32.c 3767 */ 3768 // A not on the lookup table address(es): 3769 // The lookup table consists of two sets of four columns each. 3770 // The columns {0..3} are used for little-endian machines. 3771 // The columns {4..7} are used for big-endian machines. 3772 // To save the effort of adding the column offset to the table address each time 3773 // a table element is looked up, it is possible to pass the pre-calculated 3774 // column addresses. 3775 // Uses R9..R12 as work register. Must be saved/restored by caller, if necessary. 3776 void MacroAssembler::update_1word_crc32(Register crc, Register buf, Register table, int bufDisp, int bufInc, 3777 Register t0, Register t1, Register t2, Register t3, 3778 Register tc0, Register tc1, Register tc2, Register tc3) { 3779 assert_different_registers(crc, t3); 3780 3781 // XOR crc with next four bytes of buffer. 3782 lwz(t3, bufDisp, buf); 3783 if (bufInc != 0) { 3784 addi(buf, buf, bufInc); 3785 } 3786 xorr(t3, t3, crc); 3787 3788 // Chop crc into 4 single-byte pieces, shifted left 2 bits, to form the table indices. 3789 rlwinm(t0, t3, 2, 24-2, 31-2); // ((t1 >> 0) & 0xff) << 2 3790 rlwinm(t1, t3, 32+(2- 8), 24-2, 31-2); // ((t1 >> 8) & 0xff) << 2 3791 rlwinm(t2, t3, 32+(2-16), 24-2, 31-2); // ((t1 >> 16) & 0xff) << 2 3792 rlwinm(t3, t3, 32+(2-24), 24-2, 31-2); // ((t1 >> 24) & 0xff) << 2 3793 3794 // Use the pre-calculated column addresses. 3795 // Load pre-calculated table values. 3796 lwzx(t0, tc0, t0); 3797 lwzx(t1, tc1, t1); 3798 lwzx(t2, tc2, t2); 3799 lwzx(t3, tc3, t3); 3800 3801 // Calculate new crc from table values. 3802 xorr(t0, t0, t1); 3803 xorr(t2, t2, t3); 3804 xorr(crc, t0, t2); // Now crc contains the final checksum value. 3805 } 3806 3807 /** 3808 * @param crc register containing existing CRC (32-bit) 3809 * @param buf register pointing to input byte buffer (byte*) 3810 * @param len register containing number of bytes 3811 * @param table register pointing to CRC table 3812 * 3813 * Uses R9..R12 as work register. Must be saved/restored by caller! 3814 */ 3815 void MacroAssembler::kernel_crc32_2word(Register crc, Register buf, Register len, Register table, 3816 Register t0, Register t1, Register t2, Register t3, 3817 Register tc0, Register tc1, Register tc2, Register tc3) { 3818 assert_different_registers(crc, buf, len, table); 3819 3820 Label L_mainLoop, L_tail; 3821 Register tmp = t0; 3822 Register data = t0; 3823 Register tmp2 = t1; 3824 const int mainLoop_stepping = 8; 3825 const int tailLoop_stepping = 1; 3826 const int log_stepping = exact_log2(mainLoop_stepping); 3827 const int mainLoop_alignment = 32; // InputForNewCode > 4 ? InputForNewCode : 32; 3828 const int complexThreshold = 2*mainLoop_stepping; 3829 3830 // Don't test for len <= 0 here. This pathological case should not occur anyway. 3831 // Optimizing for it by adding a test and a branch seems to be a waste of CPU cycles. 3832 // The situation itself is detected and handled correctly by the conditional branches 3833 // following aghi(len, -stepping) and aghi(len, +stepping). 3834 assert(tailLoop_stepping == 1, "check tailLoop_stepping!"); 3835 3836 BLOCK_COMMENT("kernel_crc32_2word {"); 3837 3838 nand(crc, crc, crc); // ~c 3839 3840 // Check for short (<mainLoop_stepping) buffer. 3841 cmpdi(CCR0, len, complexThreshold); 3842 blt(CCR0, L_tail); 3843 3844 // Pre-mainLoop alignment did show a slight (1%) positive effect on performance. 3845 // We leave the code in for reference. Maybe we need alignment when we exploit vector instructions. 3846 { 3847 // Align buf addr to mainLoop_stepping boundary. 3848 neg(tmp2, buf); // Calculate # preLoop iterations for alignment. 3849 rldicl(tmp2, tmp2, 0, 64-log_stepping); // Rotate tmp2 0 bits, insert into tmp2, anding with mask with 1s from 62..63. 3850 3851 if (complexThreshold > mainLoop_stepping) { 3852 sub(len, len, tmp2); // Remaining bytes for main loop (>=mainLoop_stepping is guaranteed). 3853 } else { 3854 sub(tmp, len, tmp2); // Remaining bytes for main loop. 3855 cmpdi(CCR0, tmp, mainLoop_stepping); 3856 blt(CCR0, L_tail); // For less than one mainloop_stepping left, do only tail processing 3857 mr(len, tmp); // remaining bytes for main loop (>=mainLoop_stepping is guaranteed). 3858 } 3859 update_byteLoop_crc32(crc, buf, tmp2, table, data, false, false); 3860 } 3861 3862 srdi(tmp2, len, log_stepping); // #iterations for mainLoop 3863 andi(len, len, mainLoop_stepping-1); // remaining bytes for tailLoop 3864 mtctr(tmp2); 3865 3866 #ifdef VM_LITTLE_ENDIAN 3867 Register crc_rv = crc; 3868 #else 3869 Register crc_rv = tmp; // Load_reverse needs separate registers to work on. 3870 // Occupies tmp, but frees up crc. 3871 load_reverse_32(crc_rv, crc); // Revert byte order because we are dealing with big-endian data. 3872 tmp = crc; 3873 #endif 3874 3875 int reconstructTableOffset = crc32_table_columns(table, tc0, tc1, tc2, tc3); 3876 3877 align(mainLoop_alignment); // Octoword-aligned loop address. Shows 2% improvement. 3878 BIND(L_mainLoop); 3879 update_1word_crc32(crc_rv, buf, table, 0, 0, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3); 3880 update_1word_crc32(crc_rv, buf, table, 4, mainLoop_stepping, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3); 3881 bdnz(L_mainLoop); 3882 3883 #ifndef VM_LITTLE_ENDIAN 3884 load_reverse_32(crc, crc_rv); // Revert byte order because we are dealing with big-endian data. 3885 tmp = crc_rv; // Tmp uses it's original register again. 3886 #endif 3887 3888 // Restore original table address for tailLoop. 3889 if (reconstructTableOffset != 0) { 3890 addi(table, table, -reconstructTableOffset); 3891 } 3892 3893 // Process last few (<complexThreshold) bytes of buffer. 3894 BIND(L_tail); 3895 update_byteLoop_crc32(crc, buf, len, table, data, false, false); 3896 3897 nand(crc, crc, crc); // ~c 3898 BLOCK_COMMENT("} kernel_crc32_2word"); 3899 } 3900 3901 /** 3902 * @param crc register containing existing CRC (32-bit) 3903 * @param buf register pointing to input byte buffer (byte*) 3904 * @param len register containing number of bytes 3905 * @param table register pointing to CRC table 3906 * 3907 * uses R9..R12 as work register. Must be saved/restored by caller! 3908 */ 3909 void MacroAssembler::kernel_crc32_1word(Register crc, Register buf, Register len, Register table, 3910 Register t0, Register t1, Register t2, Register t3, 3911 Register tc0, Register tc1, Register tc2, Register tc3) { 3912 assert_different_registers(crc, buf, len, table); 3913 3914 Label L_mainLoop, L_tail; 3915 Register tmp = t0; 3916 Register data = t0; 3917 Register tmp2 = t1; 3918 const int mainLoop_stepping = 4; 3919 const int tailLoop_stepping = 1; 3920 const int log_stepping = exact_log2(mainLoop_stepping); 3921 const int mainLoop_alignment = 32; // InputForNewCode > 4 ? InputForNewCode : 32; 3922 const int complexThreshold = 2*mainLoop_stepping; 3923 3924 // Don't test for len <= 0 here. This pathological case should not occur anyway. 3925 // Optimizing for it by adding a test and a branch seems to be a waste of CPU cycles. 3926 // The situation itself is detected and handled correctly by the conditional branches 3927 // following aghi(len, -stepping) and aghi(len, +stepping). 3928 assert(tailLoop_stepping == 1, "check tailLoop_stepping!"); 3929 3930 BLOCK_COMMENT("kernel_crc32_1word {"); 3931 3932 nand(crc, crc, crc); // ~c 3933 3934 // Check for short (<mainLoop_stepping) buffer. 3935 cmpdi(CCR0, len, complexThreshold); 3936 blt(CCR0, L_tail); 3937 3938 // Pre-mainLoop alignment did show a slight (1%) positive effect on performance. 3939 // We leave the code in for reference. Maybe we need alignment when we exploit vector instructions. 3940 { 3941 // Align buf addr to mainLoop_stepping boundary. 3942 neg(tmp2, buf); // Calculate # preLoop iterations for alignment. 3943 rldicl(tmp2, tmp2, 0, 64-log_stepping); // Rotate tmp2 0 bits, insert into tmp2, anding with mask with 1s from 62..63. 3944 3945 if (complexThreshold > mainLoop_stepping) { 3946 sub(len, len, tmp2); // Remaining bytes for main loop (>=mainLoop_stepping is guaranteed). 3947 } else { 3948 sub(tmp, len, tmp2); // Remaining bytes for main loop. 3949 cmpdi(CCR0, tmp, mainLoop_stepping); 3950 blt(CCR0, L_tail); // For less than one mainloop_stepping left, do only tail processing 3951 mr(len, tmp); // remaining bytes for main loop (>=mainLoop_stepping is guaranteed). 3952 } 3953 update_byteLoop_crc32(crc, buf, tmp2, table, data, false, false); 3954 } 3955 3956 srdi(tmp2, len, log_stepping); // #iterations for mainLoop 3957 andi(len, len, mainLoop_stepping-1); // remaining bytes for tailLoop 3958 mtctr(tmp2); 3959 3960 #ifdef VM_LITTLE_ENDIAN 3961 Register crc_rv = crc; 3962 #else 3963 Register crc_rv = tmp; // Load_reverse needs separate registers to work on. 3964 // Occupies tmp, but frees up crc. 3965 load_reverse_32(crc_rv, crc); // Revert byte order because we are dealing with big-endian data. 3966 tmp = crc; 3967 #endif 3968 3969 int reconstructTableOffset = crc32_table_columns(table, tc0, tc1, tc2, tc3); 3970 3971 align(mainLoop_alignment); // Octoword-aligned loop address. Shows 2% improvement. 3972 BIND(L_mainLoop); 3973 update_1word_crc32(crc_rv, buf, table, 0, mainLoop_stepping, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3); 3974 bdnz(L_mainLoop); 3975 3976 #ifndef VM_LITTLE_ENDIAN 3977 load_reverse_32(crc, crc_rv); // Revert byte order because we are dealing with big-endian data. 3978 tmp = crc_rv; // Tmp uses it's original register again. 3979 #endif 3980 3981 // Restore original table address for tailLoop. 3982 if (reconstructTableOffset != 0) { 3983 addi(table, table, -reconstructTableOffset); 3984 } 3985 3986 // Process last few (<complexThreshold) bytes of buffer. 3987 BIND(L_tail); 3988 update_byteLoop_crc32(crc, buf, len, table, data, false, false); 3989 3990 nand(crc, crc, crc); // ~c 3991 BLOCK_COMMENT("} kernel_crc32_1word"); 3992 } 3993 3994 /** 3995 * @param crc register containing existing CRC (32-bit) 3996 * @param buf register pointing to input byte buffer (byte*) 3997 * @param len register containing number of bytes 3998 * @param table register pointing to CRC table 3999 * 4000 * Uses R7_ARG5, R8_ARG6 as work registers. 4001 */ 4002 void MacroAssembler::kernel_crc32_1byte(Register crc, Register buf, Register len, Register table, 4003 Register t0, Register t1, Register t2, Register t3) { 4004 assert_different_registers(crc, buf, len, table); 4005 4006 Register data = t0; // Holds the current byte to be folded into crc. 4007 4008 BLOCK_COMMENT("kernel_crc32_1byte {"); 4009 4010 // Process all bytes in a single-byte loop. 4011 update_byteLoop_crc32(crc, buf, len, table, data, true, true); 4012 4013 BLOCK_COMMENT("} kernel_crc32_1byte"); 4014 } 4015 4016 void MacroAssembler::kernel_crc32_singleByte(Register crc, Register buf, Register len, Register table, Register tmp) { 4017 assert_different_registers(crc, buf, /* len, not used!! */ table, tmp); 4018 4019 BLOCK_COMMENT("kernel_crc32_singleByte:"); 4020 nand(crc, crc, crc); // ~c 4021 4022 lbz(tmp, 0, buf); // Byte from buffer, zero-extended. 4023 update_byte_crc32(crc, tmp, table); 4024 4025 nand(crc, crc, crc); // ~c 4026 } 4027 4028 // dest_lo += src1 + src2 4029 // dest_hi += carry1 + carry2 4030 void MacroAssembler::add2_with_carry(Register dest_hi, 4031 Register dest_lo, 4032 Register src1, Register src2) { 4033 li(R0, 0); 4034 addc(dest_lo, dest_lo, src1); 4035 adde(dest_hi, dest_hi, R0); 4036 addc(dest_lo, dest_lo, src2); 4037 adde(dest_hi, dest_hi, R0); 4038 } 4039 4040 // Multiply 64 bit by 64 bit first loop. 4041 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, 4042 Register x_xstart, 4043 Register y, Register y_idx, 4044 Register z, 4045 Register carry, 4046 Register product_high, Register product, 4047 Register idx, Register kdx, 4048 Register tmp) { 4049 // jlong carry, x[], y[], z[]; 4050 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx--, kdx--) { 4051 // huge_128 product = y[idx] * x[xstart] + carry; 4052 // z[kdx] = (jlong)product; 4053 // carry = (jlong)(product >>> 64); 4054 // } 4055 // z[xstart] = carry; 4056 4057 Label L_first_loop, L_first_loop_exit; 4058 Label L_one_x, L_one_y, L_multiply; 4059 4060 addic_(xstart, xstart, -1); 4061 blt(CCR0, L_one_x); // Special case: length of x is 1. 4062 4063 // Load next two integers of x. 4064 sldi(tmp, xstart, LogBytesPerInt); 4065 ldx(x_xstart, x, tmp); 4066 #ifdef VM_LITTLE_ENDIAN 4067 rldicl(x_xstart, x_xstart, 32, 0); 4068 #endif 4069 4070 align(32, 16); 4071 bind(L_first_loop); 4072 4073 cmpdi(CCR0, idx, 1); 4074 blt(CCR0, L_first_loop_exit); 4075 addi(idx, idx, -2); 4076 beq(CCR0, L_one_y); 4077 4078 // Load next two integers of y. 4079 sldi(tmp, idx, LogBytesPerInt); 4080 ldx(y_idx, y, tmp); 4081 #ifdef VM_LITTLE_ENDIAN 4082 rldicl(y_idx, y_idx, 32, 0); 4083 #endif 4084 4085 4086 bind(L_multiply); 4087 multiply64(product_high, product, x_xstart, y_idx); 4088 4089 li(tmp, 0); 4090 addc(product, product, carry); // Add carry to result. 4091 adde(product_high, product_high, tmp); // Add carry of the last addition. 4092 addi(kdx, kdx, -2); 4093 4094 // Store result. 4095 #ifdef VM_LITTLE_ENDIAN 4096 rldicl(product, product, 32, 0); 4097 #endif 4098 sldi(tmp, kdx, LogBytesPerInt); 4099 stdx(product, z, tmp); 4100 mr_if_needed(carry, product_high); 4101 b(L_first_loop); 4102 4103 4104 bind(L_one_y); // Load one 32 bit portion of y as (0,value). 4105 4106 lwz(y_idx, 0, y); 4107 b(L_multiply); 4108 4109 4110 bind(L_one_x); // Load one 32 bit portion of x as (0,value). 4111 4112 lwz(x_xstart, 0, x); 4113 b(L_first_loop); 4114 4115 bind(L_first_loop_exit); 4116 } 4117 4118 // Multiply 64 bit by 64 bit and add 128 bit. 4119 void MacroAssembler::multiply_add_128_x_128(Register x_xstart, Register y, 4120 Register z, Register yz_idx, 4121 Register idx, Register carry, 4122 Register product_high, Register product, 4123 Register tmp, int offset) { 4124 4125 // huge_128 product = (y[idx] * x_xstart) + z[kdx] + carry; 4126 // z[kdx] = (jlong)product; 4127 4128 sldi(tmp, idx, LogBytesPerInt); 4129 if (offset) { 4130 addi(tmp, tmp, offset); 4131 } 4132 ldx(yz_idx, y, tmp); 4133 #ifdef VM_LITTLE_ENDIAN 4134 rldicl(yz_idx, yz_idx, 32, 0); 4135 #endif 4136 4137 multiply64(product_high, product, x_xstart, yz_idx); 4138 ldx(yz_idx, z, tmp); 4139 #ifdef VM_LITTLE_ENDIAN 4140 rldicl(yz_idx, yz_idx, 32, 0); 4141 #endif 4142 4143 add2_with_carry(product_high, product, carry, yz_idx); 4144 4145 sldi(tmp, idx, LogBytesPerInt); 4146 if (offset) { 4147 addi(tmp, tmp, offset); 4148 } 4149 #ifdef VM_LITTLE_ENDIAN 4150 rldicl(product, product, 32, 0); 4151 #endif 4152 stdx(product, z, tmp); 4153 } 4154 4155 // Multiply 128 bit by 128 bit. Unrolled inner loop. 4156 void MacroAssembler::multiply_128_x_128_loop(Register x_xstart, 4157 Register y, Register z, 4158 Register yz_idx, Register idx, Register carry, 4159 Register product_high, Register product, 4160 Register carry2, Register tmp) { 4161 4162 // jlong carry, x[], y[], z[]; 4163 // int kdx = ystart+1; 4164 // for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop 4165 // huge_128 product = (y[idx+1] * x_xstart) + z[kdx+idx+1] + carry; 4166 // z[kdx+idx+1] = (jlong)product; 4167 // jlong carry2 = (jlong)(product >>> 64); 4168 // product = (y[idx] * x_xstart) + z[kdx+idx] + carry2; 4169 // z[kdx+idx] = (jlong)product; 4170 // carry = (jlong)(product >>> 64); 4171 // } 4172 // idx += 2; 4173 // if (idx > 0) { 4174 // product = (y[idx] * x_xstart) + z[kdx+idx] + carry; 4175 // z[kdx+idx] = (jlong)product; 4176 // carry = (jlong)(product >>> 64); 4177 // } 4178 4179 Label L_third_loop, L_third_loop_exit, L_post_third_loop_done; 4180 const Register jdx = R0; 4181 4182 // Scale the index. 4183 srdi_(jdx, idx, 2); 4184 beq(CCR0, L_third_loop_exit); 4185 mtctr(jdx); 4186 4187 align(32, 16); 4188 bind(L_third_loop); 4189 4190 addi(idx, idx, -4); 4191 4192 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 8); 4193 mr_if_needed(carry2, product_high); 4194 4195 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry2, product_high, product, tmp, 0); 4196 mr_if_needed(carry, product_high); 4197 bdnz(L_third_loop); 4198 4199 bind(L_third_loop_exit); // Handle any left-over operand parts. 4200 4201 andi_(idx, idx, 0x3); 4202 beq(CCR0, L_post_third_loop_done); 4203 4204 Label L_check_1; 4205 4206 addic_(idx, idx, -2); 4207 blt(CCR0, L_check_1); 4208 4209 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 0); 4210 mr_if_needed(carry, product_high); 4211 4212 bind(L_check_1); 4213 4214 addi(idx, idx, 0x2); 4215 andi_(idx, idx, 0x1); 4216 addic_(idx, idx, -1); 4217 blt(CCR0, L_post_third_loop_done); 4218 4219 sldi(tmp, idx, LogBytesPerInt); 4220 lwzx(yz_idx, y, tmp); 4221 multiply64(product_high, product, x_xstart, yz_idx); 4222 lwzx(yz_idx, z, tmp); 4223 4224 add2_with_carry(product_high, product, yz_idx, carry); 4225 4226 sldi(tmp, idx, LogBytesPerInt); 4227 stwx(product, z, tmp); 4228 srdi(product, product, 32); 4229 4230 sldi(product_high, product_high, 32); 4231 orr(product, product, product_high); 4232 mr_if_needed(carry, product); 4233 4234 bind(L_post_third_loop_done); 4235 } // multiply_128_x_128_loop 4236 4237 void MacroAssembler::multiply_to_len(Register x, Register xlen, 4238 Register y, Register ylen, 4239 Register z, Register zlen, 4240 Register tmp1, Register tmp2, 4241 Register tmp3, Register tmp4, 4242 Register tmp5, Register tmp6, 4243 Register tmp7, Register tmp8, 4244 Register tmp9, Register tmp10, 4245 Register tmp11, Register tmp12, 4246 Register tmp13) { 4247 4248 ShortBranchVerifier sbv(this); 4249 4250 assert_different_registers(x, xlen, y, ylen, z, zlen, 4251 tmp1, tmp2, tmp3, tmp4, tmp5, tmp6); 4252 assert_different_registers(x, xlen, y, ylen, z, zlen, 4253 tmp1, tmp2, tmp3, tmp4, tmp5, tmp7); 4254 assert_different_registers(x, xlen, y, ylen, z, zlen, 4255 tmp1, tmp2, tmp3, tmp4, tmp5, tmp8); 4256 4257 const Register idx = tmp1; 4258 const Register kdx = tmp2; 4259 const Register xstart = tmp3; 4260 4261 const Register y_idx = tmp4; 4262 const Register carry = tmp5; 4263 const Register product = tmp6; 4264 const Register product_high = tmp7; 4265 const Register x_xstart = tmp8; 4266 const Register tmp = tmp9; 4267 4268 // First Loop. 4269 // 4270 // final static long LONG_MASK = 0xffffffffL; 4271 // int xstart = xlen - 1; 4272 // int ystart = ylen - 1; 4273 // long carry = 0; 4274 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) { 4275 // long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry; 4276 // z[kdx] = (int)product; 4277 // carry = product >>> 32; 4278 // } 4279 // z[xstart] = (int)carry; 4280 4281 mr_if_needed(idx, ylen); // idx = ylen 4282 mr_if_needed(kdx, zlen); // kdx = xlen + ylen 4283 li(carry, 0); // carry = 0 4284 4285 Label L_done; 4286 4287 addic_(xstart, xlen, -1); 4288 blt(CCR0, L_done); 4289 4290 multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, 4291 carry, product_high, product, idx, kdx, tmp); 4292 4293 Label L_second_loop; 4294 4295 cmpdi(CCR0, kdx, 0); 4296 beq(CCR0, L_second_loop); 4297 4298 Label L_carry; 4299 4300 addic_(kdx, kdx, -1); 4301 beq(CCR0, L_carry); 4302 4303 // Store lower 32 bits of carry. 4304 sldi(tmp, kdx, LogBytesPerInt); 4305 stwx(carry, z, tmp); 4306 srdi(carry, carry, 32); 4307 addi(kdx, kdx, -1); 4308 4309 4310 bind(L_carry); 4311 4312 // Store upper 32 bits of carry. 4313 sldi(tmp, kdx, LogBytesPerInt); 4314 stwx(carry, z, tmp); 4315 4316 // Second and third (nested) loops. 4317 // 4318 // for (int i = xstart-1; i >= 0; i--) { // Second loop 4319 // carry = 0; 4320 // for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop 4321 // long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) + 4322 // (z[k] & LONG_MASK) + carry; 4323 // z[k] = (int)product; 4324 // carry = product >>> 32; 4325 // } 4326 // z[i] = (int)carry; 4327 // } 4328 // 4329 // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = rdx 4330 4331 bind(L_second_loop); 4332 4333 li(carry, 0); // carry = 0; 4334 4335 addic_(xstart, xstart, -1); // i = xstart-1; 4336 blt(CCR0, L_done); 4337 4338 Register zsave = tmp10; 4339 4340 mr(zsave, z); 4341 4342 4343 Label L_last_x; 4344 4345 sldi(tmp, xstart, LogBytesPerInt); 4346 add(z, z, tmp); // z = z + k - j 4347 addi(z, z, 4); 4348 addic_(xstart, xstart, -1); // i = xstart-1; 4349 blt(CCR0, L_last_x); 4350 4351 sldi(tmp, xstart, LogBytesPerInt); 4352 ldx(x_xstart, x, tmp); 4353 #ifdef VM_LITTLE_ENDIAN 4354 rldicl(x_xstart, x_xstart, 32, 0); 4355 #endif 4356 4357 4358 Label L_third_loop_prologue; 4359 4360 bind(L_third_loop_prologue); 4361 4362 Register xsave = tmp11; 4363 Register xlensave = tmp12; 4364 Register ylensave = tmp13; 4365 4366 mr(xsave, x); 4367 mr(xlensave, xstart); 4368 mr(ylensave, ylen); 4369 4370 4371 multiply_128_x_128_loop(x_xstart, y, z, y_idx, ylen, 4372 carry, product_high, product, x, tmp); 4373 4374 mr(z, zsave); 4375 mr(x, xsave); 4376 mr(xlen, xlensave); // This is the decrement of the loop counter! 4377 mr(ylen, ylensave); 4378 4379 addi(tmp3, xlen, 1); 4380 sldi(tmp, tmp3, LogBytesPerInt); 4381 stwx(carry, z, tmp); 4382 addic_(tmp3, tmp3, -1); 4383 blt(CCR0, L_done); 4384 4385 srdi(carry, carry, 32); 4386 sldi(tmp, tmp3, LogBytesPerInt); 4387 stwx(carry, z, tmp); 4388 b(L_second_loop); 4389 4390 // Next infrequent code is moved outside loops. 4391 bind(L_last_x); 4392 4393 lwz(x_xstart, 0, x); 4394 b(L_third_loop_prologue); 4395 4396 bind(L_done); 4397 } // multiply_to_len 4398 4399 void MacroAssembler::asm_assert(bool check_equal, const char *msg, int id) { 4400 #ifdef ASSERT 4401 Label ok; 4402 if (check_equal) { 4403 beq(CCR0, ok); 4404 } else { 4405 bne(CCR0, ok); 4406 } 4407 stop(msg, id); 4408 bind(ok); 4409 #endif 4410 } 4411 4412 void MacroAssembler::asm_assert_mems_zero(bool check_equal, int size, int mem_offset, 4413 Register mem_base, const char* msg, int id) { 4414 #ifdef ASSERT 4415 switch (size) { 4416 case 4: 4417 lwz(R0, mem_offset, mem_base); 4418 cmpwi(CCR0, R0, 0); 4419 break; 4420 case 8: 4421 ld(R0, mem_offset, mem_base); 4422 cmpdi(CCR0, R0, 0); 4423 break; 4424 default: 4425 ShouldNotReachHere(); 4426 } 4427 asm_assert(check_equal, msg, id); 4428 #endif // ASSERT 4429 } 4430 4431 void MacroAssembler::verify_thread() { 4432 if (VerifyThread) { 4433 unimplemented("'VerifyThread' currently not implemented on PPC"); 4434 } 4435 } 4436 4437 // READ: oop. KILL: R0. Volatile floats perhaps. 4438 void MacroAssembler::verify_oop(Register oop, const char* msg) { 4439 if (!VerifyOops) { 4440 return; 4441 } 4442 4443 address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address(); 4444 const Register tmp = R11; // Will be preserved. 4445 const int nbytes_save = MacroAssembler::num_volatile_regs * 8; 4446 save_volatile_gprs(R1_SP, -nbytes_save); // except R0 4447 4448 mr_if_needed(R4_ARG2, oop); 4449 save_LR_CR(tmp); // save in old frame 4450 push_frame_reg_args(nbytes_save, tmp); 4451 // load FunctionDescriptor** / entry_address * 4452 load_const_optimized(tmp, fd, R0); 4453 // load FunctionDescriptor* / entry_address 4454 ld(tmp, 0, tmp); 4455 load_const_optimized(R3_ARG1, (address)msg, R0); 4456 // Call destination for its side effect. 4457 call_c(tmp); 4458 4459 pop_frame(); 4460 restore_LR_CR(tmp); 4461 restore_volatile_gprs(R1_SP, -nbytes_save); // except R0 4462 } 4463 4464 void MacroAssembler::verify_oop_addr(RegisterOrConstant offs, Register base, const char* msg) { 4465 if (!VerifyOops) { 4466 return; 4467 } 4468 4469 address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address(); 4470 const Register tmp = R11; // Will be preserved. 4471 const int nbytes_save = MacroAssembler::num_volatile_regs * 8; 4472 save_volatile_gprs(R1_SP, -nbytes_save); // except R0 4473 4474 ld(R4_ARG2, offs, base); 4475 save_LR_CR(tmp); // save in old frame 4476 push_frame_reg_args(nbytes_save, tmp); 4477 // load FunctionDescriptor** / entry_address * 4478 load_const_optimized(tmp, fd, R0); 4479 // load FunctionDescriptor* / entry_address 4480 ld(tmp, 0, tmp); 4481 load_const_optimized(R3_ARG1, (address)msg, R0); 4482 // Call destination for its side effect. 4483 call_c(tmp); 4484 4485 pop_frame(); 4486 restore_LR_CR(tmp); 4487 restore_volatile_gprs(R1_SP, -nbytes_save); // except R0 4488 } 4489 4490 const char* stop_types[] = { 4491 "stop", 4492 "untested", 4493 "unimplemented", 4494 "shouldnotreachhere" 4495 }; 4496 4497 static void stop_on_request(int tp, const char* msg) { 4498 tty->print("PPC assembly code requires stop: (%s) %s\n", stop_types[tp%/*stop_end*/4], msg); 4499 guarantee(false, "PPC assembly code requires stop: %s", msg); 4500 } 4501 4502 // Call a C-function that prints output. 4503 void MacroAssembler::stop(int type, const char* msg, int id) { 4504 #ifndef PRODUCT 4505 block_comment(err_msg("stop: %s %s {", stop_types[type%stop_end], msg)); 4506 #else 4507 block_comment("stop {"); 4508 #endif 4509 4510 // setup arguments 4511 load_const_optimized(R3_ARG1, type); 4512 load_const_optimized(R4_ARG2, (void *)msg, /*tmp=*/R0); 4513 call_VM_leaf(CAST_FROM_FN_PTR(address, stop_on_request), R3_ARG1, R4_ARG2); 4514 illtrap(); 4515 emit_int32(id); 4516 block_comment("} stop;"); 4517 } 4518 4519 #ifndef PRODUCT 4520 // Write pattern 0x0101010101010101 in memory region [low-before, high+after]. 4521 // Val, addr are temp registers. 4522 // If low == addr, addr is killed. 4523 // High is preserved. 4524 void MacroAssembler::zap_from_to(Register low, int before, Register high, int after, Register val, Register addr) { 4525 if (!ZapMemory) return; 4526 4527 assert_different_registers(low, val); 4528 4529 BLOCK_COMMENT("zap memory region {"); 4530 load_const_optimized(val, 0x0101010101010101); 4531 int size = before + after; 4532 if (low == high && size < 5 && size > 0) { 4533 int offset = -before*BytesPerWord; 4534 for (int i = 0; i < size; ++i) { 4535 std(val, offset, low); 4536 offset += (1*BytesPerWord); 4537 } 4538 } else { 4539 addi(addr, low, -before*BytesPerWord); 4540 assert_different_registers(high, val); 4541 if (after) addi(high, high, after * BytesPerWord); 4542 Label loop; 4543 bind(loop); 4544 std(val, 0, addr); 4545 addi(addr, addr, 8); 4546 cmpd(CCR6, addr, high); 4547 ble(CCR6, loop); 4548 if (after) addi(high, high, -after * BytesPerWord); // Correct back to old value. 4549 } 4550 BLOCK_COMMENT("} zap memory region"); 4551 } 4552 4553 #endif // !PRODUCT 4554 4555 SkipIfEqualZero::SkipIfEqualZero(MacroAssembler* masm, Register temp, const bool* flag_addr) : _masm(masm), _label() { 4556 int simm16_offset = masm->load_const_optimized(temp, (address)flag_addr, R0, true); 4557 assert(sizeof(bool) == 1, "PowerPC ABI"); 4558 masm->lbz(temp, simm16_offset, temp); 4559 masm->cmpwi(CCR0, temp, 0); 4560 masm->beq(CCR0, _label); 4561 } 4562 4563 SkipIfEqualZero::~SkipIfEqualZero() { 4564 _masm->bind(_label); 4565 }