1 /* 2 * Copyright (c) 1997, 2018, Oracle and/or its affiliates. All rights reserved. 3 * Copyright (c) 2012, 2017, SAP SE. All rights reserved. 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. 9 * 10 * This code is distributed in the hope that it will be useful, but WITHOUT 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 * version 2 for more details (a copy is included in the LICENSE file that 14 * accompanied this code). 15 * 16 * You should have received a copy of the GNU General Public License version 17 * 2 along with this work; if not, write to the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 19 * 20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 21 * or visit www.oracle.com if you need additional information or have any 22 * questions. 23 * 24 */ 25 26 #include "precompiled.hpp" 27 #include "asm/macroAssembler.inline.hpp" 28 #include "compiler/disassembler.hpp" 29 #include "gc/shared/cardTableModRefBS.hpp" 30 #include "gc/shared/collectedHeap.inline.hpp" 31 #include "interpreter/interpreter.hpp" 32 #include "memory/resourceArea.hpp" 33 #include "nativeInst_ppc.hpp" 34 #include "prims/methodHandles.hpp" 35 #include "runtime/biasedLocking.hpp" 36 #include "runtime/icache.hpp" 37 #include "runtime/interfaceSupport.hpp" 38 #include "runtime/objectMonitor.hpp" 39 #include "runtime/os.hpp" 40 #include "runtime/safepoint.hpp" 41 #include "runtime/safepointMechanism.hpp" 42 #include "runtime/sharedRuntime.hpp" 43 #include "runtime/stubRoutines.hpp" 44 #include "utilities/macros.hpp" 45 #if INCLUDE_ALL_GCS 46 #include "gc/g1/g1CollectedHeap.inline.hpp" 47 #include "gc/g1/g1SATBCardTableModRefBS.hpp" 48 #include "gc/g1/heapRegion.hpp" 49 #endif // INCLUDE_ALL_GCS 50 #ifdef COMPILER2 51 #include "opto/intrinsicnode.hpp" 52 #endif 53 54 #ifdef PRODUCT 55 #define BLOCK_COMMENT(str) // nothing 56 #else 57 #define BLOCK_COMMENT(str) block_comment(str) 58 #endif 59 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 60 61 #ifdef ASSERT 62 // On RISC, there's no benefit to verifying instruction boundaries. 63 bool AbstractAssembler::pd_check_instruction_mark() { return false; } 64 #endif 65 66 void MacroAssembler::ld_largeoffset_unchecked(Register d, int si31, Register a, int emit_filler_nop) { 67 assert(Assembler::is_simm(si31, 31) && si31 >= 0, "si31 out of range"); 68 if (Assembler::is_simm(si31, 16)) { 69 ld(d, si31, a); 70 if (emit_filler_nop) nop(); 71 } else { 72 const int hi = MacroAssembler::largeoffset_si16_si16_hi(si31); 73 const int lo = MacroAssembler::largeoffset_si16_si16_lo(si31); 74 addis(d, a, hi); 75 ld(d, lo, d); 76 } 77 } 78 79 void MacroAssembler::ld_largeoffset(Register d, int si31, Register a, int emit_filler_nop) { 80 assert_different_registers(d, a); 81 ld_largeoffset_unchecked(d, si31, a, emit_filler_nop); 82 } 83 84 void MacroAssembler::load_sized_value(Register dst, RegisterOrConstant offs, Register base, 85 size_t size_in_bytes, bool is_signed) { 86 switch (size_in_bytes) { 87 case 8: ld(dst, offs, base); break; 88 case 4: is_signed ? lwa(dst, offs, base) : lwz(dst, offs, base); break; 89 case 2: is_signed ? lha(dst, offs, base) : lhz(dst, offs, base); break; 90 case 1: lbz(dst, offs, base); if (is_signed) extsb(dst, dst); break; // lba doesn't exist :( 91 default: ShouldNotReachHere(); 92 } 93 } 94 95 void MacroAssembler::store_sized_value(Register dst, RegisterOrConstant offs, Register base, 96 size_t size_in_bytes) { 97 switch (size_in_bytes) { 98 case 8: std(dst, offs, base); break; 99 case 4: stw(dst, offs, base); break; 100 case 2: sth(dst, offs, base); break; 101 case 1: stb(dst, offs, base); break; 102 default: ShouldNotReachHere(); 103 } 104 } 105 106 void MacroAssembler::align(int modulus, int max, int rem) { 107 int padding = (rem + modulus - (offset() % modulus)) % modulus; 108 if (padding > max) return; 109 for (int c = (padding >> 2); c > 0; --c) { nop(); } 110 } 111 112 // Issue instructions that calculate given TOC from global TOC. 113 void MacroAssembler::calculate_address_from_global_toc(Register dst, address addr, bool hi16, bool lo16, 114 bool add_relocation, bool emit_dummy_addr) { 115 int offset = -1; 116 if (emit_dummy_addr) { 117 offset = -128; // dummy address 118 } else if (addr != (address)(intptr_t)-1) { 119 offset = MacroAssembler::offset_to_global_toc(addr); 120 } 121 122 if (hi16) { 123 addis(dst, R29_TOC, MacroAssembler::largeoffset_si16_si16_hi(offset)); 124 } 125 if (lo16) { 126 if (add_relocation) { 127 // Relocate at the addi to avoid confusion with a load from the method's TOC. 128 relocate(internal_word_Relocation::spec(addr)); 129 } 130 addi(dst, dst, MacroAssembler::largeoffset_si16_si16_lo(offset)); 131 } 132 } 133 134 address MacroAssembler::patch_calculate_address_from_global_toc_at(address a, address bound, address addr) { 135 const int offset = MacroAssembler::offset_to_global_toc(addr); 136 137 const address inst2_addr = a; 138 const int inst2 = *(int *)inst2_addr; 139 140 // The relocation points to the second instruction, the addi, 141 // and the addi reads and writes the same register dst. 142 const int dst = inv_rt_field(inst2); 143 assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst"); 144 145 // Now, find the preceding addis which writes to dst. 146 int inst1 = 0; 147 address inst1_addr = inst2_addr - BytesPerInstWord; 148 while (inst1_addr >= bound) { 149 inst1 = *(int *) inst1_addr; 150 if (is_addis(inst1) && inv_rt_field(inst1) == dst) { 151 // Stop, found the addis which writes dst. 152 break; 153 } 154 inst1_addr -= BytesPerInstWord; 155 } 156 157 assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC"); 158 set_imm((int *)inst1_addr, MacroAssembler::largeoffset_si16_si16_hi(offset)); 159 set_imm((int *)inst2_addr, MacroAssembler::largeoffset_si16_si16_lo(offset)); 160 return inst1_addr; 161 } 162 163 address MacroAssembler::get_address_of_calculate_address_from_global_toc_at(address a, address bound) { 164 const address inst2_addr = a; 165 const int inst2 = *(int *)inst2_addr; 166 167 // The relocation points to the second instruction, the addi, 168 // and the addi reads and writes the same register dst. 169 const int dst = inv_rt_field(inst2); 170 assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst"); 171 172 // Now, find the preceding addis which writes to dst. 173 int inst1 = 0; 174 address inst1_addr = inst2_addr - BytesPerInstWord; 175 while (inst1_addr >= bound) { 176 inst1 = *(int *) inst1_addr; 177 if (is_addis(inst1) && inv_rt_field(inst1) == dst) { 178 // stop, found the addis which writes dst 179 break; 180 } 181 inst1_addr -= BytesPerInstWord; 182 } 183 184 assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC"); 185 186 int offset = (get_imm(inst1_addr, 0) << 16) + get_imm(inst2_addr, 0); 187 // -1 is a special case 188 if (offset == -1) { 189 return (address)(intptr_t)-1; 190 } else { 191 return global_toc() + offset; 192 } 193 } 194 195 #ifdef _LP64 196 // Patch compressed oops or klass constants. 197 // Assembler sequence is 198 // 1) compressed oops: 199 // lis rx = const.hi 200 // ori rx = rx | const.lo 201 // 2) compressed klass: 202 // lis rx = const.hi 203 // clrldi rx = rx & 0xFFFFffff // clearMS32b, optional 204 // ori rx = rx | const.lo 205 // Clrldi will be passed by. 206 address MacroAssembler::patch_set_narrow_oop(address a, address bound, narrowOop data) { 207 assert(UseCompressedOops, "Should only patch compressed oops"); 208 209 const address inst2_addr = a; 210 const int inst2 = *(int *)inst2_addr; 211 212 // The relocation points to the second instruction, the ori, 213 // and the ori reads and writes the same register dst. 214 const int dst = inv_rta_field(inst2); 215 assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst"); 216 // Now, find the preceding addis which writes to dst. 217 int inst1 = 0; 218 address inst1_addr = inst2_addr - BytesPerInstWord; 219 bool inst1_found = false; 220 while (inst1_addr >= bound) { 221 inst1 = *(int *)inst1_addr; 222 if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break; } 223 inst1_addr -= BytesPerInstWord; 224 } 225 assert(inst1_found, "inst is not lis"); 226 227 int xc = (data >> 16) & 0xffff; 228 int xd = (data >> 0) & 0xffff; 229 230 set_imm((int *)inst1_addr, (short)(xc)); // see enc_load_con_narrow_hi/_lo 231 set_imm((int *)inst2_addr, (xd)); // unsigned int 232 return inst1_addr; 233 } 234 235 // Get compressed oop or klass constant. 236 narrowOop MacroAssembler::get_narrow_oop(address a, address bound) { 237 assert(UseCompressedOops, "Should only patch compressed oops"); 238 239 const address inst2_addr = a; 240 const int inst2 = *(int *)inst2_addr; 241 242 // The relocation points to the second instruction, the ori, 243 // and the ori reads and writes the same register dst. 244 const int dst = inv_rta_field(inst2); 245 assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst"); 246 // Now, find the preceding lis which writes to dst. 247 int inst1 = 0; 248 address inst1_addr = inst2_addr - BytesPerInstWord; 249 bool inst1_found = false; 250 251 while (inst1_addr >= bound) { 252 inst1 = *(int *) inst1_addr; 253 if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break;} 254 inst1_addr -= BytesPerInstWord; 255 } 256 assert(inst1_found, "inst is not lis"); 257 258 uint xl = ((unsigned int) (get_imm(inst2_addr, 0) & 0xffff)); 259 uint xh = (((get_imm(inst1_addr, 0)) & 0xffff) << 16); 260 261 return (int) (xl | xh); 262 } 263 #endif // _LP64 264 265 // Returns true if successful. 266 bool MacroAssembler::load_const_from_method_toc(Register dst, AddressLiteral& a, 267 Register toc, bool fixed_size) { 268 int toc_offset = 0; 269 // Use RelocationHolder::none for the constant pool entry, otherwise 270 // we will end up with a failing NativeCall::verify(x) where x is 271 // the address of the constant pool entry. 272 // FIXME: We should insert relocation information for oops at the constant 273 // pool entries instead of inserting it at the loads; patching of a constant 274 // pool entry should be less expensive. 275 address const_address = address_constant((address)a.value(), RelocationHolder::none); 276 if (const_address == NULL) { return false; } // allocation failure 277 // Relocate at the pc of the load. 278 relocate(a.rspec()); 279 toc_offset = (int)(const_address - code()->consts()->start()); 280 ld_largeoffset_unchecked(dst, toc_offset, toc, fixed_size); 281 return true; 282 } 283 284 bool MacroAssembler::is_load_const_from_method_toc_at(address a) { 285 const address inst1_addr = a; 286 const int inst1 = *(int *)inst1_addr; 287 288 // The relocation points to the ld or the addis. 289 return (is_ld(inst1)) || 290 (is_addis(inst1) && inv_ra_field(inst1) != 0); 291 } 292 293 int MacroAssembler::get_offset_of_load_const_from_method_toc_at(address a) { 294 assert(is_load_const_from_method_toc_at(a), "must be load_const_from_method_toc"); 295 296 const address inst1_addr = a; 297 const int inst1 = *(int *)inst1_addr; 298 299 if (is_ld(inst1)) { 300 return inv_d1_field(inst1); 301 } else if (is_addis(inst1)) { 302 const int dst = inv_rt_field(inst1); 303 304 // Now, find the succeeding ld which reads and writes to dst. 305 address inst2_addr = inst1_addr + BytesPerInstWord; 306 int inst2 = 0; 307 while (true) { 308 inst2 = *(int *) inst2_addr; 309 if (is_ld(inst2) && inv_ra_field(inst2) == dst && inv_rt_field(inst2) == dst) { 310 // Stop, found the ld which reads and writes dst. 311 break; 312 } 313 inst2_addr += BytesPerInstWord; 314 } 315 return (inv_d1_field(inst1) << 16) + inv_d1_field(inst2); 316 } 317 ShouldNotReachHere(); 318 return 0; 319 } 320 321 // Get the constant from a `load_const' sequence. 322 long MacroAssembler::get_const(address a) { 323 assert(is_load_const_at(a), "not a load of a constant"); 324 const int *p = (const int*) a; 325 unsigned long x = (((unsigned long) (get_imm(a,0) & 0xffff)) << 48); 326 if (is_ori(*(p+1))) { 327 x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 32); 328 x |= (((unsigned long) (get_imm(a,3) & 0xffff)) << 16); 329 x |= (((unsigned long) (get_imm(a,4) & 0xffff))); 330 } else if (is_lis(*(p+1))) { 331 x |= (((unsigned long) (get_imm(a,2) & 0xffff)) << 32); 332 x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 16); 333 x |= (((unsigned long) (get_imm(a,3) & 0xffff))); 334 } else { 335 ShouldNotReachHere(); 336 return (long) 0; 337 } 338 return (long) x; 339 } 340 341 // Patch the 64 bit constant of a `load_const' sequence. This is a low 342 // level procedure. It neither flushes the instruction cache nor is it 343 // mt safe. 344 void MacroAssembler::patch_const(address a, long x) { 345 assert(is_load_const_at(a), "not a load of a constant"); 346 int *p = (int*) a; 347 if (is_ori(*(p+1))) { 348 set_imm(0 + p, (x >> 48) & 0xffff); 349 set_imm(1 + p, (x >> 32) & 0xffff); 350 set_imm(3 + p, (x >> 16) & 0xffff); 351 set_imm(4 + p, x & 0xffff); 352 } else if (is_lis(*(p+1))) { 353 set_imm(0 + p, (x >> 48) & 0xffff); 354 set_imm(2 + p, (x >> 32) & 0xffff); 355 set_imm(1 + p, (x >> 16) & 0xffff); 356 set_imm(3 + p, x & 0xffff); 357 } else { 358 ShouldNotReachHere(); 359 } 360 } 361 362 AddressLiteral MacroAssembler::allocate_metadata_address(Metadata* obj) { 363 assert(oop_recorder() != NULL, "this assembler needs a Recorder"); 364 int index = oop_recorder()->allocate_metadata_index(obj); 365 RelocationHolder rspec = metadata_Relocation::spec(index); 366 return AddressLiteral((address)obj, rspec); 367 } 368 369 AddressLiteral MacroAssembler::constant_metadata_address(Metadata* obj) { 370 assert(oop_recorder() != NULL, "this assembler needs a Recorder"); 371 int index = oop_recorder()->find_index(obj); 372 RelocationHolder rspec = metadata_Relocation::spec(index); 373 return AddressLiteral((address)obj, rspec); 374 } 375 376 AddressLiteral MacroAssembler::allocate_oop_address(jobject obj) { 377 assert(oop_recorder() != NULL, "this assembler needs an OopRecorder"); 378 int oop_index = oop_recorder()->allocate_oop_index(obj); 379 return AddressLiteral(address(obj), oop_Relocation::spec(oop_index)); 380 } 381 382 AddressLiteral MacroAssembler::constant_oop_address(jobject obj) { 383 assert(oop_recorder() != NULL, "this assembler needs an OopRecorder"); 384 int oop_index = oop_recorder()->find_index(obj); 385 return AddressLiteral(address(obj), oop_Relocation::spec(oop_index)); 386 } 387 388 RegisterOrConstant MacroAssembler::delayed_value_impl(intptr_t* delayed_value_addr, 389 Register tmp, int offset) { 390 intptr_t value = *delayed_value_addr; 391 if (value != 0) { 392 return RegisterOrConstant(value + offset); 393 } 394 395 // Load indirectly to solve generation ordering problem. 396 // static address, no relocation 397 int simm16_offset = load_const_optimized(tmp, delayed_value_addr, noreg, true); 398 ld(tmp, simm16_offset, tmp); // must be aligned ((xa & 3) == 0) 399 400 if (offset != 0) { 401 addi(tmp, tmp, offset); 402 } 403 404 return RegisterOrConstant(tmp); 405 } 406 407 #ifndef PRODUCT 408 void MacroAssembler::pd_print_patched_instruction(address branch) { 409 Unimplemented(); // TODO: PPC port 410 } 411 #endif // ndef PRODUCT 412 413 // Conditional far branch for destinations encodable in 24+2 bits. 414 void MacroAssembler::bc_far(int boint, int biint, Label& dest, int optimize) { 415 416 // If requested by flag optimize, relocate the bc_far as a 417 // runtime_call and prepare for optimizing it when the code gets 418 // relocated. 419 if (optimize == bc_far_optimize_on_relocate) { 420 relocate(relocInfo::runtime_call_type); 421 } 422 423 // variant 2: 424 // 425 // b!cxx SKIP 426 // bxx DEST 427 // SKIP: 428 // 429 430 const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)), 431 opposite_bcond(inv_boint_bcond(boint))); 432 433 // We emit two branches. 434 // First, a conditional branch which jumps around the far branch. 435 const address not_taken_pc = pc() + 2 * BytesPerInstWord; 436 const address bc_pc = pc(); 437 bc(opposite_boint, biint, not_taken_pc); 438 439 const int bc_instr = *(int*)bc_pc; 440 assert(not_taken_pc == (address)inv_bd_field(bc_instr, (intptr_t)bc_pc), "postcondition"); 441 assert(opposite_boint == inv_bo_field(bc_instr), "postcondition"); 442 assert(boint == add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(bc_instr))), 443 opposite_bcond(inv_boint_bcond(inv_bo_field(bc_instr)))), 444 "postcondition"); 445 assert(biint == inv_bi_field(bc_instr), "postcondition"); 446 447 // Second, an unconditional far branch which jumps to dest. 448 // Note: target(dest) remembers the current pc (see CodeSection::target) 449 // and returns the current pc if the label is not bound yet; when 450 // the label gets bound, the unconditional far branch will be patched. 451 const address target_pc = target(dest); 452 const address b_pc = pc(); 453 b(target_pc); 454 455 assert(not_taken_pc == pc(), "postcondition"); 456 assert(dest.is_bound() || target_pc == b_pc, "postcondition"); 457 } 458 459 // 1 or 2 instructions 460 void MacroAssembler::bc_far_optimized(int boint, int biint, Label& dest) { 461 if (dest.is_bound() && is_within_range_of_bcxx(target(dest), pc())) { 462 bc(boint, biint, dest); 463 } else { 464 bc_far(boint, biint, dest, MacroAssembler::bc_far_optimize_on_relocate); 465 } 466 } 467 468 bool MacroAssembler::is_bc_far_at(address instruction_addr) { 469 return is_bc_far_variant1_at(instruction_addr) || 470 is_bc_far_variant2_at(instruction_addr) || 471 is_bc_far_variant3_at(instruction_addr); 472 } 473 474 address MacroAssembler::get_dest_of_bc_far_at(address instruction_addr) { 475 if (is_bc_far_variant1_at(instruction_addr)) { 476 const address instruction_1_addr = instruction_addr; 477 const int instruction_1 = *(int*)instruction_1_addr; 478 return (address)inv_bd_field(instruction_1, (intptr_t)instruction_1_addr); 479 } else if (is_bc_far_variant2_at(instruction_addr)) { 480 const address instruction_2_addr = instruction_addr + 4; 481 return bxx_destination(instruction_2_addr); 482 } else if (is_bc_far_variant3_at(instruction_addr)) { 483 return instruction_addr + 8; 484 } 485 // variant 4 ??? 486 ShouldNotReachHere(); 487 return NULL; 488 } 489 void MacroAssembler::set_dest_of_bc_far_at(address instruction_addr, address dest) { 490 491 if (is_bc_far_variant3_at(instruction_addr)) { 492 // variant 3, far cond branch to the next instruction, already patched to nops: 493 // 494 // nop 495 // endgroup 496 // SKIP/DEST: 497 // 498 return; 499 } 500 501 // first, extract boint and biint from the current branch 502 int boint = 0; 503 int biint = 0; 504 505 ResourceMark rm; 506 const int code_size = 2 * BytesPerInstWord; 507 CodeBuffer buf(instruction_addr, code_size); 508 MacroAssembler masm(&buf); 509 if (is_bc_far_variant2_at(instruction_addr) && dest == instruction_addr + 8) { 510 // Far branch to next instruction: Optimize it by patching nops (produce variant 3). 511 masm.nop(); 512 masm.endgroup(); 513 } else { 514 if (is_bc_far_variant1_at(instruction_addr)) { 515 // variant 1, the 1st instruction contains the destination address: 516 // 517 // bcxx DEST 518 // nop 519 // 520 const int instruction_1 = *(int*)(instruction_addr); 521 boint = inv_bo_field(instruction_1); 522 biint = inv_bi_field(instruction_1); 523 } else if (is_bc_far_variant2_at(instruction_addr)) { 524 // variant 2, the 2nd instruction contains the destination address: 525 // 526 // b!cxx SKIP 527 // bxx DEST 528 // SKIP: 529 // 530 const int instruction_1 = *(int*)(instruction_addr); 531 boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(instruction_1))), 532 opposite_bcond(inv_boint_bcond(inv_bo_field(instruction_1)))); 533 biint = inv_bi_field(instruction_1); 534 } else { 535 // variant 4??? 536 ShouldNotReachHere(); 537 } 538 539 // second, set the new branch destination and optimize the code 540 if (dest != instruction_addr + 4 && // the bc_far is still unbound! 541 masm.is_within_range_of_bcxx(dest, instruction_addr)) { 542 // variant 1: 543 // 544 // bcxx DEST 545 // nop 546 // 547 masm.bc(boint, biint, dest); 548 masm.nop(); 549 } else { 550 // variant 2: 551 // 552 // b!cxx SKIP 553 // bxx DEST 554 // SKIP: 555 // 556 const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)), 557 opposite_bcond(inv_boint_bcond(boint))); 558 const address not_taken_pc = masm.pc() + 2 * BytesPerInstWord; 559 masm.bc(opposite_boint, biint, not_taken_pc); 560 masm.b(dest); 561 } 562 } 563 ICache::ppc64_flush_icache_bytes(instruction_addr, code_size); 564 } 565 566 // Emit a NOT mt-safe patchable 64 bit absolute call/jump. 567 void MacroAssembler::bxx64_patchable(address dest, relocInfo::relocType rt, bool link) { 568 // get current pc 569 uint64_t start_pc = (uint64_t) pc(); 570 571 const address pc_of_bl = (address) (start_pc + (6*BytesPerInstWord)); // bl is last 572 const address pc_of_b = (address) (start_pc + (0*BytesPerInstWord)); // b is first 573 574 // relocate here 575 if (rt != relocInfo::none) { 576 relocate(rt); 577 } 578 579 if ( ReoptimizeCallSequences && 580 (( link && is_within_range_of_b(dest, pc_of_bl)) || 581 (!link && is_within_range_of_b(dest, pc_of_b)))) { 582 // variant 2: 583 // Emit an optimized, pc-relative call/jump. 584 585 if (link) { 586 // some padding 587 nop(); 588 nop(); 589 nop(); 590 nop(); 591 nop(); 592 nop(); 593 594 // do the call 595 assert(pc() == pc_of_bl, "just checking"); 596 bl(dest, relocInfo::none); 597 } else { 598 // do the jump 599 assert(pc() == pc_of_b, "just checking"); 600 b(dest, relocInfo::none); 601 602 // some padding 603 nop(); 604 nop(); 605 nop(); 606 nop(); 607 nop(); 608 nop(); 609 } 610 611 // Assert that we can identify the emitted call/jump. 612 assert(is_bxx64_patchable_variant2_at((address)start_pc, link), 613 "can't identify emitted call"); 614 } else { 615 // variant 1: 616 mr(R0, R11); // spill R11 -> R0. 617 618 // Load the destination address into CTR, 619 // calculate destination relative to global toc. 620 calculate_address_from_global_toc(R11, dest, true, true, false); 621 622 mtctr(R11); 623 mr(R11, R0); // spill R11 <- R0. 624 nop(); 625 626 // do the call/jump 627 if (link) { 628 bctrl(); 629 } else{ 630 bctr(); 631 } 632 // Assert that we can identify the emitted call/jump. 633 assert(is_bxx64_patchable_variant1b_at((address)start_pc, link), 634 "can't identify emitted call"); 635 } 636 637 // Assert that we can identify the emitted call/jump. 638 assert(is_bxx64_patchable_at((address)start_pc, link), 639 "can't identify emitted call"); 640 assert(get_dest_of_bxx64_patchable_at((address)start_pc, link) == dest, 641 "wrong encoding of dest address"); 642 } 643 644 // Identify a bxx64_patchable instruction. 645 bool MacroAssembler::is_bxx64_patchable_at(address instruction_addr, bool link) { 646 return is_bxx64_patchable_variant1b_at(instruction_addr, link) 647 //|| is_bxx64_patchable_variant1_at(instruction_addr, link) 648 || is_bxx64_patchable_variant2_at(instruction_addr, link); 649 } 650 651 // Does the call64_patchable instruction use a pc-relative encoding of 652 // the call destination? 653 bool MacroAssembler::is_bxx64_patchable_pcrelative_at(address instruction_addr, bool link) { 654 // variant 2 is pc-relative 655 return is_bxx64_patchable_variant2_at(instruction_addr, link); 656 } 657 658 // Identify variant 1. 659 bool MacroAssembler::is_bxx64_patchable_variant1_at(address instruction_addr, bool link) { 660 unsigned int* instr = (unsigned int*) instruction_addr; 661 return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l] 662 && is_mtctr(instr[5]) // mtctr 663 && is_load_const_at(instruction_addr); 664 } 665 666 // Identify variant 1b: load destination relative to global toc. 667 bool MacroAssembler::is_bxx64_patchable_variant1b_at(address instruction_addr, bool link) { 668 unsigned int* instr = (unsigned int*) instruction_addr; 669 return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l] 670 && is_mtctr(instr[3]) // mtctr 671 && is_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord, instruction_addr); 672 } 673 674 // Identify variant 2. 675 bool MacroAssembler::is_bxx64_patchable_variant2_at(address instruction_addr, bool link) { 676 unsigned int* instr = (unsigned int*) instruction_addr; 677 if (link) { 678 return is_bl (instr[6]) // bl dest is last 679 && is_nop(instr[0]) // nop 680 && is_nop(instr[1]) // nop 681 && is_nop(instr[2]) // nop 682 && is_nop(instr[3]) // nop 683 && is_nop(instr[4]) // nop 684 && is_nop(instr[5]); // nop 685 } else { 686 return is_b (instr[0]) // b dest is first 687 && is_nop(instr[1]) // nop 688 && is_nop(instr[2]) // nop 689 && is_nop(instr[3]) // nop 690 && is_nop(instr[4]) // nop 691 && is_nop(instr[5]) // nop 692 && is_nop(instr[6]); // nop 693 } 694 } 695 696 // Set dest address of a bxx64_patchable instruction. 697 void MacroAssembler::set_dest_of_bxx64_patchable_at(address instruction_addr, address dest, bool link) { 698 ResourceMark rm; 699 int code_size = MacroAssembler::bxx64_patchable_size; 700 CodeBuffer buf(instruction_addr, code_size); 701 MacroAssembler masm(&buf); 702 masm.bxx64_patchable(dest, relocInfo::none, link); 703 ICache::ppc64_flush_icache_bytes(instruction_addr, code_size); 704 } 705 706 // Get dest address of a bxx64_patchable instruction. 707 address MacroAssembler::get_dest_of_bxx64_patchable_at(address instruction_addr, bool link) { 708 if (is_bxx64_patchable_variant1_at(instruction_addr, link)) { 709 return (address) (unsigned long) get_const(instruction_addr); 710 } else if (is_bxx64_patchable_variant2_at(instruction_addr, link)) { 711 unsigned int* instr = (unsigned int*) instruction_addr; 712 if (link) { 713 const int instr_idx = 6; // bl is last 714 int branchoffset = branch_destination(instr[instr_idx], 0); 715 return instruction_addr + branchoffset + instr_idx*BytesPerInstWord; 716 } else { 717 const int instr_idx = 0; // b is first 718 int branchoffset = branch_destination(instr[instr_idx], 0); 719 return instruction_addr + branchoffset + instr_idx*BytesPerInstWord; 720 } 721 // Load dest relative to global toc. 722 } else if (is_bxx64_patchable_variant1b_at(instruction_addr, link)) { 723 return get_address_of_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord, 724 instruction_addr); 725 } else { 726 ShouldNotReachHere(); 727 return NULL; 728 } 729 } 730 731 // Uses ordering which corresponds to ABI: 732 // _savegpr0_14: std r14,-144(r1) 733 // _savegpr0_15: std r15,-136(r1) 734 // _savegpr0_16: std r16,-128(r1) 735 void MacroAssembler::save_nonvolatile_gprs(Register dst, int offset) { 736 std(R14, offset, dst); offset += 8; 737 std(R15, offset, dst); offset += 8; 738 std(R16, offset, dst); offset += 8; 739 std(R17, offset, dst); offset += 8; 740 std(R18, offset, dst); offset += 8; 741 std(R19, offset, dst); offset += 8; 742 std(R20, offset, dst); offset += 8; 743 std(R21, offset, dst); offset += 8; 744 std(R22, offset, dst); offset += 8; 745 std(R23, offset, dst); offset += 8; 746 std(R24, offset, dst); offset += 8; 747 std(R25, offset, dst); offset += 8; 748 std(R26, offset, dst); offset += 8; 749 std(R27, offset, dst); offset += 8; 750 std(R28, offset, dst); offset += 8; 751 std(R29, offset, dst); offset += 8; 752 std(R30, offset, dst); offset += 8; 753 std(R31, offset, dst); offset += 8; 754 755 stfd(F14, offset, dst); offset += 8; 756 stfd(F15, offset, dst); offset += 8; 757 stfd(F16, offset, dst); offset += 8; 758 stfd(F17, offset, dst); offset += 8; 759 stfd(F18, offset, dst); offset += 8; 760 stfd(F19, offset, dst); offset += 8; 761 stfd(F20, offset, dst); offset += 8; 762 stfd(F21, offset, dst); offset += 8; 763 stfd(F22, offset, dst); offset += 8; 764 stfd(F23, offset, dst); offset += 8; 765 stfd(F24, offset, dst); offset += 8; 766 stfd(F25, offset, dst); offset += 8; 767 stfd(F26, offset, dst); offset += 8; 768 stfd(F27, offset, dst); offset += 8; 769 stfd(F28, offset, dst); offset += 8; 770 stfd(F29, offset, dst); offset += 8; 771 stfd(F30, offset, dst); offset += 8; 772 stfd(F31, offset, dst); 773 } 774 775 // Uses ordering which corresponds to ABI: 776 // _restgpr0_14: ld r14,-144(r1) 777 // _restgpr0_15: ld r15,-136(r1) 778 // _restgpr0_16: ld r16,-128(r1) 779 void MacroAssembler::restore_nonvolatile_gprs(Register src, int offset) { 780 ld(R14, offset, src); offset += 8; 781 ld(R15, offset, src); offset += 8; 782 ld(R16, offset, src); offset += 8; 783 ld(R17, offset, src); offset += 8; 784 ld(R18, offset, src); offset += 8; 785 ld(R19, offset, src); offset += 8; 786 ld(R20, offset, src); offset += 8; 787 ld(R21, offset, src); offset += 8; 788 ld(R22, offset, src); offset += 8; 789 ld(R23, offset, src); offset += 8; 790 ld(R24, offset, src); offset += 8; 791 ld(R25, offset, src); offset += 8; 792 ld(R26, offset, src); offset += 8; 793 ld(R27, offset, src); offset += 8; 794 ld(R28, offset, src); offset += 8; 795 ld(R29, offset, src); offset += 8; 796 ld(R30, offset, src); offset += 8; 797 ld(R31, offset, src); offset += 8; 798 799 // FP registers 800 lfd(F14, offset, src); offset += 8; 801 lfd(F15, offset, src); offset += 8; 802 lfd(F16, offset, src); offset += 8; 803 lfd(F17, offset, src); offset += 8; 804 lfd(F18, offset, src); offset += 8; 805 lfd(F19, offset, src); offset += 8; 806 lfd(F20, offset, src); offset += 8; 807 lfd(F21, offset, src); offset += 8; 808 lfd(F22, offset, src); offset += 8; 809 lfd(F23, offset, src); offset += 8; 810 lfd(F24, offset, src); offset += 8; 811 lfd(F25, offset, src); offset += 8; 812 lfd(F26, offset, src); offset += 8; 813 lfd(F27, offset, src); offset += 8; 814 lfd(F28, offset, src); offset += 8; 815 lfd(F29, offset, src); offset += 8; 816 lfd(F30, offset, src); offset += 8; 817 lfd(F31, offset, src); 818 } 819 820 // For verify_oops. 821 void MacroAssembler::save_volatile_gprs(Register dst, int offset) { 822 std(R2, offset, dst); offset += 8; 823 std(R3, offset, dst); offset += 8; 824 std(R4, offset, dst); offset += 8; 825 std(R5, offset, dst); offset += 8; 826 std(R6, offset, dst); offset += 8; 827 std(R7, offset, dst); offset += 8; 828 std(R8, offset, dst); offset += 8; 829 std(R9, offset, dst); offset += 8; 830 std(R10, offset, dst); offset += 8; 831 std(R11, offset, dst); offset += 8; 832 std(R12, offset, dst); offset += 8; 833 834 stfd(F0, offset, dst); offset += 8; 835 stfd(F1, offset, dst); offset += 8; 836 stfd(F2, offset, dst); offset += 8; 837 stfd(F3, offset, dst); offset += 8; 838 stfd(F4, offset, dst); offset += 8; 839 stfd(F5, offset, dst); offset += 8; 840 stfd(F6, offset, dst); offset += 8; 841 stfd(F7, offset, dst); offset += 8; 842 stfd(F8, offset, dst); offset += 8; 843 stfd(F9, offset, dst); offset += 8; 844 stfd(F10, offset, dst); offset += 8; 845 stfd(F11, offset, dst); offset += 8; 846 stfd(F12, offset, dst); offset += 8; 847 stfd(F13, offset, dst); 848 } 849 850 // For verify_oops. 851 void MacroAssembler::restore_volatile_gprs(Register src, int offset) { 852 ld(R2, offset, src); offset += 8; 853 ld(R3, offset, src); offset += 8; 854 ld(R4, offset, src); offset += 8; 855 ld(R5, offset, src); offset += 8; 856 ld(R6, offset, src); offset += 8; 857 ld(R7, offset, src); offset += 8; 858 ld(R8, offset, src); offset += 8; 859 ld(R9, offset, src); offset += 8; 860 ld(R10, offset, src); offset += 8; 861 ld(R11, offset, src); offset += 8; 862 ld(R12, offset, src); offset += 8; 863 864 lfd(F0, offset, src); offset += 8; 865 lfd(F1, offset, src); offset += 8; 866 lfd(F2, offset, src); offset += 8; 867 lfd(F3, offset, src); offset += 8; 868 lfd(F4, offset, src); offset += 8; 869 lfd(F5, offset, src); offset += 8; 870 lfd(F6, offset, src); offset += 8; 871 lfd(F7, offset, src); offset += 8; 872 lfd(F8, offset, src); offset += 8; 873 lfd(F9, offset, src); offset += 8; 874 lfd(F10, offset, src); offset += 8; 875 lfd(F11, offset, src); offset += 8; 876 lfd(F12, offset, src); offset += 8; 877 lfd(F13, offset, src); 878 } 879 880 void MacroAssembler::save_LR_CR(Register tmp) { 881 mfcr(tmp); 882 std(tmp, _abi(cr), R1_SP); 883 mflr(tmp); 884 std(tmp, _abi(lr), R1_SP); 885 // Tmp must contain lr on exit! (see return_addr and prolog in ppc64.ad) 886 } 887 888 void MacroAssembler::restore_LR_CR(Register tmp) { 889 assert(tmp != R1_SP, "must be distinct"); 890 ld(tmp, _abi(lr), R1_SP); 891 mtlr(tmp); 892 ld(tmp, _abi(cr), R1_SP); 893 mtcr(tmp); 894 } 895 896 address MacroAssembler::get_PC_trash_LR(Register result) { 897 Label L; 898 bl(L); 899 bind(L); 900 address lr_pc = pc(); 901 mflr(result); 902 return lr_pc; 903 } 904 905 void MacroAssembler::resize_frame(Register offset, Register tmp) { 906 #ifdef ASSERT 907 assert_different_registers(offset, tmp, R1_SP); 908 andi_(tmp, offset, frame::alignment_in_bytes-1); 909 asm_assert_eq("resize_frame: unaligned", 0x204); 910 #endif 911 912 // tmp <- *(SP) 913 ld(tmp, _abi(callers_sp), R1_SP); 914 // addr <- SP + offset; 915 // *(addr) <- tmp; 916 // SP <- addr 917 stdux(tmp, R1_SP, offset); 918 } 919 920 void MacroAssembler::resize_frame(int offset, Register tmp) { 921 assert(is_simm(offset, 16), "too big an offset"); 922 assert_different_registers(tmp, R1_SP); 923 assert((offset & (frame::alignment_in_bytes-1))==0, "resize_frame: unaligned"); 924 // tmp <- *(SP) 925 ld(tmp, _abi(callers_sp), R1_SP); 926 // addr <- SP + offset; 927 // *(addr) <- tmp; 928 // SP <- addr 929 stdu(tmp, offset, R1_SP); 930 } 931 932 void MacroAssembler::resize_frame_absolute(Register addr, Register tmp1, Register tmp2) { 933 // (addr == tmp1) || (addr == tmp2) is allowed here! 934 assert(tmp1 != tmp2, "must be distinct"); 935 936 // compute offset w.r.t. current stack pointer 937 // tmp_1 <- addr - SP (!) 938 subf(tmp1, R1_SP, addr); 939 940 // atomically update SP keeping back link. 941 resize_frame(tmp1/* offset */, tmp2/* tmp */); 942 } 943 944 void MacroAssembler::push_frame(Register bytes, Register tmp) { 945 #ifdef ASSERT 946 assert(bytes != R0, "r0 not allowed here"); 947 andi_(R0, bytes, frame::alignment_in_bytes-1); 948 asm_assert_eq("push_frame(Reg, Reg): unaligned", 0x203); 949 #endif 950 neg(tmp, bytes); 951 stdux(R1_SP, R1_SP, tmp); 952 } 953 954 // Push a frame of size `bytes'. 955 void MacroAssembler::push_frame(unsigned int bytes, Register tmp) { 956 long offset = align_addr(bytes, frame::alignment_in_bytes); 957 if (is_simm(-offset, 16)) { 958 stdu(R1_SP, -offset, R1_SP); 959 } else { 960 load_const_optimized(tmp, -offset); 961 stdux(R1_SP, R1_SP, tmp); 962 } 963 } 964 965 // Push a frame of size `bytes' plus abi_reg_args on top. 966 void MacroAssembler::push_frame_reg_args(unsigned int bytes, Register tmp) { 967 push_frame(bytes + frame::abi_reg_args_size, tmp); 968 } 969 970 // Setup up a new C frame with a spill area for non-volatile GPRs and 971 // additional space for local variables. 972 void MacroAssembler::push_frame_reg_args_nonvolatiles(unsigned int bytes, 973 Register tmp) { 974 push_frame(bytes + frame::abi_reg_args_size + frame::spill_nonvolatiles_size, tmp); 975 } 976 977 // Pop current C frame. 978 void MacroAssembler::pop_frame() { 979 ld(R1_SP, _abi(callers_sp), R1_SP); 980 } 981 982 #if defined(ABI_ELFv2) 983 address MacroAssembler::branch_to(Register r_function_entry, bool and_link) { 984 // TODO(asmundak): make sure the caller uses R12 as function descriptor 985 // most of the times. 986 if (R12 != r_function_entry) { 987 mr(R12, r_function_entry); 988 } 989 mtctr(R12); 990 // Do a call or a branch. 991 if (and_link) { 992 bctrl(); 993 } else { 994 bctr(); 995 } 996 _last_calls_return_pc = pc(); 997 998 return _last_calls_return_pc; 999 } 1000 1001 // Call a C function via a function descriptor and use full C 1002 // calling conventions. Updates and returns _last_calls_return_pc. 1003 address MacroAssembler::call_c(Register r_function_entry) { 1004 return branch_to(r_function_entry, /*and_link=*/true); 1005 } 1006 1007 // For tail calls: only branch, don't link, so callee returns to caller of this function. 1008 address MacroAssembler::call_c_and_return_to_caller(Register r_function_entry) { 1009 return branch_to(r_function_entry, /*and_link=*/false); 1010 } 1011 1012 address MacroAssembler::call_c(address function_entry, relocInfo::relocType rt) { 1013 load_const(R12, function_entry, R0); 1014 return branch_to(R12, /*and_link=*/true); 1015 } 1016 1017 #else 1018 // Generic version of a call to C function via a function descriptor 1019 // with variable support for C calling conventions (TOC, ENV, etc.). 1020 // Updates and returns _last_calls_return_pc. 1021 address MacroAssembler::branch_to(Register function_descriptor, bool and_link, bool save_toc_before_call, 1022 bool restore_toc_after_call, bool load_toc_of_callee, bool load_env_of_callee) { 1023 // we emit standard ptrgl glue code here 1024 assert((function_descriptor != R0), "function_descriptor cannot be R0"); 1025 1026 // retrieve necessary entries from the function descriptor 1027 ld(R0, in_bytes(FunctionDescriptor::entry_offset()), function_descriptor); 1028 mtctr(R0); 1029 1030 if (load_toc_of_callee) { 1031 ld(R2_TOC, in_bytes(FunctionDescriptor::toc_offset()), function_descriptor); 1032 } 1033 if (load_env_of_callee) { 1034 ld(R11, in_bytes(FunctionDescriptor::env_offset()), function_descriptor); 1035 } else if (load_toc_of_callee) { 1036 li(R11, 0); 1037 } 1038 1039 // do a call or a branch 1040 if (and_link) { 1041 bctrl(); 1042 } else { 1043 bctr(); 1044 } 1045 _last_calls_return_pc = pc(); 1046 1047 return _last_calls_return_pc; 1048 } 1049 1050 // Call a C function via a function descriptor and use full C calling 1051 // conventions. 1052 // We don't use the TOC in generated code, so there is no need to save 1053 // and restore its value. 1054 address MacroAssembler::call_c(Register fd) { 1055 return branch_to(fd, /*and_link=*/true, 1056 /*save toc=*/false, 1057 /*restore toc=*/false, 1058 /*load toc=*/true, 1059 /*load env=*/true); 1060 } 1061 1062 address MacroAssembler::call_c_and_return_to_caller(Register fd) { 1063 return branch_to(fd, /*and_link=*/false, 1064 /*save toc=*/false, 1065 /*restore toc=*/false, 1066 /*load toc=*/true, 1067 /*load env=*/true); 1068 } 1069 1070 address MacroAssembler::call_c(const FunctionDescriptor* fd, relocInfo::relocType rt) { 1071 if (rt != relocInfo::none) { 1072 // this call needs to be relocatable 1073 if (!ReoptimizeCallSequences 1074 || (rt != relocInfo::runtime_call_type && rt != relocInfo::none) 1075 || fd == NULL // support code-size estimation 1076 || !fd->is_friend_function() 1077 || fd->entry() == NULL) { 1078 // it's not a friend function as defined by class FunctionDescriptor, 1079 // so do a full call-c here. 1080 load_const(R11, (address)fd, R0); 1081 1082 bool has_env = (fd != NULL && fd->env() != NULL); 1083 return branch_to(R11, /*and_link=*/true, 1084 /*save toc=*/false, 1085 /*restore toc=*/false, 1086 /*load toc=*/true, 1087 /*load env=*/has_env); 1088 } else { 1089 // It's a friend function. Load the entry point and don't care about 1090 // toc and env. Use an optimizable call instruction, but ensure the 1091 // same code-size as in the case of a non-friend function. 1092 nop(); 1093 nop(); 1094 nop(); 1095 bl64_patchable(fd->entry(), rt); 1096 _last_calls_return_pc = pc(); 1097 return _last_calls_return_pc; 1098 } 1099 } else { 1100 // This call does not need to be relocatable, do more aggressive 1101 // optimizations. 1102 if (!ReoptimizeCallSequences 1103 || !fd->is_friend_function()) { 1104 // It's not a friend function as defined by class FunctionDescriptor, 1105 // so do a full call-c here. 1106 load_const(R11, (address)fd, R0); 1107 return branch_to(R11, /*and_link=*/true, 1108 /*save toc=*/false, 1109 /*restore toc=*/false, 1110 /*load toc=*/true, 1111 /*load env=*/true); 1112 } else { 1113 // it's a friend function, load the entry point and don't care about 1114 // toc and env. 1115 address dest = fd->entry(); 1116 if (is_within_range_of_b(dest, pc())) { 1117 bl(dest); 1118 } else { 1119 bl64_patchable(dest, rt); 1120 } 1121 _last_calls_return_pc = pc(); 1122 return _last_calls_return_pc; 1123 } 1124 } 1125 } 1126 1127 // Call a C function. All constants needed reside in TOC. 1128 // 1129 // Read the address to call from the TOC. 1130 // Read env from TOC, if fd specifies an env. 1131 // Read new TOC from TOC. 1132 address MacroAssembler::call_c_using_toc(const FunctionDescriptor* fd, 1133 relocInfo::relocType rt, Register toc) { 1134 if (!ReoptimizeCallSequences 1135 || (rt != relocInfo::runtime_call_type && rt != relocInfo::none) 1136 || !fd->is_friend_function()) { 1137 // It's not a friend function as defined by class FunctionDescriptor, 1138 // so do a full call-c here. 1139 assert(fd->entry() != NULL, "function must be linked"); 1140 1141 AddressLiteral fd_entry(fd->entry()); 1142 bool success = load_const_from_method_toc(R11, fd_entry, toc, /*fixed_size*/ true); 1143 mtctr(R11); 1144 if (fd->env() == NULL) { 1145 li(R11, 0); 1146 nop(); 1147 } else { 1148 AddressLiteral fd_env(fd->env()); 1149 success = success && load_const_from_method_toc(R11, fd_env, toc, /*fixed_size*/ true); 1150 } 1151 AddressLiteral fd_toc(fd->toc()); 1152 // Set R2_TOC (load from toc) 1153 success = success && load_const_from_method_toc(R2_TOC, fd_toc, toc, /*fixed_size*/ true); 1154 bctrl(); 1155 _last_calls_return_pc = pc(); 1156 if (!success) { return NULL; } 1157 } else { 1158 // It's a friend function, load the entry point and don't care about 1159 // toc and env. Use an optimizable call instruction, but ensure the 1160 // same code-size as in the case of a non-friend function. 1161 nop(); 1162 bl64_patchable(fd->entry(), rt); 1163 _last_calls_return_pc = pc(); 1164 } 1165 return _last_calls_return_pc; 1166 } 1167 #endif // ABI_ELFv2 1168 1169 void MacroAssembler::call_VM_base(Register oop_result, 1170 Register last_java_sp, 1171 address entry_point, 1172 bool check_exceptions) { 1173 BLOCK_COMMENT("call_VM {"); 1174 // Determine last_java_sp register. 1175 if (!last_java_sp->is_valid()) { 1176 last_java_sp = R1_SP; 1177 } 1178 set_top_ijava_frame_at_SP_as_last_Java_frame(last_java_sp, R11_scratch1); 1179 1180 // ARG1 must hold thread address. 1181 mr(R3_ARG1, R16_thread); 1182 #if defined(ABI_ELFv2) 1183 address return_pc = call_c(entry_point, relocInfo::none); 1184 #else 1185 address return_pc = call_c((FunctionDescriptor*)entry_point, relocInfo::none); 1186 #endif 1187 1188 reset_last_Java_frame(); 1189 1190 // Check for pending exceptions. 1191 if (check_exceptions) { 1192 // We don't check for exceptions here. 1193 ShouldNotReachHere(); 1194 } 1195 1196 // Get oop result if there is one and reset the value in the thread. 1197 if (oop_result->is_valid()) { 1198 get_vm_result(oop_result); 1199 } 1200 1201 _last_calls_return_pc = return_pc; 1202 BLOCK_COMMENT("} call_VM"); 1203 } 1204 1205 void MacroAssembler::call_VM_leaf_base(address entry_point) { 1206 BLOCK_COMMENT("call_VM_leaf {"); 1207 #if defined(ABI_ELFv2) 1208 call_c(entry_point, relocInfo::none); 1209 #else 1210 call_c(CAST_FROM_FN_PTR(FunctionDescriptor*, entry_point), relocInfo::none); 1211 #endif 1212 BLOCK_COMMENT("} call_VM_leaf"); 1213 } 1214 1215 void MacroAssembler::call_VM(Register oop_result, address entry_point, bool check_exceptions) { 1216 call_VM_base(oop_result, noreg, entry_point, check_exceptions); 1217 } 1218 1219 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, 1220 bool check_exceptions) { 1221 // R3_ARG1 is reserved for the thread. 1222 mr_if_needed(R4_ARG2, arg_1); 1223 call_VM(oop_result, entry_point, check_exceptions); 1224 } 1225 1226 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2, 1227 bool check_exceptions) { 1228 // R3_ARG1 is reserved for the thread 1229 mr_if_needed(R4_ARG2, arg_1); 1230 assert(arg_2 != R4_ARG2, "smashed argument"); 1231 mr_if_needed(R5_ARG3, arg_2); 1232 call_VM(oop_result, entry_point, check_exceptions); 1233 } 1234 1235 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2, Register arg_3, 1236 bool check_exceptions) { 1237 // R3_ARG1 is reserved for the thread 1238 mr_if_needed(R4_ARG2, arg_1); 1239 assert(arg_2 != R4_ARG2, "smashed argument"); 1240 mr_if_needed(R5_ARG3, arg_2); 1241 mr_if_needed(R6_ARG4, arg_3); 1242 call_VM(oop_result, entry_point, check_exceptions); 1243 } 1244 1245 void MacroAssembler::call_VM_leaf(address entry_point) { 1246 call_VM_leaf_base(entry_point); 1247 } 1248 1249 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1) { 1250 mr_if_needed(R3_ARG1, arg_1); 1251 call_VM_leaf(entry_point); 1252 } 1253 1254 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2) { 1255 mr_if_needed(R3_ARG1, arg_1); 1256 assert(arg_2 != R3_ARG1, "smashed argument"); 1257 mr_if_needed(R4_ARG2, arg_2); 1258 call_VM_leaf(entry_point); 1259 } 1260 1261 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2, Register arg_3) { 1262 mr_if_needed(R3_ARG1, arg_1); 1263 assert(arg_2 != R3_ARG1, "smashed argument"); 1264 mr_if_needed(R4_ARG2, arg_2); 1265 assert(arg_3 != R3_ARG1 && arg_3 != R4_ARG2, "smashed argument"); 1266 mr_if_needed(R5_ARG3, arg_3); 1267 call_VM_leaf(entry_point); 1268 } 1269 1270 // Check whether instruction is a read access to the polling page 1271 // which was emitted by load_from_polling_page(..). 1272 bool MacroAssembler::is_load_from_polling_page(int instruction, void* ucontext, 1273 address* polling_address_ptr) { 1274 if (!is_ld(instruction)) 1275 return false; // It's not a ld. Fail. 1276 1277 int rt = inv_rt_field(instruction); 1278 int ra = inv_ra_field(instruction); 1279 int ds = inv_ds_field(instruction); 1280 if (!(ds == 0 && ra != 0 && rt == 0)) { 1281 return false; // It's not a ld(r0, X, ra). Fail. 1282 } 1283 1284 if (!ucontext) { 1285 // Set polling address. 1286 if (polling_address_ptr != NULL) { 1287 *polling_address_ptr = NULL; 1288 } 1289 return true; // No ucontext given. Can't check value of ra. Assume true. 1290 } 1291 1292 #ifdef LINUX 1293 // Ucontext given. Check that register ra contains the address of 1294 // the safepoing polling page. 1295 ucontext_t* uc = (ucontext_t*) ucontext; 1296 // Set polling address. 1297 address addr = (address)uc->uc_mcontext.regs->gpr[ra] + (ssize_t)ds; 1298 if (polling_address_ptr != NULL) { 1299 *polling_address_ptr = addr; 1300 } 1301 return os::is_poll_address(addr); 1302 #else 1303 // Not on Linux, ucontext must be NULL. 1304 ShouldNotReachHere(); 1305 return false; 1306 #endif 1307 } 1308 1309 bool MacroAssembler::is_memory_serialization(int instruction, JavaThread* thread, void* ucontext) { 1310 #ifdef LINUX 1311 ucontext_t* uc = (ucontext_t*) ucontext; 1312 1313 if (is_stwx(instruction) || is_stwux(instruction)) { 1314 int ra = inv_ra_field(instruction); 1315 int rb = inv_rb_field(instruction); 1316 1317 // look up content of ra and rb in ucontext 1318 address ra_val=(address)uc->uc_mcontext.regs->gpr[ra]; 1319 long rb_val=(long)uc->uc_mcontext.regs->gpr[rb]; 1320 return os::is_memory_serialize_page(thread, ra_val+rb_val); 1321 } else if (is_stw(instruction) || is_stwu(instruction)) { 1322 int ra = inv_ra_field(instruction); 1323 int d1 = inv_d1_field(instruction); 1324 1325 // look up content of ra in ucontext 1326 address ra_val=(address)uc->uc_mcontext.regs->gpr[ra]; 1327 return os::is_memory_serialize_page(thread, ra_val+d1); 1328 } else { 1329 return false; 1330 } 1331 #else 1332 // workaround not needed on !LINUX :-) 1333 ShouldNotCallThis(); 1334 return false; 1335 #endif 1336 } 1337 1338 void MacroAssembler::bang_stack_with_offset(int offset) { 1339 // When increasing the stack, the old stack pointer will be written 1340 // to the new top of stack according to the PPC64 abi. 1341 // Therefore, stack banging is not necessary when increasing 1342 // the stack by <= os::vm_page_size() bytes. 1343 // When increasing the stack by a larger amount, this method is 1344 // called repeatedly to bang the intermediate pages. 1345 1346 // Stack grows down, caller passes positive offset. 1347 assert(offset > 0, "must bang with positive offset"); 1348 1349 long stdoffset = -offset; 1350 1351 if (is_simm(stdoffset, 16)) { 1352 // Signed 16 bit offset, a simple std is ok. 1353 if (UseLoadInstructionsForStackBangingPPC64) { 1354 ld(R0, (int)(signed short)stdoffset, R1_SP); 1355 } else { 1356 std(R0,(int)(signed short)stdoffset, R1_SP); 1357 } 1358 } else if (is_simm(stdoffset, 31)) { 1359 const int hi = MacroAssembler::largeoffset_si16_si16_hi(stdoffset); 1360 const int lo = MacroAssembler::largeoffset_si16_si16_lo(stdoffset); 1361 1362 Register tmp = R11; 1363 addis(tmp, R1_SP, hi); 1364 if (UseLoadInstructionsForStackBangingPPC64) { 1365 ld(R0, lo, tmp); 1366 } else { 1367 std(R0, lo, tmp); 1368 } 1369 } else { 1370 ShouldNotReachHere(); 1371 } 1372 } 1373 1374 // If instruction is a stack bang of the form 1375 // std R0, x(Ry), (see bang_stack_with_offset()) 1376 // stdu R1_SP, x(R1_SP), (see push_frame(), resize_frame()) 1377 // or stdux R1_SP, Rx, R1_SP (see push_frame(), resize_frame()) 1378 // return the banged address. Otherwise, return 0. 1379 address MacroAssembler::get_stack_bang_address(int instruction, void *ucontext) { 1380 #ifdef LINUX 1381 ucontext_t* uc = (ucontext_t*) ucontext; 1382 int rs = inv_rs_field(instruction); 1383 int ra = inv_ra_field(instruction); 1384 if ( (is_ld(instruction) && rs == 0 && UseLoadInstructionsForStackBangingPPC64) 1385 || (is_std(instruction) && rs == 0 && !UseLoadInstructionsForStackBangingPPC64) 1386 || (is_stdu(instruction) && rs == 1)) { 1387 int ds = inv_ds_field(instruction); 1388 // return banged address 1389 return ds+(address)uc->uc_mcontext.regs->gpr[ra]; 1390 } else if (is_stdux(instruction) && rs == 1) { 1391 int rb = inv_rb_field(instruction); 1392 address sp = (address)uc->uc_mcontext.regs->gpr[1]; 1393 long rb_val = (long)uc->uc_mcontext.regs->gpr[rb]; 1394 return ra != 1 || rb_val >= 0 ? NULL // not a stack bang 1395 : sp + rb_val; // banged address 1396 } 1397 return NULL; // not a stack bang 1398 #else 1399 // workaround not needed on !LINUX :-) 1400 ShouldNotCallThis(); 1401 return NULL; 1402 #endif 1403 } 1404 1405 void MacroAssembler::reserved_stack_check(Register return_pc) { 1406 // Test if reserved zone needs to be enabled. 1407 Label no_reserved_zone_enabling; 1408 1409 ld_ptr(R0, JavaThread::reserved_stack_activation_offset(), R16_thread); 1410 cmpld(CCR0, R1_SP, R0); 1411 blt_predict_taken(CCR0, no_reserved_zone_enabling); 1412 1413 // Enable reserved zone again, throw stack overflow exception. 1414 push_frame_reg_args(0, R0); 1415 call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone), R16_thread); 1416 pop_frame(); 1417 mtlr(return_pc); 1418 load_const_optimized(R0, StubRoutines::throw_delayed_StackOverflowError_entry()); 1419 mtctr(R0); 1420 bctr(); 1421 1422 should_not_reach_here(); 1423 1424 bind(no_reserved_zone_enabling); 1425 } 1426 1427 void MacroAssembler::getandsetd(Register dest_current_value, Register exchange_value, Register addr_base, 1428 bool cmpxchgx_hint) { 1429 Label retry; 1430 bind(retry); 1431 ldarx(dest_current_value, addr_base, cmpxchgx_hint); 1432 stdcx_(exchange_value, addr_base); 1433 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1434 bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0. 1435 } else { 1436 bne( CCR0, retry); // StXcx_ sets CCR0. 1437 } 1438 } 1439 1440 void MacroAssembler::getandaddd(Register dest_current_value, Register inc_value, Register addr_base, 1441 Register tmp, bool cmpxchgx_hint) { 1442 Label retry; 1443 bind(retry); 1444 ldarx(dest_current_value, addr_base, cmpxchgx_hint); 1445 add(tmp, dest_current_value, inc_value); 1446 stdcx_(tmp, addr_base); 1447 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1448 bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0. 1449 } else { 1450 bne( CCR0, retry); // StXcx_ sets CCR0. 1451 } 1452 } 1453 1454 // Word/sub-word atomic helper functions 1455 1456 // Temps and addr_base are killed if size < 4 and processor does not support respective instructions. 1457 // Only signed types are supported with size < 4. 1458 // Atomic add always kills tmp1. 1459 void MacroAssembler::atomic_get_and_modify_generic(Register dest_current_value, Register exchange_value, 1460 Register addr_base, Register tmp1, Register tmp2, Register tmp3, 1461 bool cmpxchgx_hint, bool is_add, int size) { 1462 // Sub-word instructions are available since Power 8. 1463 // For older processors, instruction_type != size holds, and we 1464 // emulate the sub-word instructions by constructing a 4-byte value 1465 // that leaves the other bytes unchanged. 1466 const int instruction_type = VM_Version::has_lqarx() ? size : 4; 1467 1468 Label retry; 1469 Register shift_amount = noreg, 1470 val32 = dest_current_value, 1471 modval = is_add ? tmp1 : exchange_value; 1472 1473 if (instruction_type != size) { 1474 assert_different_registers(tmp1, tmp2, tmp3, dest_current_value, exchange_value, addr_base); 1475 modval = tmp1; 1476 shift_amount = tmp2; 1477 val32 = tmp3; 1478 // Need some preperation: Compute shift amount, align address. Note: shorts must be 2 byte aligned. 1479 #ifdef VM_LITTLE_ENDIAN 1480 rldic(shift_amount, addr_base, 3, 64-5); // (dest & 3) * 8; 1481 clrrdi(addr_base, addr_base, 2); 1482 #else 1483 xori(shift_amount, addr_base, (size == 1) ? 3 : 2); 1484 clrrdi(addr_base, addr_base, 2); 1485 rldic(shift_amount, shift_amount, 3, 64-5); // byte: ((3-dest) & 3) * 8; short: ((1-dest/2) & 1) * 16; 1486 #endif 1487 } 1488 1489 // atomic emulation loop 1490 bind(retry); 1491 1492 switch (instruction_type) { 1493 case 4: lwarx(val32, addr_base, cmpxchgx_hint); break; 1494 case 2: lharx(val32, addr_base, cmpxchgx_hint); break; 1495 case 1: lbarx(val32, addr_base, cmpxchgx_hint); break; 1496 default: ShouldNotReachHere(); 1497 } 1498 1499 if (instruction_type != size) { 1500 srw(dest_current_value, val32, shift_amount); 1501 } 1502 1503 if (is_add) { add(modval, dest_current_value, exchange_value); } 1504 1505 if (instruction_type != size) { 1506 // Transform exchange value such that the replacement can be done by one xor instruction. 1507 xorr(modval, dest_current_value, is_add ? modval : exchange_value); 1508 clrldi(modval, modval, (size == 1) ? 56 : 48); 1509 slw(modval, modval, shift_amount); 1510 xorr(modval, val32, modval); 1511 } 1512 1513 switch (instruction_type) { 1514 case 4: stwcx_(modval, addr_base); break; 1515 case 2: sthcx_(modval, addr_base); break; 1516 case 1: stbcx_(modval, addr_base); break; 1517 default: ShouldNotReachHere(); 1518 } 1519 1520 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1521 bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0. 1522 } else { 1523 bne( CCR0, retry); // StXcx_ sets CCR0. 1524 } 1525 1526 // l?arx zero-extends, but Java wants byte/short values sign-extended. 1527 if (size == 1) { 1528 extsb(dest_current_value, dest_current_value); 1529 } else if (size == 2) { 1530 extsh(dest_current_value, dest_current_value); 1531 }; 1532 } 1533 1534 // Temps, addr_base and exchange_value are killed if size < 4 and processor does not support respective instructions. 1535 // Only signed types are supported with size < 4. 1536 void MacroAssembler::cmpxchg_loop_body(ConditionRegister flag, Register dest_current_value, 1537 Register compare_value, Register exchange_value, 1538 Register addr_base, Register tmp1, Register tmp2, 1539 Label &retry, Label &failed, bool cmpxchgx_hint, int size) { 1540 // Sub-word instructions are available since Power 8. 1541 // For older processors, instruction_type != size holds, and we 1542 // emulate the sub-word instructions by constructing a 4-byte value 1543 // that leaves the other bytes unchanged. 1544 const int instruction_type = VM_Version::has_lqarx() ? size : 4; 1545 1546 Register shift_amount = noreg, 1547 val32 = dest_current_value, 1548 modval = exchange_value; 1549 1550 if (instruction_type != size) { 1551 assert_different_registers(tmp1, tmp2, dest_current_value, compare_value, exchange_value, addr_base); 1552 shift_amount = tmp1; 1553 val32 = tmp2; 1554 modval = tmp2; 1555 // Need some preperation: Compute shift amount, align address. Note: shorts must be 2 byte aligned. 1556 #ifdef VM_LITTLE_ENDIAN 1557 rldic(shift_amount, addr_base, 3, 64-5); // (dest & 3) * 8; 1558 clrrdi(addr_base, addr_base, 2); 1559 #else 1560 xori(shift_amount, addr_base, (size == 1) ? 3 : 2); 1561 clrrdi(addr_base, addr_base, 2); 1562 rldic(shift_amount, shift_amount, 3, 64-5); // byte: ((3-dest) & 3) * 8; short: ((1-dest/2) & 1) * 16; 1563 #endif 1564 // Transform exchange value such that the replacement can be done by one xor instruction. 1565 xorr(exchange_value, compare_value, exchange_value); 1566 clrldi(exchange_value, exchange_value, (size == 1) ? 56 : 48); 1567 slw(exchange_value, exchange_value, shift_amount); 1568 } 1569 1570 // atomic emulation loop 1571 bind(retry); 1572 1573 switch (instruction_type) { 1574 case 4: lwarx(val32, addr_base, cmpxchgx_hint); break; 1575 case 2: lharx(val32, addr_base, cmpxchgx_hint); break; 1576 case 1: lbarx(val32, addr_base, cmpxchgx_hint); break; 1577 default: ShouldNotReachHere(); 1578 } 1579 1580 if (instruction_type != size) { 1581 srw(dest_current_value, val32, shift_amount); 1582 } 1583 if (size == 1) { 1584 extsb(dest_current_value, dest_current_value); 1585 } else if (size == 2) { 1586 extsh(dest_current_value, dest_current_value); 1587 }; 1588 1589 cmpw(flag, dest_current_value, compare_value); 1590 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1591 bne_predict_not_taken(flag, failed); 1592 } else { 1593 bne( flag, failed); 1594 } 1595 // branch to done => (flag == ne), (dest_current_value != compare_value) 1596 // fall through => (flag == eq), (dest_current_value == compare_value) 1597 1598 if (instruction_type != size) { 1599 xorr(modval, val32, exchange_value); 1600 } 1601 1602 switch (instruction_type) { 1603 case 4: stwcx_(modval, addr_base); break; 1604 case 2: sthcx_(modval, addr_base); break; 1605 case 1: stbcx_(modval, addr_base); break; 1606 default: ShouldNotReachHere(); 1607 } 1608 } 1609 1610 // CmpxchgX sets condition register to cmpX(current, compare). 1611 void MacroAssembler::cmpxchg_generic(ConditionRegister flag, Register dest_current_value, 1612 Register compare_value, Register exchange_value, 1613 Register addr_base, Register tmp1, Register tmp2, 1614 int semantics, bool cmpxchgx_hint, 1615 Register int_flag_success, bool contention_hint, bool weak, int size) { 1616 Label retry; 1617 Label failed; 1618 Label done; 1619 1620 // Save one branch if result is returned via register and 1621 // result register is different from the other ones. 1622 bool use_result_reg = (int_flag_success != noreg); 1623 bool preset_result_reg = (int_flag_success != dest_current_value && int_flag_success != compare_value && 1624 int_flag_success != exchange_value && int_flag_success != addr_base && 1625 int_flag_success != tmp1 && int_flag_success != tmp2); 1626 assert(!weak || flag == CCR0, "weak only supported with CCR0"); 1627 assert(size == 1 || size == 2 || size == 4, "unsupported"); 1628 1629 if (use_result_reg && preset_result_reg) { 1630 li(int_flag_success, 0); // preset (assume cas failed) 1631 } 1632 1633 // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM). 1634 if (contention_hint) { // Don't try to reserve if cmp fails. 1635 switch (size) { 1636 case 1: lbz(dest_current_value, 0, addr_base); extsb(dest_current_value, dest_current_value); break; 1637 case 2: lha(dest_current_value, 0, addr_base); break; 1638 case 4: lwz(dest_current_value, 0, addr_base); break; 1639 default: ShouldNotReachHere(); 1640 } 1641 cmpw(flag, dest_current_value, compare_value); 1642 bne(flag, failed); 1643 } 1644 1645 // release/fence semantics 1646 if (semantics & MemBarRel) { 1647 release(); 1648 } 1649 1650 cmpxchg_loop_body(flag, dest_current_value, compare_value, exchange_value, addr_base, tmp1, tmp2, 1651 retry, failed, cmpxchgx_hint, size); 1652 if (!weak || use_result_reg) { 1653 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1654 bne_predict_not_taken(CCR0, weak ? failed : retry); // StXcx_ sets CCR0. 1655 } else { 1656 bne( CCR0, weak ? failed : retry); // StXcx_ sets CCR0. 1657 } 1658 } 1659 // fall through => (flag == eq), (dest_current_value == compare_value), (swapped) 1660 1661 // Result in register (must do this at the end because int_flag_success can be the 1662 // same register as one above). 1663 if (use_result_reg) { 1664 li(int_flag_success, 1); 1665 } 1666 1667 if (semantics & MemBarFenceAfter) { 1668 fence(); 1669 } else if (semantics & MemBarAcq) { 1670 isync(); 1671 } 1672 1673 if (use_result_reg && !preset_result_reg) { 1674 b(done); 1675 } 1676 1677 bind(failed); 1678 if (use_result_reg && !preset_result_reg) { 1679 li(int_flag_success, 0); 1680 } 1681 1682 bind(done); 1683 // (flag == ne) => (dest_current_value != compare_value), (!swapped) 1684 // (flag == eq) => (dest_current_value == compare_value), ( swapped) 1685 } 1686 1687 // Preforms atomic compare exchange: 1688 // if (compare_value == *addr_base) 1689 // *addr_base = exchange_value 1690 // int_flag_success = 1; 1691 // else 1692 // int_flag_success = 0; 1693 // 1694 // ConditionRegister flag = cmp(compare_value, *addr_base) 1695 // Register dest_current_value = *addr_base 1696 // Register compare_value Used to compare with value in memory 1697 // Register exchange_value Written to memory if compare_value == *addr_base 1698 // Register addr_base The memory location to compareXChange 1699 // Register int_flag_success Set to 1 if exchange_value was written to *addr_base 1700 // 1701 // To avoid the costly compare exchange the value is tested beforehand. 1702 // Several special cases exist to avoid that unnecessary information is generated. 1703 // 1704 void MacroAssembler::cmpxchgd(ConditionRegister flag, 1705 Register dest_current_value, RegisterOrConstant compare_value, Register exchange_value, 1706 Register addr_base, int semantics, bool cmpxchgx_hint, 1707 Register int_flag_success, Label* failed_ext, bool contention_hint, bool weak) { 1708 Label retry; 1709 Label failed_int; 1710 Label& failed = (failed_ext != NULL) ? *failed_ext : failed_int; 1711 Label done; 1712 1713 // Save one branch if result is returned via register and result register is different from the other ones. 1714 bool use_result_reg = (int_flag_success!=noreg); 1715 bool preset_result_reg = (int_flag_success!=dest_current_value && int_flag_success!=compare_value.register_or_noreg() && 1716 int_flag_success!=exchange_value && int_flag_success!=addr_base); 1717 assert(!weak || flag == CCR0, "weak only supported with CCR0"); 1718 assert(int_flag_success == noreg || failed_ext == NULL, "cannot have both"); 1719 1720 if (use_result_reg && preset_result_reg) { 1721 li(int_flag_success, 0); // preset (assume cas failed) 1722 } 1723 1724 // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM). 1725 if (contention_hint) { // Don't try to reserve if cmp fails. 1726 ld(dest_current_value, 0, addr_base); 1727 cmpd(flag, compare_value, dest_current_value); 1728 bne(flag, failed); 1729 } 1730 1731 // release/fence semantics 1732 if (semantics & MemBarRel) { 1733 release(); 1734 } 1735 1736 // atomic emulation loop 1737 bind(retry); 1738 1739 ldarx(dest_current_value, addr_base, cmpxchgx_hint); 1740 cmpd(flag, compare_value, dest_current_value); 1741 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1742 bne_predict_not_taken(flag, failed); 1743 } else { 1744 bne( flag, failed); 1745 } 1746 1747 stdcx_(exchange_value, addr_base); 1748 if (!weak || use_result_reg || failed_ext) { 1749 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1750 bne_predict_not_taken(CCR0, weak ? failed : retry); // stXcx_ sets CCR0 1751 } else { 1752 bne( CCR0, weak ? failed : retry); // stXcx_ sets CCR0 1753 } 1754 } 1755 1756 // result in register (must do this at the end because int_flag_success can be the same register as one above) 1757 if (use_result_reg) { 1758 li(int_flag_success, 1); 1759 } 1760 1761 if (semantics & MemBarFenceAfter) { 1762 fence(); 1763 } else if (semantics & MemBarAcq) { 1764 isync(); 1765 } 1766 1767 if (use_result_reg && !preset_result_reg) { 1768 b(done); 1769 } 1770 1771 bind(failed_int); 1772 if (use_result_reg && !preset_result_reg) { 1773 li(int_flag_success, 0); 1774 } 1775 1776 bind(done); 1777 // (flag == ne) => (dest_current_value != compare_value), (!swapped) 1778 // (flag == eq) => (dest_current_value == compare_value), ( swapped) 1779 } 1780 1781 // Look up the method for a megamorphic invokeinterface call. 1782 // The target method is determined by <intf_klass, itable_index>. 1783 // The receiver klass is in recv_klass. 1784 // On success, the result will be in method_result, and execution falls through. 1785 // On failure, execution transfers to the given label. 1786 void MacroAssembler::lookup_interface_method(Register recv_klass, 1787 Register intf_klass, 1788 RegisterOrConstant itable_index, 1789 Register method_result, 1790 Register scan_temp, 1791 Register sethi_temp, 1792 Label& L_no_such_interface) { 1793 assert_different_registers(recv_klass, intf_klass, method_result, scan_temp); 1794 assert(itable_index.is_constant() || itable_index.as_register() == method_result, 1795 "caller must use same register for non-constant itable index as for method"); 1796 1797 // Compute start of first itableOffsetEntry (which is at the end of the vtable). 1798 int vtable_base = in_bytes(Klass::vtable_start_offset()); 1799 int itentry_off = itableMethodEntry::method_offset_in_bytes(); 1800 int logMEsize = exact_log2(itableMethodEntry::size() * wordSize); 1801 int scan_step = itableOffsetEntry::size() * wordSize; 1802 int log_vte_size= exact_log2(vtableEntry::size_in_bytes()); 1803 1804 lwz(scan_temp, in_bytes(Klass::vtable_length_offset()), recv_klass); 1805 // %%% We should store the aligned, prescaled offset in the klassoop. 1806 // Then the next several instructions would fold away. 1807 1808 sldi(scan_temp, scan_temp, log_vte_size); 1809 addi(scan_temp, scan_temp, vtable_base); 1810 add(scan_temp, recv_klass, scan_temp); 1811 1812 // Adjust recv_klass by scaled itable_index, so we can free itable_index. 1813 if (itable_index.is_register()) { 1814 Register itable_offset = itable_index.as_register(); 1815 sldi(itable_offset, itable_offset, logMEsize); 1816 if (itentry_off) addi(itable_offset, itable_offset, itentry_off); 1817 add(recv_klass, itable_offset, recv_klass); 1818 } else { 1819 long itable_offset = (long)itable_index.as_constant(); 1820 load_const_optimized(sethi_temp, (itable_offset<<logMEsize)+itentry_off); // static address, no relocation 1821 add(recv_klass, sethi_temp, recv_klass); 1822 } 1823 1824 // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) { 1825 // if (scan->interface() == intf) { 1826 // result = (klass + scan->offset() + itable_index); 1827 // } 1828 // } 1829 Label search, found_method; 1830 1831 for (int peel = 1; peel >= 0; peel--) { 1832 // %%%% Could load both offset and interface in one ldx, if they were 1833 // in the opposite order. This would save a load. 1834 ld(method_result, itableOffsetEntry::interface_offset_in_bytes(), scan_temp); 1835 1836 // Check that this entry is non-null. A null entry means that 1837 // the receiver class doesn't implement the interface, and wasn't the 1838 // same as when the caller was compiled. 1839 cmpd(CCR0, method_result, intf_klass); 1840 1841 if (peel) { 1842 beq(CCR0, found_method); 1843 } else { 1844 bne(CCR0, search); 1845 // (invert the test to fall through to found_method...) 1846 } 1847 1848 if (!peel) break; 1849 1850 bind(search); 1851 1852 cmpdi(CCR0, method_result, 0); 1853 beq(CCR0, L_no_such_interface); 1854 addi(scan_temp, scan_temp, scan_step); 1855 } 1856 1857 bind(found_method); 1858 1859 // Got a hit. 1860 int ito_offset = itableOffsetEntry::offset_offset_in_bytes(); 1861 lwz(scan_temp, ito_offset, scan_temp); 1862 ldx(method_result, scan_temp, recv_klass); 1863 } 1864 1865 // virtual method calling 1866 void MacroAssembler::lookup_virtual_method(Register recv_klass, 1867 RegisterOrConstant vtable_index, 1868 Register method_result) { 1869 1870 assert_different_registers(recv_klass, method_result, vtable_index.register_or_noreg()); 1871 1872 const int base = in_bytes(Klass::vtable_start_offset()); 1873 assert(vtableEntry::size() * wordSize == wordSize, "adjust the scaling in the code below"); 1874 1875 if (vtable_index.is_register()) { 1876 sldi(vtable_index.as_register(), vtable_index.as_register(), LogBytesPerWord); 1877 add(recv_klass, vtable_index.as_register(), recv_klass); 1878 } else { 1879 addi(recv_klass, recv_klass, vtable_index.as_constant() << LogBytesPerWord); 1880 } 1881 ld(R19_method, base + vtableEntry::method_offset_in_bytes(), recv_klass); 1882 } 1883 1884 /////////////////////////////////////////// subtype checking //////////////////////////////////////////// 1885 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass, 1886 Register super_klass, 1887 Register temp1_reg, 1888 Register temp2_reg, 1889 Label* L_success, 1890 Label* L_failure, 1891 Label* L_slow_path, 1892 RegisterOrConstant super_check_offset) { 1893 1894 const Register check_cache_offset = temp1_reg; 1895 const Register cached_super = temp2_reg; 1896 1897 assert_different_registers(sub_klass, super_klass, check_cache_offset, cached_super); 1898 1899 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 1900 int sc_offset = in_bytes(Klass::secondary_super_cache_offset()); 1901 1902 bool must_load_sco = (super_check_offset.constant_or_zero() == -1); 1903 bool need_slow_path = (must_load_sco || super_check_offset.constant_or_zero() == sco_offset); 1904 1905 Label L_fallthrough; 1906 int label_nulls = 0; 1907 if (L_success == NULL) { L_success = &L_fallthrough; label_nulls++; } 1908 if (L_failure == NULL) { L_failure = &L_fallthrough; label_nulls++; } 1909 if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; } 1910 assert(label_nulls <= 1 || 1911 (L_slow_path == &L_fallthrough && label_nulls <= 2 && !need_slow_path), 1912 "at most one NULL in the batch, usually"); 1913 1914 // If the pointers are equal, we are done (e.g., String[] elements). 1915 // This self-check enables sharing of secondary supertype arrays among 1916 // non-primary types such as array-of-interface. Otherwise, each such 1917 // type would need its own customized SSA. 1918 // We move this check to the front of the fast path because many 1919 // type checks are in fact trivially successful in this manner, 1920 // so we get a nicely predicted branch right at the start of the check. 1921 cmpd(CCR0, sub_klass, super_klass); 1922 beq(CCR0, *L_success); 1923 1924 // Check the supertype display: 1925 if (must_load_sco) { 1926 // The super check offset is always positive... 1927 lwz(check_cache_offset, sco_offset, super_klass); 1928 super_check_offset = RegisterOrConstant(check_cache_offset); 1929 // super_check_offset is register. 1930 assert_different_registers(sub_klass, super_klass, cached_super, super_check_offset.as_register()); 1931 } 1932 // The loaded value is the offset from KlassOopDesc. 1933 1934 ld(cached_super, super_check_offset, sub_klass); 1935 cmpd(CCR0, cached_super, super_klass); 1936 1937 // This check has worked decisively for primary supers. 1938 // Secondary supers are sought in the super_cache ('super_cache_addr'). 1939 // (Secondary supers are interfaces and very deeply nested subtypes.) 1940 // This works in the same check above because of a tricky aliasing 1941 // between the super_cache and the primary super display elements. 1942 // (The 'super_check_addr' can address either, as the case requires.) 1943 // Note that the cache is updated below if it does not help us find 1944 // what we need immediately. 1945 // So if it was a primary super, we can just fail immediately. 1946 // Otherwise, it's the slow path for us (no success at this point). 1947 1948 #define FINAL_JUMP(label) if (&(label) != &L_fallthrough) { b(label); } 1949 1950 if (super_check_offset.is_register()) { 1951 beq(CCR0, *L_success); 1952 cmpwi(CCR0, super_check_offset.as_register(), sc_offset); 1953 if (L_failure == &L_fallthrough) { 1954 beq(CCR0, *L_slow_path); 1955 } else { 1956 bne(CCR0, *L_failure); 1957 FINAL_JUMP(*L_slow_path); 1958 } 1959 } else { 1960 if (super_check_offset.as_constant() == sc_offset) { 1961 // Need a slow path; fast failure is impossible. 1962 if (L_slow_path == &L_fallthrough) { 1963 beq(CCR0, *L_success); 1964 } else { 1965 bne(CCR0, *L_slow_path); 1966 FINAL_JUMP(*L_success); 1967 } 1968 } else { 1969 // No slow path; it's a fast decision. 1970 if (L_failure == &L_fallthrough) { 1971 beq(CCR0, *L_success); 1972 } else { 1973 bne(CCR0, *L_failure); 1974 FINAL_JUMP(*L_success); 1975 } 1976 } 1977 } 1978 1979 bind(L_fallthrough); 1980 #undef FINAL_JUMP 1981 } 1982 1983 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass, 1984 Register super_klass, 1985 Register temp1_reg, 1986 Register temp2_reg, 1987 Label* L_success, 1988 Register result_reg) { 1989 const Register array_ptr = temp1_reg; // current value from cache array 1990 const Register temp = temp2_reg; 1991 1992 assert_different_registers(sub_klass, super_klass, array_ptr, temp); 1993 1994 int source_offset = in_bytes(Klass::secondary_supers_offset()); 1995 int target_offset = in_bytes(Klass::secondary_super_cache_offset()); 1996 1997 int length_offset = Array<Klass*>::length_offset_in_bytes(); 1998 int base_offset = Array<Klass*>::base_offset_in_bytes(); 1999 2000 Label hit, loop, failure, fallthru; 2001 2002 ld(array_ptr, source_offset, sub_klass); 2003 2004 // TODO: PPC port: assert(4 == arrayOopDesc::length_length_in_bytes(), "precondition violated."); 2005 lwz(temp, length_offset, array_ptr); 2006 cmpwi(CCR0, temp, 0); 2007 beq(CCR0, result_reg!=noreg ? failure : fallthru); // length 0 2008 2009 mtctr(temp); // load ctr 2010 2011 bind(loop); 2012 // Oops in table are NO MORE compressed. 2013 ld(temp, base_offset, array_ptr); 2014 cmpd(CCR0, temp, super_klass); 2015 beq(CCR0, hit); 2016 addi(array_ptr, array_ptr, BytesPerWord); 2017 bdnz(loop); 2018 2019 bind(failure); 2020 if (result_reg!=noreg) li(result_reg, 1); // load non-zero result (indicates a miss) 2021 b(fallthru); 2022 2023 bind(hit); 2024 std(super_klass, target_offset, sub_klass); // save result to cache 2025 if (result_reg != noreg) { li(result_reg, 0); } // load zero result (indicates a hit) 2026 if (L_success != NULL) { b(*L_success); } 2027 else if (result_reg == noreg) { blr(); } // return with CR0.eq if neither label nor result reg provided 2028 2029 bind(fallthru); 2030 } 2031 2032 // Try fast path, then go to slow one if not successful 2033 void MacroAssembler::check_klass_subtype(Register sub_klass, 2034 Register super_klass, 2035 Register temp1_reg, 2036 Register temp2_reg, 2037 Label& L_success) { 2038 Label L_failure; 2039 check_klass_subtype_fast_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success, &L_failure); 2040 check_klass_subtype_slow_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success); 2041 bind(L_failure); // Fallthru if not successful. 2042 } 2043 2044 void MacroAssembler::check_method_handle_type(Register mtype_reg, Register mh_reg, 2045 Register temp_reg, 2046 Label& wrong_method_type) { 2047 assert_different_registers(mtype_reg, mh_reg, temp_reg); 2048 // Compare method type against that of the receiver. 2049 load_heap_oop_not_null(temp_reg, delayed_value(java_lang_invoke_MethodHandle::type_offset_in_bytes, temp_reg), mh_reg); 2050 cmpd(CCR0, temp_reg, mtype_reg); 2051 bne(CCR0, wrong_method_type); 2052 } 2053 2054 RegisterOrConstant MacroAssembler::argument_offset(RegisterOrConstant arg_slot, 2055 Register temp_reg, 2056 int extra_slot_offset) { 2057 // cf. TemplateTable::prepare_invoke(), if (load_receiver). 2058 int stackElementSize = Interpreter::stackElementSize; 2059 int offset = extra_slot_offset * stackElementSize; 2060 if (arg_slot.is_constant()) { 2061 offset += arg_slot.as_constant() * stackElementSize; 2062 return offset; 2063 } else { 2064 assert(temp_reg != noreg, "must specify"); 2065 sldi(temp_reg, arg_slot.as_register(), exact_log2(stackElementSize)); 2066 if (offset != 0) 2067 addi(temp_reg, temp_reg, offset); 2068 return temp_reg; 2069 } 2070 } 2071 2072 // Supports temp2_reg = R0. 2073 void MacroAssembler::biased_locking_enter(ConditionRegister cr_reg, Register obj_reg, 2074 Register mark_reg, Register temp_reg, 2075 Register temp2_reg, Label& done, Label* slow_case) { 2076 assert(UseBiasedLocking, "why call this otherwise?"); 2077 2078 #ifdef ASSERT 2079 assert_different_registers(obj_reg, mark_reg, temp_reg, temp2_reg); 2080 #endif 2081 2082 Label cas_label; 2083 2084 // Branch to done if fast path fails and no slow_case provided. 2085 Label *slow_case_int = (slow_case != NULL) ? slow_case : &done; 2086 2087 // Biased locking 2088 // See whether the lock is currently biased toward our thread and 2089 // whether the epoch is still valid 2090 // Note that the runtime guarantees sufficient alignment of JavaThread 2091 // pointers to allow age to be placed into low bits 2092 assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits, 2093 "biased locking makes assumptions about bit layout"); 2094 2095 if (PrintBiasedLockingStatistics) { 2096 load_const(temp2_reg, (address) BiasedLocking::total_entry_count_addr(), temp_reg); 2097 lwzx(temp_reg, temp2_reg); 2098 addi(temp_reg, temp_reg, 1); 2099 stwx(temp_reg, temp2_reg); 2100 } 2101 2102 andi(temp_reg, mark_reg, markOopDesc::biased_lock_mask_in_place); 2103 cmpwi(cr_reg, temp_reg, markOopDesc::biased_lock_pattern); 2104 bne(cr_reg, cas_label); 2105 2106 load_klass(temp_reg, obj_reg); 2107 2108 load_const_optimized(temp2_reg, ~((int) markOopDesc::age_mask_in_place)); 2109 ld(temp_reg, in_bytes(Klass::prototype_header_offset()), temp_reg); 2110 orr(temp_reg, R16_thread, temp_reg); 2111 xorr(temp_reg, mark_reg, temp_reg); 2112 andr(temp_reg, temp_reg, temp2_reg); 2113 cmpdi(cr_reg, temp_reg, 0); 2114 if (PrintBiasedLockingStatistics) { 2115 Label l; 2116 bne(cr_reg, l); 2117 load_const(temp2_reg, (address) BiasedLocking::biased_lock_entry_count_addr()); 2118 lwzx(mark_reg, temp2_reg); 2119 addi(mark_reg, mark_reg, 1); 2120 stwx(mark_reg, temp2_reg); 2121 // restore mark_reg 2122 ld(mark_reg, oopDesc::mark_offset_in_bytes(), obj_reg); 2123 bind(l); 2124 } 2125 beq(cr_reg, done); 2126 2127 Label try_revoke_bias; 2128 Label try_rebias; 2129 2130 // At this point we know that the header has the bias pattern and 2131 // that we are not the bias owner in the current epoch. We need to 2132 // figure out more details about the state of the header in order to 2133 // know what operations can be legally performed on the object's 2134 // header. 2135 2136 // If the low three bits in the xor result aren't clear, that means 2137 // the prototype header is no longer biased and we have to revoke 2138 // the bias on this object. 2139 andi(temp2_reg, temp_reg, markOopDesc::biased_lock_mask_in_place); 2140 cmpwi(cr_reg, temp2_reg, 0); 2141 bne(cr_reg, try_revoke_bias); 2142 2143 // Biasing is still enabled for this data type. See whether the 2144 // epoch of the current bias is still valid, meaning that the epoch 2145 // bits of the mark word are equal to the epoch bits of the 2146 // prototype header. (Note that the prototype header's epoch bits 2147 // only change at a safepoint.) If not, attempt to rebias the object 2148 // toward the current thread. Note that we must be absolutely sure 2149 // that the current epoch is invalid in order to do this because 2150 // otherwise the manipulations it performs on the mark word are 2151 // illegal. 2152 2153 int shift_amount = 64 - markOopDesc::epoch_shift; 2154 // rotate epoch bits to right (little) end and set other bits to 0 2155 // [ big part | epoch | little part ] -> [ 0..0 | epoch ] 2156 rldicl_(temp2_reg, temp_reg, shift_amount, 64 - markOopDesc::epoch_bits); 2157 // branch if epoch bits are != 0, i.e. they differ, because the epoch has been incremented 2158 bne(CCR0, try_rebias); 2159 2160 // The epoch of the current bias is still valid but we know nothing 2161 // about the owner; it might be set or it might be clear. Try to 2162 // acquire the bias of the object using an atomic operation. If this 2163 // fails we will go in to the runtime to revoke the object's bias. 2164 // Note that we first construct the presumed unbiased header so we 2165 // don't accidentally blow away another thread's valid bias. 2166 andi(mark_reg, mark_reg, (markOopDesc::biased_lock_mask_in_place | 2167 markOopDesc::age_mask_in_place | 2168 markOopDesc::epoch_mask_in_place)); 2169 orr(temp_reg, R16_thread, mark_reg); 2170 2171 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 2172 2173 // CmpxchgX sets cr_reg to cmpX(temp2_reg, mark_reg). 2174 cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg, 2175 /*compare_value=*/mark_reg, /*exchange_value=*/temp_reg, 2176 /*where=*/obj_reg, 2177 MacroAssembler::MemBarAcq, 2178 MacroAssembler::cmpxchgx_hint_acquire_lock(), 2179 noreg, slow_case_int); // bail out if failed 2180 2181 // If the biasing toward our thread failed, this means that 2182 // another thread succeeded in biasing it toward itself and we 2183 // need to revoke that bias. The revocation will occur in the 2184 // interpreter runtime in the slow case. 2185 if (PrintBiasedLockingStatistics) { 2186 load_const(temp2_reg, (address) BiasedLocking::anonymously_biased_lock_entry_count_addr(), temp_reg); 2187 lwzx(temp_reg, temp2_reg); 2188 addi(temp_reg, temp_reg, 1); 2189 stwx(temp_reg, temp2_reg); 2190 } 2191 b(done); 2192 2193 bind(try_rebias); 2194 // At this point we know the epoch has expired, meaning that the 2195 // current "bias owner", if any, is actually invalid. Under these 2196 // circumstances _only_, we are allowed to use the current header's 2197 // value as the comparison value when doing the cas to acquire the 2198 // bias in the current epoch. In other words, we allow transfer of 2199 // the bias from one thread to another directly in this situation. 2200 load_klass(temp_reg, obj_reg); 2201 andi(temp2_reg, mark_reg, markOopDesc::age_mask_in_place); 2202 orr(temp2_reg, R16_thread, temp2_reg); 2203 ld(temp_reg, in_bytes(Klass::prototype_header_offset()), temp_reg); 2204 orr(temp_reg, temp2_reg, temp_reg); 2205 2206 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 2207 2208 cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg, 2209 /*compare_value=*/mark_reg, /*exchange_value=*/temp_reg, 2210 /*where=*/obj_reg, 2211 MacroAssembler::MemBarAcq, 2212 MacroAssembler::cmpxchgx_hint_acquire_lock(), 2213 noreg, slow_case_int); // bail out if failed 2214 2215 // If the biasing toward our thread failed, this means that 2216 // another thread succeeded in biasing it toward itself and we 2217 // need to revoke that bias. The revocation will occur in the 2218 // interpreter runtime in the slow case. 2219 if (PrintBiasedLockingStatistics) { 2220 load_const(temp2_reg, (address) BiasedLocking::rebiased_lock_entry_count_addr(), temp_reg); 2221 lwzx(temp_reg, temp2_reg); 2222 addi(temp_reg, temp_reg, 1); 2223 stwx(temp_reg, temp2_reg); 2224 } 2225 b(done); 2226 2227 bind(try_revoke_bias); 2228 // The prototype mark in the klass doesn't have the bias bit set any 2229 // more, indicating that objects of this data type are not supposed 2230 // to be biased any more. We are going to try to reset the mark of 2231 // this object to the prototype value and fall through to the 2232 // CAS-based locking scheme. Note that if our CAS fails, it means 2233 // that another thread raced us for the privilege of revoking the 2234 // bias of this particular object, so it's okay to continue in the 2235 // normal locking code. 2236 load_klass(temp_reg, obj_reg); 2237 ld(temp_reg, in_bytes(Klass::prototype_header_offset()), temp_reg); 2238 andi(temp2_reg, mark_reg, markOopDesc::age_mask_in_place); 2239 orr(temp_reg, temp_reg, temp2_reg); 2240 2241 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 2242 2243 // CmpxchgX sets cr_reg to cmpX(temp2_reg, mark_reg). 2244 cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg, 2245 /*compare_value=*/mark_reg, /*exchange_value=*/temp_reg, 2246 /*where=*/obj_reg, 2247 MacroAssembler::MemBarAcq, 2248 MacroAssembler::cmpxchgx_hint_acquire_lock()); 2249 2250 // reload markOop in mark_reg before continuing with lightweight locking 2251 ld(mark_reg, oopDesc::mark_offset_in_bytes(), obj_reg); 2252 2253 // Fall through to the normal CAS-based lock, because no matter what 2254 // the result of the above CAS, some thread must have succeeded in 2255 // removing the bias bit from the object's header. 2256 if (PrintBiasedLockingStatistics) { 2257 Label l; 2258 bne(cr_reg, l); 2259 load_const(temp2_reg, (address) BiasedLocking::revoked_lock_entry_count_addr(), temp_reg); 2260 lwzx(temp_reg, temp2_reg); 2261 addi(temp_reg, temp_reg, 1); 2262 stwx(temp_reg, temp2_reg); 2263 bind(l); 2264 } 2265 2266 bind(cas_label); 2267 } 2268 2269 void MacroAssembler::biased_locking_exit (ConditionRegister cr_reg, Register mark_addr, Register temp_reg, Label& done) { 2270 // Check for biased locking unlock case, which is a no-op 2271 // Note: we do not have to check the thread ID for two reasons. 2272 // First, the interpreter checks for IllegalMonitorStateException at 2273 // a higher level. Second, if the bias was revoked while we held the 2274 // lock, the object could not be rebiased toward another thread, so 2275 // the bias bit would be clear. 2276 2277 ld(temp_reg, 0, mark_addr); 2278 andi(temp_reg, temp_reg, markOopDesc::biased_lock_mask_in_place); 2279 2280 cmpwi(cr_reg, temp_reg, markOopDesc::biased_lock_pattern); 2281 beq(cr_reg, done); 2282 } 2283 2284 // allocation (for C1) 2285 void MacroAssembler::eden_allocate( 2286 Register obj, // result: pointer to object after successful allocation 2287 Register var_size_in_bytes, // object size in bytes if unknown at compile time; invalid otherwise 2288 int con_size_in_bytes, // object size in bytes if known at compile time 2289 Register t1, // temp register 2290 Register t2, // temp register 2291 Label& slow_case // continuation point if fast allocation fails 2292 ) { 2293 b(slow_case); 2294 } 2295 2296 void MacroAssembler::tlab_allocate( 2297 Register obj, // result: pointer to object after successful allocation 2298 Register var_size_in_bytes, // object size in bytes if unknown at compile time; invalid otherwise 2299 int con_size_in_bytes, // object size in bytes if known at compile time 2300 Register t1, // temp register 2301 Label& slow_case // continuation point if fast allocation fails 2302 ) { 2303 // make sure arguments make sense 2304 assert_different_registers(obj, var_size_in_bytes, t1); 2305 assert(0 <= con_size_in_bytes && is_simm13(con_size_in_bytes), "illegal object size"); 2306 assert((con_size_in_bytes & MinObjAlignmentInBytesMask) == 0, "object size is not multiple of alignment"); 2307 2308 const Register new_top = t1; 2309 //verify_tlab(); not implemented 2310 2311 ld(obj, in_bytes(JavaThread::tlab_top_offset()), R16_thread); 2312 ld(R0, in_bytes(JavaThread::tlab_current_end_offset()), R16_thread); 2313 if (var_size_in_bytes == noreg) { 2314 addi(new_top, obj, con_size_in_bytes); 2315 } else { 2316 add(new_top, obj, var_size_in_bytes); 2317 } 2318 cmpld(CCR0, new_top, R0); 2319 bc_far_optimized(Assembler::bcondCRbiIs1, bi0(CCR0, Assembler::greater), slow_case); 2320 2321 #ifdef ASSERT 2322 // make sure new free pointer is properly aligned 2323 { 2324 Label L; 2325 andi_(R0, new_top, MinObjAlignmentInBytesMask); 2326 beq(CCR0, L); 2327 stop("updated TLAB free is not properly aligned", 0x934); 2328 bind(L); 2329 } 2330 #endif // ASSERT 2331 2332 // update the tlab top pointer 2333 std(new_top, in_bytes(JavaThread::tlab_top_offset()), R16_thread); 2334 //verify_tlab(); not implemented 2335 } 2336 void MacroAssembler::tlab_refill(Label& retry_tlab, Label& try_eden, Label& slow_case) { 2337 unimplemented("tlab_refill"); 2338 } 2339 void MacroAssembler::incr_allocated_bytes(RegisterOrConstant size_in_bytes, Register t1, Register t2) { 2340 unimplemented("incr_allocated_bytes"); 2341 } 2342 2343 address MacroAssembler::emit_trampoline_stub(int destination_toc_offset, 2344 int insts_call_instruction_offset, Register Rtoc) { 2345 // Start the stub. 2346 address stub = start_a_stub(64); 2347 if (stub == NULL) { return NULL; } // CodeCache full: bail out 2348 2349 // Create a trampoline stub relocation which relates this trampoline stub 2350 // with the call instruction at insts_call_instruction_offset in the 2351 // instructions code-section. 2352 relocate(trampoline_stub_Relocation::spec(code()->insts()->start() + insts_call_instruction_offset)); 2353 const int stub_start_offset = offset(); 2354 2355 // For java_to_interp stubs we use R11_scratch1 as scratch register 2356 // and in call trampoline stubs we use R12_scratch2. This way we 2357 // can distinguish them (see is_NativeCallTrampolineStub_at()). 2358 Register reg_scratch = R12_scratch2; 2359 2360 // Now, create the trampoline stub's code: 2361 // - load the TOC 2362 // - load the call target from the constant pool 2363 // - call 2364 if (Rtoc == noreg) { 2365 calculate_address_from_global_toc(reg_scratch, method_toc()); 2366 Rtoc = reg_scratch; 2367 } 2368 2369 ld_largeoffset_unchecked(reg_scratch, destination_toc_offset, Rtoc, false); 2370 mtctr(reg_scratch); 2371 bctr(); 2372 2373 const address stub_start_addr = addr_at(stub_start_offset); 2374 2375 // Assert that the encoded destination_toc_offset can be identified and that it is correct. 2376 assert(destination_toc_offset == NativeCallTrampolineStub_at(stub_start_addr)->destination_toc_offset(), 2377 "encoded offset into the constant pool must match"); 2378 // Trampoline_stub_size should be good. 2379 assert((uint)(offset() - stub_start_offset) <= trampoline_stub_size, "should be good size"); 2380 assert(is_NativeCallTrampolineStub_at(stub_start_addr), "doesn't look like a trampoline"); 2381 2382 // End the stub. 2383 end_a_stub(); 2384 return stub; 2385 } 2386 2387 // TM on PPC64. 2388 void MacroAssembler::atomic_inc_ptr(Register addr, Register result, int simm16) { 2389 Label retry; 2390 bind(retry); 2391 ldarx(result, addr, /*hint*/ false); 2392 addi(result, result, simm16); 2393 stdcx_(result, addr); 2394 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 2395 bne_predict_not_taken(CCR0, retry); // stXcx_ sets CCR0 2396 } else { 2397 bne( CCR0, retry); // stXcx_ sets CCR0 2398 } 2399 } 2400 2401 void MacroAssembler::atomic_ori_int(Register addr, Register result, int uimm16) { 2402 Label retry; 2403 bind(retry); 2404 lwarx(result, addr, /*hint*/ false); 2405 ori(result, result, uimm16); 2406 stwcx_(result, addr); 2407 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 2408 bne_predict_not_taken(CCR0, retry); // stXcx_ sets CCR0 2409 } else { 2410 bne( CCR0, retry); // stXcx_ sets CCR0 2411 } 2412 } 2413 2414 #if INCLUDE_RTM_OPT 2415 2416 // Update rtm_counters based on abort status 2417 // input: abort_status 2418 // rtm_counters (RTMLockingCounters*) 2419 void MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters_Reg) { 2420 // Mapping to keep PreciseRTMLockingStatistics similar to x86. 2421 // x86 ppc (! means inverted, ? means not the same) 2422 // 0 31 Set if abort caused by XABORT instruction. 2423 // 1 ! 7 If set, the transaction may succeed on a retry. This bit is always clear if bit 0 is set. 2424 // 2 13 Set if another logical processor conflicted with a memory address that was part of the transaction that aborted. 2425 // 3 10 Set if an internal buffer overflowed. 2426 // 4 ?12 Set if a debug breakpoint was hit. 2427 // 5 ?32 Set if an abort occurred during execution of a nested transaction. 2428 const int tm_failure_bit[] = {Assembler::tm_tabort, // Note: Seems like signal handler sets this, too. 2429 Assembler::tm_failure_persistent, // inverted: transient 2430 Assembler::tm_trans_cf, 2431 Assembler::tm_footprint_of, 2432 Assembler::tm_non_trans_cf, 2433 Assembler::tm_suspended}; 2434 const bool tm_failure_inv[] = {false, true, false, false, false, false}; 2435 assert(sizeof(tm_failure_bit)/sizeof(int) == RTMLockingCounters::ABORT_STATUS_LIMIT, "adapt mapping!"); 2436 2437 const Register addr_Reg = R0; 2438 // Keep track of offset to where rtm_counters_Reg had pointed to. 2439 int counters_offs = RTMLockingCounters::abort_count_offset(); 2440 addi(addr_Reg, rtm_counters_Reg, counters_offs); 2441 const Register temp_Reg = rtm_counters_Reg; 2442 2443 //atomic_inc_ptr(addr_Reg, temp_Reg); We don't increment atomically 2444 ldx(temp_Reg, addr_Reg); 2445 addi(temp_Reg, temp_Reg, 1); 2446 stdx(temp_Reg, addr_Reg); 2447 2448 if (PrintPreciseRTMLockingStatistics) { 2449 int counters_offs_delta = RTMLockingCounters::abortX_count_offset() - counters_offs; 2450 2451 //mftexasr(abort_status); done by caller 2452 for (int i = 0; i < RTMLockingCounters::ABORT_STATUS_LIMIT; i++) { 2453 counters_offs += counters_offs_delta; 2454 li(temp_Reg, counters_offs_delta); // can't use addi with R0 2455 add(addr_Reg, addr_Reg, temp_Reg); // point to next counter 2456 counters_offs_delta = sizeof(uintx); 2457 2458 Label check_abort; 2459 rldicr_(temp_Reg, abort_status, tm_failure_bit[i], 0); 2460 if (tm_failure_inv[i]) { 2461 bne(CCR0, check_abort); 2462 } else { 2463 beq(CCR0, check_abort); 2464 } 2465 //atomic_inc_ptr(addr_Reg, temp_Reg); We don't increment atomically 2466 ldx(temp_Reg, addr_Reg); 2467 addi(temp_Reg, temp_Reg, 1); 2468 stdx(temp_Reg, addr_Reg); 2469 bind(check_abort); 2470 } 2471 } 2472 li(temp_Reg, -counters_offs); // can't use addi with R0 2473 add(rtm_counters_Reg, addr_Reg, temp_Reg); // restore 2474 } 2475 2476 // Branch if (random & (count-1) != 0), count is 2^n 2477 // tmp and CR0 are killed 2478 void MacroAssembler::branch_on_random_using_tb(Register tmp, int count, Label& brLabel) { 2479 mftb(tmp); 2480 andi_(tmp, tmp, count-1); 2481 bne(CCR0, brLabel); 2482 } 2483 2484 // Perform abort ratio calculation, set no_rtm bit if high ratio. 2485 // input: rtm_counters_Reg (RTMLockingCounters* address) - KILLED 2486 void MacroAssembler::rtm_abort_ratio_calculation(Register rtm_counters_Reg, 2487 RTMLockingCounters* rtm_counters, 2488 Metadata* method_data) { 2489 Label L_done, L_check_always_rtm1, L_check_always_rtm2; 2490 2491 if (RTMLockingCalculationDelay > 0) { 2492 // Delay calculation. 2493 ld(rtm_counters_Reg, (RegisterOrConstant)(intptr_t)RTMLockingCounters::rtm_calculation_flag_addr()); 2494 cmpdi(CCR0, rtm_counters_Reg, 0); 2495 beq(CCR0, L_done); 2496 load_const_optimized(rtm_counters_Reg, (address)rtm_counters, R0); // reload 2497 } 2498 // Abort ratio calculation only if abort_count > RTMAbortThreshold. 2499 // Aborted transactions = abort_count * 100 2500 // All transactions = total_count * RTMTotalCountIncrRate 2501 // Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio) 2502 ld(R0, RTMLockingCounters::abort_count_offset(), rtm_counters_Reg); 2503 if (is_simm(RTMAbortThreshold, 16)) { // cmpdi can handle 16bit immediate only. 2504 cmpdi(CCR0, R0, RTMAbortThreshold); 2505 blt(CCR0, L_check_always_rtm2); // reload of rtm_counters_Reg not necessary 2506 } else { 2507 load_const_optimized(rtm_counters_Reg, RTMAbortThreshold); 2508 cmpd(CCR0, R0, rtm_counters_Reg); 2509 blt(CCR0, L_check_always_rtm1); // reload of rtm_counters_Reg required 2510 } 2511 mulli(R0, R0, 100); 2512 2513 const Register tmpReg = rtm_counters_Reg; 2514 ld(tmpReg, RTMLockingCounters::total_count_offset(), rtm_counters_Reg); 2515 mulli(tmpReg, tmpReg, RTMTotalCountIncrRate); // allowable range: int16 2516 mulli(tmpReg, tmpReg, RTMAbortRatio); // allowable range: int16 2517 cmpd(CCR0, R0, tmpReg); 2518 blt(CCR0, L_check_always_rtm1); // jump to reload 2519 if (method_data != NULL) { 2520 // Set rtm_state to "no rtm" in MDO. 2521 // Not using a metadata relocation. Method and Class Loader are kept alive anyway. 2522 // (See nmethod::metadata_do and CodeBuffer::finalize_oop_references.) 2523 load_const(R0, (address)method_data + MethodData::rtm_state_offset_in_bytes(), tmpReg); 2524 atomic_ori_int(R0, tmpReg, NoRTM); 2525 } 2526 b(L_done); 2527 2528 bind(L_check_always_rtm1); 2529 load_const_optimized(rtm_counters_Reg, (address)rtm_counters, R0); // reload 2530 bind(L_check_always_rtm2); 2531 ld(tmpReg, RTMLockingCounters::total_count_offset(), rtm_counters_Reg); 2532 int64_t thresholdValue = RTMLockingThreshold / RTMTotalCountIncrRate; 2533 if (is_simm(thresholdValue, 16)) { // cmpdi can handle 16bit immediate only. 2534 cmpdi(CCR0, tmpReg, thresholdValue); 2535 } else { 2536 load_const_optimized(R0, thresholdValue); 2537 cmpd(CCR0, tmpReg, R0); 2538 } 2539 blt(CCR0, L_done); 2540 if (method_data != NULL) { 2541 // Set rtm_state to "always rtm" in MDO. 2542 // Not using a metadata relocation. See above. 2543 load_const(R0, (address)method_data + MethodData::rtm_state_offset_in_bytes(), tmpReg); 2544 atomic_ori_int(R0, tmpReg, UseRTM); 2545 } 2546 bind(L_done); 2547 } 2548 2549 // Update counters and perform abort ratio calculation. 2550 // input: abort_status_Reg 2551 void MacroAssembler::rtm_profiling(Register abort_status_Reg, Register temp_Reg, 2552 RTMLockingCounters* rtm_counters, 2553 Metadata* method_data, 2554 bool profile_rtm) { 2555 2556 assert(rtm_counters != NULL, "should not be NULL when profiling RTM"); 2557 // Update rtm counters based on state at abort. 2558 // Reads abort_status_Reg, updates flags. 2559 assert_different_registers(abort_status_Reg, temp_Reg); 2560 load_const_optimized(temp_Reg, (address)rtm_counters, R0); 2561 rtm_counters_update(abort_status_Reg, temp_Reg); 2562 if (profile_rtm) { 2563 assert(rtm_counters != NULL, "should not be NULL when profiling RTM"); 2564 rtm_abort_ratio_calculation(temp_Reg, rtm_counters, method_data); 2565 } 2566 } 2567 2568 // Retry on abort if abort's status indicates non-persistent failure. 2569 // inputs: retry_count_Reg 2570 // : abort_status_Reg 2571 // output: retry_count_Reg decremented by 1 2572 void MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg, 2573 Label& retryLabel, Label* checkRetry) { 2574 Label doneRetry; 2575 rldicr_(R0, abort_status_Reg, tm_failure_persistent, 0); 2576 bne(CCR0, doneRetry); 2577 if (checkRetry) { bind(*checkRetry); } 2578 addic_(retry_count_Reg, retry_count_Reg, -1); 2579 blt(CCR0, doneRetry); 2580 smt_yield(); // Can't use wait(). No permission (SIGILL). 2581 b(retryLabel); 2582 bind(doneRetry); 2583 } 2584 2585 // Spin and retry if lock is busy. 2586 // inputs: owner_addr_Reg (monitor address) 2587 // : retry_count_Reg 2588 // output: retry_count_Reg decremented by 1 2589 // CTR is killed 2590 void MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register owner_addr_Reg, Label& retryLabel) { 2591 Label SpinLoop, doneRetry; 2592 addic_(retry_count_Reg, retry_count_Reg, -1); 2593 blt(CCR0, doneRetry); 2594 2595 if (RTMSpinLoopCount > 1) { 2596 li(R0, RTMSpinLoopCount); 2597 mtctr(R0); 2598 } 2599 2600 bind(SpinLoop); 2601 smt_yield(); // Can't use waitrsv(). No permission (SIGILL). 2602 2603 if (RTMSpinLoopCount > 1) { 2604 bdz(retryLabel); 2605 ld(R0, 0, owner_addr_Reg); 2606 cmpdi(CCR0, R0, 0); 2607 bne(CCR0, SpinLoop); 2608 } 2609 2610 b(retryLabel); 2611 2612 bind(doneRetry); 2613 } 2614 2615 // Use RTM for normal stack locks. 2616 // Input: objReg (object to lock) 2617 void MacroAssembler::rtm_stack_locking(ConditionRegister flag, 2618 Register obj, Register mark_word, Register tmp, 2619 Register retry_on_abort_count_Reg, 2620 RTMLockingCounters* stack_rtm_counters, 2621 Metadata* method_data, bool profile_rtm, 2622 Label& DONE_LABEL, Label& IsInflated) { 2623 assert(UseRTMForStackLocks, "why call this otherwise?"); 2624 assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking"); 2625 Label L_rtm_retry, L_decrement_retry, L_on_abort; 2626 2627 if (RTMRetryCount > 0) { 2628 load_const_optimized(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort 2629 bind(L_rtm_retry); 2630 } 2631 andi_(R0, mark_word, markOopDesc::monitor_value); // inflated vs stack-locked|neutral|biased 2632 bne(CCR0, IsInflated); 2633 2634 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 2635 Label L_noincrement; 2636 if (RTMTotalCountIncrRate > 1) { 2637 branch_on_random_using_tb(tmp, RTMTotalCountIncrRate, L_noincrement); 2638 } 2639 assert(stack_rtm_counters != NULL, "should not be NULL when profiling RTM"); 2640 load_const_optimized(tmp, (address)stack_rtm_counters->total_count_addr(), R0); 2641 //atomic_inc_ptr(tmp, /*temp, will be reloaded*/mark_word); We don't increment atomically 2642 ldx(mark_word, tmp); 2643 addi(mark_word, mark_word, 1); 2644 stdx(mark_word, tmp); 2645 bind(L_noincrement); 2646 } 2647 tbegin_(); 2648 beq(CCR0, L_on_abort); 2649 ld(mark_word, oopDesc::mark_offset_in_bytes(), obj); // Reload in transaction, conflicts need to be tracked. 2650 andi(R0, mark_word, markOopDesc::biased_lock_mask_in_place); // look at 3 lock bits 2651 cmpwi(flag, R0, markOopDesc::unlocked_value); // bits = 001 unlocked 2652 beq(flag, DONE_LABEL); // all done if unlocked 2653 2654 if (UseRTMXendForLockBusy) { 2655 tend_(); 2656 b(L_decrement_retry); 2657 } else { 2658 tabort_(); 2659 } 2660 bind(L_on_abort); 2661 const Register abort_status_Reg = tmp; 2662 mftexasr(abort_status_Reg); 2663 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 2664 rtm_profiling(abort_status_Reg, /*temp*/mark_word, stack_rtm_counters, method_data, profile_rtm); 2665 } 2666 ld(mark_word, oopDesc::mark_offset_in_bytes(), obj); // reload 2667 if (RTMRetryCount > 0) { 2668 // Retry on lock abort if abort status is not permanent. 2669 rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry, &L_decrement_retry); 2670 } else { 2671 bind(L_decrement_retry); 2672 } 2673 } 2674 2675 // Use RTM for inflating locks 2676 // inputs: obj (object to lock) 2677 // mark_word (current header - KILLED) 2678 // boxReg (on-stack box address (displaced header location) - KILLED) 2679 void MacroAssembler::rtm_inflated_locking(ConditionRegister flag, 2680 Register obj, Register mark_word, Register boxReg, 2681 Register retry_on_busy_count_Reg, Register retry_on_abort_count_Reg, 2682 RTMLockingCounters* rtm_counters, 2683 Metadata* method_data, bool profile_rtm, 2684 Label& DONE_LABEL) { 2685 assert(UseRTMLocking, "why call this otherwise?"); 2686 Label L_rtm_retry, L_decrement_retry, L_on_abort; 2687 // Clean monitor_value bit to get valid pointer. 2688 int owner_offset = ObjectMonitor::owner_offset_in_bytes() - markOopDesc::monitor_value; 2689 2690 // Store non-null, using boxReg instead of (intptr_t)markOopDesc::unused_mark(). 2691 std(boxReg, BasicLock::displaced_header_offset_in_bytes(), boxReg); 2692 const Register tmpReg = boxReg; 2693 const Register owner_addr_Reg = mark_word; 2694 addi(owner_addr_Reg, mark_word, owner_offset); 2695 2696 if (RTMRetryCount > 0) { 2697 load_const_optimized(retry_on_busy_count_Reg, RTMRetryCount); // Retry on lock busy. 2698 load_const_optimized(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort. 2699 bind(L_rtm_retry); 2700 } 2701 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 2702 Label L_noincrement; 2703 if (RTMTotalCountIncrRate > 1) { 2704 branch_on_random_using_tb(R0, RTMTotalCountIncrRate, L_noincrement); 2705 } 2706 assert(rtm_counters != NULL, "should not be NULL when profiling RTM"); 2707 load_const(R0, (address)rtm_counters->total_count_addr(), tmpReg); 2708 //atomic_inc_ptr(R0, tmpReg); We don't increment atomically 2709 ldx(tmpReg, R0); 2710 addi(tmpReg, tmpReg, 1); 2711 stdx(tmpReg, R0); 2712 bind(L_noincrement); 2713 } 2714 tbegin_(); 2715 beq(CCR0, L_on_abort); 2716 // We don't reload mark word. Will only be reset at safepoint. 2717 ld(R0, 0, owner_addr_Reg); // Load in transaction, conflicts need to be tracked. 2718 cmpdi(flag, R0, 0); 2719 beq(flag, DONE_LABEL); 2720 2721 if (UseRTMXendForLockBusy) { 2722 tend_(); 2723 b(L_decrement_retry); 2724 } else { 2725 tabort_(); 2726 } 2727 bind(L_on_abort); 2728 const Register abort_status_Reg = tmpReg; 2729 mftexasr(abort_status_Reg); 2730 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 2731 rtm_profiling(abort_status_Reg, /*temp*/ owner_addr_Reg, rtm_counters, method_data, profile_rtm); 2732 // Restore owner_addr_Reg 2733 ld(mark_word, oopDesc::mark_offset_in_bytes(), obj); 2734 #ifdef ASSERT 2735 andi_(R0, mark_word, markOopDesc::monitor_value); 2736 asm_assert_ne("must be inflated", 0xa754); // Deflating only allowed at safepoint. 2737 #endif 2738 addi(owner_addr_Reg, mark_word, owner_offset); 2739 } 2740 if (RTMRetryCount > 0) { 2741 // Retry on lock abort if abort status is not permanent. 2742 rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry); 2743 } 2744 2745 // Appears unlocked - try to swing _owner from null to non-null. 2746 cmpxchgd(flag, /*current val*/ R0, (intptr_t)0, /*new val*/ R16_thread, owner_addr_Reg, 2747 MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq, 2748 MacroAssembler::cmpxchgx_hint_acquire_lock(), noreg, &L_decrement_retry, true); 2749 2750 if (RTMRetryCount > 0) { 2751 // success done else retry 2752 b(DONE_LABEL); 2753 bind(L_decrement_retry); 2754 // Spin and retry if lock is busy. 2755 rtm_retry_lock_on_busy(retry_on_busy_count_Reg, owner_addr_Reg, L_rtm_retry); 2756 } else { 2757 bind(L_decrement_retry); 2758 } 2759 } 2760 2761 #endif // INCLUDE_RTM_OPT 2762 2763 // "The box" is the space on the stack where we copy the object mark. 2764 void MacroAssembler::compiler_fast_lock_object(ConditionRegister flag, Register oop, Register box, 2765 Register temp, Register displaced_header, Register current_header, 2766 bool try_bias, 2767 RTMLockingCounters* rtm_counters, 2768 RTMLockingCounters* stack_rtm_counters, 2769 Metadata* method_data, 2770 bool use_rtm, bool profile_rtm) { 2771 assert_different_registers(oop, box, temp, displaced_header, current_header); 2772 assert(flag != CCR0, "bad condition register"); 2773 Label cont; 2774 Label object_has_monitor; 2775 Label cas_failed; 2776 2777 // Load markOop from object into displaced_header. 2778 ld(displaced_header, oopDesc::mark_offset_in_bytes(), oop); 2779 2780 2781 // Always do locking in runtime. 2782 if (EmitSync & 0x01) { 2783 cmpdi(flag, oop, 0); // Oop can't be 0 here => always false. 2784 return; 2785 } 2786 2787 if (try_bias) { 2788 biased_locking_enter(flag, oop, displaced_header, temp, current_header, cont); 2789 } 2790 2791 #if INCLUDE_RTM_OPT 2792 if (UseRTMForStackLocks && use_rtm) { 2793 rtm_stack_locking(flag, oop, displaced_header, temp, /*temp*/ current_header, 2794 stack_rtm_counters, method_data, profile_rtm, 2795 cont, object_has_monitor); 2796 } 2797 #endif // INCLUDE_RTM_OPT 2798 2799 // Handle existing monitor. 2800 if ((EmitSync & 0x02) == 0) { 2801 // The object has an existing monitor iff (mark & monitor_value) != 0. 2802 andi_(temp, displaced_header, markOopDesc::monitor_value); 2803 bne(CCR0, object_has_monitor); 2804 } 2805 2806 // Set displaced_header to be (markOop of object | UNLOCK_VALUE). 2807 ori(displaced_header, displaced_header, markOopDesc::unlocked_value); 2808 2809 // Load Compare Value application register. 2810 2811 // Initialize the box. (Must happen before we update the object mark!) 2812 std(displaced_header, BasicLock::displaced_header_offset_in_bytes(), box); 2813 2814 // Must fence, otherwise, preceding store(s) may float below cmpxchg. 2815 // Compare object markOop with mark and if equal exchange scratch1 with object markOop. 2816 cmpxchgd(/*flag=*/flag, 2817 /*current_value=*/current_header, 2818 /*compare_value=*/displaced_header, 2819 /*exchange_value=*/box, 2820 /*where=*/oop, 2821 MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq, 2822 MacroAssembler::cmpxchgx_hint_acquire_lock(), 2823 noreg, 2824 &cas_failed, 2825 /*check without membar and ldarx first*/true); 2826 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 2827 2828 // If the compare-and-exchange succeeded, then we found an unlocked 2829 // object and we have now locked it. 2830 b(cont); 2831 2832 bind(cas_failed); 2833 // We did not see an unlocked object so try the fast recursive case. 2834 2835 // Check if the owner is self by comparing the value in the markOop of object 2836 // (current_header) with the stack pointer. 2837 sub(current_header, current_header, R1_SP); 2838 load_const_optimized(temp, ~(os::vm_page_size()-1) | markOopDesc::lock_mask_in_place); 2839 2840 and_(R0/*==0?*/, current_header, temp); 2841 // If condition is true we are cont and hence we can store 0 as the 2842 // displaced header in the box, which indicates that it is a recursive lock. 2843 mcrf(flag,CCR0); 2844 std(R0/*==0, perhaps*/, BasicLock::displaced_header_offset_in_bytes(), box); 2845 2846 // Handle existing monitor. 2847 if ((EmitSync & 0x02) == 0) { 2848 b(cont); 2849 2850 bind(object_has_monitor); 2851 // The object's monitor m is unlocked iff m->owner == NULL, 2852 // otherwise m->owner may contain a thread or a stack address. 2853 2854 #if INCLUDE_RTM_OPT 2855 // Use the same RTM locking code in 32- and 64-bit VM. 2856 if (use_rtm) { 2857 rtm_inflated_locking(flag, oop, displaced_header, box, temp, /*temp*/ current_header, 2858 rtm_counters, method_data, profile_rtm, cont); 2859 } else { 2860 #endif // INCLUDE_RTM_OPT 2861 2862 // Try to CAS m->owner from NULL to current thread. 2863 addi(temp, displaced_header, ObjectMonitor::owner_offset_in_bytes()-markOopDesc::monitor_value); 2864 cmpxchgd(/*flag=*/flag, 2865 /*current_value=*/current_header, 2866 /*compare_value=*/(intptr_t)0, 2867 /*exchange_value=*/R16_thread, 2868 /*where=*/temp, 2869 MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq, 2870 MacroAssembler::cmpxchgx_hint_acquire_lock()); 2871 2872 // Store a non-null value into the box. 2873 std(box, BasicLock::displaced_header_offset_in_bytes(), box); 2874 2875 # ifdef ASSERT 2876 bne(flag, cont); 2877 // We have acquired the monitor, check some invariants. 2878 addi(/*monitor=*/temp, temp, -ObjectMonitor::owner_offset_in_bytes()); 2879 // Invariant 1: _recursions should be 0. 2880 //assert(ObjectMonitor::recursions_size_in_bytes() == 8, "unexpected size"); 2881 asm_assert_mem8_is_zero(ObjectMonitor::recursions_offset_in_bytes(), temp, 2882 "monitor->_recursions should be 0", -1); 2883 # endif 2884 2885 #if INCLUDE_RTM_OPT 2886 } // use_rtm() 2887 #endif 2888 } 2889 2890 bind(cont); 2891 // flag == EQ indicates success 2892 // flag == NE indicates failure 2893 } 2894 2895 void MacroAssembler::compiler_fast_unlock_object(ConditionRegister flag, Register oop, Register box, 2896 Register temp, Register displaced_header, Register current_header, 2897 bool try_bias, bool use_rtm) { 2898 assert_different_registers(oop, box, temp, displaced_header, current_header); 2899 assert(flag != CCR0, "bad condition register"); 2900 Label cont; 2901 Label object_has_monitor; 2902 2903 // Always do locking in runtime. 2904 if (EmitSync & 0x01) { 2905 cmpdi(flag, oop, 0); // Oop can't be 0 here => always false. 2906 return; 2907 } 2908 2909 if (try_bias) { 2910 biased_locking_exit(flag, oop, current_header, cont); 2911 } 2912 2913 #if INCLUDE_RTM_OPT 2914 if (UseRTMForStackLocks && use_rtm) { 2915 assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking"); 2916 Label L_regular_unlock; 2917 ld(current_header, oopDesc::mark_offset_in_bytes(), oop); // fetch markword 2918 andi(R0, current_header, markOopDesc::biased_lock_mask_in_place); // look at 3 lock bits 2919 cmpwi(flag, R0, markOopDesc::unlocked_value); // bits = 001 unlocked 2920 bne(flag, L_regular_unlock); // else RegularLock 2921 tend_(); // otherwise end... 2922 b(cont); // ... and we're done 2923 bind(L_regular_unlock); 2924 } 2925 #endif 2926 2927 // Find the lock address and load the displaced header from the stack. 2928 ld(displaced_header, BasicLock::displaced_header_offset_in_bytes(), box); 2929 2930 // If the displaced header is 0, we have a recursive unlock. 2931 cmpdi(flag, displaced_header, 0); 2932 beq(flag, cont); 2933 2934 // Handle existing monitor. 2935 if ((EmitSync & 0x02) == 0) { 2936 // The object has an existing monitor iff (mark & monitor_value) != 0. 2937 RTM_OPT_ONLY( if (!(UseRTMForStackLocks && use_rtm)) ) // skip load if already done 2938 ld(current_header, oopDesc::mark_offset_in_bytes(), oop); 2939 andi_(R0, current_header, markOopDesc::monitor_value); 2940 bne(CCR0, object_has_monitor); 2941 } 2942 2943 // Check if it is still a light weight lock, this is is true if we see 2944 // the stack address of the basicLock in the markOop of the object. 2945 // Cmpxchg sets flag to cmpd(current_header, box). 2946 cmpxchgd(/*flag=*/flag, 2947 /*current_value=*/current_header, 2948 /*compare_value=*/box, 2949 /*exchange_value=*/displaced_header, 2950 /*where=*/oop, 2951 MacroAssembler::MemBarRel, 2952 MacroAssembler::cmpxchgx_hint_release_lock(), 2953 noreg, 2954 &cont); 2955 2956 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 2957 2958 // Handle existing monitor. 2959 if ((EmitSync & 0x02) == 0) { 2960 b(cont); 2961 2962 bind(object_has_monitor); 2963 addi(current_header, current_header, -markOopDesc::monitor_value); // monitor 2964 ld(temp, ObjectMonitor::owner_offset_in_bytes(), current_header); 2965 2966 // It's inflated. 2967 #if INCLUDE_RTM_OPT 2968 if (use_rtm) { 2969 Label L_regular_inflated_unlock; 2970 // Clean monitor_value bit to get valid pointer 2971 cmpdi(flag, temp, 0); 2972 bne(flag, L_regular_inflated_unlock); 2973 tend_(); 2974 b(cont); 2975 bind(L_regular_inflated_unlock); 2976 } 2977 #endif 2978 2979 ld(displaced_header, ObjectMonitor::recursions_offset_in_bytes(), current_header); 2980 xorr(temp, R16_thread, temp); // Will be 0 if we are the owner. 2981 orr(temp, temp, displaced_header); // Will be 0 if there are 0 recursions. 2982 cmpdi(flag, temp, 0); 2983 bne(flag, cont); 2984 2985 ld(temp, ObjectMonitor::EntryList_offset_in_bytes(), current_header); 2986 ld(displaced_header, ObjectMonitor::cxq_offset_in_bytes(), current_header); 2987 orr(temp, temp, displaced_header); // Will be 0 if both are 0. 2988 cmpdi(flag, temp, 0); 2989 bne(flag, cont); 2990 release(); 2991 std(temp, ObjectMonitor::owner_offset_in_bytes(), current_header); 2992 } 2993 2994 bind(cont); 2995 // flag == EQ indicates success 2996 // flag == NE indicates failure 2997 } 2998 2999 // Write serialization page so VM thread can do a pseudo remote membar. 3000 // We use the current thread pointer to calculate a thread specific 3001 // offset to write to within the page. This minimizes bus traffic 3002 // due to cache line collision. 3003 void MacroAssembler::serialize_memory(Register thread, Register tmp1, Register tmp2) { 3004 srdi(tmp2, thread, os::get_serialize_page_shift_count()); 3005 3006 int mask = os::vm_page_size() - sizeof(int); 3007 if (Assembler::is_simm(mask, 16)) { 3008 andi(tmp2, tmp2, mask); 3009 } else { 3010 lis(tmp1, (int)((signed short) (mask >> 16))); 3011 ori(tmp1, tmp1, mask & 0x0000ffff); 3012 andr(tmp2, tmp2, tmp1); 3013 } 3014 3015 load_const(tmp1, (long) os::get_memory_serialize_page()); 3016 release(); 3017 stwx(R0, tmp1, tmp2); 3018 } 3019 3020 void MacroAssembler::safepoint_poll(Label& slow_path, Register temp_reg) { 3021 if (SafepointMechanism::uses_thread_local_poll()) { 3022 ld(temp_reg, in_bytes(Thread::polling_page_offset()), R16_thread); 3023 // Armed page has poll_bit set. 3024 andi_(temp_reg, temp_reg, SafepointMechanism::poll_bit()); 3025 } else { 3026 lwz(temp_reg, (RegisterOrConstant)(intptr_t)SafepointSynchronize::address_of_state()); 3027 cmpwi(CCR0, temp_reg, SafepointSynchronize::_not_synchronized); 3028 } 3029 bne(CCR0, slow_path); 3030 } 3031 3032 3033 // GC barrier helper macros 3034 3035 // Write the card table byte if needed. 3036 void MacroAssembler::card_write_barrier_post(Register Rstore_addr, Register Rnew_val, Register Rtmp) { 3037 CardTableModRefBS* bs = 3038 barrier_set_cast<CardTableModRefBS>(Universe::heap()->barrier_set()); 3039 assert(bs->kind() == BarrierSet::CardTableForRS || 3040 bs->kind() == BarrierSet::CardTableExtension, "wrong barrier"); 3041 #ifdef ASSERT 3042 cmpdi(CCR0, Rnew_val, 0); 3043 asm_assert_ne("null oop not allowed", 0x321); 3044 #endif 3045 card_table_write(bs->byte_map_base, Rtmp, Rstore_addr); 3046 } 3047 3048 // Write the card table byte. 3049 void MacroAssembler::card_table_write(jbyte* byte_map_base, Register Rtmp, Register Robj) { 3050 assert_different_registers(Robj, Rtmp, R0); 3051 load_const_optimized(Rtmp, (address)byte_map_base, R0); 3052 srdi(Robj, Robj, CardTableModRefBS::card_shift); 3053 li(R0, 0); // dirty 3054 if (UseConcMarkSweepGC) membar(Assembler::StoreStore); 3055 stbx(R0, Rtmp, Robj); 3056 } 3057 3058 // Kills R31 if value is a volatile register. 3059 void MacroAssembler::resolve_jobject(Register value, Register tmp1, Register tmp2, bool needs_frame) { 3060 Label done; 3061 cmpdi(CCR0, value, 0); 3062 beq(CCR0, done); // Use NULL as-is. 3063 3064 clrrdi(tmp1, value, JNIHandles::weak_tag_size); 3065 #if INCLUDE_ALL_GCS 3066 if (UseG1GC) { andi_(tmp2, value, JNIHandles::weak_tag_mask); } 3067 #endif 3068 ld(value, 0, tmp1); // Resolve (untagged) jobject. 3069 3070 #if INCLUDE_ALL_GCS 3071 if (UseG1GC) { 3072 Label not_weak; 3073 beq(CCR0, not_weak); // Test for jweak tag. 3074 verify_oop(value); 3075 g1_write_barrier_pre(noreg, // obj 3076 noreg, // offset 3077 value, // pre_val 3078 tmp1, tmp2, needs_frame); 3079 bind(not_weak); 3080 } 3081 #endif // INCLUDE_ALL_GCS 3082 verify_oop(value); 3083 bind(done); 3084 } 3085 3086 #if INCLUDE_ALL_GCS 3087 // General G1 pre-barrier generator. 3088 // Goal: record the previous value if it is not null. 3089 void MacroAssembler::g1_write_barrier_pre(Register Robj, RegisterOrConstant offset, Register Rpre_val, 3090 Register Rtmp1, Register Rtmp2, bool needs_frame) { 3091 Label runtime, filtered; 3092 3093 // Is marking active? 3094 if (in_bytes(SATBMarkQueue::byte_width_of_active()) == 4) { 3095 lwz(Rtmp1, in_bytes(JavaThread::satb_mark_queue_offset() + SATBMarkQueue::byte_offset_of_active()), R16_thread); 3096 } else { 3097 guarantee(in_bytes(SATBMarkQueue::byte_width_of_active()) == 1, "Assumption"); 3098 lbz(Rtmp1, in_bytes(JavaThread::satb_mark_queue_offset() + SATBMarkQueue::byte_offset_of_active()), R16_thread); 3099 } 3100 cmpdi(CCR0, Rtmp1, 0); 3101 beq(CCR0, filtered); 3102 3103 // Do we need to load the previous value? 3104 if (Robj != noreg) { 3105 // Load the previous value... 3106 if (UseCompressedOops) { 3107 lwz(Rpre_val, offset, Robj); 3108 } else { 3109 ld(Rpre_val, offset, Robj); 3110 } 3111 // Previous value has been loaded into Rpre_val. 3112 } 3113 assert(Rpre_val != noreg, "must have a real register"); 3114 3115 // Is the previous value null? 3116 cmpdi(CCR0, Rpre_val, 0); 3117 beq(CCR0, filtered); 3118 3119 if (Robj != noreg && UseCompressedOops) { 3120 decode_heap_oop_not_null(Rpre_val); 3121 } 3122 3123 // OK, it's not filtered, so we'll need to call enqueue. In the normal 3124 // case, pre_val will be a scratch G-reg, but there are some cases in 3125 // which it's an O-reg. In the first case, do a normal call. In the 3126 // latter, do a save here and call the frameless version. 3127 3128 // Can we store original value in the thread's buffer? 3129 // Is index == 0? 3130 // (The index field is typed as size_t.) 3131 const Register Rbuffer = Rtmp1, Rindex = Rtmp2; 3132 3133 ld(Rindex, in_bytes(JavaThread::satb_mark_queue_offset() + SATBMarkQueue::byte_offset_of_index()), R16_thread); 3134 cmpdi(CCR0, Rindex, 0); 3135 beq(CCR0, runtime); // If index == 0, goto runtime. 3136 ld(Rbuffer, in_bytes(JavaThread::satb_mark_queue_offset() + SATBMarkQueue::byte_offset_of_buf()), R16_thread); 3137 3138 addi(Rindex, Rindex, -wordSize); // Decrement index. 3139 std(Rindex, in_bytes(JavaThread::satb_mark_queue_offset() + SATBMarkQueue::byte_offset_of_index()), R16_thread); 3140 3141 // Record the previous value. 3142 stdx(Rpre_val, Rbuffer, Rindex); 3143 b(filtered); 3144 3145 bind(runtime); 3146 3147 // May need to preserve LR. Also needed if current frame is not compatible with C calling convention. 3148 if (needs_frame) { 3149 save_LR_CR(Rtmp1); 3150 push_frame_reg_args(0, Rtmp2); 3151 } 3152 3153 if (Rpre_val->is_volatile() && Robj == noreg) mr(R31, Rpre_val); // Save pre_val across C call if it was preloaded. 3154 call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_pre), Rpre_val, R16_thread); 3155 if (Rpre_val->is_volatile() && Robj == noreg) mr(Rpre_val, R31); // restore 3156 3157 if (needs_frame) { 3158 pop_frame(); 3159 restore_LR_CR(Rtmp1); 3160 } 3161 3162 bind(filtered); 3163 } 3164 3165 // General G1 post-barrier generator 3166 // Store cross-region card. 3167 void MacroAssembler::g1_write_barrier_post(Register Rstore_addr, Register Rnew_val, Register Rtmp1, Register Rtmp2, Register Rtmp3, Label *filtered_ext) { 3168 Label runtime, filtered_int; 3169 Label& filtered = (filtered_ext != NULL) ? *filtered_ext : filtered_int; 3170 assert_different_registers(Rstore_addr, Rnew_val, Rtmp1, Rtmp2); 3171 3172 G1SATBCardTableLoggingModRefBS* bs = 3173 barrier_set_cast<G1SATBCardTableLoggingModRefBS>(Universe::heap()->barrier_set()); 3174 3175 // Does store cross heap regions? 3176 if (G1RSBarrierRegionFilter) { 3177 xorr(Rtmp1, Rstore_addr, Rnew_val); 3178 srdi_(Rtmp1, Rtmp1, HeapRegion::LogOfHRGrainBytes); 3179 beq(CCR0, filtered); 3180 } 3181 3182 // Crosses regions, storing NULL? 3183 #ifdef ASSERT 3184 cmpdi(CCR0, Rnew_val, 0); 3185 asm_assert_ne("null oop not allowed (G1)", 0x322); // Checked by caller on PPC64, so following branch is obsolete: 3186 //beq(CCR0, filtered); 3187 #endif 3188 3189 // Storing region crossing non-NULL, is card already dirty? 3190 assert(sizeof(*bs->byte_map_base) == sizeof(jbyte), "adjust this code"); 3191 const Register Rcard_addr = Rtmp1; 3192 Register Rbase = Rtmp2; 3193 load_const_optimized(Rbase, (address)bs->byte_map_base, /*temp*/ Rtmp3); 3194 3195 srdi(Rcard_addr, Rstore_addr, CardTableModRefBS::card_shift); 3196 3197 // Get the address of the card. 3198 lbzx(/*card value*/ Rtmp3, Rbase, Rcard_addr); 3199 cmpwi(CCR0, Rtmp3, (int)G1SATBCardTableModRefBS::g1_young_card_val()); 3200 beq(CCR0, filtered); 3201 3202 membar(Assembler::StoreLoad); 3203 lbzx(/*card value*/ Rtmp3, Rbase, Rcard_addr); // Reload after membar. 3204 cmpwi(CCR0, Rtmp3 /* card value */, CardTableModRefBS::dirty_card_val()); 3205 beq(CCR0, filtered); 3206 3207 // Storing a region crossing, non-NULL oop, card is clean. 3208 // Dirty card and log. 3209 li(Rtmp3, CardTableModRefBS::dirty_card_val()); 3210 //release(); // G1: oops are allowed to get visible after dirty marking. 3211 stbx(Rtmp3, Rbase, Rcard_addr); 3212 3213 add(Rcard_addr, Rbase, Rcard_addr); // This is the address which needs to get enqueued. 3214 Rbase = noreg; // end of lifetime 3215 3216 const Register Rqueue_index = Rtmp2, 3217 Rqueue_buf = Rtmp3; 3218 ld(Rqueue_index, in_bytes(JavaThread::dirty_card_queue_offset() + DirtyCardQueue::byte_offset_of_index()), R16_thread); 3219 cmpdi(CCR0, Rqueue_index, 0); 3220 beq(CCR0, runtime); // index == 0 then jump to runtime 3221 ld(Rqueue_buf, in_bytes(JavaThread::dirty_card_queue_offset() + DirtyCardQueue::byte_offset_of_buf()), R16_thread); 3222 3223 addi(Rqueue_index, Rqueue_index, -wordSize); // decrement index 3224 std(Rqueue_index, in_bytes(JavaThread::dirty_card_queue_offset() + DirtyCardQueue::byte_offset_of_index()), R16_thread); 3225 3226 stdx(Rcard_addr, Rqueue_buf, Rqueue_index); // store card 3227 b(filtered); 3228 3229 bind(runtime); 3230 3231 // Save the live input values. 3232 call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_post), Rcard_addr, R16_thread); 3233 3234 bind(filtered_int); 3235 } 3236 #endif // INCLUDE_ALL_GCS 3237 3238 // Values for last_Java_pc, and last_Java_sp must comply to the rules 3239 // in frame_ppc.hpp. 3240 void MacroAssembler::set_last_Java_frame(Register last_Java_sp, Register last_Java_pc) { 3241 // Always set last_Java_pc and flags first because once last_Java_sp 3242 // is visible has_last_Java_frame is true and users will look at the 3243 // rest of the fields. (Note: flags should always be zero before we 3244 // get here so doesn't need to be set.) 3245 3246 // Verify that last_Java_pc was zeroed on return to Java 3247 asm_assert_mem8_is_zero(in_bytes(JavaThread::last_Java_pc_offset()), R16_thread, 3248 "last_Java_pc not zeroed before leaving Java", 0x200); 3249 3250 // When returning from calling out from Java mode the frame anchor's 3251 // last_Java_pc will always be set to NULL. It is set here so that 3252 // if we are doing a call to native (not VM) that we capture the 3253 // known pc and don't have to rely on the native call having a 3254 // standard frame linkage where we can find the pc. 3255 if (last_Java_pc != noreg) 3256 std(last_Java_pc, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread); 3257 3258 // Set last_Java_sp last. 3259 std(last_Java_sp, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread); 3260 } 3261 3262 void MacroAssembler::reset_last_Java_frame(void) { 3263 asm_assert_mem8_isnot_zero(in_bytes(JavaThread::last_Java_sp_offset()), 3264 R16_thread, "SP was not set, still zero", 0x202); 3265 3266 BLOCK_COMMENT("reset_last_Java_frame {"); 3267 li(R0, 0); 3268 3269 // _last_Java_sp = 0 3270 std(R0, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread); 3271 3272 // _last_Java_pc = 0 3273 std(R0, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread); 3274 BLOCK_COMMENT("} reset_last_Java_frame"); 3275 } 3276 3277 void MacroAssembler::set_top_ijava_frame_at_SP_as_last_Java_frame(Register sp, Register tmp1) { 3278 assert_different_registers(sp, tmp1); 3279 3280 // sp points to a TOP_IJAVA_FRAME, retrieve frame's PC via 3281 // TOP_IJAVA_FRAME_ABI. 3282 // FIXME: assert that we really have a TOP_IJAVA_FRAME here! 3283 address entry = pc(); 3284 load_const_optimized(tmp1, entry); 3285 3286 set_last_Java_frame(/*sp=*/sp, /*pc=*/tmp1); 3287 } 3288 3289 void MacroAssembler::get_vm_result(Register oop_result) { 3290 // Read: 3291 // R16_thread 3292 // R16_thread->in_bytes(JavaThread::vm_result_offset()) 3293 // 3294 // Updated: 3295 // oop_result 3296 // R16_thread->in_bytes(JavaThread::vm_result_offset()) 3297 3298 verify_thread(); 3299 3300 ld(oop_result, in_bytes(JavaThread::vm_result_offset()), R16_thread); 3301 li(R0, 0); 3302 std(R0, in_bytes(JavaThread::vm_result_offset()), R16_thread); 3303 3304 verify_oop(oop_result); 3305 } 3306 3307 void MacroAssembler::get_vm_result_2(Register metadata_result) { 3308 // Read: 3309 // R16_thread 3310 // R16_thread->in_bytes(JavaThread::vm_result_2_offset()) 3311 // 3312 // Updated: 3313 // metadata_result 3314 // R16_thread->in_bytes(JavaThread::vm_result_2_offset()) 3315 3316 ld(metadata_result, in_bytes(JavaThread::vm_result_2_offset()), R16_thread); 3317 li(R0, 0); 3318 std(R0, in_bytes(JavaThread::vm_result_2_offset()), R16_thread); 3319 } 3320 3321 Register MacroAssembler::encode_klass_not_null(Register dst, Register src) { 3322 Register current = (src != noreg) ? src : dst; // Klass is in dst if no src provided. 3323 if (Universe::narrow_klass_base() != 0) { 3324 // Use dst as temp if it is free. 3325 sub_const_optimized(dst, current, Universe::narrow_klass_base(), R0); 3326 current = dst; 3327 } 3328 if (Universe::narrow_klass_shift() != 0) { 3329 srdi(dst, current, Universe::narrow_klass_shift()); 3330 current = dst; 3331 } 3332 return current; 3333 } 3334 3335 void MacroAssembler::store_klass(Register dst_oop, Register klass, Register ck) { 3336 if (UseCompressedClassPointers) { 3337 Register compressedKlass = encode_klass_not_null(ck, klass); 3338 stw(compressedKlass, oopDesc::klass_offset_in_bytes(), dst_oop); 3339 } else { 3340 std(klass, oopDesc::klass_offset_in_bytes(), dst_oop); 3341 } 3342 } 3343 3344 void MacroAssembler::store_klass_gap(Register dst_oop, Register val) { 3345 if (UseCompressedClassPointers) { 3346 if (val == noreg) { 3347 val = R0; 3348 li(val, 0); 3349 } 3350 stw(val, oopDesc::klass_gap_offset_in_bytes(), dst_oop); // klass gap if compressed 3351 } 3352 } 3353 3354 int MacroAssembler::instr_size_for_decode_klass_not_null() { 3355 if (!UseCompressedClassPointers) return 0; 3356 int num_instrs = 1; // shift or move 3357 if (Universe::narrow_klass_base() != 0) num_instrs = 7; // shift + load const + add 3358 return num_instrs * BytesPerInstWord; 3359 } 3360 3361 void MacroAssembler::decode_klass_not_null(Register dst, Register src) { 3362 assert(dst != R0, "Dst reg may not be R0, as R0 is used here."); 3363 if (src == noreg) src = dst; 3364 Register shifted_src = src; 3365 if (Universe::narrow_klass_shift() != 0 || 3366 Universe::narrow_klass_base() == 0 && src != dst) { // Move required. 3367 shifted_src = dst; 3368 sldi(shifted_src, src, Universe::narrow_klass_shift()); 3369 } 3370 if (Universe::narrow_klass_base() != 0) { 3371 add_const_optimized(dst, shifted_src, Universe::narrow_klass_base(), R0); 3372 } 3373 } 3374 3375 void MacroAssembler::load_klass(Register dst, Register src) { 3376 if (UseCompressedClassPointers) { 3377 lwz(dst, oopDesc::klass_offset_in_bytes(), src); 3378 // Attention: no null check here! 3379 decode_klass_not_null(dst, dst); 3380 } else { 3381 ld(dst, oopDesc::klass_offset_in_bytes(), src); 3382 } 3383 } 3384 3385 // ((OopHandle)result).resolve(); 3386 void MacroAssembler::resolve_oop_handle(Register result) { 3387 // OopHandle::resolve is an indirection. 3388 ld(result, 0, result); 3389 } 3390 3391 void MacroAssembler::load_mirror_from_const_method(Register mirror, Register const_method) { 3392 ld(mirror, in_bytes(ConstMethod::constants_offset()), const_method); 3393 ld(mirror, ConstantPool::pool_holder_offset_in_bytes(), mirror); 3394 ld(mirror, in_bytes(Klass::java_mirror_offset()), mirror); 3395 resolve_oop_handle(mirror); 3396 } 3397 3398 // Clear Array 3399 // For very short arrays. tmp == R0 is allowed. 3400 void MacroAssembler::clear_memory_unrolled(Register base_ptr, int cnt_dwords, Register tmp, int offset) { 3401 if (cnt_dwords > 0) { li(tmp, 0); } 3402 for (int i = 0; i < cnt_dwords; ++i) { std(tmp, offset + i * 8, base_ptr); } 3403 } 3404 3405 // Version for constant short array length. Kills base_ptr. tmp == R0 is allowed. 3406 void MacroAssembler::clear_memory_constlen(Register base_ptr, int cnt_dwords, Register tmp) { 3407 if (cnt_dwords < 8) { 3408 clear_memory_unrolled(base_ptr, cnt_dwords, tmp); 3409 return; 3410 } 3411 3412 Label loop; 3413 const long loopcnt = cnt_dwords >> 1, 3414 remainder = cnt_dwords & 1; 3415 3416 li(tmp, loopcnt); 3417 mtctr(tmp); 3418 li(tmp, 0); 3419 bind(loop); 3420 std(tmp, 0, base_ptr); 3421 std(tmp, 8, base_ptr); 3422 addi(base_ptr, base_ptr, 16); 3423 bdnz(loop); 3424 if (remainder) { std(tmp, 0, base_ptr); } 3425 } 3426 3427 // Kills both input registers. tmp == R0 is allowed. 3428 void MacroAssembler::clear_memory_doubleword(Register base_ptr, Register cnt_dwords, Register tmp, long const_cnt) { 3429 // Procedure for large arrays (uses data cache block zero instruction). 3430 Label startloop, fast, fastloop, small_rest, restloop, done; 3431 const int cl_size = VM_Version::L1_data_cache_line_size(), 3432 cl_dwords = cl_size >> 3, 3433 cl_dw_addr_bits = exact_log2(cl_dwords), 3434 dcbz_min = 1, // Min count of dcbz executions, needs to be >0. 3435 min_cnt = ((dcbz_min + 1) << cl_dw_addr_bits) - 1; 3436 3437 if (const_cnt >= 0) { 3438 // Constant case. 3439 if (const_cnt < min_cnt) { 3440 clear_memory_constlen(base_ptr, const_cnt, tmp); 3441 return; 3442 } 3443 load_const_optimized(cnt_dwords, const_cnt, tmp); 3444 } else { 3445 // cnt_dwords already loaded in register. Need to check size. 3446 cmpdi(CCR1, cnt_dwords, min_cnt); // Big enough? (ensure >= dcbz_min lines included). 3447 blt(CCR1, small_rest); 3448 } 3449 rldicl_(tmp, base_ptr, 64-3, 64-cl_dw_addr_bits); // Extract dword offset within first cache line. 3450 beq(CCR0, fast); // Already 128byte aligned. 3451 3452 subfic(tmp, tmp, cl_dwords); 3453 mtctr(tmp); // Set ctr to hit 128byte boundary (0<ctr<cl_dwords). 3454 subf(cnt_dwords, tmp, cnt_dwords); // rest. 3455 li(tmp, 0); 3456 3457 bind(startloop); // Clear at the beginning to reach 128byte boundary. 3458 std(tmp, 0, base_ptr); // Clear 8byte aligned block. 3459 addi(base_ptr, base_ptr, 8); 3460 bdnz(startloop); 3461 3462 bind(fast); // Clear 128byte blocks. 3463 srdi(tmp, cnt_dwords, cl_dw_addr_bits); // Loop count for 128byte loop (>0). 3464 andi(cnt_dwords, cnt_dwords, cl_dwords-1); // Rest in dwords. 3465 mtctr(tmp); // Load counter. 3466 3467 bind(fastloop); 3468 dcbz(base_ptr); // Clear 128byte aligned block. 3469 addi(base_ptr, base_ptr, cl_size); 3470 bdnz(fastloop); 3471 3472 bind(small_rest); 3473 cmpdi(CCR0, cnt_dwords, 0); // size 0? 3474 beq(CCR0, done); // rest == 0 3475 li(tmp, 0); 3476 mtctr(cnt_dwords); // Load counter. 3477 3478 bind(restloop); // Clear rest. 3479 std(tmp, 0, base_ptr); // Clear 8byte aligned block. 3480 addi(base_ptr, base_ptr, 8); 3481 bdnz(restloop); 3482 3483 bind(done); 3484 } 3485 3486 /////////////////////////////////////////// String intrinsics //////////////////////////////////////////// 3487 3488 #ifdef COMPILER2 3489 // Intrinsics for CompactStrings 3490 3491 // Compress char[] to byte[] by compressing 16 bytes at once. 3492 void MacroAssembler::string_compress_16(Register src, Register dst, Register cnt, 3493 Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5, 3494 Label& Lfailure) { 3495 3496 const Register tmp0 = R0; 3497 assert_different_registers(src, dst, cnt, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5); 3498 Label Lloop, Lslow; 3499 3500 // Check if cnt >= 8 (= 16 bytes) 3501 lis(tmp1, 0xFF); // tmp1 = 0x00FF00FF00FF00FF 3502 srwi_(tmp2, cnt, 3); 3503 beq(CCR0, Lslow); 3504 ori(tmp1, tmp1, 0xFF); 3505 rldimi(tmp1, tmp1, 32, 0); 3506 mtctr(tmp2); 3507 3508 // 2x unrolled loop 3509 bind(Lloop); 3510 ld(tmp2, 0, src); // _0_1_2_3 (Big Endian) 3511 ld(tmp4, 8, src); // _4_5_6_7 3512 3513 orr(tmp0, tmp2, tmp4); 3514 rldicl(tmp3, tmp2, 6*8, 64-24); // _____1_2 3515 rldimi(tmp2, tmp2, 2*8, 2*8); // _0_2_3_3 3516 rldicl(tmp5, tmp4, 6*8, 64-24); // _____5_6 3517 rldimi(tmp4, tmp4, 2*8, 2*8); // _4_6_7_7 3518 3519 andc_(tmp0, tmp0, tmp1); 3520 bne(CCR0, Lfailure); // Not latin1. 3521 addi(src, src, 16); 3522 3523 rlwimi(tmp3, tmp2, 0*8, 24, 31);// _____1_3 3524 srdi(tmp2, tmp2, 3*8); // ____0_2_ 3525 rlwimi(tmp5, tmp4, 0*8, 24, 31);// _____5_7 3526 srdi(tmp4, tmp4, 3*8); // ____4_6_ 3527 3528 orr(tmp2, tmp2, tmp3); // ____0123 3529 orr(tmp4, tmp4, tmp5); // ____4567 3530 3531 stw(tmp2, 0, dst); 3532 stw(tmp4, 4, dst); 3533 addi(dst, dst, 8); 3534 bdnz(Lloop); 3535 3536 bind(Lslow); // Fallback to slow version 3537 } 3538 3539 // Compress char[] to byte[]. cnt must be positive int. 3540 void MacroAssembler::string_compress(Register src, Register dst, Register cnt, Register tmp, Label& Lfailure) { 3541 Label Lloop; 3542 mtctr(cnt); 3543 3544 bind(Lloop); 3545 lhz(tmp, 0, src); 3546 cmplwi(CCR0, tmp, 0xff); 3547 bgt(CCR0, Lfailure); // Not latin1. 3548 addi(src, src, 2); 3549 stb(tmp, 0, dst); 3550 addi(dst, dst, 1); 3551 bdnz(Lloop); 3552 } 3553 3554 // Inflate byte[] to char[] by inflating 16 bytes at once. 3555 void MacroAssembler::string_inflate_16(Register src, Register dst, Register cnt, 3556 Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5) { 3557 const Register tmp0 = R0; 3558 assert_different_registers(src, dst, cnt, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5); 3559 Label Lloop, Lslow; 3560 3561 // Check if cnt >= 8 3562 srwi_(tmp2, cnt, 3); 3563 beq(CCR0, Lslow); 3564 lis(tmp1, 0xFF); // tmp1 = 0x00FF00FF 3565 ori(tmp1, tmp1, 0xFF); 3566 mtctr(tmp2); 3567 3568 // 2x unrolled loop 3569 bind(Lloop); 3570 lwz(tmp2, 0, src); // ____0123 (Big Endian) 3571 lwz(tmp4, 4, src); // ____4567 3572 addi(src, src, 8); 3573 3574 rldicl(tmp3, tmp2, 7*8, 64-8); // _______2 3575 rlwimi(tmp2, tmp2, 3*8, 16, 23);// ____0113 3576 rldicl(tmp5, tmp4, 7*8, 64-8); // _______6 3577 rlwimi(tmp4, tmp4, 3*8, 16, 23);// ____4557 3578 3579 andc(tmp0, tmp2, tmp1); // ____0_1_ 3580 rlwimi(tmp2, tmp3, 2*8, 0, 23); // _____2_3 3581 andc(tmp3, tmp4, tmp1); // ____4_5_ 3582 rlwimi(tmp4, tmp5, 2*8, 0, 23); // _____6_7 3583 3584 rldimi(tmp2, tmp0, 3*8, 0*8); // _0_1_2_3 3585 rldimi(tmp4, tmp3, 3*8, 0*8); // _4_5_6_7 3586 3587 std(tmp2, 0, dst); 3588 std(tmp4, 8, dst); 3589 addi(dst, dst, 16); 3590 bdnz(Lloop); 3591 3592 bind(Lslow); // Fallback to slow version 3593 } 3594 3595 // Inflate byte[] to char[]. cnt must be positive int. 3596 void MacroAssembler::string_inflate(Register src, Register dst, Register cnt, Register tmp) { 3597 Label Lloop; 3598 mtctr(cnt); 3599 3600 bind(Lloop); 3601 lbz(tmp, 0, src); 3602 addi(src, src, 1); 3603 sth(tmp, 0, dst); 3604 addi(dst, dst, 2); 3605 bdnz(Lloop); 3606 } 3607 3608 void MacroAssembler::string_compare(Register str1, Register str2, 3609 Register cnt1, Register cnt2, 3610 Register tmp1, Register result, int ae) { 3611 const Register tmp0 = R0, 3612 diff = tmp1; 3613 3614 assert_different_registers(str1, str2, cnt1, cnt2, tmp0, tmp1, result); 3615 Label Ldone, Lslow, Lloop, Lreturn_diff; 3616 3617 // Note: Making use of the fact that compareTo(a, b) == -compareTo(b, a) 3618 // we interchange str1 and str2 in the UL case and negate the result. 3619 // Like this, str1 is always latin1 encoded, except for the UU case. 3620 // In addition, we need 0 (or sign which is 0) extend. 3621 3622 if (ae == StrIntrinsicNode::UU) { 3623 srwi(cnt1, cnt1, 1); 3624 } else { 3625 clrldi(cnt1, cnt1, 32); 3626 } 3627 3628 if (ae != StrIntrinsicNode::LL) { 3629 srwi(cnt2, cnt2, 1); 3630 } else { 3631 clrldi(cnt2, cnt2, 32); 3632 } 3633 3634 // See if the lengths are different, and calculate min in cnt1. 3635 // Save diff in case we need it for a tie-breaker. 3636 subf_(diff, cnt2, cnt1); // diff = cnt1 - cnt2 3637 // if (diff > 0) { cnt1 = cnt2; } 3638 if (VM_Version::has_isel()) { 3639 isel(cnt1, CCR0, Assembler::greater, /*invert*/ false, cnt2); 3640 } else { 3641 Label Lskip; 3642 blt(CCR0, Lskip); 3643 mr(cnt1, cnt2); 3644 bind(Lskip); 3645 } 3646 3647 // Rename registers 3648 Register chr1 = result; 3649 Register chr2 = tmp0; 3650 3651 // Compare multiple characters in fast loop (only implemented for same encoding). 3652 int stride1 = 8, stride2 = 8; 3653 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3654 int log2_chars_per_iter = (ae == StrIntrinsicNode::LL) ? 3 : 2; 3655 Label Lfastloop, Lskipfast; 3656 3657 srwi_(tmp0, cnt1, log2_chars_per_iter); 3658 beq(CCR0, Lskipfast); 3659 rldicl(cnt2, cnt1, 0, 64 - log2_chars_per_iter); // Remaining characters. 3660 li(cnt1, 1 << log2_chars_per_iter); // Initialize for failure case: Rescan characters from current iteration. 3661 mtctr(tmp0); 3662 3663 bind(Lfastloop); 3664 ld(chr1, 0, str1); 3665 ld(chr2, 0, str2); 3666 cmpd(CCR0, chr1, chr2); 3667 bne(CCR0, Lslow); 3668 addi(str1, str1, stride1); 3669 addi(str2, str2, stride2); 3670 bdnz(Lfastloop); 3671 mr(cnt1, cnt2); // Remaining characters. 3672 bind(Lskipfast); 3673 } 3674 3675 // Loop which searches the first difference character by character. 3676 cmpwi(CCR0, cnt1, 0); 3677 beq(CCR0, Lreturn_diff); 3678 bind(Lslow); 3679 mtctr(cnt1); 3680 3681 switch (ae) { 3682 case StrIntrinsicNode::LL: stride1 = 1; stride2 = 1; break; 3683 case StrIntrinsicNode::UL: // fallthru (see comment above) 3684 case StrIntrinsicNode::LU: stride1 = 1; stride2 = 2; break; 3685 case StrIntrinsicNode::UU: stride1 = 2; stride2 = 2; break; 3686 default: ShouldNotReachHere(); break; 3687 } 3688 3689 bind(Lloop); 3690 if (stride1 == 1) { lbz(chr1, 0, str1); } else { lhz(chr1, 0, str1); } 3691 if (stride2 == 1) { lbz(chr2, 0, str2); } else { lhz(chr2, 0, str2); } 3692 subf_(result, chr2, chr1); // result = chr1 - chr2 3693 bne(CCR0, Ldone); 3694 addi(str1, str1, stride1); 3695 addi(str2, str2, stride2); 3696 bdnz(Lloop); 3697 3698 // If strings are equal up to min length, return the length difference. 3699 bind(Lreturn_diff); 3700 mr(result, diff); 3701 3702 // Otherwise, return the difference between the first mismatched chars. 3703 bind(Ldone); 3704 if (ae == StrIntrinsicNode::UL) { 3705 neg(result, result); // Negate result (see note above). 3706 } 3707 } 3708 3709 void MacroAssembler::array_equals(bool is_array_equ, Register ary1, Register ary2, 3710 Register limit, Register tmp1, Register result, bool is_byte) { 3711 const Register tmp0 = R0; 3712 assert_different_registers(ary1, ary2, limit, tmp0, tmp1, result); 3713 Label Ldone, Lskiploop, Lloop, Lfastloop, Lskipfast; 3714 bool limit_needs_shift = false; 3715 3716 if (is_array_equ) { 3717 const int length_offset = arrayOopDesc::length_offset_in_bytes(); 3718 const int base_offset = arrayOopDesc::base_offset_in_bytes(is_byte ? T_BYTE : T_CHAR); 3719 3720 // Return true if the same array. 3721 cmpd(CCR0, ary1, ary2); 3722 beq(CCR0, Lskiploop); 3723 3724 // Return false if one of them is NULL. 3725 cmpdi(CCR0, ary1, 0); 3726 cmpdi(CCR1, ary2, 0); 3727 li(result, 0); 3728 cror(CCR0, Assembler::equal, CCR1, Assembler::equal); 3729 beq(CCR0, Ldone); 3730 3731 // Load the lengths of arrays. 3732 lwz(limit, length_offset, ary1); 3733 lwz(tmp0, length_offset, ary2); 3734 3735 // Return false if the two arrays are not equal length. 3736 cmpw(CCR0, limit, tmp0); 3737 bne(CCR0, Ldone); 3738 3739 // Load array addresses. 3740 addi(ary1, ary1, base_offset); 3741 addi(ary2, ary2, base_offset); 3742 } else { 3743 limit_needs_shift = !is_byte; 3744 li(result, 0); // Assume not equal. 3745 } 3746 3747 // Rename registers 3748 Register chr1 = tmp0; 3749 Register chr2 = tmp1; 3750 3751 // Compare 8 bytes per iteration in fast loop. 3752 const int log2_chars_per_iter = is_byte ? 3 : 2; 3753 3754 srwi_(tmp0, limit, log2_chars_per_iter + (limit_needs_shift ? 1 : 0)); 3755 beq(CCR0, Lskipfast); 3756 mtctr(tmp0); 3757 3758 bind(Lfastloop); 3759 ld(chr1, 0, ary1); 3760 ld(chr2, 0, ary2); 3761 addi(ary1, ary1, 8); 3762 addi(ary2, ary2, 8); 3763 cmpd(CCR0, chr1, chr2); 3764 bne(CCR0, Ldone); 3765 bdnz(Lfastloop); 3766 3767 bind(Lskipfast); 3768 rldicl_(limit, limit, limit_needs_shift ? 64 - 1 : 0, 64 - log2_chars_per_iter); // Remaining characters. 3769 beq(CCR0, Lskiploop); 3770 mtctr(limit); 3771 3772 // Character by character. 3773 bind(Lloop); 3774 if (is_byte) { 3775 lbz(chr1, 0, ary1); 3776 lbz(chr2, 0, ary2); 3777 addi(ary1, ary1, 1); 3778 addi(ary2, ary2, 1); 3779 } else { 3780 lhz(chr1, 0, ary1); 3781 lhz(chr2, 0, ary2); 3782 addi(ary1, ary1, 2); 3783 addi(ary2, ary2, 2); 3784 } 3785 cmpw(CCR0, chr1, chr2); 3786 bne(CCR0, Ldone); 3787 bdnz(Lloop); 3788 3789 bind(Lskiploop); 3790 li(result, 1); // All characters are equal. 3791 bind(Ldone); 3792 } 3793 3794 void MacroAssembler::string_indexof(Register result, Register haystack, Register haycnt, 3795 Register needle, ciTypeArray* needle_values, Register needlecnt, int needlecntval, 3796 Register tmp1, Register tmp2, Register tmp3, Register tmp4, int ae) { 3797 3798 // Ensure 0<needlecnt<=haycnt in ideal graph as prerequisite! 3799 Label L_TooShort, L_Found, L_NotFound, L_End; 3800 Register last_addr = haycnt, // Kill haycnt at the beginning. 3801 addr = tmp1, 3802 n_start = tmp2, 3803 ch1 = tmp3, 3804 ch2 = R0; 3805 3806 assert(ae != StrIntrinsicNode::LU, "Invalid encoding"); 3807 const int h_csize = (ae == StrIntrinsicNode::LL) ? 1 : 2; 3808 const int n_csize = (ae == StrIntrinsicNode::UU) ? 2 : 1; 3809 3810 // ************************************************************************************************** 3811 // Prepare for main loop: optimized for needle count >=2, bail out otherwise. 3812 // ************************************************************************************************** 3813 3814 // Compute last haystack addr to use if no match gets found. 3815 clrldi(haycnt, haycnt, 32); // Ensure positive int is valid as 64 bit value. 3816 addi(addr, haystack, -h_csize); // Accesses use pre-increment. 3817 if (needlecntval == 0) { // variable needlecnt 3818 cmpwi(CCR6, needlecnt, 2); 3819 clrldi(needlecnt, needlecnt, 32); // Ensure positive int is valid as 64 bit value. 3820 blt(CCR6, L_TooShort); // Variable needlecnt: handle short needle separately. 3821 } 3822 3823 if (n_csize == 2) { lwz(n_start, 0, needle); } else { lhz(n_start, 0, needle); } // Load first 2 characters of needle. 3824 3825 if (needlecntval == 0) { // variable needlecnt 3826 subf(ch1, needlecnt, haycnt); // Last character index to compare is haycnt-needlecnt. 3827 addi(needlecnt, needlecnt, -2); // Rest of needle. 3828 } else { // constant needlecnt 3829 guarantee(needlecntval != 1, "IndexOf with single-character needle must be handled separately"); 3830 assert((needlecntval & 0x7fff) == needlecntval, "wrong immediate"); 3831 addi(ch1, haycnt, -needlecntval); // Last character index to compare is haycnt-needlecnt. 3832 if (needlecntval > 3) { li(needlecnt, needlecntval - 2); } // Rest of needle. 3833 } 3834 3835 if (h_csize == 2) { slwi(ch1, ch1, 1); } // Scale to number of bytes. 3836 3837 if (ae ==StrIntrinsicNode::UL) { 3838 srwi(tmp4, n_start, 1*8); // ___0 3839 rlwimi(n_start, tmp4, 2*8, 0, 23); // _0_1 3840 } 3841 3842 add(last_addr, haystack, ch1); // Point to last address to compare (haystack+2*(haycnt-needlecnt)). 3843 3844 // Main Loop (now we have at least 2 characters). 3845 Label L_OuterLoop, L_InnerLoop, L_FinalCheck, L_Comp1, L_Comp2; 3846 bind(L_OuterLoop); // Search for 1st 2 characters. 3847 Register addr_diff = tmp4; 3848 subf(addr_diff, addr, last_addr); // Difference between already checked address and last address to check. 3849 addi(addr, addr, h_csize); // This is the new address we want to use for comparing. 3850 srdi_(ch2, addr_diff, h_csize); 3851 beq(CCR0, L_FinalCheck); // 2 characters left? 3852 mtctr(ch2); // num of characters / 2 3853 bind(L_InnerLoop); // Main work horse (2x unrolled search loop) 3854 if (h_csize == 2) { // Load 2 characters of haystack (ignore alignment). 3855 lwz(ch1, 0, addr); 3856 lwz(ch2, 2, addr); 3857 } else { 3858 lhz(ch1, 0, addr); 3859 lhz(ch2, 1, addr); 3860 } 3861 cmpw(CCR0, ch1, n_start); // Compare 2 characters (1 would be sufficient but try to reduce branches to CompLoop). 3862 cmpw(CCR1, ch2, n_start); 3863 beq(CCR0, L_Comp1); // Did we find the needle start? 3864 beq(CCR1, L_Comp2); 3865 addi(addr, addr, 2 * h_csize); 3866 bdnz(L_InnerLoop); 3867 bind(L_FinalCheck); 3868 andi_(addr_diff, addr_diff, h_csize); // Remaining characters not covered by InnerLoop: (num of characters) & 1. 3869 beq(CCR0, L_NotFound); 3870 if (h_csize == 2) { lwz(ch1, 0, addr); } else { lhz(ch1, 0, addr); } // One position left at which we have to compare. 3871 cmpw(CCR1, ch1, n_start); 3872 beq(CCR1, L_Comp1); 3873 bind(L_NotFound); 3874 li(result, -1); // not found 3875 b(L_End); 3876 3877 // ************************************************************************************************** 3878 // Special Case: unfortunately, the variable needle case can be called with needlecnt<2 3879 // ************************************************************************************************** 3880 if (needlecntval == 0) { // We have to handle these cases separately. 3881 Label L_OneCharLoop; 3882 bind(L_TooShort); 3883 mtctr(haycnt); 3884 if (n_csize == 2) { lhz(n_start, 0, needle); } else { lbz(n_start, 0, needle); } // First character of needle 3885 bind(L_OneCharLoop); 3886 if (h_csize == 2) { lhzu(ch1, 2, addr); } else { lbzu(ch1, 1, addr); } 3887 cmpw(CCR1, ch1, n_start); 3888 beq(CCR1, L_Found); // Did we find the one character needle? 3889 bdnz(L_OneCharLoop); 3890 li(result, -1); // Not found. 3891 b(L_End); 3892 } 3893 3894 // ************************************************************************************************** 3895 // Regular Case Part II: compare rest of needle (first 2 characters have been compared already) 3896 // ************************************************************************************************** 3897 3898 // Compare the rest 3899 bind(L_Comp2); 3900 addi(addr, addr, h_csize); // First comparison has failed, 2nd one hit. 3901 bind(L_Comp1); // Addr points to possible needle start. 3902 if (needlecntval != 2) { // Const needlecnt==2? 3903 if (needlecntval != 3) { 3904 if (needlecntval == 0) { beq(CCR6, L_Found); } // Variable needlecnt==2? 3905 Register n_ind = tmp4, 3906 h_ind = n_ind; 3907 li(n_ind, 2 * n_csize); // First 2 characters are already compared, use index 2. 3908 mtctr(needlecnt); // Decremented by 2, still > 0. 3909 Label L_CompLoop; 3910 bind(L_CompLoop); 3911 if (ae ==StrIntrinsicNode::UL) { 3912 h_ind = ch1; 3913 sldi(h_ind, n_ind, 1); 3914 } 3915 if (n_csize == 2) { lhzx(ch2, needle, n_ind); } else { lbzx(ch2, needle, n_ind); } 3916 if (h_csize == 2) { lhzx(ch1, addr, h_ind); } else { lbzx(ch1, addr, h_ind); } 3917 cmpw(CCR1, ch1, ch2); 3918 bne(CCR1, L_OuterLoop); 3919 addi(n_ind, n_ind, n_csize); 3920 bdnz(L_CompLoop); 3921 } else { // No loop required if there's only one needle character left. 3922 if (n_csize == 2) { lhz(ch2, 2 * 2, needle); } else { lbz(ch2, 2 * 1, needle); } 3923 if (h_csize == 2) { lhz(ch1, 2 * 2, addr); } else { lbz(ch1, 2 * 1, addr); } 3924 cmpw(CCR1, ch1, ch2); 3925 bne(CCR1, L_OuterLoop); 3926 } 3927 } 3928 // Return index ... 3929 bind(L_Found); 3930 subf(result, haystack, addr); // relative to haystack, ... 3931 if (h_csize == 2) { srdi(result, result, 1); } // in characters. 3932 bind(L_End); 3933 } // string_indexof 3934 3935 void MacroAssembler::string_indexof_char(Register result, Register haystack, Register haycnt, 3936 Register needle, jchar needleChar, Register tmp1, Register tmp2, bool is_byte) { 3937 assert_different_registers(haystack, haycnt, needle, tmp1, tmp2); 3938 3939 Label L_InnerLoop, L_FinalCheck, L_Found1, L_Found2, L_NotFound, L_End; 3940 Register addr = tmp1, 3941 ch1 = tmp2, 3942 ch2 = R0; 3943 3944 const int h_csize = is_byte ? 1 : 2; 3945 3946 //4: 3947 srwi_(tmp2, haycnt, 1); // Shift right by exact_log2(UNROLL_FACTOR). 3948 mr(addr, haystack); 3949 beq(CCR0, L_FinalCheck); 3950 mtctr(tmp2); // Move to count register. 3951 //8: 3952 bind(L_InnerLoop); // Main work horse (2x unrolled search loop). 3953 if (!is_byte) { 3954 lhz(ch1, 0, addr); 3955 lhz(ch2, 2, addr); 3956 } else { 3957 lbz(ch1, 0, addr); 3958 lbz(ch2, 1, addr); 3959 } 3960 (needle != R0) ? cmpw(CCR0, ch1, needle) : cmplwi(CCR0, ch1, (unsigned int)needleChar); 3961 (needle != R0) ? cmpw(CCR1, ch2, needle) : cmplwi(CCR1, ch2, (unsigned int)needleChar); 3962 beq(CCR0, L_Found1); // Did we find the needle? 3963 beq(CCR1, L_Found2); 3964 addi(addr, addr, 2 * h_csize); 3965 bdnz(L_InnerLoop); 3966 //16: 3967 bind(L_FinalCheck); 3968 andi_(R0, haycnt, 1); 3969 beq(CCR0, L_NotFound); 3970 if (!is_byte) { lhz(ch1, 0, addr); } else { lbz(ch1, 0, addr); } // One position left at which we have to compare. 3971 (needle != R0) ? cmpw(CCR1, ch1, needle) : cmplwi(CCR1, ch1, (unsigned int)needleChar); 3972 beq(CCR1, L_Found1); 3973 //21: 3974 bind(L_NotFound); 3975 li(result, -1); // Not found. 3976 b(L_End); 3977 3978 bind(L_Found2); 3979 addi(addr, addr, h_csize); 3980 //24: 3981 bind(L_Found1); // Return index ... 3982 subf(result, haystack, addr); // relative to haystack, ... 3983 if (!is_byte) { srdi(result, result, 1); } // in characters. 3984 bind(L_End); 3985 } // string_indexof_char 3986 3987 3988 void MacroAssembler::has_negatives(Register src, Register cnt, Register result, 3989 Register tmp1, Register tmp2) { 3990 const Register tmp0 = R0; 3991 assert_different_registers(src, result, cnt, tmp0, tmp1, tmp2); 3992 Label Lfastloop, Lslow, Lloop, Lnoneg, Ldone; 3993 3994 // Check if cnt >= 8 (= 16 bytes) 3995 lis(tmp1, (int)(short)0x8080); // tmp1 = 0x8080808080808080 3996 srwi_(tmp2, cnt, 4); 3997 li(result, 1); // Assume there's a negative byte. 3998 beq(CCR0, Lslow); 3999 ori(tmp1, tmp1, 0x8080); 4000 rldimi(tmp1, tmp1, 32, 0); 4001 mtctr(tmp2); 4002 4003 // 2x unrolled loop 4004 bind(Lfastloop); 4005 ld(tmp2, 0, src); 4006 ld(tmp0, 8, src); 4007 4008 orr(tmp0, tmp2, tmp0); 4009 4010 and_(tmp0, tmp0, tmp1); 4011 bne(CCR0, Ldone); // Found negative byte. 4012 addi(src, src, 16); 4013 4014 bdnz(Lfastloop); 4015 4016 bind(Lslow); // Fallback to slow version 4017 rldicl_(tmp0, cnt, 0, 64-4); 4018 beq(CCR0, Lnoneg); 4019 mtctr(tmp0); 4020 bind(Lloop); 4021 lbz(tmp0, 0, src); 4022 addi(src, src, 1); 4023 andi_(tmp0, tmp0, 0x80); 4024 bne(CCR0, Ldone); // Found negative byte. 4025 bdnz(Lloop); 4026 bind(Lnoneg); 4027 li(result, 0); 4028 4029 bind(Ldone); 4030 } 4031 4032 #endif // Compiler2 4033 4034 // Helpers for Intrinsic Emitters 4035 // 4036 // Revert the byte order of a 32bit value in a register 4037 // src: 0x44556677 4038 // dst: 0x77665544 4039 // Three steps to obtain the result: 4040 // 1) Rotate src (as doubleword) left 5 bytes. That puts the leftmost byte of the src word 4041 // into the rightmost byte position. Afterwards, everything left of the rightmost byte is cleared. 4042 // This value initializes dst. 4043 // 2) Rotate src (as word) left 3 bytes. That puts the rightmost byte of the src word into the leftmost 4044 // byte position. Furthermore, byte 5 is rotated into byte 6 position where it is supposed to go. 4045 // This value is mask inserted into dst with a [0..23] mask of 1s. 4046 // 3) Rotate src (as word) left 1 byte. That puts byte 6 into byte 5 position. 4047 // This value is mask inserted into dst with a [8..15] mask of 1s. 4048 void MacroAssembler::load_reverse_32(Register dst, Register src) { 4049 assert_different_registers(dst, src); 4050 4051 rldicl(dst, src, (4+1)*8, 56); // Rotate byte 4 into position 7 (rightmost), clear all to the left. 4052 rlwimi(dst, src, 3*8, 0, 23); // Insert byte 5 into position 6, 7 into 4, leave pos 7 alone. 4053 rlwimi(dst, src, 1*8, 8, 15); // Insert byte 6 into position 5, leave the rest alone. 4054 } 4055 4056 // Calculate the column addresses of the crc32 lookup table into distinct registers. 4057 // This loop-invariant calculation is moved out of the loop body, reducing the loop 4058 // body size from 20 to 16 instructions. 4059 // Returns the offset that was used to calculate the address of column tc3. 4060 // Due to register shortage, setting tc3 may overwrite table. With the return offset 4061 // at hand, the original table address can be easily reconstructed. 4062 int MacroAssembler::crc32_table_columns(Register table, Register tc0, Register tc1, Register tc2, Register tc3) { 4063 4064 #ifdef VM_LITTLE_ENDIAN 4065 // This is what we implement (the DOLIT4 part): 4066 // ========================================================================= */ 4067 // #define DOLIT4 c ^= *buf4++; \ 4068 // c = crc_table[3][c & 0xff] ^ crc_table[2][(c >> 8) & 0xff] ^ \ 4069 // crc_table[1][(c >> 16) & 0xff] ^ crc_table[0][c >> 24] 4070 // #define DOLIT32 DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4 4071 // ========================================================================= */ 4072 const int ix0 = 3*(4*CRC32_COLUMN_SIZE); 4073 const int ix1 = 2*(4*CRC32_COLUMN_SIZE); 4074 const int ix2 = 1*(4*CRC32_COLUMN_SIZE); 4075 const int ix3 = 0*(4*CRC32_COLUMN_SIZE); 4076 #else 4077 // This is what we implement (the DOBIG4 part): 4078 // ========================================================================= 4079 // #define DOBIG4 c ^= *++buf4; \ 4080 // c = crc_table[4][c & 0xff] ^ crc_table[5][(c >> 8) & 0xff] ^ \ 4081 // crc_table[6][(c >> 16) & 0xff] ^ crc_table[7][c >> 24] 4082 // #define DOBIG32 DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4 4083 // ========================================================================= 4084 const int ix0 = 4*(4*CRC32_COLUMN_SIZE); 4085 const int ix1 = 5*(4*CRC32_COLUMN_SIZE); 4086 const int ix2 = 6*(4*CRC32_COLUMN_SIZE); 4087 const int ix3 = 7*(4*CRC32_COLUMN_SIZE); 4088 #endif 4089 assert_different_registers(table, tc0, tc1, tc2); 4090 assert(table == tc3, "must be!"); 4091 4092 addi(tc0, table, ix0); 4093 addi(tc1, table, ix1); 4094 addi(tc2, table, ix2); 4095 if (ix3 != 0) addi(tc3, table, ix3); 4096 4097 return ix3; 4098 } 4099 4100 /** 4101 * uint32_t crc; 4102 * timesXtoThe32[crc & 0xFF] ^ (crc >> 8); 4103 */ 4104 void MacroAssembler::fold_byte_crc32(Register crc, Register val, Register table, Register tmp) { 4105 assert_different_registers(crc, table, tmp); 4106 assert_different_registers(val, table); 4107 4108 if (crc == val) { // Must rotate first to use the unmodified value. 4109 rlwinm(tmp, val, 2, 24-2, 31-2); // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest. 4110 // As we use a word (4-byte) instruction, we have to adapt the mask bit positions. 4111 srwi(crc, crc, 8); // Unsigned shift, clear leftmost 8 bits. 4112 } else { 4113 srwi(crc, crc, 8); // Unsigned shift, clear leftmost 8 bits. 4114 rlwinm(tmp, val, 2, 24-2, 31-2); // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest. 4115 } 4116 lwzx(tmp, table, tmp); 4117 xorr(crc, crc, tmp); 4118 } 4119 4120 /** 4121 * uint32_t crc; 4122 * timesXtoThe32[crc & 0xFF] ^ (crc >> 8); 4123 */ 4124 void MacroAssembler::fold_8bit_crc32(Register crc, Register table, Register tmp) { 4125 fold_byte_crc32(crc, crc, table, tmp); 4126 } 4127 4128 /** 4129 * Emits code to update CRC-32 with a byte value according to constants in table. 4130 * 4131 * @param [in,out]crc Register containing the crc. 4132 * @param [in]val Register containing the byte to fold into the CRC. 4133 * @param [in]table Register containing the table of crc constants. 4134 * 4135 * uint32_t crc; 4136 * val = crc_table[(val ^ crc) & 0xFF]; 4137 * crc = val ^ (crc >> 8); 4138 */ 4139 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) { 4140 BLOCK_COMMENT("update_byte_crc32:"); 4141 xorr(val, val, crc); 4142 fold_byte_crc32(crc, val, table, val); 4143 } 4144 4145 /** 4146 * @param crc register containing existing CRC (32-bit) 4147 * @param buf register pointing to input byte buffer (byte*) 4148 * @param len register containing number of bytes 4149 * @param table register pointing to CRC table 4150 */ 4151 void MacroAssembler::update_byteLoop_crc32(Register crc, Register buf, Register len, Register table, 4152 Register data, bool loopAlignment) { 4153 assert_different_registers(crc, buf, len, table, data); 4154 4155 Label L_mainLoop, L_done; 4156 const int mainLoop_stepping = 1; 4157 const int mainLoop_alignment = loopAlignment ? 32 : 4; // (InputForNewCode > 4 ? InputForNewCode : 32) : 4; 4158 4159 // Process all bytes in a single-byte loop. 4160 clrldi_(len, len, 32); // Enforce 32 bit. Anything to do? 4161 beq(CCR0, L_done); 4162 4163 mtctr(len); 4164 align(mainLoop_alignment); 4165 BIND(L_mainLoop); 4166 lbz(data, 0, buf); // Byte from buffer, zero-extended. 4167 addi(buf, buf, mainLoop_stepping); // Advance buffer position. 4168 update_byte_crc32(crc, data, table); 4169 bdnz(L_mainLoop); // Iterate. 4170 4171 bind(L_done); 4172 } 4173 4174 /** 4175 * Emits code to update CRC-32 with a 4-byte value according to constants in table 4176 * Implementation according to jdk/src/share/native/java/util/zip/zlib-1.2.8/crc32.c 4177 */ 4178 // A not on the lookup table address(es): 4179 // The lookup table consists of two sets of four columns each. 4180 // The columns {0..3} are used for little-endian machines. 4181 // The columns {4..7} are used for big-endian machines. 4182 // To save the effort of adding the column offset to the table address each time 4183 // a table element is looked up, it is possible to pass the pre-calculated 4184 // column addresses. 4185 // Uses R9..R12 as work register. Must be saved/restored by caller, if necessary. 4186 void MacroAssembler::update_1word_crc32(Register crc, Register buf, Register table, int bufDisp, int bufInc, 4187 Register t0, Register t1, Register t2, Register t3, 4188 Register tc0, Register tc1, Register tc2, Register tc3) { 4189 assert_different_registers(crc, t3); 4190 4191 // XOR crc with next four bytes of buffer. 4192 lwz(t3, bufDisp, buf); 4193 if (bufInc != 0) { 4194 addi(buf, buf, bufInc); 4195 } 4196 xorr(t3, t3, crc); 4197 4198 // Chop crc into 4 single-byte pieces, shifted left 2 bits, to form the table indices. 4199 rlwinm(t0, t3, 2, 24-2, 31-2); // ((t1 >> 0) & 0xff) << 2 4200 rlwinm(t1, t3, 32+(2- 8), 24-2, 31-2); // ((t1 >> 8) & 0xff) << 2 4201 rlwinm(t2, t3, 32+(2-16), 24-2, 31-2); // ((t1 >> 16) & 0xff) << 2 4202 rlwinm(t3, t3, 32+(2-24), 24-2, 31-2); // ((t1 >> 24) & 0xff) << 2 4203 4204 // Use the pre-calculated column addresses. 4205 // Load pre-calculated table values. 4206 lwzx(t0, tc0, t0); 4207 lwzx(t1, tc1, t1); 4208 lwzx(t2, tc2, t2); 4209 lwzx(t3, tc3, t3); 4210 4211 // Calculate new crc from table values. 4212 xorr(t0, t0, t1); 4213 xorr(t2, t2, t3); 4214 xorr(crc, t0, t2); // Now crc contains the final checksum value. 4215 } 4216 4217 /** 4218 * @param crc register containing existing CRC (32-bit) 4219 * @param buf register pointing to input byte buffer (byte*) 4220 * @param len register containing number of bytes 4221 * @param table register pointing to CRC table 4222 * 4223 * Uses R9..R12 as work register. Must be saved/restored by caller! 4224 */ 4225 void MacroAssembler::kernel_crc32_2word(Register crc, Register buf, Register len, Register table, 4226 Register t0, Register t1, Register t2, Register t3, 4227 Register tc0, Register tc1, Register tc2, Register tc3, 4228 bool invertCRC) { 4229 assert_different_registers(crc, buf, len, table); 4230 4231 Label L_mainLoop, L_tail; 4232 Register tmp = t0; 4233 Register data = t0; 4234 Register tmp2 = t1; 4235 const int mainLoop_stepping = 8; 4236 const int tailLoop_stepping = 1; 4237 const int log_stepping = exact_log2(mainLoop_stepping); 4238 const int mainLoop_alignment = 32; // InputForNewCode > 4 ? InputForNewCode : 32; 4239 const int complexThreshold = 2*mainLoop_stepping; 4240 4241 // Don't test for len <= 0 here. This pathological case should not occur anyway. 4242 // Optimizing for it by adding a test and a branch seems to be a waste of CPU cycles 4243 // for all well-behaved cases. The situation itself is detected and handled correctly 4244 // within update_byteLoop_crc32. 4245 assert(tailLoop_stepping == 1, "check tailLoop_stepping!"); 4246 4247 BLOCK_COMMENT("kernel_crc32_2word {"); 4248 4249 if (invertCRC) { 4250 nand(crc, crc, crc); // 1s complement of crc 4251 } 4252 4253 // Check for short (<mainLoop_stepping) buffer. 4254 cmpdi(CCR0, len, complexThreshold); 4255 blt(CCR0, L_tail); 4256 4257 // Pre-mainLoop alignment did show a slight (1%) positive effect on performance. 4258 // We leave the code in for reference. Maybe we need alignment when we exploit vector instructions. 4259 { 4260 // Align buf addr to mainLoop_stepping boundary. 4261 neg(tmp2, buf); // Calculate # preLoop iterations for alignment. 4262 rldicl(tmp2, tmp2, 0, 64-log_stepping); // Rotate tmp2 0 bits, insert into tmp2, anding with mask with 1s from 62..63. 4263 4264 if (complexThreshold > mainLoop_stepping) { 4265 sub(len, len, tmp2); // Remaining bytes for main loop (>=mainLoop_stepping is guaranteed). 4266 } else { 4267 sub(tmp, len, tmp2); // Remaining bytes for main loop. 4268 cmpdi(CCR0, tmp, mainLoop_stepping); 4269 blt(CCR0, L_tail); // For less than one mainloop_stepping left, do only tail processing 4270 mr(len, tmp); // remaining bytes for main loop (>=mainLoop_stepping is guaranteed). 4271 } 4272 update_byteLoop_crc32(crc, buf, tmp2, table, data, false); 4273 } 4274 4275 srdi(tmp2, len, log_stepping); // #iterations for mainLoop 4276 andi(len, len, mainLoop_stepping-1); // remaining bytes for tailLoop 4277 mtctr(tmp2); 4278 4279 #ifdef VM_LITTLE_ENDIAN 4280 Register crc_rv = crc; 4281 #else 4282 Register crc_rv = tmp; // Load_reverse needs separate registers to work on. 4283 // Occupies tmp, but frees up crc. 4284 load_reverse_32(crc_rv, crc); // Revert byte order because we are dealing with big-endian data. 4285 tmp = crc; 4286 #endif 4287 4288 int reconstructTableOffset = crc32_table_columns(table, tc0, tc1, tc2, tc3); 4289 4290 align(mainLoop_alignment); // Octoword-aligned loop address. Shows 2% improvement. 4291 BIND(L_mainLoop); 4292 update_1word_crc32(crc_rv, buf, table, 0, 0, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3); 4293 update_1word_crc32(crc_rv, buf, table, 4, mainLoop_stepping, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3); 4294 bdnz(L_mainLoop); 4295 4296 #ifndef VM_LITTLE_ENDIAN 4297 load_reverse_32(crc, crc_rv); // Revert byte order because we are dealing with big-endian data. 4298 tmp = crc_rv; // Tmp uses it's original register again. 4299 #endif 4300 4301 // Restore original table address for tailLoop. 4302 if (reconstructTableOffset != 0) { 4303 addi(table, table, -reconstructTableOffset); 4304 } 4305 4306 // Process last few (<complexThreshold) bytes of buffer. 4307 BIND(L_tail); 4308 update_byteLoop_crc32(crc, buf, len, table, data, false); 4309 4310 if (invertCRC) { 4311 nand(crc, crc, crc); // 1s complement of crc 4312 } 4313 BLOCK_COMMENT("} kernel_crc32_2word"); 4314 } 4315 4316 /** 4317 * @param crc register containing existing CRC (32-bit) 4318 * @param buf register pointing to input byte buffer (byte*) 4319 * @param len register containing number of bytes 4320 * @param table register pointing to CRC table 4321 * 4322 * uses R9..R12 as work register. Must be saved/restored by caller! 4323 */ 4324 void MacroAssembler::kernel_crc32_1word(Register crc, Register buf, Register len, Register table, 4325 Register t0, Register t1, Register t2, Register t3, 4326 Register tc0, Register tc1, Register tc2, Register tc3, 4327 bool invertCRC) { 4328 assert_different_registers(crc, buf, len, table); 4329 4330 Label L_mainLoop, L_tail; 4331 Register tmp = t0; 4332 Register data = t0; 4333 Register tmp2 = t1; 4334 const int mainLoop_stepping = 4; 4335 const int tailLoop_stepping = 1; 4336 const int log_stepping = exact_log2(mainLoop_stepping); 4337 const int mainLoop_alignment = 32; // InputForNewCode > 4 ? InputForNewCode : 32; 4338 const int complexThreshold = 2*mainLoop_stepping; 4339 4340 // Don't test for len <= 0 here. This pathological case should not occur anyway. 4341 // Optimizing for it by adding a test and a branch seems to be a waste of CPU cycles 4342 // for all well-behaved cases. The situation itself is detected and handled correctly 4343 // within update_byteLoop_crc32. 4344 assert(tailLoop_stepping == 1, "check tailLoop_stepping!"); 4345 4346 BLOCK_COMMENT("kernel_crc32_1word {"); 4347 4348 if (invertCRC) { 4349 nand(crc, crc, crc); // 1s complement of crc 4350 } 4351 4352 // Check for short (<mainLoop_stepping) buffer. 4353 cmpdi(CCR0, len, complexThreshold); 4354 blt(CCR0, L_tail); 4355 4356 // Pre-mainLoop alignment did show a slight (1%) positive effect on performance. 4357 // We leave the code in for reference. Maybe we need alignment when we exploit vector instructions. 4358 { 4359 // Align buf addr to mainLoop_stepping boundary. 4360 neg(tmp2, buf); // Calculate # preLoop iterations for alignment. 4361 rldicl(tmp2, tmp2, 0, 64-log_stepping); // Rotate tmp2 0 bits, insert into tmp2, anding with mask with 1s from 62..63. 4362 4363 if (complexThreshold > mainLoop_stepping) { 4364 sub(len, len, tmp2); // Remaining bytes for main loop (>=mainLoop_stepping is guaranteed). 4365 } else { 4366 sub(tmp, len, tmp2); // Remaining bytes for main loop. 4367 cmpdi(CCR0, tmp, mainLoop_stepping); 4368 blt(CCR0, L_tail); // For less than one mainloop_stepping left, do only tail processing 4369 mr(len, tmp); // remaining bytes for main loop (>=mainLoop_stepping is guaranteed). 4370 } 4371 update_byteLoop_crc32(crc, buf, tmp2, table, data, false); 4372 } 4373 4374 srdi(tmp2, len, log_stepping); // #iterations for mainLoop 4375 andi(len, len, mainLoop_stepping-1); // remaining bytes for tailLoop 4376 mtctr(tmp2); 4377 4378 #ifdef VM_LITTLE_ENDIAN 4379 Register crc_rv = crc; 4380 #else 4381 Register crc_rv = tmp; // Load_reverse needs separate registers to work on. 4382 // Occupies tmp, but frees up crc. 4383 load_reverse_32(crc_rv, crc); // Revert byte order because we are dealing with big-endian data. 4384 tmp = crc; 4385 #endif 4386 4387 int reconstructTableOffset = crc32_table_columns(table, tc0, tc1, tc2, tc3); 4388 4389 align(mainLoop_alignment); // Octoword-aligned loop address. Shows 2% improvement. 4390 BIND(L_mainLoop); 4391 update_1word_crc32(crc_rv, buf, table, 0, mainLoop_stepping, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3); 4392 bdnz(L_mainLoop); 4393 4394 #ifndef VM_LITTLE_ENDIAN 4395 load_reverse_32(crc, crc_rv); // Revert byte order because we are dealing with big-endian data. 4396 tmp = crc_rv; // Tmp uses it's original register again. 4397 #endif 4398 4399 // Restore original table address for tailLoop. 4400 if (reconstructTableOffset != 0) { 4401 addi(table, table, -reconstructTableOffset); 4402 } 4403 4404 // Process last few (<complexThreshold) bytes of buffer. 4405 BIND(L_tail); 4406 update_byteLoop_crc32(crc, buf, len, table, data, false); 4407 4408 if (invertCRC) { 4409 nand(crc, crc, crc); // 1s complement of crc 4410 } 4411 BLOCK_COMMENT("} kernel_crc32_1word"); 4412 } 4413 4414 /** 4415 * @param crc register containing existing CRC (32-bit) 4416 * @param buf register pointing to input byte buffer (byte*) 4417 * @param len register containing number of bytes 4418 * @param table register pointing to CRC table 4419 * 4420 * Uses R7_ARG5, R8_ARG6 as work registers. 4421 */ 4422 void MacroAssembler::kernel_crc32_1byte(Register crc, Register buf, Register len, Register table, 4423 Register t0, Register t1, Register t2, Register t3, 4424 bool invertCRC) { 4425 assert_different_registers(crc, buf, len, table); 4426 4427 Register data = t0; // Holds the current byte to be folded into crc. 4428 4429 BLOCK_COMMENT("kernel_crc32_1byte {"); 4430 4431 if (invertCRC) { 4432 nand(crc, crc, crc); // 1s complement of crc 4433 } 4434 4435 // Process all bytes in a single-byte loop. 4436 update_byteLoop_crc32(crc, buf, len, table, data, true); 4437 4438 if (invertCRC) { 4439 nand(crc, crc, crc); // 1s complement of crc 4440 } 4441 BLOCK_COMMENT("} kernel_crc32_1byte"); 4442 } 4443 4444 /** 4445 * @param crc register containing existing CRC (32-bit) 4446 * @param buf register pointing to input byte buffer (byte*) 4447 * @param len register containing number of bytes 4448 * @param table register pointing to CRC table 4449 * @param constants register pointing to CRC table for 128-bit aligned memory 4450 * @param barretConstants register pointing to table for barrett reduction 4451 * @param t0 volatile register 4452 * @param t1 volatile register 4453 * @param t2 volatile register 4454 * @param t3 volatile register 4455 */ 4456 void MacroAssembler::kernel_crc32_1word_vpmsumd(Register crc, Register buf, Register len, Register table, 4457 Register constants, Register barretConstants, 4458 Register t0, Register t1, Register t2, Register t3, Register t4, 4459 bool invertCRC) { 4460 assert_different_registers(crc, buf, len, table); 4461 4462 Label L_alignedHead, L_tail, L_alignTail, L_start, L_end; 4463 4464 Register prealign = t0; 4465 Register postalign = t0; 4466 4467 BLOCK_COMMENT("kernel_crc32_1word_vpmsumb {"); 4468 4469 // 1. use kernel_crc32_1word for shorter than 384bit 4470 clrldi(len, len, 32); 4471 cmpdi(CCR0, len, 384); 4472 bge(CCR0, L_start); 4473 4474 Register tc0 = t4; 4475 Register tc1 = constants; 4476 Register tc2 = barretConstants; 4477 kernel_crc32_1word(crc, buf, len, table,t0, t1, t2, t3, tc0, tc1, tc2, table, invertCRC); 4478 b(L_end); 4479 4480 BIND(L_start); 4481 4482 // 2. ~c 4483 if (invertCRC) { 4484 nand(crc, crc, crc); // 1s complement of crc 4485 } 4486 4487 // 3. calculate from 0 to first 128bit-aligned address 4488 clrldi_(prealign, buf, 57); 4489 beq(CCR0, L_alignedHead); 4490 4491 subfic(prealign, prealign, 128); 4492 4493 subf(len, prealign, len); 4494 update_byteLoop_crc32(crc, buf, prealign, table, t2, false); 4495 4496 // 4. calculate from first 128bit-aligned address to last 128bit-aligned address 4497 BIND(L_alignedHead); 4498 4499 clrldi(postalign, len, 57); 4500 subf(len, postalign, len); 4501 4502 // len must be more than 256bit 4503 kernel_crc32_1word_aligned(crc, buf, len, constants, barretConstants, t1, t2, t3); 4504 4505 // 5. calculate remaining 4506 cmpdi(CCR0, postalign, 0); 4507 beq(CCR0, L_tail); 4508 4509 update_byteLoop_crc32(crc, buf, postalign, table, t2, false); 4510 4511 BIND(L_tail); 4512 4513 // 6. ~c 4514 if (invertCRC) { 4515 nand(crc, crc, crc); // 1s complement of crc 4516 } 4517 4518 BIND(L_end); 4519 4520 BLOCK_COMMENT("} kernel_crc32_1word_vpmsumb"); 4521 } 4522 4523 /** 4524 * @param crc register containing existing CRC (32-bit) 4525 * @param buf register pointing to input byte buffer (byte*) 4526 * @param len register containing number of bytes 4527 * @param constants register pointing to CRC table for 128-bit aligned memory 4528 * @param barretConstants register pointing to table for barrett reduction 4529 * @param t0 volatile register 4530 * @param t1 volatile register 4531 * @param t2 volatile register 4532 */ 4533 void MacroAssembler::kernel_crc32_1word_aligned(Register crc, Register buf, Register len, 4534 Register constants, Register barretConstants, Register t0, Register t1, Register t2) { 4535 Label L_mainLoop, L_tail, L_alignTail, L_barrett_reduction, L_end, L_first_warm_up_done, L_first_cool_down, L_second_cool_down, L_XOR, L_test; 4536 Label L_lv0, L_lv1, L_lv2, L_lv3, L_lv4, L_lv5, L_lv6, L_lv7, L_lv8, L_lv9, L_lv10, L_lv11, L_lv12, L_lv13, L_lv14, L_lv15; 4537 Label L_1, L_2, L_3, L_4; 4538 4539 Register rLoaded = t0; 4540 Register rTmp1 = t1; 4541 Register rTmp2 = t2; 4542 Register off16 = R22; 4543 Register off32 = R23; 4544 Register off48 = R24; 4545 Register off64 = R25; 4546 Register off80 = R26; 4547 Register off96 = R27; 4548 Register off112 = R28; 4549 Register rIdx = R29; 4550 Register rMax = R30; 4551 Register constantsPos = R31; 4552 4553 VectorRegister mask_32bit = VR24; 4554 VectorRegister mask_64bit = VR25; 4555 VectorRegister zeroes = VR26; 4556 VectorRegister const1 = VR27; 4557 VectorRegister const2 = VR28; 4558 4559 // Save non-volatile vector registers (frameless). 4560 Register offset = t1; int offsetInt = 0; 4561 offsetInt -= 16; li(offset, -16); stvx(VR20, offset, R1_SP); 4562 offsetInt -= 16; addi(offset, offset, -16); stvx(VR21, offset, R1_SP); 4563 offsetInt -= 16; addi(offset, offset, -16); stvx(VR22, offset, R1_SP); 4564 offsetInt -= 16; addi(offset, offset, -16); stvx(VR23, offset, R1_SP); 4565 offsetInt -= 16; addi(offset, offset, -16); stvx(VR24, offset, R1_SP); 4566 offsetInt -= 16; addi(offset, offset, -16); stvx(VR25, offset, R1_SP); 4567 offsetInt -= 16; addi(offset, offset, -16); stvx(VR26, offset, R1_SP); 4568 offsetInt -= 16; addi(offset, offset, -16); stvx(VR27, offset, R1_SP); 4569 offsetInt -= 16; addi(offset, offset, -16); stvx(VR28, offset, R1_SP); 4570 offsetInt -= 8; std(R22, offsetInt, R1_SP); 4571 offsetInt -= 8; std(R23, offsetInt, R1_SP); 4572 offsetInt -= 8; std(R24, offsetInt, R1_SP); 4573 offsetInt -= 8; std(R25, offsetInt, R1_SP); 4574 offsetInt -= 8; std(R26, offsetInt, R1_SP); 4575 offsetInt -= 8; std(R27, offsetInt, R1_SP); 4576 offsetInt -= 8; std(R28, offsetInt, R1_SP); 4577 offsetInt -= 8; std(R29, offsetInt, R1_SP); 4578 offsetInt -= 8; std(R30, offsetInt, R1_SP); 4579 offsetInt -= 8; std(R31, offsetInt, R1_SP); 4580 4581 // Set constants 4582 li(off16, 16); 4583 li(off32, 32); 4584 li(off48, 48); 4585 li(off64, 64); 4586 li(off80, 80); 4587 li(off96, 96); 4588 li(off112, 112); 4589 4590 clrldi(crc, crc, 32); 4591 4592 vxor(zeroes, zeroes, zeroes); 4593 vspltisw(VR0, -1); 4594 4595 vsldoi(mask_32bit, zeroes, VR0, 4); 4596 vsldoi(mask_64bit, zeroes, VR0, 8); 4597 4598 // Get the initial value into v8 4599 vxor(VR8, VR8, VR8); 4600 mtvrd(VR8, crc); 4601 vsldoi(VR8, zeroes, VR8, 8); // shift into bottom 32 bits 4602 4603 li (rLoaded, 0); 4604 4605 rldicr(rIdx, len, 0, 56); 4606 4607 { 4608 BIND(L_1); 4609 // Checksum in blocks of MAX_SIZE (32768) 4610 lis(rMax, 0); 4611 ori(rMax, rMax, 32768); 4612 mr(rTmp2, rMax); 4613 cmpd(CCR0, rIdx, rMax); 4614 bgt(CCR0, L_2); 4615 mr(rMax, rIdx); 4616 4617 BIND(L_2); 4618 subf(rIdx, rMax, rIdx); 4619 4620 // our main loop does 128 bytes at a time 4621 srdi(rMax, rMax, 7); 4622 4623 /* 4624 * Work out the offset into the constants table to start at. Each 4625 * constant is 16 bytes, and it is used against 128 bytes of input 4626 * data - 128 / 16 = 8 4627 */ 4628 sldi(rTmp1, rMax, 4); 4629 srdi(rTmp2, rTmp2, 3); 4630 subf(rTmp1, rTmp1, rTmp2); 4631 4632 // We reduce our final 128 bytes in a separate step 4633 addi(rMax, rMax, -1); 4634 mtctr(rMax); 4635 4636 // Find the start of our constants 4637 add(constantsPos, constants, rTmp1); 4638 4639 // zero VR0-v7 which will contain our checksums 4640 vxor(VR0, VR0, VR0); 4641 vxor(VR1, VR1, VR1); 4642 vxor(VR2, VR2, VR2); 4643 vxor(VR3, VR3, VR3); 4644 vxor(VR4, VR4, VR4); 4645 vxor(VR5, VR5, VR5); 4646 vxor(VR6, VR6, VR6); 4647 vxor(VR7, VR7, VR7); 4648 4649 lvx(const1, constantsPos); 4650 4651 /* 4652 * If we are looping back to consume more data we use the values 4653 * already in VR16-v23. 4654 */ 4655 cmpdi(CCR0, rLoaded, 1); 4656 beq(CCR0, L_3); 4657 { 4658 4659 // First warm up pass 4660 lvx(VR16, buf); 4661 lvx(VR17, off16, buf); 4662 lvx(VR18, off32, buf); 4663 lvx(VR19, off48, buf); 4664 lvx(VR20, off64, buf); 4665 lvx(VR21, off80, buf); 4666 lvx(VR22, off96, buf); 4667 lvx(VR23, off112, buf); 4668 addi(buf, buf, 8*16); 4669 4670 // xor in initial value 4671 vxor(VR16, VR16, VR8); 4672 } 4673 4674 BIND(L_3); 4675 bdz(L_first_warm_up_done); 4676 4677 addi(constantsPos, constantsPos, 16); 4678 lvx(const2, constantsPos); 4679 4680 // Second warm up pass 4681 vpmsumd(VR8, VR16, const1); 4682 lvx(VR16, buf); 4683 4684 vpmsumd(VR9, VR17, const1); 4685 lvx(VR17, off16, buf); 4686 4687 vpmsumd(VR10, VR18, const1); 4688 lvx(VR18, off32, buf); 4689 4690 vpmsumd(VR11, VR19, const1); 4691 lvx(VR19, off48, buf); 4692 4693 vpmsumd(VR12, VR20, const1); 4694 lvx(VR20, off64, buf); 4695 4696 vpmsumd(VR13, VR21, const1); 4697 lvx(VR21, off80, buf); 4698 4699 vpmsumd(VR14, VR22, const1); 4700 lvx(VR22, off96, buf); 4701 4702 vpmsumd(VR15, VR23, const1); 4703 lvx(VR23, off112, buf); 4704 4705 addi(buf, buf, 8 * 16); 4706 4707 bdz(L_first_cool_down); 4708 4709 /* 4710 * main loop. We modulo schedule it such that it takes three iterations 4711 * to complete - first iteration load, second iteration vpmsum, third 4712 * iteration xor. 4713 */ 4714 { 4715 BIND(L_4); 4716 lvx(const1, constantsPos); addi(constantsPos, constantsPos, 16); 4717 4718 vxor(VR0, VR0, VR8); 4719 vpmsumd(VR8, VR16, const2); 4720 lvx(VR16, buf); 4721 4722 vxor(VR1, VR1, VR9); 4723 vpmsumd(VR9, VR17, const2); 4724 lvx(VR17, off16, buf); 4725 4726 vxor(VR2, VR2, VR10); 4727 vpmsumd(VR10, VR18, const2); 4728 lvx(VR18, off32, buf); 4729 4730 vxor(VR3, VR3, VR11); 4731 vpmsumd(VR11, VR19, const2); 4732 lvx(VR19, off48, buf); 4733 lvx(const2, constantsPos); 4734 4735 vxor(VR4, VR4, VR12); 4736 vpmsumd(VR12, VR20, const1); 4737 lvx(VR20, off64, buf); 4738 4739 vxor(VR5, VR5, VR13); 4740 vpmsumd(VR13, VR21, const1); 4741 lvx(VR21, off80, buf); 4742 4743 vxor(VR6, VR6, VR14); 4744 vpmsumd(VR14, VR22, const1); 4745 lvx(VR22, off96, buf); 4746 4747 vxor(VR7, VR7, VR15); 4748 vpmsumd(VR15, VR23, const1); 4749 lvx(VR23, off112, buf); 4750 4751 addi(buf, buf, 8 * 16); 4752 4753 bdnz(L_4); 4754 } 4755 4756 BIND(L_first_cool_down); 4757 4758 // First cool down pass 4759 lvx(const1, constantsPos); 4760 addi(constantsPos, constantsPos, 16); 4761 4762 vxor(VR0, VR0, VR8); 4763 vpmsumd(VR8, VR16, const1); 4764 4765 vxor(VR1, VR1, VR9); 4766 vpmsumd(VR9, VR17, const1); 4767 4768 vxor(VR2, VR2, VR10); 4769 vpmsumd(VR10, VR18, const1); 4770 4771 vxor(VR3, VR3, VR11); 4772 vpmsumd(VR11, VR19, const1); 4773 4774 vxor(VR4, VR4, VR12); 4775 vpmsumd(VR12, VR20, const1); 4776 4777 vxor(VR5, VR5, VR13); 4778 vpmsumd(VR13, VR21, const1); 4779 4780 vxor(VR6, VR6, VR14); 4781 vpmsumd(VR14, VR22, const1); 4782 4783 vxor(VR7, VR7, VR15); 4784 vpmsumd(VR15, VR23, const1); 4785 4786 BIND(L_second_cool_down); 4787 // Second cool down pass 4788 vxor(VR0, VR0, VR8); 4789 vxor(VR1, VR1, VR9); 4790 vxor(VR2, VR2, VR10); 4791 vxor(VR3, VR3, VR11); 4792 vxor(VR4, VR4, VR12); 4793 vxor(VR5, VR5, VR13); 4794 vxor(VR6, VR6, VR14); 4795 vxor(VR7, VR7, VR15); 4796 4797 /* 4798 * vpmsumd produces a 96 bit result in the least significant bits 4799 * of the register. Since we are bit reflected we have to shift it 4800 * left 32 bits so it occupies the least significant bits in the 4801 * bit reflected domain. 4802 */ 4803 vsldoi(VR0, VR0, zeroes, 4); 4804 vsldoi(VR1, VR1, zeroes, 4); 4805 vsldoi(VR2, VR2, zeroes, 4); 4806 vsldoi(VR3, VR3, zeroes, 4); 4807 vsldoi(VR4, VR4, zeroes, 4); 4808 vsldoi(VR5, VR5, zeroes, 4); 4809 vsldoi(VR6, VR6, zeroes, 4); 4810 vsldoi(VR7, VR7, zeroes, 4); 4811 4812 // xor with last 1024 bits 4813 lvx(VR8, buf); 4814 lvx(VR9, off16, buf); 4815 lvx(VR10, off32, buf); 4816 lvx(VR11, off48, buf); 4817 lvx(VR12, off64, buf); 4818 lvx(VR13, off80, buf); 4819 lvx(VR14, off96, buf); 4820 lvx(VR15, off112, buf); 4821 addi(buf, buf, 8 * 16); 4822 4823 vxor(VR16, VR0, VR8); 4824 vxor(VR17, VR1, VR9); 4825 vxor(VR18, VR2, VR10); 4826 vxor(VR19, VR3, VR11); 4827 vxor(VR20, VR4, VR12); 4828 vxor(VR21, VR5, VR13); 4829 vxor(VR22, VR6, VR14); 4830 vxor(VR23, VR7, VR15); 4831 4832 li(rLoaded, 1); 4833 cmpdi(CCR0, rIdx, 0); 4834 addi(rIdx, rIdx, 128); 4835 bne(CCR0, L_1); 4836 } 4837 4838 // Work out how many bytes we have left 4839 andi_(len, len, 127); 4840 4841 // Calculate where in the constant table we need to start 4842 subfic(rTmp1, len, 128); 4843 add(constantsPos, constantsPos, rTmp1); 4844 4845 // How many 16 byte chunks are in the tail 4846 srdi(rIdx, len, 4); 4847 mtctr(rIdx); 4848 4849 /* 4850 * Reduce the previously calculated 1024 bits to 64 bits, shifting 4851 * 32 bits to include the trailing 32 bits of zeros 4852 */ 4853 lvx(VR0, constantsPos); 4854 lvx(VR1, off16, constantsPos); 4855 lvx(VR2, off32, constantsPos); 4856 lvx(VR3, off48, constantsPos); 4857 lvx(VR4, off64, constantsPos); 4858 lvx(VR5, off80, constantsPos); 4859 lvx(VR6, off96, constantsPos); 4860 lvx(VR7, off112, constantsPos); 4861 addi(constantsPos, constantsPos, 8 * 16); 4862 4863 vpmsumw(VR0, VR16, VR0); 4864 vpmsumw(VR1, VR17, VR1); 4865 vpmsumw(VR2, VR18, VR2); 4866 vpmsumw(VR3, VR19, VR3); 4867 vpmsumw(VR4, VR20, VR4); 4868 vpmsumw(VR5, VR21, VR5); 4869 vpmsumw(VR6, VR22, VR6); 4870 vpmsumw(VR7, VR23, VR7); 4871 4872 // Now reduce the tail (0 - 112 bytes) 4873 cmpdi(CCR0, rIdx, 0); 4874 beq(CCR0, L_XOR); 4875 4876 lvx(VR16, buf); addi(buf, buf, 16); 4877 lvx(VR17, constantsPos); 4878 vpmsumw(VR16, VR16, VR17); 4879 vxor(VR0, VR0, VR16); 4880 beq(CCR0, L_XOR); 4881 4882 lvx(VR16, buf); addi(buf, buf, 16); 4883 lvx(VR17, off16, constantsPos); 4884 vpmsumw(VR16, VR16, VR17); 4885 vxor(VR0, VR0, VR16); 4886 beq(CCR0, L_XOR); 4887 4888 lvx(VR16, buf); addi(buf, buf, 16); 4889 lvx(VR17, off32, constantsPos); 4890 vpmsumw(VR16, VR16, VR17); 4891 vxor(VR0, VR0, VR16); 4892 beq(CCR0, L_XOR); 4893 4894 lvx(VR16, buf); addi(buf, buf, 16); 4895 lvx(VR17, off48,constantsPos); 4896 vpmsumw(VR16, VR16, VR17); 4897 vxor(VR0, VR0, VR16); 4898 beq(CCR0, L_XOR); 4899 4900 lvx(VR16, buf); addi(buf, buf, 16); 4901 lvx(VR17, off64, constantsPos); 4902 vpmsumw(VR16, VR16, VR17); 4903 vxor(VR0, VR0, VR16); 4904 beq(CCR0, L_XOR); 4905 4906 lvx(VR16, buf); addi(buf, buf, 16); 4907 lvx(VR17, off80, constantsPos); 4908 vpmsumw(VR16, VR16, VR17); 4909 vxor(VR0, VR0, VR16); 4910 beq(CCR0, L_XOR); 4911 4912 lvx(VR16, buf); addi(buf, buf, 16); 4913 lvx(VR17, off96, constantsPos); 4914 vpmsumw(VR16, VR16, VR17); 4915 vxor(VR0, VR0, VR16); 4916 4917 // Now xor all the parallel chunks together 4918 BIND(L_XOR); 4919 vxor(VR0, VR0, VR1); 4920 vxor(VR2, VR2, VR3); 4921 vxor(VR4, VR4, VR5); 4922 vxor(VR6, VR6, VR7); 4923 4924 vxor(VR0, VR0, VR2); 4925 vxor(VR4, VR4, VR6); 4926 4927 vxor(VR0, VR0, VR4); 4928 4929 b(L_barrett_reduction); 4930 4931 BIND(L_first_warm_up_done); 4932 lvx(const1, constantsPos); 4933 addi(constantsPos, constantsPos, 16); 4934 vpmsumd(VR8, VR16, const1); 4935 vpmsumd(VR9, VR17, const1); 4936 vpmsumd(VR10, VR18, const1); 4937 vpmsumd(VR11, VR19, const1); 4938 vpmsumd(VR12, VR20, const1); 4939 vpmsumd(VR13, VR21, const1); 4940 vpmsumd(VR14, VR22, const1); 4941 vpmsumd(VR15, VR23, const1); 4942 b(L_second_cool_down); 4943 4944 BIND(L_barrett_reduction); 4945 4946 lvx(const1, barretConstants); 4947 addi(barretConstants, barretConstants, 16); 4948 lvx(const2, barretConstants); 4949 4950 vsldoi(VR1, VR0, VR0, 8); 4951 vxor(VR0, VR0, VR1); // xor two 64 bit results together 4952 4953 // shift left one bit 4954 vspltisb(VR1, 1); 4955 vsl(VR0, VR0, VR1); 4956 4957 vand(VR0, VR0, mask_64bit); 4958 4959 /* 4960 * The reflected version of Barrett reduction. Instead of bit 4961 * reflecting our data (which is expensive to do), we bit reflect our 4962 * constants and our algorithm, which means the intermediate data in 4963 * our vector registers goes from 0-63 instead of 63-0. We can reflect 4964 * the algorithm because we don't carry in mod 2 arithmetic. 4965 */ 4966 vand(VR1, VR0, mask_32bit); // bottom 32 bits of a 4967 vpmsumd(VR1, VR1, const1); // ma 4968 vand(VR1, VR1, mask_32bit); // bottom 32bits of ma 4969 vpmsumd(VR1, VR1, const2); // qn */ 4970 vxor(VR0, VR0, VR1); // a - qn, subtraction is xor in GF(2) 4971 4972 /* 4973 * Since we are bit reflected, the result (ie the low 32 bits) is in 4974 * the high 32 bits. We just need to shift it left 4 bytes 4975 * V0 [ 0 1 X 3 ] 4976 * V0 [ 0 X 2 3 ] 4977 */ 4978 vsldoi(VR0, VR0, zeroes, 4); // shift result into top 64 bits of 4979 4980 // Get it into r3 4981 mfvrd(crc, VR0); 4982 4983 BIND(L_end); 4984 4985 offsetInt = 0; 4986 // Restore non-volatile Vector registers (frameless). 4987 offsetInt -= 16; li(offset, -16); lvx(VR20, offset, R1_SP); 4988 offsetInt -= 16; addi(offset, offset, -16); lvx(VR21, offset, R1_SP); 4989 offsetInt -= 16; addi(offset, offset, -16); lvx(VR22, offset, R1_SP); 4990 offsetInt -= 16; addi(offset, offset, -16); lvx(VR23, offset, R1_SP); 4991 offsetInt -= 16; addi(offset, offset, -16); lvx(VR24, offset, R1_SP); 4992 offsetInt -= 16; addi(offset, offset, -16); lvx(VR25, offset, R1_SP); 4993 offsetInt -= 16; addi(offset, offset, -16); lvx(VR26, offset, R1_SP); 4994 offsetInt -= 16; addi(offset, offset, -16); lvx(VR27, offset, R1_SP); 4995 offsetInt -= 16; addi(offset, offset, -16); lvx(VR28, offset, R1_SP); 4996 offsetInt -= 8; ld(R22, offsetInt, R1_SP); 4997 offsetInt -= 8; ld(R23, offsetInt, R1_SP); 4998 offsetInt -= 8; ld(R24, offsetInt, R1_SP); 4999 offsetInt -= 8; ld(R25, offsetInt, R1_SP); 5000 offsetInt -= 8; ld(R26, offsetInt, R1_SP); 5001 offsetInt -= 8; ld(R27, offsetInt, R1_SP); 5002 offsetInt -= 8; ld(R28, offsetInt, R1_SP); 5003 offsetInt -= 8; ld(R29, offsetInt, R1_SP); 5004 offsetInt -= 8; ld(R30, offsetInt, R1_SP); 5005 offsetInt -= 8; ld(R31, offsetInt, R1_SP); 5006 } 5007 5008 void MacroAssembler::kernel_crc32_singleByte(Register crc, Register buf, Register len, Register table, Register tmp, bool invertCRC) { 5009 assert_different_registers(crc, buf, /* len, not used!! */ table, tmp); 5010 5011 BLOCK_COMMENT("kernel_crc32_singleByte:"); 5012 if (invertCRC) { 5013 nand(crc, crc, crc); // 1s complement of crc 5014 } 5015 5016 lbz(tmp, 0, buf); // Byte from buffer, zero-extended. 5017 update_byte_crc32(crc, tmp, table); 5018 5019 if (invertCRC) { 5020 nand(crc, crc, crc); // 1s complement of crc 5021 } 5022 } 5023 5024 void MacroAssembler::kernel_crc32_singleByteReg(Register crc, Register val, Register table, bool invertCRC) { 5025 assert_different_registers(crc, val, table); 5026 5027 BLOCK_COMMENT("kernel_crc32_singleByteReg:"); 5028 if (invertCRC) { 5029 nand(crc, crc, crc); // 1s complement of crc 5030 } 5031 5032 update_byte_crc32(crc, val, table); 5033 5034 if (invertCRC) { 5035 nand(crc, crc, crc); // 1s complement of crc 5036 } 5037 } 5038 5039 // dest_lo += src1 + src2 5040 // dest_hi += carry1 + carry2 5041 void MacroAssembler::add2_with_carry(Register dest_hi, 5042 Register dest_lo, 5043 Register src1, Register src2) { 5044 li(R0, 0); 5045 addc(dest_lo, dest_lo, src1); 5046 adde(dest_hi, dest_hi, R0); 5047 addc(dest_lo, dest_lo, src2); 5048 adde(dest_hi, dest_hi, R0); 5049 } 5050 5051 // Multiply 64 bit by 64 bit first loop. 5052 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, 5053 Register x_xstart, 5054 Register y, Register y_idx, 5055 Register z, 5056 Register carry, 5057 Register product_high, Register product, 5058 Register idx, Register kdx, 5059 Register tmp) { 5060 // jlong carry, x[], y[], z[]; 5061 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx--, kdx--) { 5062 // huge_128 product = y[idx] * x[xstart] + carry; 5063 // z[kdx] = (jlong)product; 5064 // carry = (jlong)(product >>> 64); 5065 // } 5066 // z[xstart] = carry; 5067 5068 Label L_first_loop, L_first_loop_exit; 5069 Label L_one_x, L_one_y, L_multiply; 5070 5071 addic_(xstart, xstart, -1); 5072 blt(CCR0, L_one_x); // Special case: length of x is 1. 5073 5074 // Load next two integers of x. 5075 sldi(tmp, xstart, LogBytesPerInt); 5076 ldx(x_xstart, x, tmp); 5077 #ifdef VM_LITTLE_ENDIAN 5078 rldicl(x_xstart, x_xstart, 32, 0); 5079 #endif 5080 5081 align(32, 16); 5082 bind(L_first_loop); 5083 5084 cmpdi(CCR0, idx, 1); 5085 blt(CCR0, L_first_loop_exit); 5086 addi(idx, idx, -2); 5087 beq(CCR0, L_one_y); 5088 5089 // Load next two integers of y. 5090 sldi(tmp, idx, LogBytesPerInt); 5091 ldx(y_idx, y, tmp); 5092 #ifdef VM_LITTLE_ENDIAN 5093 rldicl(y_idx, y_idx, 32, 0); 5094 #endif 5095 5096 5097 bind(L_multiply); 5098 multiply64(product_high, product, x_xstart, y_idx); 5099 5100 li(tmp, 0); 5101 addc(product, product, carry); // Add carry to result. 5102 adde(product_high, product_high, tmp); // Add carry of the last addition. 5103 addi(kdx, kdx, -2); 5104 5105 // Store result. 5106 #ifdef VM_LITTLE_ENDIAN 5107 rldicl(product, product, 32, 0); 5108 #endif 5109 sldi(tmp, kdx, LogBytesPerInt); 5110 stdx(product, z, tmp); 5111 mr_if_needed(carry, product_high); 5112 b(L_first_loop); 5113 5114 5115 bind(L_one_y); // Load one 32 bit portion of y as (0,value). 5116 5117 lwz(y_idx, 0, y); 5118 b(L_multiply); 5119 5120 5121 bind(L_one_x); // Load one 32 bit portion of x as (0,value). 5122 5123 lwz(x_xstart, 0, x); 5124 b(L_first_loop); 5125 5126 bind(L_first_loop_exit); 5127 } 5128 5129 // Multiply 64 bit by 64 bit and add 128 bit. 5130 void MacroAssembler::multiply_add_128_x_128(Register x_xstart, Register y, 5131 Register z, Register yz_idx, 5132 Register idx, Register carry, 5133 Register product_high, Register product, 5134 Register tmp, int offset) { 5135 5136 // huge_128 product = (y[idx] * x_xstart) + z[kdx] + carry; 5137 // z[kdx] = (jlong)product; 5138 5139 sldi(tmp, idx, LogBytesPerInt); 5140 if (offset) { 5141 addi(tmp, tmp, offset); 5142 } 5143 ldx(yz_idx, y, tmp); 5144 #ifdef VM_LITTLE_ENDIAN 5145 rldicl(yz_idx, yz_idx, 32, 0); 5146 #endif 5147 5148 multiply64(product_high, product, x_xstart, yz_idx); 5149 ldx(yz_idx, z, tmp); 5150 #ifdef VM_LITTLE_ENDIAN 5151 rldicl(yz_idx, yz_idx, 32, 0); 5152 #endif 5153 5154 add2_with_carry(product_high, product, carry, yz_idx); 5155 5156 sldi(tmp, idx, LogBytesPerInt); 5157 if (offset) { 5158 addi(tmp, tmp, offset); 5159 } 5160 #ifdef VM_LITTLE_ENDIAN 5161 rldicl(product, product, 32, 0); 5162 #endif 5163 stdx(product, z, tmp); 5164 } 5165 5166 // Multiply 128 bit by 128 bit. Unrolled inner loop. 5167 void MacroAssembler::multiply_128_x_128_loop(Register x_xstart, 5168 Register y, Register z, 5169 Register yz_idx, Register idx, Register carry, 5170 Register product_high, Register product, 5171 Register carry2, Register tmp) { 5172 5173 // jlong carry, x[], y[], z[]; 5174 // int kdx = ystart+1; 5175 // for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop 5176 // huge_128 product = (y[idx+1] * x_xstart) + z[kdx+idx+1] + carry; 5177 // z[kdx+idx+1] = (jlong)product; 5178 // jlong carry2 = (jlong)(product >>> 64); 5179 // product = (y[idx] * x_xstart) + z[kdx+idx] + carry2; 5180 // z[kdx+idx] = (jlong)product; 5181 // carry = (jlong)(product >>> 64); 5182 // } 5183 // idx += 2; 5184 // if (idx > 0) { 5185 // product = (y[idx] * x_xstart) + z[kdx+idx] + carry; 5186 // z[kdx+idx] = (jlong)product; 5187 // carry = (jlong)(product >>> 64); 5188 // } 5189 5190 Label L_third_loop, L_third_loop_exit, L_post_third_loop_done; 5191 const Register jdx = R0; 5192 5193 // Scale the index. 5194 srdi_(jdx, idx, 2); 5195 beq(CCR0, L_third_loop_exit); 5196 mtctr(jdx); 5197 5198 align(32, 16); 5199 bind(L_third_loop); 5200 5201 addi(idx, idx, -4); 5202 5203 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 8); 5204 mr_if_needed(carry2, product_high); 5205 5206 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry2, product_high, product, tmp, 0); 5207 mr_if_needed(carry, product_high); 5208 bdnz(L_third_loop); 5209 5210 bind(L_third_loop_exit); // Handle any left-over operand parts. 5211 5212 andi_(idx, idx, 0x3); 5213 beq(CCR0, L_post_third_loop_done); 5214 5215 Label L_check_1; 5216 5217 addic_(idx, idx, -2); 5218 blt(CCR0, L_check_1); 5219 5220 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 0); 5221 mr_if_needed(carry, product_high); 5222 5223 bind(L_check_1); 5224 5225 addi(idx, idx, 0x2); 5226 andi_(idx, idx, 0x1); 5227 addic_(idx, idx, -1); 5228 blt(CCR0, L_post_third_loop_done); 5229 5230 sldi(tmp, idx, LogBytesPerInt); 5231 lwzx(yz_idx, y, tmp); 5232 multiply64(product_high, product, x_xstart, yz_idx); 5233 lwzx(yz_idx, z, tmp); 5234 5235 add2_with_carry(product_high, product, yz_idx, carry); 5236 5237 sldi(tmp, idx, LogBytesPerInt); 5238 stwx(product, z, tmp); 5239 srdi(product, product, 32); 5240 5241 sldi(product_high, product_high, 32); 5242 orr(product, product, product_high); 5243 mr_if_needed(carry, product); 5244 5245 bind(L_post_third_loop_done); 5246 } // multiply_128_x_128_loop 5247 5248 void MacroAssembler::muladd(Register out, Register in, 5249 Register offset, Register len, Register k, 5250 Register tmp1, Register tmp2, Register carry) { 5251 5252 // Labels 5253 Label LOOP, SKIP; 5254 5255 // Make sure length is positive. 5256 cmpdi (CCR0, len, 0); 5257 5258 // Prepare variables 5259 subi (offset, offset, 4); 5260 li (carry, 0); 5261 ble (CCR0, SKIP); 5262 5263 mtctr (len); 5264 subi (len, len, 1 ); 5265 sldi (len, len, 2 ); 5266 5267 // Main loop 5268 bind(LOOP); 5269 lwzx (tmp1, len, in ); 5270 lwzx (tmp2, offset, out ); 5271 mulld (tmp1, tmp1, k ); 5272 add (tmp2, carry, tmp2 ); 5273 add (tmp2, tmp1, tmp2 ); 5274 stwx (tmp2, offset, out ); 5275 srdi (carry, tmp2, 32 ); 5276 subi (offset, offset, 4 ); 5277 subi (len, len, 4 ); 5278 bdnz (LOOP); 5279 bind(SKIP); 5280 } 5281 5282 void MacroAssembler::multiply_to_len(Register x, Register xlen, 5283 Register y, Register ylen, 5284 Register z, Register zlen, 5285 Register tmp1, Register tmp2, 5286 Register tmp3, Register tmp4, 5287 Register tmp5, Register tmp6, 5288 Register tmp7, Register tmp8, 5289 Register tmp9, Register tmp10, 5290 Register tmp11, Register tmp12, 5291 Register tmp13) { 5292 5293 ShortBranchVerifier sbv(this); 5294 5295 assert_different_registers(x, xlen, y, ylen, z, zlen, 5296 tmp1, tmp2, tmp3, tmp4, tmp5, tmp6); 5297 assert_different_registers(x, xlen, y, ylen, z, zlen, 5298 tmp1, tmp2, tmp3, tmp4, tmp5, tmp7); 5299 assert_different_registers(x, xlen, y, ylen, z, zlen, 5300 tmp1, tmp2, tmp3, tmp4, tmp5, tmp8); 5301 5302 const Register idx = tmp1; 5303 const Register kdx = tmp2; 5304 const Register xstart = tmp3; 5305 5306 const Register y_idx = tmp4; 5307 const Register carry = tmp5; 5308 const Register product = tmp6; 5309 const Register product_high = tmp7; 5310 const Register x_xstart = tmp8; 5311 const Register tmp = tmp9; 5312 5313 // First Loop. 5314 // 5315 // final static long LONG_MASK = 0xffffffffL; 5316 // int xstart = xlen - 1; 5317 // int ystart = ylen - 1; 5318 // long carry = 0; 5319 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) { 5320 // long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry; 5321 // z[kdx] = (int)product; 5322 // carry = product >>> 32; 5323 // } 5324 // z[xstart] = (int)carry; 5325 5326 mr_if_needed(idx, ylen); // idx = ylen 5327 mr_if_needed(kdx, zlen); // kdx = xlen + ylen 5328 li(carry, 0); // carry = 0 5329 5330 Label L_done; 5331 5332 addic_(xstart, xlen, -1); 5333 blt(CCR0, L_done); 5334 5335 multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, 5336 carry, product_high, product, idx, kdx, tmp); 5337 5338 Label L_second_loop; 5339 5340 cmpdi(CCR0, kdx, 0); 5341 beq(CCR0, L_second_loop); 5342 5343 Label L_carry; 5344 5345 addic_(kdx, kdx, -1); 5346 beq(CCR0, L_carry); 5347 5348 // Store lower 32 bits of carry. 5349 sldi(tmp, kdx, LogBytesPerInt); 5350 stwx(carry, z, tmp); 5351 srdi(carry, carry, 32); 5352 addi(kdx, kdx, -1); 5353 5354 5355 bind(L_carry); 5356 5357 // Store upper 32 bits of carry. 5358 sldi(tmp, kdx, LogBytesPerInt); 5359 stwx(carry, z, tmp); 5360 5361 // Second and third (nested) loops. 5362 // 5363 // for (int i = xstart-1; i >= 0; i--) { // Second loop 5364 // carry = 0; 5365 // for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop 5366 // long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) + 5367 // (z[k] & LONG_MASK) + carry; 5368 // z[k] = (int)product; 5369 // carry = product >>> 32; 5370 // } 5371 // z[i] = (int)carry; 5372 // } 5373 // 5374 // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = rdx 5375 5376 bind(L_second_loop); 5377 5378 li(carry, 0); // carry = 0; 5379 5380 addic_(xstart, xstart, -1); // i = xstart-1; 5381 blt(CCR0, L_done); 5382 5383 Register zsave = tmp10; 5384 5385 mr(zsave, z); 5386 5387 5388 Label L_last_x; 5389 5390 sldi(tmp, xstart, LogBytesPerInt); 5391 add(z, z, tmp); // z = z + k - j 5392 addi(z, z, 4); 5393 addic_(xstart, xstart, -1); // i = xstart-1; 5394 blt(CCR0, L_last_x); 5395 5396 sldi(tmp, xstart, LogBytesPerInt); 5397 ldx(x_xstart, x, tmp); 5398 #ifdef VM_LITTLE_ENDIAN 5399 rldicl(x_xstart, x_xstart, 32, 0); 5400 #endif 5401 5402 5403 Label L_third_loop_prologue; 5404 5405 bind(L_third_loop_prologue); 5406 5407 Register xsave = tmp11; 5408 Register xlensave = tmp12; 5409 Register ylensave = tmp13; 5410 5411 mr(xsave, x); 5412 mr(xlensave, xstart); 5413 mr(ylensave, ylen); 5414 5415 5416 multiply_128_x_128_loop(x_xstart, y, z, y_idx, ylen, 5417 carry, product_high, product, x, tmp); 5418 5419 mr(z, zsave); 5420 mr(x, xsave); 5421 mr(xlen, xlensave); // This is the decrement of the loop counter! 5422 mr(ylen, ylensave); 5423 5424 addi(tmp3, xlen, 1); 5425 sldi(tmp, tmp3, LogBytesPerInt); 5426 stwx(carry, z, tmp); 5427 addic_(tmp3, tmp3, -1); 5428 blt(CCR0, L_done); 5429 5430 srdi(carry, carry, 32); 5431 sldi(tmp, tmp3, LogBytesPerInt); 5432 stwx(carry, z, tmp); 5433 b(L_second_loop); 5434 5435 // Next infrequent code is moved outside loops. 5436 bind(L_last_x); 5437 5438 lwz(x_xstart, 0, x); 5439 b(L_third_loop_prologue); 5440 5441 bind(L_done); 5442 } // multiply_to_len 5443 5444 void MacroAssembler::asm_assert(bool check_equal, const char *msg, int id) { 5445 #ifdef ASSERT 5446 Label ok; 5447 if (check_equal) { 5448 beq(CCR0, ok); 5449 } else { 5450 bne(CCR0, ok); 5451 } 5452 stop(msg, id); 5453 bind(ok); 5454 #endif 5455 } 5456 5457 void MacroAssembler::asm_assert_mems_zero(bool check_equal, int size, int mem_offset, 5458 Register mem_base, const char* msg, int id) { 5459 #ifdef ASSERT 5460 switch (size) { 5461 case 4: 5462 lwz(R0, mem_offset, mem_base); 5463 cmpwi(CCR0, R0, 0); 5464 break; 5465 case 8: 5466 ld(R0, mem_offset, mem_base); 5467 cmpdi(CCR0, R0, 0); 5468 break; 5469 default: 5470 ShouldNotReachHere(); 5471 } 5472 asm_assert(check_equal, msg, id); 5473 #endif // ASSERT 5474 } 5475 5476 void MacroAssembler::verify_thread() { 5477 if (VerifyThread) { 5478 unimplemented("'VerifyThread' currently not implemented on PPC"); 5479 } 5480 } 5481 5482 // READ: oop. KILL: R0. Volatile floats perhaps. 5483 void MacroAssembler::verify_oop(Register oop, const char* msg) { 5484 if (!VerifyOops) { 5485 return; 5486 } 5487 5488 address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address(); 5489 const Register tmp = R11; // Will be preserved. 5490 const int nbytes_save = MacroAssembler::num_volatile_regs * 8; 5491 save_volatile_gprs(R1_SP, -nbytes_save); // except R0 5492 5493 mr_if_needed(R4_ARG2, oop); 5494 save_LR_CR(tmp); // save in old frame 5495 push_frame_reg_args(nbytes_save, tmp); 5496 // load FunctionDescriptor** / entry_address * 5497 load_const_optimized(tmp, fd, R0); 5498 // load FunctionDescriptor* / entry_address 5499 ld(tmp, 0, tmp); 5500 load_const_optimized(R3_ARG1, (address)msg, R0); 5501 // Call destination for its side effect. 5502 call_c(tmp); 5503 5504 pop_frame(); 5505 restore_LR_CR(tmp); 5506 restore_volatile_gprs(R1_SP, -nbytes_save); // except R0 5507 } 5508 5509 void MacroAssembler::verify_oop_addr(RegisterOrConstant offs, Register base, const char* msg) { 5510 if (!VerifyOops) { 5511 return; 5512 } 5513 5514 address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address(); 5515 const Register tmp = R11; // Will be preserved. 5516 const int nbytes_save = MacroAssembler::num_volatile_regs * 8; 5517 save_volatile_gprs(R1_SP, -nbytes_save); // except R0 5518 5519 ld(R4_ARG2, offs, base); 5520 save_LR_CR(tmp); // save in old frame 5521 push_frame_reg_args(nbytes_save, tmp); 5522 // load FunctionDescriptor** / entry_address * 5523 load_const_optimized(tmp, fd, R0); 5524 // load FunctionDescriptor* / entry_address 5525 ld(tmp, 0, tmp); 5526 load_const_optimized(R3_ARG1, (address)msg, R0); 5527 // Call destination for its side effect. 5528 call_c(tmp); 5529 5530 pop_frame(); 5531 restore_LR_CR(tmp); 5532 restore_volatile_gprs(R1_SP, -nbytes_save); // except R0 5533 } 5534 5535 const char* stop_types[] = { 5536 "stop", 5537 "untested", 5538 "unimplemented", 5539 "shouldnotreachhere" 5540 }; 5541 5542 static void stop_on_request(int tp, const char* msg) { 5543 tty->print("PPC assembly code requires stop: (%s) %s\n", stop_types[tp%/*stop_end*/4], msg); 5544 guarantee(false, "PPC assembly code requires stop: %s", msg); 5545 } 5546 5547 // Call a C-function that prints output. 5548 void MacroAssembler::stop(int type, const char* msg, int id) { 5549 #ifndef PRODUCT 5550 block_comment(err_msg("stop: %s %s {", stop_types[type%stop_end], msg)); 5551 #else 5552 block_comment("stop {"); 5553 #endif 5554 5555 // setup arguments 5556 load_const_optimized(R3_ARG1, type); 5557 load_const_optimized(R4_ARG2, (void *)msg, /*tmp=*/R0); 5558 call_VM_leaf(CAST_FROM_FN_PTR(address, stop_on_request), R3_ARG1, R4_ARG2); 5559 illtrap(); 5560 emit_int32(id); 5561 block_comment("} stop;"); 5562 } 5563 5564 #ifndef PRODUCT 5565 // Write pattern 0x0101010101010101 in memory region [low-before, high+after]. 5566 // Val, addr are temp registers. 5567 // If low == addr, addr is killed. 5568 // High is preserved. 5569 void MacroAssembler::zap_from_to(Register low, int before, Register high, int after, Register val, Register addr) { 5570 if (!ZapMemory) return; 5571 5572 assert_different_registers(low, val); 5573 5574 BLOCK_COMMENT("zap memory region {"); 5575 load_const_optimized(val, 0x0101010101010101); 5576 int size = before + after; 5577 if (low == high && size < 5 && size > 0) { 5578 int offset = -before*BytesPerWord; 5579 for (int i = 0; i < size; ++i) { 5580 std(val, offset, low); 5581 offset += (1*BytesPerWord); 5582 } 5583 } else { 5584 addi(addr, low, -before*BytesPerWord); 5585 assert_different_registers(high, val); 5586 if (after) addi(high, high, after * BytesPerWord); 5587 Label loop; 5588 bind(loop); 5589 std(val, 0, addr); 5590 addi(addr, addr, 8); 5591 cmpd(CCR6, addr, high); 5592 ble(CCR6, loop); 5593 if (after) addi(high, high, -after * BytesPerWord); // Correct back to old value. 5594 } 5595 BLOCK_COMMENT("} zap memory region"); 5596 } 5597 5598 #endif // !PRODUCT 5599 5600 void SkipIfEqualZero::skip_to_label_if_equal_zero(MacroAssembler* masm, Register temp, 5601 const bool* flag_addr, Label& label) { 5602 int simm16_offset = masm->load_const_optimized(temp, (address)flag_addr, R0, true); 5603 assert(sizeof(bool) == 1, "PowerPC ABI"); 5604 masm->lbz(temp, simm16_offset, temp); 5605 masm->cmpwi(CCR0, temp, 0); 5606 masm->beq(CCR0, label); 5607 } 5608 5609 SkipIfEqualZero::SkipIfEqualZero(MacroAssembler* masm, Register temp, const bool* flag_addr) : _masm(masm), _label() { 5610 skip_to_label_if_equal_zero(masm, temp, flag_addr, _label); 5611 } 5612 5613 SkipIfEqualZero::~SkipIfEqualZero() { 5614 _masm->bind(_label); 5615 }